count instances of duplicates in a colum of a .csv file

count instances of duplicates in a colum of a .csv file - python

import re, csv
import os,shutil
import io,json, collections
from collections import Counter, defaultdict,deque
sn=0 #1st column
p_f=1 #2nd column
reader = csv.reader(open("C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv", "r"), delimiter='\t')
f= csv.writer(open("C:/Users/gurbir.sahota/Documents/python_csv_file_program/final.csv", "w"))
g=open("C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv",'r')
with open("C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv", 'r') as infh:
data = csv.reader(infh)
next(data) # skip header
seen = defaultdict(set)
counts = Counter(
row[sn]
for row in data
if row[sn] and row[p_f] not in seen[row[sn]] and not seen[row[sn]].add(row[sn])
)
print(counts.most_common())
#want to count instances of the number 2 in [('VFGRP15040030', 2), ('VFGRP15370118', 2), ('VFGRP15150113', 2)]
x=len(list(csv.reader(open('C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv'))))
print('# of rows including header=');print(x)
count_pass = sum(1 for row in csv.reader(open('C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv')) if row[1] =='pass')
print('# of passes=');print(count_pass)
count_fail = sum(1 for row in csv.reader(open('C:/Users/gurbir.sahota/Documents/python_csv_file_program/remove_duplicates.csv')) if row[1] =='fail')
print('# of fails=');print(count_fail)
#count_retest = ??
g.close
#f.close

# to get duplicates and their frequency for a column
from collections import Counter
from operator import itemgetter
with open('data.csv', 'r', newline='') as f:
r = csv.reader(f)
# here we take as example column number 1
cn = Counter(map(itemgetter(1), r))
# print item that appears more than once in the column
for k, v in cn.items():
if v > 1:
print(k,v)

Related

How to count the characters from the csv?

My CSV have below data
['value']
['abcd']
['def abc']
I want to count each characters in descending order of value, value is the header in the csv file. I have wrote one script below. Is there any better script than this?
from csv import DictReader
with open("name.csv") as f:
a1 = [row["value"] for row in DictReader(f)]
#a1
from collections import Counter
counts = Counter()
for line in a1:
counts.update(list((line)))
x=dict(counts)
from collections import defaultdict
d = defaultdict(int)
for w in sorted(x, key=x.get, reverse=True):
print (w, x[w])

from collections import defaultdict
path = "name.csv"
d_list = defaultdict(int)
with open(path, 'r') as fl:
for word in fl:
for ch in word:
#if word[0] == ch:
dd[ch] += 1
del d_list['\n']
del d_list[' ']
#print (d_list)
dd = sorted(d_list.items(), key=lambda v:v[1], reverse=True)
#dd_lex = sorted(dd, key = lambda k: (-k[1],k[0]))
for el in dd:
print (el[0] + ' '+ str(el[1]))

Sum values in column B for lines with matching column A values

Extract from large csv looks like this:
Description,Foo,Excl,GST,Incl
A,foo,$154.52,$15.44,$169.96
A,foo,$45.44,$4.54,$49.98
A,foo,$45.44,$4.54,$49.98
A,foo,$154.52,$15.44,$169.96
A,foo,$0.00,$0.00,$0.00
A,foo,$50.16,$5.02,$55.18
B,foo,$175.33,$15.65,$190.98
C,foo,$204.52,$15.44,$219.96
D,foo,$154.52,$15.44,$169.96
D,foo,$154.52,$15.44,$169.96
D,foo,$45.44,$4.54,$49.98
D,foo,$154.52,$15.44,$169.96
D,foo,$145.44,$14.54,$159.98
I need to strip the dollar sign and for all lines containing matching Description values (A or B or whatever it may be), sum the Excl column values separately, the GST column values separately and Incl column values separately for that Description value.
End result should be a dictionary object containing the Description column as key and the sum totals of the Excl, GST and Incl columns matching the Description, example:
{
"A": [450.08,44.98,495.06],
"B": [175.33,15.65,190.98],
"C": [204.52,15.44,219.96],
"D": [654.44,65.40,719.84]
}
I'm completely stumped on how to perform the sum operation. My code only goes as far as opening the csv and reading in values on each line. Any enlightenment is appreciated.
import csv
def getField(rowdata, index):
try:
val = rowdata[index]
except IndexError:
val = '-1'
return val
with open(csv, 'r') as f:
reader = csv.reader(f)
order_list = list(reader)
# Remove the header row in csv
order_list.pop(0)
for row in order_list:
Desc = getField(row, 0)
Excl = getField(row, 2)
GST = getField(row, 3)
Incl = getField(row, 4)

This might help
import csv
import decimal
path = "Path to CSV_File.csv"
def removeSym(s):
return float(s.replace("$", ""))
with open(path, 'r') as f:
reader = csv.reader(f)
order_list = list(reader)
d = {}
for i in order_list[1:]: #Skip reading the first line
if i[0] not in d:
d[i[0]] = map(removeSym, i[2:]) #Check if desc is a key the result dict. if not create
else:
d[i[0]] = [float(round(sum(k),2)) for k in zip(d[i[0]], map(removeSym, i[2:]))]
print d
Output:
{'A': [450.08, 44.98, 495.06], 'C': [204.52, 15.44, 219.96], 'B': [175.33, 15.65, 190.98], 'D': [654.44, 65.4, 719.84]}

python to list a file

This script is actually not working with the desired input
script:
import csv
file1 = csv.reader(open("1.csv"))
file2 = csv.reader(open("2.csv"))
file3 = open("3.csv", "w")
k, l = list(file1),list(file2)
length_file1 = len(k)
length_file2 = len(l)
n = []
file3.write(",".join(str(i) for i in l[0])+'\n')
for i in xrange(1, length_file1):
arr = k[i][1]
for j in xrange(1, length_file2):
arr2 = l[j][1]
if arr == arr2:
l[j][0] = k[i][0]
print l[j]
n.append(l[j])
file3.write(",".join(str(i) for i in l[j])+'\n')
so i want the code to be replaced

You can create a dictionary with the key:value pairs from 1.csv and use compare each value in 2.csv with the keys from the dictionary. This is using Python3, there is no need to use range and xrange here, you can iterate over the lists directly.
import csv
with open("2.csv", 'r') as f:
file2 = csv.reader(f)
file2 = [j for _,j in file2] # This is to remove the blank item at the start of each row
with open("1.csv", 'r') as f:
file1 = csv.reader(f)
file1 = {i:j for j,i in file1}
toWrite = []
for i in file2:
if i in file1.keys():
toWrite.append("{},{}".format(file1[i],i))
with open("bdsp_updated.csv", "w") as f:
f.write('\n'.join(toWrite))
Content of bdsp_updated.csv:
1,99277050
10,92782013
2,71269815
3,99724582
7,92043333
4,92011116
8,99799635

Python dictionary created from CSV file should merge the value (integer) whenever the key repeats

I have a file named report_data.csv that contains the following:
user,score
a,10
b,15
c,10
a,10
a,5
b,10
I am creating a dictionary from this file using this code:
with open('report_data.csv') as f:
f.readline() # Skip over the column titles
mydict = dict(csv.reader(f, delimiter=','))
After running this code mydict is:
mydict = {'a':5,'b':10,'c':10}
But I want it to be:
mydict = {'a':25,'b':25,'c':10}
In other words, whenever a key that already exists in mydict is encountered while reading a line of the file, the new value in mydict associated with that key should be the sum of the old value and the integer that appears on that line of the file. How can I do this?

The most straightforward way is to use defaultdict or Counter from useful collections module.
from collections import Counter
summary = Counter()
with open('report_data.csv') as f:
f.readline()
for line in f:
lbl, n = line.split(",")
n = int(n)
summary[lbl] = summary[lbl] + n
One of the most useful features in Counter class is the most_common() function, that is absent from the plain dictionaries and from defaultdict

This should work for you:
with open('report_data.csv') as f:
f.readline()
mydict = {}
for line in csv.reader(f, delimiter=','):
mydict[line[0]] = mydict.get(line[0], 0) + int(line[1])

try this.
mydict = {}
with open('report_data.csv') as f:
f.readline()
x = csv.reader(f, delimiter=',')
for x1 in x:
if mydict.get(x1[0]):
mydict[x1[0]] += int(x1[1])
else:
mydict[x1[0]] = int(x1[1])
print mydict

Creating a dictionary

my goal is to create a dictionary in Python. I have a .csv file which contains two columns, first one being 'word', other being 'meaning'. I am trying to read the csv file in the dictionary format and get the 'meaning' when 'word' is given.
Can you please help me by telling me how to get the value of 'word'? this is what I tried:
My codes are,
>>> with open('wordlist.csv', mode = 'r') as infile:
... reader = csv.reader(infile)
... with open('wordlist.csv', mode = 'w') as outfile:
... writer = csv.writer(outfile)
... mydict = {rows[0]:rows[1] for rows in reader}
... print(mydict)
...
The result turns out to be,
{}
the next one I tried was,
>>> reader = csv.reader(open('wordlist.csv', 'r'))
>>> d = {}
>>> for row in reader:
... k, v = row
... d[k] = v
...
But when I wanted to use this, the result was like this-
>>> d['Try']
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
KeyError: 'Try'
The next code I tried was,
>>> reader = csv.DictReader(open('wordlist.csv'))
>>> result = {}
>>> for row in reader:
... key = row.pop('word')
... if key in result:
... pass
... result[key] = row
... print result
...
It didn't give me any answer at all.
>>> for row in reader:
... for column, value in row.iteritems():
... result.setdefault(column, []).append(value)
... print result
...
Neither did this give me a result.

I would use pandas. You could then use zip two create the dictionaries.
import pandas as pd
df = pd.read_csv('wordlist.csv')
words = list(df.word)
meaning = dict( zip( df.word, df.meaning ) )
if your file doesn't have a header row, that is ok. just print out the each column is still given some name which can then be referenced.
Alternative:
import pandas as pd
df = pd.read_csv('wordlist.csv')
dictionary = {}
for w, s, m, p in zip(df.words, df.meaning):
dictionary[w] = [m, p]

If "final_word.csv" looks like this:
word1, synonym1, meaning1, POS_tag1
word2, synonym2, meaning2, POS_tag2
This will read it in as a dictionary:
with open("final_word.csv",'r') as f:
rows = f.readlines()
dictionary = {}
for row in rows:
row = row.strip()
word, synonym, meaning, POS_tag = row.split(", ")
dictionary[word] = [synonym, meaning, POS_tag]
print(dictionary['word1'])
#out>> ['synonym1', 'meaning1', 'POS_tag1']
print(dictionary['word2'][0])
#out>> synonym2
The strip() is used to get rid of the newlines "\n" that's in the end of each csv-row

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

count instances of duplicates in a colum of a .csv file - python

Related

How to count the characters from the csv?

Sum values in column B for lines with matching column A values

python to list a file

Python dictionary created from CSV file should merge the value (integer) whenever the key repeats

Creating a dictionary

Categories

Resources