Sum of a particular column in a csv file

Sum of a particular column in a csv file - python

There is a csv file, say A.csv, having content:
Place,Hotel,Food,Fare
Norway,Regal,NonVeg,5000
Poland,Jenny,Italiano,6000
Norway,Suzane,Vegeterian,4000
Norway,Regal,NonVeg,5000
I have to parse this csv and obtain an output by passing arguments in command prompt.
Example 1:
mycode.py Place
Desired output is:
Place,Fare
Norway,14000
Poland,6000
Example 2:
mycode.py Place Hotel
Desired output is:
Place,Hotel,Fare
Norway,Regal,10000
Poland,Jenny,6000
Norway,Suzane,4000
So it is clear from the above example that no matter what you pass as argument it gives you the sum of the Fare header for the common ones.
Below is my code and I am able to pass arguments and get an output, but I am stuck in sum of Fare. Can any one help me with this.
import sys
import csv
import collections
d = collections.defaultdict(list)
Data = []
Result = []
Final = []
Argvs = []
argv_len = len(sys.argv)
index = 0
input = ''
file = open('A.csv', 'rb')
try:
reader = csv.reader(file)
for row in reader:
Data.append(row)
for x in range(1, argv_len):
Argvs.append(sys.argv[x])
Argvs.append('Fare')
for input in Argvs:
for y in range(0, len(Data[0])):
if(input == Data[0][y]):
for z in range(1, len(Data)):
Result.append(Data[z][y])
break
Final.append(Result)
Result = []
New = []
NewFinal = []
for x in range(0, len(Final[0])):
for y in range(0, len(Final)):
New.append(Final[y][x])
NewFinal.append(New)
New = []
out = {}
for a in NewFinal:
out.setdefault(a[0],[]).append(int(a[-1]))
with open("output.csv", "wb") as csv_file:
writer = csv.writer(csv_file, dialect='excel', delimiter=',')
writer.writerow(Argvs)
for k,v in out.iteritems():
writer.writerow((k,sum(v)))
except Exception,e:
print str(e)
finally:
file.close()
I edit the code and tried to group it. Now I am able to get the aggregate of the Fare but not the desired output.
So when I am passing:
mycode.py Place Hotel
Instead of:
Place,Hotel,Fare
Norway,Regal,10000
Poland,Jenny,6000
Norway,Suzane,4000
I am getting:
Place,Hotel,Fare
Norway,14000
Poland,6000

Finally i managed to get my desired output.
Below i am sharing the final code. \
import sys
import csv
Data = []
Result = []
Final = []
Argvs = []
argv_len = len(sys.argv)
index = 0
input = ''
file = open('A.csv', 'rb')
try:
reader = csv.reader(file)
for row in reader:
Data.append(row)
for x in range(1, argv_len):
Argvs.append(sys.argv[x])
Argvs.append('Fare')
for input in Argvs:
for y in range(0, len(Data[0])):
if(input == Data[0][y]):
for z in range(1, len(Data)):
Result.append(Data[z][y])
break
Final.append(Result)
Result = []
New = []
NewFinal = []
for x in range(0, len(Final[0])):
for y in range(0, len(Final)):
New.append(Final[y][x])
NewFinal.append(New)
New = []
out = {}
for a in NewFinal:
count_val = a[-1]
del a[-1]
key_val = ','.join(a)
out.setdefault(key_val.strip('"'),[]).append(int(count_val))
with open("output.csv", "wb") as csv_file:
writer = csv.writer(csv_file, delimiter=',',quotechar=' ')
writer.writerow(Argvs)
for k,v in out.iteritems():
writer.writerow((k,sum(v)))
except Exception,e:
print str(e)
finally:
file.close()

Related

How do i find the mean

def get_mean_temperature(filename):
with open(filename) as f:
lst = f.read().splitlines()
lst.pop(0)
result = 0
count = 0
for element in lst:
count += 1
el = int(element[6:])
result += el
print(result)
mn_tem = result / count
return mmn_tem
if __name__ == "__main__":
filename = "temp_log.txt"
with open(filename, "w") as f:
f.write("DATES T.\n07-01 28.0\n08-01 33.5\n09-01 27.0\n")
mean_temperature = get_mean_temperature(filename)
print(f"{mean_temperature:.1f}")
This is the code that I am trying to solve. So what I have to do here is to find the mean of temperature that are given in the text file, which are in this case "DATES T.\n07-01 28.0\n08-01 33.5\n09-01 27.0\n"
The text is sorted by MM-DD TT.T
Please help me have this code to work

from statistics import mean
data = "DATES T.\n07-01 28.0\n08-01 33.5\n09-01 27.0\n"
temperatures = [float(item.split()[1]) for item in data.split("\n")[1:] if item]
temperatures_mean = mean(temperatures)
print(temperatures)
print(temperatures_mean)
Output:
[28.0, 33.5, 27.0]
29.5
Or, as your original function:
from statistics import mean
def get_mean_temperature(filepath):
with open(filepath, "r") as f:
data = f.read()
temperatures = [float(item.split()[1]) for item in data.split("\n")[1:] if item]
return mean(temperatures)

CS50 PSET6 - DNA - Works fine on SMALL but not for LARGE database

I'm taking CS50 and got stuck on this pset6.
I made this code and it's working fine for 'small' given database.
On 'large' one i get wrong values in my DNA sequence.
Like, using debug50 i got that Albus sequence should be 15,49,38... and my seq is 21, 55, 64...
whats wrong? AND why it works fine on small database and not in large one?
Thanks for the help!
# Import ARGV and CSV library
from sys import argv, exit
import pandas as pd
import csv
# Check if argv has 3 arguments (program name, cvs file and dna sequence)
while True:
if len(argv) != 3:
print("Usage: python dna.py data.csv sequence.txt")
exit(1)
else:
break
with open(argv[2], 'r', encoding="UTF-8") as txt:
dna_seq = txt.read()
#Find the number of STR - AGATC,TTTTTTCT,AATG,TCTAG,GATA,TATC,GAAA,TCTG
AGATC = dna_seq.count("AGATC")
TTTTTTCT = dna_seq.count("TTTTTTCT")
AATG = dna_seq.count("AATG")
TCTAG = dna_seq.count("TCTAG")
GATA = dna_seq.count("GATA")
TATC = dna_seq.count("TATC")
GAAA = dna_seq.count("GAAA")
TCTG = dna_seq.count("TCTG")
name = 0
if argv[1] == "databases/small.csv":
with open(argv[1], 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
dna_db = row['name'], row['AGATC'], row['AATG'], row['TATC']
dna_db = list(dna_db)
seq = [AGATC, AATG, TATC]
seq = [str(x) for x in seq]
if dna_db[1:4] == seq:
name = dna_db[:1]
break
else:
name = "No match"
elif argv[1] == "databases/large.csv":
with open(argv[1], 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
dna_db = row['name'], row['AGATC'], row['TTTTTTCT'], row['AATG'], row['TCTAG'],
row['GATA'], row['TATC'], row['GAAA'], row['TCTG']
dna_db = list(dna_db)
seq = [AGATC,TTTTTTCT,AATG,TCTAG,GATA,TATC,GAAA,TCTG]
seq = [str(x) for x in seq]
if dna_db[1:9] == seq:
name = dna_db[:1]
break
else:
name = "No match"
print(name)

My function to extract totals is exhausting my input file for future reading

The client includes 3 rows at the bottom that contain totals for me to reconcile against in my program. Only problem is that my program is exhausting the input file with readlines() before it can do anything else. Is there a way to keep the file from being exhausted during my get_recon_total function call?
#!/usr/bin/env python
# pre_process.py
import csv
import sys
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(in_obj)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
# Create a list to contain section header information
header_list = create_header_list(reader_list)
# Create dictionary that contains header list as the key,
# then all rows that match as a list of dictionaries.
master_dict = map_data(header_list, reader_list)
# Write data to processed file, create recon counts to compare
# to footer record
tot_cnt, rec_cnt, erec_cnt = write_data(master_dict, outfile, fieldnames)
print tot_cnt, rec_cnt, erec_cnt
def open_reader(file_obj):
'''
Uses DictReader from the csv module to take the first header line
as the fieldnames, then applies them to each element in the file.
Returns the DictReader object and the fieldnames being used (used
later when data is printed out with DictWriter.)
'''
reader = csv.DictReader(file_obj, delimiter=',')
return reader, reader.fieldnames
def create_header_list(in_obj):
p_id_list = []
for row in in_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) not in p_id_list:
p_id_list.append((row['PEOPLE_ID'], row['DON_DATE']))
return p_id_list
def map_data(header_list, data_obj):
master_dict = {}
client_section_list = []
for element in header_list:
for row in data_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) == element:
client_section_list.append(row)
element = list(element)
element_list = [client_section_list[0]['DEDUCT_AMT'],
client_section_list[0]['ND_AMT'],
client_section_list[0]['DEDUCT_YTD'],
client_section_list[0]['NONDEDUCT_YTD']
]
try:
element_list.append((float(client_section_list[0]['DEDUCT_YTD']) +
float(client_section_list[0]['NONDEDUCT_YTD'])
))
except ValueError:
pass
element.extend(element_list)
element = tuple(element)
master_dict[element] = client_section_list
client_section_list = []
return master_dict
def write_data(in_obj, outfile, in_fieldnames):
with open(outfile, 'wb') as writer_outfile:
writer = csv.writer(writer_outfile, delimiter=',')
dict_writer = csv.DictWriter(writer_outfile,
fieldnames=in_fieldnames,
extrasaction='ignore')
tot_cnt = 0
rec_cnt = 0
email_cnt = 0
for k, v in in_obj.iteritems():
writer_outfile.write(' -01- ')
writer.writerow(k)
rec_cnt += 1
for i, e in enumerate(v):
if v[i]['INT_CODE_EX0006'] != '' or v[i]['INT_CODE_EX0028'] != '':
email_cnt += 1
writer_outfile.write(' -02- ')
dict_writer.writerow(e)
tot_cnt += 1
return tot_cnt, rec_cnt, email_cnt
def get_recon_totals(in_obj):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in in_obj.readlines():
line = line.split(',')
if line[0] == 'T' and line[1] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Receipt Count':
print 'Receipt Count found.'
client_rec_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Email Receipt Count':
print 'E-Receipt Count Found.'
client_erec_cnt = line[2]
return client_tot_cnt, client_rec_cnt, client_erec_cnt
if __name__ == '__main__':
main()

If your file is not very large, you can convert reader generator to a list of dcitonary , by calling list() on reader and then use it in your code instead of trying to read from the file directly.
Example -
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
reader_list = list(reader)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(reader_list)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader_list, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
.
.
def get_recon_totals(reader_list):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in reader_list: #line here is a dict
if line[<fieldname for first column>] == 'T' and line[<fieldname for secondcolumn>] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[<fieldname for third column>]
.
. #continued like above
.
return client_tot_cnt, client_rec_cnt, client_erec_cnt

List of tuples to an xls Python

Sorry if I do something wrong I'm new here.
I got a Problem with my Python Code.
I have a sorted_List out of an dictionary. the sorted List looks like
sorted_Dict = [('158124', 26708), ('146127', 12738), ('21068', 9949),
('274186', 8255), ('189509', 6550), ('165758', 5346), ...]
I now want to print them in an xls file which should look like
x y
'158124' 26708
i have to plot it in Excell but i also want to plot it in python (which is not necessary but cool) but i don't get how to do this. Here is my whole code. Thank you for any help
cheers
Sven
# -*- coding: iso-8859-1 -*-
from __future__ import division
import csv
import operator
def computeSoldProducts():
catalog = csv.reader(open("data/catalog.csv", "r"))
sales = csv.reader(open("data/sales_3yr.csv", "r"))
output = open("output.csv", "a")
catalogIDs = set()
lineNumber = 0
# lese katalog
for line in catalog:
id = line[0]
if lineNumber <> 0:
catalogIDs.add(eval(id))
lineNumber = 1
soldItems = set()
lineNumber = 0
# lese sales
for line in sales:
id = line[6]
if lineNumber <> 0:
soldItems.add(eval(id))
lineNumber = 1
print "anzahl Produkte:", len(catalogIDs)
print "verkaufte Produkte", len(soldItems)
notSoldIDs = catalogIDs - soldItems
print len(notSoldIDs)
catalog = csv.reader(open("data/catalog.csv", "r"))
sales = csv.reader(open("data/sales_3yr.csv", "r"))
soldDict = {}
for k in catalog:
soldDict[str(k[0])] = 0
for item in sales:
if str(item[6]) in soldDict:
soldDict[str(item[6])] +=1
sorted_soldDict = sorted(soldDict.iteritems(), key=operator.itemgetter(1), reverse=True)
print sorted_soldDict
print sorted_soldDict
for k in sorted_soldDict:
output.write(sorted_soldDict[k])
print "done"
computeSoldProducts()

Straight from the docs for the csv module
import csv
with open('text.csv', 'wb') as csvfile:
fwriter = csv.writer(csvfile)
for x in sorted_list:
fwriter.writerow(x)
You can then open this csv file in excel.

One alternative is to use my library pyexcel, documentation is here: http://pythonhosted.org//pyexcel/
import pyexcel
sorted_list_of_sets = ....
writer = pyexcel.Writer("output.csv")
writer.write_array(sorted_list_of_sets)
writer.close()
Your original solution becomes the following if pyexcel is used:
import pyexcel
import operator
def computeSoldProducts():
catalog = pyexcel.SeriesReader("data/catalog.csv")
sales = pyexcel.SeriesReader("data/sales_3yr.csv")
print "anzahl Produkte:", catalog.number_of_rows()
print "verkaufte Produkte", sales.number_of_rows()
product_list = catalog.column_at(0)
solditem_list = sales.column_at(6)
soldOnes = []
for item in solditem_list:
if item not in soldOnes:
soldOnes.append(item)
notSoldIDs = catalog.number_of_rows() - len(soldOnes)
print notSoldIDs
print product_list
print solditem_list
# initialize the soldDict
zeros_array = [0] * len(product_list)
soldDict = dict(zip(product_list, zeros_array))
for item in solditem_list:
if item in product_list:
soldDict[item] += 1
sorted_soldDict = sorted(soldDict.iteritems(), key=operator.itemgetter(1), reverse=True)
print sorted_soldDict
writer = pyexcel.Writer("output.csv")
writer.write_row(["product", "number"])
writer.write_array(sorted_soldDict)
writer.close()
print "done"
computeSoldProducts()

Rewind the file pointer to the beginning of the previous line

I am doing text processing and using 'readline()' function as follows:
ifd = open(...)
for line in ifd:
while (condition)
do something...
line = ifd.readline()
condition = ....
#Here when the condition becomes false I need to rewind the pointer so that the 'for' loop read the same line again.
ifd.fseek() followed by readline is giving me a '\n' character. How to rewind the pointer so that the whole line is read again.
>>> ifd.seek(-1,1)
>>> line = ifd.readline()
>>> line
'\n'
Here is my code
labtestnames = sorted(tmp)
#Now read each line in the inFile and write into outFile
ifd = open(inFile, "r")
ofd = open(outFile, "w")
#read the header
header = ifd.readline() #Do nothing with this line. Skip
#Write header into the output file
nl = "mrn\tspecimen_id\tlab_number\tlogin_dt\tfluid"
offset = len(nl.split("\t"))
nl = nl + "\t" + "\t".join(labtestnames)
ofd.write(nl+"\n")
lenFields = len(nl.split("\t"))
print "Reading the input file and converting into modified file for further processing (correlation analysis etc..)"
prevTup = (0,0,0)
rowComplete = 0
k=0
for line in ifd:
k=k+1
if (k==200): break
items = line.rstrip("\n").split("\t")
if((items[0] =='')):
continue
newline= list('' for i in range(lenFields))
newline[0],newline[1],newline[3],newline[2],newline[4] = items[0], items[1], items[3], items[2], items[4]
ltests = []
ltvals = []
while(cmp(prevTup, (items[0], items[1], items[3])) == 0): # If the same mrn, lab_number and specimen_id then fill the same row. else create a new row.
ltests.append(items[6])
ltvals.append(items[7])
pos = ifd.tell()
line = ifd.readline()
prevTup = (items[0], items[1], items[3])
items = line.rstrip("\n").split("\t")
rowComplete = 1
if (rowComplete == 1): #If the row is completed, prepare newline and write into outfile
indices = [labtestnames.index(x) for x in ltests]
j=0
ifd.seek(pos)
for i in indices:
newline[i+offset] = ltvals[j]
j=j+1
if (rowComplete == 0): #
currTup = (items[0], items[1], items[3])
ltests = items[6]
ltvals = items[7]
pos = ifd.tell()
line = ifd.readline()
items = line.rstrip("\n").split("\t")
newTup = (items[0], items[1], items[3])
if(cmp(currTup, newTup) == 0):
prevTup = currTup
ifd.seek(pos)
continue
else:
indices = labtestnames.index(ltests)
newline[indices+offset] = ltvals
ofd.write(newline+"\n")

The problem can be handled more simply using itertools.groupby. groupby can cluster all the contiguous lines that deal with the same mrn, specimen_id, and lab_num.
The code that does this is
for key, group in IT.groupby(reader, key = mykey):
where reader iterates over the lines of the input file, and mykey is defined by
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
Each row from reader is passed to mykey, and all rows with the same key are clustered together in the same group.
While we're at it, we might as well use the csv module to read each line into a dict (which I call row). This frees us from having to deal with low-level string manipulation like line.rstrip("\n").split("\t") and instead of referring to columns by index numbers (e.g. row[3]) we can write code that speaks in higher-level terms such as row['lab_num'].
import itertools as IT
import csv
inFile = 'curious.dat'
outFile = 'curious.out'
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
fieldnames = 'mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate'.split()
with open(inFile, 'rb') as ifd:
reader = csv.DictReader(ifd, delimiter = '\t')
with open(outFile, 'wb') as ofd:
writer = csv.DictWriter(
ofd, fieldnames, delimiter = '\t', lineterminator = '\n', )
writer.writeheader()
for key, group in IT.groupby(reader, key = mykey):
new = {}
row = next(group)
for key in ('mrn', 'specimen_id', 'date', 'lab_num'):
new[key] = row[key]
new[row['labtest']] = row['result_val']
for row in group:
new[row['labtest']] = row['result_val']
writer.writerow(new)
yields
mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate
4419529 1614487 26.2675 5802791G 0.1
3319529 1614487 26.2675 5802791G 0.3 153 8.1 2.1 4
5713871 682571 56.0779 9732266E 4.1

This seems to be a perfect use case for yield expressions. Consider the following example that prints lines from a file, repeating some of them at random:
def buflines(fp):
r = None
while True:
r = yield r or next(fp)
if r:
yield None
from random import randint
with open('filename') as fp:
buf = buflines(fp)
for line in buf:
print line
if randint(1, 100) > 80:
print 'ONCE AGAIN::'
buf.send(line)
Basically, if you want to process an item once again, you send it back to the generator. On the next iteration you will be reading the same item once again.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Sum of a particular column in a csv file - python

Related

How do i find the mean

CS50 PSET6 - DNA - Works fine on SMALL but not for LARGE database

My function to extract totals is exhausting my input file for future reading

List of tuples to an xls Python

Rewind the file pointer to the beginning of the previous line

Categories

Resources