get value of one column by another column in csv file python - python

I have my csv file like this:
ID Value Amount
---- ------- -------
A 3 2
A 4 4
B 3 6
C 5 5
A 3 2
B 10 1
I want sum of column "Value" or "Amount" by the column "ID". I want the output that for 'A' it should give me sum of all values which is related to A means [3+4+3].
My Code:
import csv
file = open(datafile.csv)
rows=csv.DictReader(file)
summ=0.0
count=0
for r in rows:
summ=summ+int(r['Value'])
count=count+1
print "Mean for column Value is: ",(summ/count)
file.close()

You can use a defaultdict of list to group the data by the ID column. Then use sum() to produce the totals.
from collections import defaultdict
with open('datafile.csv') as f:
d = defaultdict(list)
next(f) # skip first header line
next(f) # skip second header line
for line in f:
id_, value, amount = line.split()
d[id_].append((int(value), int(amount)))
# sum and average of column Value by ID
for id_ in d:
total = sum(t[0] for t in d[id_])
average = total / float(len(d[id_]))
print('{}: sum = {}, avg = {:.2f}'.format(id_, total, average))
Output for your input data:
A: sum = 10, avg = 3.33
C: sum = 5, avg = 5.00
B: sum = 13, avg = 6.50
It can also be done with a standard Python dictionary. The solution is very similar:
with open('datafile.csv') as f:
d = {}
next(f) # skip first header line
next(f) # skip second header line
for line in f:
id_, value, amount = line.split()
d[id_] = d.get(id_, []) + [(int(value), int(amount))]
# sum and average of column Value by ID
for id_ in d:
total = sum(t[0] for t in d[id_])
average = total / float(len(d[id_]))
print('{}: sum = {}, avg = {:.2f}'.format(id_, total, average))

Related

Python CSV sum value if they have same ID/name

I want to sum all values that have the same name / ID in a csv file
Right now I am only looking for ID with the name 'company'
csv file format:
company A, 100
company B, 200
company A, 300
The end result I am looking for is:
company A, 400
company B, 200
total: 600
My code so far:
import csv
name = ''
num = ''
total = 0
with open('xx.csv', 'r', newline='') as csvfile:
reader = csv.reader(csvfile)
next(csvfile)
for a in reader:
if a[0].__contain__('company'):
name = (a[0])
num = (a[1])
total += float(a[1])
print(str(name) + ', ' + str(num))
print('total: ' + str(total))
First, CSV typically have commas, and the delimiter for csv.reader must be a single character, so I suggest updating your file to properly use commas.
Secondly, to aggregate the companies, you need to store them as you iterate the file. Easiest way is to use a dictionary type.
Then only after you've aggregated everything, should you create a second loop to go over the aggregated values, then print the final total.
import csv
from collections import defaultdict
totals = defaultdict(int)
total = 0
with open('companies.csv') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
# next (csvfile) # shown file has no header
for row in reader:
if not row[0].startswith('company'):
continue
name, value = row
totals[name] += float(value)
total = 0
for name, value in totals.items():
print(f'{name},{value}')
total += value
print(f'total: {total}')
You don't necessarily need to use csv module here. Just read every single line split them from right (rsplit) and fill a dictionary like below:
d = {}
with open('your_file.csv') as f:
# next(f) - If header needs to be skipped
for line in f:
name, value = line.rsplit(',', maxsplit=1)
d[name] = d.get(name, 0) + int(value)
for k, v in d.items():
print(f"{k}, {v}")
print(f"total: {sum(d.values())}")
output:
company A, 400
company B, 200
total: 600
In order not to iterate again through the dictionary's values to calculate the total(I mean in sum(d.values()) expression), you can do add to total while you are printing the items like:
d = {}
with open('new.csv') as f:
for line in f:
name, value = line.rsplit(',', maxsplit=1)
d[name] = d.get(name, 0) + int(value)
total = 0
for k, v in d.items():
total += v
print(f"{k}, {v}")
print(f"total: {total}")

Python: How to read space delimited data with different length in text file and parse it

I have space delimited data in a text file look like the following:
0 1 2 3
1 2 3
3 4 5 6
1 3 5
1
2 3 5
3 5
each line has different length.
I need to read it starting from line 2 ('1 2 3')
and parse it and get the following information:
Number of unique data = (1,2,3,4,5,6)=6
Count of each data:
count data (1)=3
count data (2)=2
count data (3)=5
count data (4)=1
count data (5)=4
count data (6)=1
Number of lines=6
Sort the data in descending order:
data (3)
data (5)
data (1)
data (2)
data (4)
data (6)
I did this:
file=open('data.txt')
csvreader=csv.reader(file)
header=[]
header=next(csvreader)
print(header)
rows=[]
for row in csvreader:
rows.append(row)
print(rows)
After this step, what should I do to get the expected results?
I would do something like this:
from collections import Counter
with open('data.txt', 'r') as file:
lines = file.readlines()
lines = lines[1:] # skip first line
data = []
for line in lines:
data += line.strip().split(" ")
counter = Counter(data)
print(f'unique data: {list(counter.keys())}')
print(f'count data: {list(sorted(counter.most_common(), key=lambda x: x[0]))}')
print(f'number of lines: {len(lines)}')
print(f'sort data: {[x[0] for x in counter.most_common()]}')
A simple brute force approach:
nums = []
counts = {}
for row in open('data.txt'):
if row[0] == '0':
continue
nums.extend( [int(k) for k in row.rstrip().split()] )
print(nums)
for n in nums:
if n not in counts:
counts[n] = 1
else:
counts[n] += 1
print(counts)
ordering = list(sorted(counts.items(), key=lambda k: -k[1]))
print(ordering)
Here is another approach
def getData(infile):
""" Read file lines and return lines 1 thru end"""
lnes = []
with open(infile, 'r') as data:
lnes = data.readlines()
return lnes[1:]
def parseData(ld):
""" Parse data and print desired results """
unique_symbols = set()
all_symbols = dict()
for l in ld:
symbols = l.strip().split()
for s in symbols:
unique_symbols.add(s)
cnt = all_symbols.pop(s, 0)
cnt += 1
all_symbols[s] = cnt
print(f'Number of Unique Symbols = {len(unique_symbols)}')
print(f'Number of Lines Processed = {len(ld)}')
for symb in unique_symbols:
print(f'Number of {symb} = {all_symbols[symb]}')
print(f"Descending Sort of Symbols = {', '.join(sorted(list(unique_symbols), reverse=True))}")
On executing:
infile = r'spaced_text.txt'
parseData(getData(infile))
Produces:
Number of Unique Symbols = 6
Number of Lines Processed = 6
Number of 2 = 2
Number of 5 = 4
Number of 3 = 5
Number of 1 = 3
Number of 6 = 1
Number of 4 = 1
Descending Sort of Symbols = 6, 5, 4, 3, 2, 1

How to extract key and value from a text file

I would like to extract key and value from an existing text file. Key in a separate variable and value in a separate variable.
The text file (sample.txt) contains the below content,
one:two
three:four
five:six
seven:eight
nine:ten
sample:demo
I am able to read the content from the text file, but i am not able to proceed further to extract key and value.
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
x=0
for line in sp:
sp.split(":")[x].strip()
x+=1
The above only extracts the value and also provides index out of range exception at the end.
If we iterate through the file, i am expecting the output as below,
Key 0 = one
Key 1 = three
Key 2 = five
Key 3 = seven
key 4 = sample
Value 0 = two
Value 1 = four
Value 2 = six
Value 3 = eight
Value 4 = ten
This should work:
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
lines = sp.split("\n")
for line in lines:
# print("line:[{0}]".format(line))
parts = line.split(":")
print("key:[{0}], value:[{1}]".format(parts[0], parts[1]))
It can work:
sp = open ("sampletxt.txt", "r")
x=0
key=[]
value=[]
try:
while True:
text_line = sp.readline()
if text_line:
text_line = ''.join(text_line)
text_line = text_line.split()
text_line = ''.join(text_line).split(':')
key.append(text_line[0])
value.append(text_line[1])
x += 1
else:
for i in range(x):
print("Key {} = {}".format(i,key[i]))
print("")
for i in range(x):
print("Value {} = {}".format(i,value[i]))
break
finally:
sp.close()
The output is:
Key 0 = one
Key 1 = three
Key 2 = five
Key 3 = seven
Key 4 = nine
Key 5 = sample
Value 0 = two
Value 1 = four
Value 2 = six
Value 3 = eight
Value 4 = ten
Value 5 = demo
which is similar to your request
Why don't you try:
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
dictionary = {}
for x, line in enumerate(sp):
line_list = sp.split(":")
dictionary[line_list[0]]=line_list[1]
You should always check if split returns two members (or any number you expect) before using the indexes.

Sort Average In A file

I have a file with 3 scores for each person. Each person has their own row. I want to use these scores, and get the average of all 3 of them. There scores are separated by tabs and in descending order. For example:
tam 10 6 11
tom 3 7 3
tim 5 4 6
these people would come out with an average of:
tam 9
tom 5
tim 4
I want these to be able to print to the python shell, however not be saved to the file.
with open("file.txt") as file1:
d = {}
count = 0
for line in file1:
column = line.split()
names = column[0]
average = (int(column[1].strip()) + int(column[2].strip()) + int(column[3].strip()))/3
count = 0
while count < 3:
d.setdefault(names, []).append(average)
count = count + 1
for names, v in sorted(d.items()):
averages = (sum(v)/3)
print(names,average)
averageslist=[]
averageslist.append(averages)
My code only finds the first persons average and outputs it for all of them. I also want it to be descending in order of averages.
You can use the following code that parses your file into a list of (name, average) tuples and prints every entry of the by average sorted list:
import operator
with open("file.txt") as f:
data = []
for line in f:
parts = line.split()
name = parts[0]
vals = parts[1:]
avg = sum(int(x) for x in vals)/len(vals)
data.append((name, avg))
for person in sorted(data, key=operator.itemgetter(1), reverse=True):
print("{} {}".format(*person))
You are almost correct.You are calculating average in the first step.So need of sum(v)/3 again.Try this
with open("file.txt") as file1:
d = {}
count = 0
for line in file1:
column = line.split()
names = column[0]
average = (int(column[1].strip()) + int(column[2].strip()) + int(column[3].strip()))/3
d[names] = average
for names, v in sorted(d.items(),key=lambda x:x[1],reverse=True): #increasing order==>sorted(d.items(),key=lambda x:x[1])
print(names,v)
#output
('tam', 9)
('tim', 5)
('tom', 4)
To sort by name
for names, v in sorted(d.items()):
print(names,v)
#output
('tam', 9)
('tim', 5)
('tom', 4)
The issue is this:
averages = (sum(v)/3)
print(names,average)
Notice that on the first line you are computing averages (with an s at the end) and on the next line you are printing average (without an s).
Try This:
from operator import itemgetter
with open("file.txt") as file1:
d = {}
count = 0
for line in file1:
column = line.split()
names = column[0]
average = (int(column[1].strip()) + int(column[2].strip()) + int(column[3].strip()))/3
count = 0
d.setdefault(names, []).append(average)
for names,v in sorted(d.items(), key=itemgetter(1),reverse=True):
print(names,v)

Finding Maximum Value in CSV File

Have an assignment of finding average and maximum rainfall in file "BoulderWeatherData.csv". Have found the average using this code:
rain = open("BoulderWeatherData.csv", "r")
data = rain.readline()
print(rain)
data = rain.readlines()
total = 0
linecounter = 0
for rain in data:
linecounter = linecounter + 1
print("The number of lines is", linecounter)
for line in data:
r = line.split(",")
total = total + float(r[4])
print(total)
average = float(total / linecounter)
print("The average rainfall is ", "%.2f" % average)
However, can't seem to find maximum using this same process. Attempted using max, function but the answer that must be obtained is float number, which can not be iterated through max.
Any help would be appreciated.
This is my prefered way of handling this.
#!/usr/bin/env python3
rain = open("BoulderWeatherData.csv","r")
average = 0.0
total = 0
maxt = 0.0
for line in rain:
try:
p = float(line.split(",")[4])
average += p
total += 1
maxt = max(maxt,p)
except:
pass
average = average / float(total)
print("Average:",average)
print("Maximum:",maxt)
This will output:
Average: 0.05465272591486193
Maximum: 1.98
import csv
INPUT = "BoulderWeatherData.csv"
PRECIP = 4 # 5th column
with open(INPUT, "rU") as inf:
incsv = csv.reader(inf)
header = next(incsv, None) # skip header row
precip = [float(row[PRECIP]) for row in incsv]
avg_precip = sum(precip, 0.) / (1 and len(precip)) # prevent div-by-0
max_precip = max(precip)
print(
"Avg precip: {:0.3f} in/day, max precip: {:0.3f} in/day"
.format(avg_precip, max_precip)
)
returns
Avg precip: 0.055 in/day, max precip: 1.980 in/day
max=0
for line in data:
r = line.split(",")
if float(r[4]) > max:
max=float(r[4])
print(max)
something like that
You're already accumulating total across loop iterations.
To keep track of a maxvalue, it's basically the same thing, except instead of adding you're maxing:
total = 0
maxvalue = 0
for line in data:
r = line.split(",")
value = float(r[4])
total = total + value
maxvalue = max(maxvalue, value)
print(total)
print(maxvalue)
Or, if you don't want to use the max function:
for line in data:
r = line.split(",")
value = float(r[4])
total = total + value
if value > maxvalue:
maxvalue = value
This code will attempt to find the maximum value, and the average value, of floats stored in the 5th position in a .csv.
rainval = []
Initializes the empty array where we will store values.
with open ("BoulderWeatherData.csv", "r") as rain:
Opens the .csv file and names it "rain".
for lines in rain:
This reads every line in rain until the end of the file.
rainval += [float(lines.strip().split(",")[4])]
We append the float value found in the fifth position (fourth index) of the line.
We repeat the above for every line located in the .csv file.
print (sorted(rainval)[len(rainval)])
This sorts the values in the rainval array and then takes the last (greatest) value, and prints it. This is the maximum value and is better than max because it can handle floats and not just ints.
print (sum(rainval)/len(rainval))
This prints the average rainfall.
Alternatively, if we don't want to use arrays:
maxrain = -float("inf")
total, count = 0, 0
with open ("test.txt", "r") as rain:
for lines in rain:
temp = float(lines.strip().split(",")[4])
if maxrain < temp:
maxrain = temp
total += temp
count += 1
print (maxrain)
print (total/count)

Categories