Merging computed file contents and display previous computed data in output - python

I am working to 2 files, oldFile.txt and newFile.txt and compute some changes between them. The newFile.txt is updated constantly and any updates will be written to oldFile.txt
I am trying to improve the snippet below by saving previous computed values and add it to a finalOutput.txt. Any idea will be very helpful to accomplish the needed output. Thank you in advance.
import pandas as pd
from time import sleep
def read_file(fn):
data = {}
with open(fn, 'r') as f:
for lines in f:
line = lines.rstrip()
pname, cnt, cat = line.split(maxsplit=2)
data.update({pname: {'pname': pname, 'cnt': int(cnt), 'cat': cat}})
return data
def process_data(oldfn, newfn):
old = read_file(oldfn)
new = read_file(newfn)
u_data = {}
for ko, vo in old.items():
if ko in new:
n = new[ko]
old_cnt = vo['cnt']
new_cnt = n['cnt']
u_cnt = old_cnt + new_cnt
tmp_old_cnt = 1 if old_cnt == 0 else old_cnt
cnt_change = 100 * (new_cnt - tmp_old_cnt) / tmp_old_cnt
u_data.update({ko: {'pname': n['pname'], 'cnt': new_cnt, 'cat': n['cat'],
'curr_change%': round(cnt_change, 0)}})
for kn, vn in new.items():
if kn not in old:
old_cnt = 1
new_cnt = vn['cnt']
cnt_change = 0
vn.update({'cnt_change': round(cnt_change, 0)})
u_data.update({kn: vn})
pd.options.display.float_format = "{:,.0f}".format
mydata = []
for _, v in u_data.items():
mydata.append(v)
df = pd.DataFrame(mydata)
df = df.sort_values(by=['cnt'], ascending=False)
# Save to text file.
with open('finalOutput.txt', 'w') as w:
w.write(df.to_string(header=None, index=False))
# Overwrite oldFile.txt
with open('oldFile.txt', 'w') as w:
w.write(df.to_string(header=None, index=False))
# Print in console.
df.insert(0, '#', range(1, 1 + len(df)))
print(df.to_string(index=False,header=True))
while True:
oldfn = './oldFile.txt'
newfn = './newFile.txt'
process_data(oldfn, newfn)
sleep(60)
oldFile.txt
e6c76e4810a464bc 1 Hello(HLL)
65b66cc4e81ac81d 2 CryptoCars (CCAR)
c42d0c924df124ce 3 GoldNugget (NGT)
ee70ad06df3d2657 4 BabySwap (BABY)
e5b7ebc589ea9ed8 8 Heroes&E... (HE)
7e7e9d75f5da2377 3 Robox (RBOX)
newfile.txt #-- content during 1st reading
e6c76e4810a464bc 34 Hello(HLL)
65b66cc4e81ac81d 43 CryptoCars (CCAR)
c42d0c924df124ce 95 GoldNugget (NGT)
ee70ad06df3d2657 15 BabySwap (BABY)
e5b7ebc589ea9ed8 37 Heroes&E... (HE)
7e7e9d75f5da2377 23 Robox (RBOX)
755507d18913a944 49 CharliesFactory
newfile.txt #-- content during 2nd reading
924dfc924df1242d 35 AeroDie (ADie)
e6c76e4810a464bc 34 Hello(HLL)
65b66cc4e81ac81d 73 CryptoCars (CCAR)
c42d0c924df124ce 15 GoldNugget (NGT)
ee70ad06df3d2657 5 BabySwap (BABY)
e5b7ebc589ea9ed8 12 Heroes&E... (HE)
7e7e9d75f5da2377 19 Robox (RBOX)
755507d18913a944 169 CharliesFactory
newfile.txt # content during 3rd reading
924dfc924df1242d 45 AeroDie (ADie)
e6c76e4810a464bc 2 Hello(HLL)
65b66cc4e81ac81d 4 CryptoCars (CCAR)
c42d0c924df124ce 7 GoldNugget (NGT)
ee70ad06df3d2657 5 BabySwap (BABY)
e5b7ebc589ea9ed8 3 Heroes&E... (HE)
7e7e9d75f5da2377 6 Robox (RBOX)
755507d18913a944 9 CharliesFactory
oldFile.txt #-- Current output that needs improvement
# pname cnt cat curr_change%
1 924dfc924df1242d 35 AeroDie (ADie) 29
2 755507d18913a944 9 CharliesFactory -95
3 c42d0c924df124ce 7 GoldNugget (NGT) -53
4 7e7e9d75f5da2377 6 Robox (RBOX) -68
5 ee70ad06df3d2657 5 BabySwap (BABY) 0
6 65b66cc4e81ac81d 4 CryptoCars (CCAR) -95
7 e5b7ebc589ea9ed8 3 Heroes&E... (HE) -75
8 e6c76e4810a464bc 2 Hello(HLL) -94
finalOutput.txt #-- Needed Improved Output with additional columns r1, r2 and so on depending on how many update readings
# curr_change% is the latest 3rd reading
# r2% is based on the 2nd reading
# r1% is based on the 1st reading
# pname cnt cat curr_change% r2% r1%
1 924dfc924df1242d 35 AeroDie (ADie) 29 0 0
2 755507d18913a944 9 CharliesFactory -95 245 0
3 c42d0c924df124ce 7 GoldNugget (NGT) -53 -84 3,067
4 7e7e9d75f5da2377 6 Robox (RBOX) -68 -17 667
5 ee70ad06df3d2657 5 BabySwap (BABY) 0 -67 275
6 65b66cc4e81ac81d 4 CryptoCars (CCAR) -95 70 2,050
7 e5b7ebc589ea9ed8 3 Heroes&E... (HE) -75 -68 362
8 e6c76e4810a464bc 2 Hello(HLL) -94 0 3,300

Updated for feedback, I made adjustments so that it would handle data that was fed to it live. Whenever new data is loaded, load the file name into process_new_file() function, and it will update the 'finalOutput.txt'.
For simplicity, I named the different files file1, file2, file3, and file4.
I'm doing most of the operations using the pandas Dataframe. I think working with Pandas DataFrames will make the task a lot easier for you.
Overall, I created one function to read the file and return a properly formatted DataFrame. I created a second function that compares the old and the new file and does the calculation you were looking for. I merge together the results of these calculations. Finally, I merge all of these calculations with the last file's data to get the output you're looking for.
import pandas as pd
global global_old_df
global results_df
global count
global_old_df = None
results_df = pd.DataFrame()
count = 0
def read_file(file_name):
rows = []
with open(file_name) as f:
for line in f:
rows.append(line.split(" ", 2))
df = pd.DataFrame(rows, columns=['pname', 'cnt', 'cat'])
df['cat'] = df['cat'].str.strip()
df['cnt'] = df['cnt'].astype(float)
return df
def compare_dfs(df_old, df_new, count):
df_ = df_old.merge(df_new, on=['pname', 'cat'], how='outer')
df_['r%s' % count] = (df_['cnt_y'] / df_['cnt_x'] - 1) * 100
df_ = df_[['pname', 'r%s' % count]]
df_ = df_.set_index('pname')
return df_
def process_new_file(file):
global global_old_df
global results_df
global count
df_new = read_file(file)
if global_old_df is None:
global_old_df = df_new
return
else:
count += 1
r_df = compare_dfs(global_old_df, df_new, count)
results_df = pd.concat([r_df, results_df], axis=1)
global_old_df = df_new
output_df = df_new.merge(results_df, left_on='pname', right_index=True)
output_df.to_csv('finalOutput.txt')
pd.options.display.float_format = "{:,.1f}".format
print(output_df.to_string())
files = ['file1.txt', 'file2.txt', 'file3.txt', 'file4.txt']
for file in files:
process_new_file(file)
This gives the output:
pname cnt cat r3 r2 r1
0 924dfc924df1242d 45.0 AeroDie (ADie) 28.6 NaN NaN
1 e6c76e4810a464bc 2.0 Hello(HLL) -94.1 0.0 3,300.0
2 65b66cc4e81ac81d 4.0 CryptoCars (CCAR) -94.5 69.8 2,050.0
3 c42d0c924df124ce 7.0 GoldNugget (NGT) -53.3 -84.2 3,066.7
4 ee70ad06df3d2657 5.0 BabySwap (BABY) 0.0 -66.7 275.0
5 e5b7ebc589ea9ed8 3.0 Heroes&E... (HE) -75.0 -67.6 362.5
6 7e7e9d75f5da2377 6.0 Robox (RBOX) -68.4 -17.4 666.7
7 755507d18913a944 9.0 CharliesFactory -94.7 244.9 NaN
So, to run it live, you'd just replace that last section with:
while True:
newfn = './newFile.txt'
process_new_file(newfn)
sleep(60)

Related

Finding mean/SD of a group of population and mean/SD of remaining population within a data frame

I have a pandas data frame that looks like this:
id age weight group
1 12 45 [10-20]
1 18 110 [10-20]
1 25 25 [20-30]
1 29 85 [20-30]
1 32 49 [30-40]
1 31 70 [30-40]
1 37 39 [30-40]
I am looking for a data frame that would look like this: (sd=standard deviation)
group group_mean_weight group_sd_weight rest_mean_weight rest_sd_weight
[10-20]
[20-30]
[30-40]
Here the second/third columns are mean and SD for that group. columns third and fourth are mean and SD for the rest of the groups combined.
Here's a way to do it:
res = df.group.to_frame().groupby('group').count()
for group in res.index:
mask = df.group==group
srGroup, srOther = df.loc[mask, 'weight'], df.loc[~mask, 'weight']
res.loc[group, ['group_mean_weight','group_sd_weight','rest_mean_weight','rest_sd_weight']] = [
srGroup.mean(), srGroup.std(), srOther.mean(), srOther.std()]
res = res.reset_index()
Output:
group group_mean_weight group_sd_weight rest_mean_weight rest_sd_weight
0 [10-20] 77.500000 45.961941 53.60 24.016661
1 [20-30] 55.000000 42.426407 62.60 28.953411
2 [30-40] 52.666667 15.821926 66.25 38.378596
An alternative way to get the same result is:
res = ( pd.DataFrame(
df.group.drop_duplicates().to_frame()
.apply(lambda x: [
df.loc[df.group==x.group,'weight'].mean(),
df.loc[df.group==x.group,'weight'].std(),
df.loc[df.group!=x.group,'weight'].mean(),
df.loc[df.group!=x.group,'weight'].std()], axis=1, result_type='expand')
.to_numpy(),
index=list(df.group.drop_duplicates()),
columns=['group_mean_weight','group_sd_weight','rest_mean_weight','rest_sd_weight'])
.reset_index().rename(columns={'index':'group'}) )
Output:
group group_mean_weight group_sd_weight rest_mean_weight rest_sd_weight
0 [10-20] 77.500000 45.961941 53.60 24.016661
1 [20-30] 55.000000 42.426407 62.60 28.953411
2 [30-40] 52.666667 15.821926 66.25 38.378596
UPDATE:
OP asked in a comment: "what if I have more than one weight column? what if I have around 10 different weight columns and I want sd for all weight columns?"
To illustrate below, I have created two weight columns (weight and weight2) and have simply provided all 4 aggregates (mean, sd, mean of other, sd of other) for each weight column.
wgtCols = ['weight','weight2']
res = ( pd.concat([ pd.DataFrame(
df.group.drop_duplicates().to_frame()
.apply(lambda x: [
df.loc[df.group==x.group,wgtCol].mean(),
df.loc[df.group==x.group,wgtCol].std(),
df.loc[df.group!=x.group,wgtCol].mean(),
df.loc[df.group!=x.group,wgtCol].std()], axis=1, result_type='expand')
.to_numpy(),
index=list(df.group.drop_duplicates()),
columns=[f'group_mean_{wgtCol}',f'group_sd_{wgtCol}',f'rest_mean_{wgtCol}',f'rest_sd_{wgtCol}'])
for wgtCol in wgtCols], axis=1)
.reset_index().rename(columns={'index':'group'}) )
Input:
id age weight weight2 group
0 1 12 45 55 [10-20]
1 1 18 110 120 [10-20]
2 1 25 25 35 [20-30]
3 1 29 85 95 [20-30]
4 1 32 49 59 [30-40]
5 1 31 70 80 [30-40]
6 1 37 39 49 [30-40]
Output:
group group_mean_weight group_sd_weight rest_mean_weight rest_sd_weight group_mean_weight2 group_sd_weight2 rest_mean_weight2 rest_sd_weight2
0 [10-20] 77.500000 45.961941 53.60 24.016661 87.500000 45.961941 63.60 24.016661
1 [20-30] 55.000000 42.426407 62.60 28.953411 65.000000 42.426407 72.60 28.953411
2 [30-40] 52.666667 15.821926 66.25 38.378596 62.666667 15.821926 76.25 38.378596

Pandas: use apply to create 2 new columns

I have a dataset where col a represent the number of total values in values e,i,d,t which are in string format separated by a "-"
a e i d t
0 4 40-80-120-150 0.5-0.3-0.2-0.2 30-32-30-32 1-1-1-1
1 4 40-40-40-40 0.1-0.1-0.1-0.1 18-18-18-18 1-2-3-4
3 4 40-80-120-150 0.5-0.3-0.2-0.2 30-32-30-32 1-1-1-1
5 4 40-40-40-40 0.1-0.1-0.1-0.1 18-18-18-18 1-2-3-4
I want to create 8 new columns, 4 representing the SUM of (e-i-d-t), 4 the product.
For example:
def funct_two_outputs(E, I, d, t, d_calib = 50):
return E+i+d+t, E*i*d*t
OUT first 2 values:
SUM_0, row0 = 40+0.5+30+1 SUM_1 = 80+0.3+32+1
The sum and product are example functions substituting my functions which are a bit more complicated.
I have written out a function **expand_on_col ** that creates separates all the e,i,d,t values into new columns:
def expand_on_col (df_, col_to_split = "namecol", sep='-', prefix="this"):
'''
Pass a df indicating on which col you want to split,
return a df with the col split with a prefix.
'''
df1 = df_[col_to_split].str.split(sep,expand=True).add_prefix(prefix)
df1 = pd.concat([df_,df1], axis=1).replace(np.nan, '-')
return df1
Now i need to create 4 new columsn that are the sum of eidt, and 4 that are the prodct.
Example output for SUM:
index a e i d t a-0 e-0 e-1 e-2 e-3 i-0 i-1 i-2 i-3 d-0 d-1 d-2 d-3 t-0 t-1 t-2 t-3 sum-0 sum-1 sum-2 sum-3
0 0 4 40-80-120-150 0.5-0.3-0.2-0.2 30-32-30-32 1-1-1-1 4 40 80 120 150 0.5 0.3 0.2 0.2 30 32 30 32 1 1 1 1 71 114 153 186
1 1 4 40-40-40-40 0.1-0.1-0.1-0.1 18-18-18-18 1-2-3-4 4 40 40 40 40 0.1 0.1 0.1 0.1 18 18 18 18 1 2 3 4 59 61 63 65
2 3 4 40-80-120-150 0.5-0.3-0.2-0.2 30-32-30-32 1-1-1-1 4 40 80 120 150 0.5 0.3 0.2 0.2 30 32 30 32 1 1 1 1 71 114 153 186
3 5 4 40-40-40-40 0.1-0.1-0.1-0.1 18-18-18-18 1-2-3-4 4 40 40 40 40 0.1 0.1 0.1 0.1 18 18 18 18 1 2 3 4 59 61 63 65
If i run the code with funct_one_output(only returns sum) it works, but wit the funct_two_outputs(suma and product) I get an error.
Here is the code:
import pandas as pd
def expand_on_col (df_, col_to_split = "namecol", sep='-', prefix="this"):
'''
Pass a df indicating on which col you want to split,
return a df with the col split with a prefix.
'''
df1 = df_[col_to_split].str.split(sep,expand=True).add_prefix(prefix)
df1 = pd.concat([df_,df1], axis=1).replace(np.nan, '-')
return df1
def funct_two_outputs(E, I, d, t, d_calib = 50): #the function i want to pass
return E+i+d+t, E*i*d*t
def funct_one_outputs(E, I, d, t, d_calib = 50): #for now i can olny use this one, cant use 2 return values.
return E+i+d+t
for col in columns:
df = expand_on_col (df_=df, col_to_split = col, sep='-', prefix=f"{col}-")
cols_ = df.columns.drop(columns)
df[cols_]= df[cols_].apply(pd.to_numeric, errors="coerce")
df["a"] = df["a"].apply(pd.to_numeric, errors="coerce")
df.reset_index(inplace=True)
for i in range (max(df["a"])):
name_1, name_2 = f"sum-{i}", f"mult-{i}"
df[name_1] = df.apply(lambda row: funct_one_outputs(E= row[f'e-{i}'], I=row[f'i-{i}'], d=row[f'd-{i}'], t=row[f"t-{i}"]), axis=1)
#if i try and fill 2 outputs it wont work
df[[name_1, name_2]] = df.apply(lambda row: funct_two_outputs(E= row[f'e-{i}'], I=row[f'i-{i}'], d=row[f'd-{i}'], t=row[f"t-{i}"]), axis=1)
OUT:
ValueError Traceback (most recent call last)
<ipython-input-306-85157b89d696> in <module>()
68 df[name_1] = df.apply(lambda row: funct_one_outputs(E= row[f'e-{i}'], I=row[f'i-{i}'], d=row[f'd-{i}'], t=row[f"t-{i}"]), axis=1)
69 #if i try and fill 2 outputs it wont work
---> 70 df[[name_1, name_2]] = df.apply(lambda row: funct_two_outputs(E= row[f'e-{i}'], I=row[f'i-{i}'], d=row[f'd-{i}'], t=row[f"t-{i}"]), axis=1)
71
72
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in __setitem__(self, key, value)
3039 self._setitem_frame(key, value)
3040 elif isinstance(key, (Series, np.ndarray, list, Index)):
-> 3041 self._setitem_array(key, value)
3042 else:
3043 # set column
/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in _setitem_array(self, key, value)
3074 )[1]
3075 self._check_setitem_copy()
-> 3076 self.iloc._setitem_with_indexer((slice(None), indexer), value)
3077
3078 def _setitem_frame(self, key, value):
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value)
1751 if len(ilocs) != len(value):
1752 raise ValueError(
-> 1753 "Must have equal len keys and value "
1754 "when setting with an iterable"
1755 )
ValueError: Must have equal len keys and value when setting with an iterable
Don't Use apply
If you can help it
s = pd.to_numeric(
df[['e', 'i', 'd', 't']]
.stack()
.str.split('-', expand=True)
.stack()
)
sums = s.sum(level=[0, 2]).rename('Sum')
prods = s.prod(level=[0, 2]).rename('Prod')
sums_prods = pd.concat([sums, prods], axis=1).unstack()
sums_prods.columns = [f'{o}-{i}' for o, i in sums_prods.columns]
df.join(sums_prods)
a e i d t Sum-0 Sum-1 Sum-2 Sum-3 Prod-0 Prod-1 Prod-2 Prod-3
0 4 40-80-120-150 0.5-0.3-0.2-0.2 30-32-30-32 1-1-1-1 71.5 113.3 151.2 183.2 600.0 768.0 720.0 960.0
1 4 40-40-40-40 0.1-0.1-0.1-0.1 18-18-18-18 1-2-3-4 59.1 60.1 61.1 62.1 72.0 144.0 216.0 288.0
3 4 40-80-120-150 0.5-0.3-0.2-0.2 30-32-30-32 1-1-1-1 71.5 113.3 151.2 183.2 600.0 768.0 720.0 960.0
5 4 40-40-40-40 0.1-0.1-0.1-0.1 18-18-18-18 1-2-3-4 59.1 60.1 61.1 62.1 72.0 144.0 216.0 288.0

Python : Print function is not giving excepted output

I have written below function in Python:
df = pd.DataFrame({'age': [32, 33, 33,34,44]})
def PROC_FREQ(dataset,arg1):
x= dataset.groupby(arg1)[arg1[0]].agg(({'Frequency':'count'}))
nombre=x.columns.tolist()[0]
x.rename(columns={nombre:'Freq'},inplace=True)
x['Pct']=round((x['Freq']/x.Freq.sum())*100,2)
x['Freq Acum'],x['Cumm Percent']=x.Freq.cumsum(),x.Pct.cumsum()
x.sort_values(arg1,ascending=[1],inplace=True)
pd.set_option('display.max_columns',500)
x=x.reset_index()
string_repr = x.to_string(index=False,justify='center').splitlines()
string_repr.insert(1, "-" * len(string_repr[0]))
out = '\n'.join(string_repr)
df_split = out.split('\n')
columns = shutil.get_terminal_size().columns
for i in range(len(df_split)):
print(df_split[i].center(columns))
and below is the code to call the function:
PROC_FREQ(df,['age'])
and below is the output of the function:
age Freq Pct Freq Acum Cumm Percent
-----------------------------------------
32 1 16.67 1 16.67
33 2 33.33 3 50.00
34 1 16.67 4 66.67
44 2 33.33 6 100.00
Last line the output is not aligned correctly.

Get sliced dataframe by giving two values

Given two values, how can I get all the value between those two values. Thank you
for example:
dataframe:
Quarter GDP
0 1947q1 243.1
1 1947q2 246.3
2 1947q3 250.1
3 1947q4 260.3
4 1948q1 266.2
5 1948q2 272.9
6 1948q3 279.5
7 1948q4 280.7
8 1949q1 275.4
9 1949q2 271.7
10 1949q3 273.3
11 1949q4 271.0
12 1950q1 281.2
13 1950q2 290.7
14 1950q3 308.5
15 1950q4 320.3
16 1951q1 336.4
given 1947q3 and 1948q4, I need to get all the data between(inclusive) those two values
2 1947q3 250.1
3 1947q4 260.3
4 1948q1 266.2
5 1948q2 272.9
6 1948q3 279.5
7 1948q4 280.7
This will give you the desired result
df[(df['Quarter'] >= '1947q3') & (df['Quarter'] <= '1948q4')]
Quarter GDP
2 1947q3 250.1
3 1947q4 260.3
4 1948q1 266.2
5 1948q2 272.9
6 1948q3 279.5
7 1948q4 280.7
You can also use .between
df[df['Quarter'].between('1947q3', '1948q4', inclusive=True)]
put the data in dataframe.txt
f = open('dataframe.txt', 'r')
f_r = f.readline()
data = []
while f_r:
infos = f_r.split(' ')
infos = [info.strip() for info in infos if info]
if len(infos) == 3:
data.append((infos[1], infos[2]))
f_r = f.readline()
def get_rangedata_by_quarter(quarter_s, quarter_b):
""" quarter_s is the small one
quarter_b is the big one
"""
for info in data:
quarter = info[0]
if quarter >= quarter_s and quarter <= quarter_b:
print quarter, info[1]
get_rangedata_by_quarter('1947q3', '1948q4')

Finding the averages from columns

I'm using this txt file named Gradedata.txt and it looks like this:
Sarah K.,10,9,7,9,10,20,19,19,45,92
John M.,9,9,8,9,8,20,20,18,43,95
David R.,8,7,7,9,6,18,17,17,40,83
Joan A.,9,10,10,10,10,20,19,20,47,99
Nick J.,9,7,10,10,10,20,20,19,46,98
Vicki T.,7,7,8,9,9,17,18,19,44,88
I'm looking for the averages of each column. Each column has it's own title (Homework #1, Homework #2, etc. in that order). What I am trying to do should look exactly like this:
Homework #1 8.67
Homework #2 8.17
Homework #3 8.33
Homework #4 9.33
Homework #5 8.83
Quiz #1 19.17
Quiz #2 18.83
Quiz #3 18.67
Midterm #1 44.17
Final #1 92.50
Here is my attempt at accomplishing this task:
with open("GradeData.txt", "rtU") as f:
columns = f.readline().strip().split(" ")
numRows = 0
sums = [0] * len(columns)
for line in f:
if not line.strip():
continue
values = line.split(" ")
for i in xrange(len(values)):
sums[i] += int(values[i])
numRows += 1
for index, summedRowValue in enumerate(sums):
print columns[index], 1.0 * summedRowValue / numRows
I'm getting errors and also I realize I have to name each assignment average. Need some help here. I appreciate it.
numpy can chew this up in one line:
>>> np.loadtxt('Gradedata.txt', delimiter=',', usecols=range(1,11)).mean(axis=0)
array([ 8.66666667, 8.16666667, 8.33333333, 9.33333333,
8.83333333, 19.16666667, 18.83333333, 18.66666667,
44.16666667, 92.5 ])
Just transpose and use statistics.mean to get the average, skipping the first col:
import csv
from itertools import islice
from statistics import mean
with open("in.txt") as f:
for col in islice(zip(*csv.reader(f)), 1, None):
print(mean(map(float,col)))
Which will give you:
8.666666666666666
8.166666666666666
8.333333333333334
9.333333333333334
8.833333333333334
19.166666666666668
18.833333333333332
18.666666666666668
44.166666666666664
92.5
If the columns are actually named and you want to pair them:
import csv
from itertools import islice
from statistics import mean
with open("in.txt") as f:
# get column names
cols = next(f).split(",")
for col in islice(zip(*csv.reader(f)),1 ,None):
# keys are column names, values are averages
data = dict(zip(cols[1:],mean(map(float,col))))
Or using pandas.read_csv:
import pandas as pd
df = pd.read_csv("in.txt",index_col=0,header=None)
print(df)
print(df.mean(axis=0))
1 2 3 4 5 6 7 8 9 10
0
Sarah K. 10 9 7 9 10 20 19 19 45 92
John M. 9 9 8 9 8 20 20 18 43 95
David R. 8 7 7 9 6 18 17 17 40 83
Joan A. 9 10 10 10 10 20 19 20 47 99
Nick J. 9 7 10 10 10 20 20 19 46 98
Vicki T. 7 7 8 9 9 17 18 19 44 88
1 8.666667
2 8.166667
3 8.333333
4 9.333333
5 8.833333
6 19.166667
7 18.833333
8 18.666667
9 44.166667
10 92.500000
dtype: float64

Categories