I need to find the median of all the integers associated with each key (AA, BB). The basic format my code leads to:
AA - 21
AA - 52
BB - 3
BB - 2
My code:
def scoreData(filename):
d = dict()
fin = open(filename)
contents = fin.readlines()
for line in contents:
parts = linesplit()
part[i] = int(part[1])
if parts[0] not in d:
d[parts[0]] = list(parts[1])
else:
d[parts[0]].append(parts[1])
names = list(d.keys())
names.sort() #alphabeticez the names
print("Name\+Max\+Min\+Median")
for name in names: #makes the table
print (name"\+", max(d[name]),\+min(d[name]),"\+"median(d[name]))
I'm afraid following the same format as the "names" and "names.sort" will completely restructure the data. I've thought about "from statistics import median," but once again I do not know how to only select the values associated with each of the same keys.
Thanks in advance
You can do it easily with pandas and numpy:
import pandas
import numpy as np
and aggregating by first row:
score = pandas.read_csv(filename, delimiter=' - ', header=None)
print score.groupby(0).agg([np.median, np.min, np.max])
which returns:
1
median amin amax
0
AA 36.5 21 52
BB 2.5 2 3
There are many, many ways you can go about this. But here's a 'naive' implementation that will get the job done.
Assuming your data looks like:
AA 1
BB 5
AA 2
CC 7
BB 1
You can do the following:
import numpy as np
from collections import defaultdict
def find_averages(input_file)
result_dict = defaultdict(list)
for line in input_file.readlines()
key, value = line.split()
result_dict[key].append[int(value)]
return [(key, np.mean(value)) for key,value in result_dict.iteritems()]
Related
I have a CSV file like this:
w syn
0 abaca http://kaiko.getalp.org/dbnary/fra/Musa_textilis
1 abaca http://kaiko.getalp.org/dbnary/fra/chanvre_de_...
2 abaca http://kaiko.getalp.org/dbnary/fra/tagal
3 abaca http://kaiko.getalp.org/dbnary/fra/Musa_textilis
4 abaca http://kaiko.getalp.org/dbnary/fra/chanvre_de_...
.. ... ...
95 abandon http://kaiko.getalp.org/dbnary/fra/apostasie
96 abandon http://kaiko.getalp.org/dbnary/fra/capitulation
97 abandon http://kaiko.getalp.org/dbnary/fra/cession_de_...
98 abandon http://kaiko.getalp.org/dbnary/fra/confiance
99 abandon http://kaiko.getalp.org/dbnary/fra/défection
[100 rows x 2 columns]
6
{'abaca': 'tagal', 'abdomen': 'ventre', 'abricot': 'michemis', 'ADN': 'acide désoxyribonucléique', 'Indien': 'sauvage', 'abandon': 'défection'}
I ma trying to create a dictionary which each word and its synonym. I came up with this code but the final dictionary only contains one synonym for the word but as you can see in the csv file, a word can have more than one synonym.
# read specific columns of csv file using Pandas
df = pd.read_csv("sparql.csv", usecols = ["w","syn"]) #usecols = ["l","f","s","w","syn","synonyme"]
print(df)
liste_mot = df['w'].tolist()
liste_mot = set(liste_mot)
print(len(liste_mot))
liste_sys = []
dict_syn = {}
for index, row in df.iterrows():
k, v = row
sys = os.path.basename(v)
if "_" in sys:
sys = sys.split("_")
sys = " ".join(sys)
dict_syn[k] = sys
else:
dict_syn[k] = sys
print(dict_syn)
What I want to get is each word as key and a list of all their synonyms as its value but so far I only get one synonym (syn) per word (w) and not all of them.
Another approach:
import os
df = pd.read_csv("sparql.csv", usecols=["w","syn"])
df["syn_new"] = df.syn.map(os.path.basename).str.replace("_", " ")
dict_syn = {
key: group.syn_new.to_list()
for key, group in df[["w", "syn_new"]].groupby("w")
}
Result for your sample:
{'abaca': ['Musa textilis',
'chanvre de ...',
'tagal',
'Musa textilis',
'chanvre de ...'],
'abandon': ['apostasie',
'capitulation',
'cession de ...',
'confiance',
'défection']}
You could try if
df["syn_new"] = df.syn.str.rsplit("/", 1, expand=True)[1].str.replace("_", " ")
works too, could be faster.
And maybe you don't want lists but sets as dict_syn values to avoid duplicates:
...
key: set(group.syn_new.to_list())
...
Here's a working example based partly on your code. Synonyms are put in a list:
from io import StringIO
import pandas as pd
text = """
w syn
0 abaca http://kaiko.getalp.org/dbnary/fra/Musa_textilis
1 abaca http://kaiko.getalp.org/dbnary/fra/chanvre_de_...
2 abaca http://kaiko.getalp.org/dbnary/fra/tagal
3 abaca http://kaiko.getalp.org/dbnary/fra/Musa_textilis
4 abaca http://kaiko.getalp.org/dbnary/fra/chanvre_de_...
95 abandon http://kaiko.getalp.org/dbnary/fra/apostasie
95 abandon http://kaiko.getalp.org/dbnary/fra/apostasie
96 abandon http://kaiko.getalp.org/dbnary/fra/capitulation
97 abandon http://kaiko.getalp.org/dbnary/fra/cession_de_...
98 abandon http://kaiko.getalp.org/dbnary/fra/confiance
99 abandon http://kaiko.getalp.org/dbnary/fra/défection
"""
# read in data
df = pd.read_csv(StringIO(text), sep='\s+')
# get the synonym out of the url
df['real_syn'] = df['syn'].str.extract('.*/(.*)')
# dictionary to write results to
result = {}
# loop over every row of the dataframe
for _, row in df[['w', 'real_syn']].iterrows():
word = row['w']
syn = row['real_syn']
# check if word is already in result dictionary and make sure words are not added twice
if result.get(word) and syn not in result[word]:
result[word] = result[word] + [syn]
else:
# if word is not yet in dictionary, then add it a key, and add the synonym as a list
result[word] = [syn]
print(result)
I'm not sure if your CSV is actually fixed-width, or if that's just a nice printout.
If you don't need Pandas, Python's standard CSV module is up to the job.
import csv
import os
import pprint
from collections import defaultdict
def syn_splitter(s):
syn = os.path.basename(s)
syn = syn.replace('_', ' ')
return syn
# So we can just start appending syns, without having to "prime" the dictionary with an empty list
word_syn_map = defaultdict(list)
with open('sample.csv', 'r', newline='') as f:
reader = csv.reader(f)
next(reader) # discard header
for row in reader:
w, syn = row
syn = syn_splitter(syn)
word_syn_map[w].append(syn)
pprint.pprint(word_syn_map)
# word_syn_map = dict(word_syn_map) if you want to get rid of the defaultdict wrapper
I mocked up sample.csv:
w,syn
abaca,http://kaiko.getalp.org/dbnary/fra/Musa_textilis
abaca,http://kaiko.getalp.org/dbnary/fra/tagal
abaca,http://kaiko.getalp.org/dbnary/fra/Musa_textilis
abandon,http://kaiko.getalp.org/dbnary/fra/apostasie
abandon,http://kaiko.getalp.org/dbnary/fra/capitulation
abandon,http://kaiko.getalp.org/dbnary/fra/confiance
abandon,http://kaiko.getalp.org/dbnary/fra/défection
and I got:
defaultdict(<class 'list'>,
{'abaca': ['Musa textilis', 'tagal', 'Musa textilis'],
'abandon': ['apostasie',
'capitulation',
'confiance',
'défection']})
I'm attempting some text analytics and writing code to show the occurrence of a word each month from a given dataset. I have the following function which outputs the frequency of the given word every month - however I am struggling to transform this into a dataframe (columns; month, word frequency).
Appreciate any help!
import collections
df=df.set_index(df['Date'])
for u,v in df.groupby(pd.Grouper(freq="M")):
words=sum(v['Processed'].str.split(' ').values.tolist(),[])
c = collections.Counter(words)
print (c['word'])
currently outputs:
0
1
0
1
1
2
1
18
6
0
0
0
You can convert your collection into a dataframe using pd.DataFrame.from_dict:
import collections
import pandas as pd
df=df.set_index(df['Date'])
results = []
for u,v in df.groupby(pd.Grouper(freq="M")):
words=sum(v['Processed'].str.split(' ').values.tolist(),[])
c = collections.Counter(words)
# convert counter to dataframe
cdf = pd.DataFrame.from_dict(c,orient='index',columns=['frequency']).reset_index()
# add identifer to dataframe
cdf['month'] = u
# collect results
results += [cdf]
# concatenate results
results = pd.concat(results)
If I have 2 csv files as follows:
csv1.csv:
1,Bob,Bob#gmail.com,baseball
2,Tom,Tom#gmail.com.football
3,Bill,Bill#gmail.com,softball
...
csv2.csv:
baseball, b1
football, f1
...
I am looking for a Python way to replace wrong values from csv1(the third column in csv1 is equal to the first column in csv2)) by right values from csv2(the second column).
It should look like:
1,Bob,Bob#gmail.com,b1
2,Tom,Tom#gmail.com,f1
3,Bill,Bill#gmail.com,softball
My code doesn't work.
import csv
table1 = r'data.csv'
table2 = r'facebook_creo.csv'
creo_desc = dict()
with open(table2) as tbl2:
t2 = csv.reader(tbl2, delimiter=',')
next(t2)
for t2row in t2:
wrong_creo = t2row[0]
desc = t2row[1]
creo_desc[wrong_creo] = desc
with open(table1) as tbl1:
t1 = csv.reader(tbl1, delimiter=',')
for t1row in t1:
wrong_creo = t1row[8]
t1.writerow(t1row[8])
Pandas version:
import pandas as pd
data = pd.read_csv(r'data.csv')
creo = pd.read_csv(r'creo.csv')
adset = pd.read_csv(r'adset.csv')
campaign = pd.read_csv(r'campaign.csv')
CreoDict = pd.Series(creo.iloc[:,1].values,index=creo.iloc[:,0]).to_dict()
AdsetDict = pd.Series(adset.iloc[:,1].values,index=adset.iloc[:,0]).to_dict()
CampaignDict = pd.Series(adset.iloc[:,1].values,index=adset.iloc[:,0]).to_dict()
data.iloc[:,8] = data.iloc[:,8].replace(CreoDict)
data.iloc[:,6] = data.iloc[:,6].replace(AdsetDict)
data.iloc[:,4] = data.iloc[:,4].replace(CampaignDict)
data.to_csv(r'total.csv')
I'd use pandas to read in the 2 tables, using the second table to be a dictionary of the replace values to remap into csv1.
import pandas as pd
# Read in the 2 csv files
csv1 = pd.read_csv('csv1.csv')
csv2 = pd.read_csv('csv2.csv')
#Create dictionary form csv2
replaceDict = pd.Series(csv2.iloc[:,1].values,index=csv2.iloc[:,0]).to_dict()
#Use dictionary to replace values
csv1.iloc[:,-1] = csv1.iloc[:,-1].replace(replaceDict)
# Write to file
csv1.to_csv('csv1_new.csv')
Output:
print (csv1)
0 1 2 3
0 1 Bob Bob#gmail.com baseball
1 2 Tom Tom#gmail.com football
2 3 Bill Bill#gmail.com softball
print (csv2)
0 1
0 baseball b1
1 football f1
Then after replace:
print (csv1)
0 1 2 3
0 1 Bob Bob#gmail.com b1
1 2 Tom Tom#gmail.com f1
2 3 Bill Bill#gmail.com softball
It would have been better if you would enclosed the error message, however, I guess you should use csv.writer where you would like to make changes rather than csv.reader..
import csv
table1 = r'a.csv'
table2 = r'b.csv'
creo_desc = dict()
with open(table2) as tbl2:
t2 = csv.reader(tbl2, delimiter=',')
for t2row in t2:
creo_desc[t2row[0]] = t2row[1]
print(creo_desc)
ans = []
with open(table1,'r') as tbl1:
t1 = csv.reader(tbl1, delimiter=',')
for t1row in t1:
if t1row[-1] in creo_desc:
t1row[-1] = creo_desc[t1row[-1]]
ans.append(t1row)
with open(table1,'w') as tbl1:
writer = csv.writer(tbl1)
writer.writerows(ans)
1) a.csv
1,Bob,Bob#gmail.com,baseball
2,Tom,Tom#gmail.com.football
3,Bill,Bill#gmail.com,softball
2) b.csv
baseball, b1
football, f1
Merged 2 csv files with unique filter as 'NAME' using pandas. Further trying to compare 'STANCE' values to 'bipedal' and print. Summarizing, would like to know the method to compare any column values with a string.
s1:
NAME LEG_LENGTH DIET
0 Hadrosaurus 1.20 herbivore
s2:
NAME STRIDE_LENGTH STANCE
3 Hadrosaurus 1.40 bipedal
merged:
NAME LEG_LENGTH DIET STRIDE_LENGTH STANCE
0 Hadrosaurus 1.20 herbivore 1.40 bipedal
Code:
import pandas as pd
import csv
from collections import defaultdict
csv1 = 'dataset1.csv'
csv2 = 'dataset2.csv'
g = 9.8
def splits(c1, c2):
s1 = pd.read_csv(c1)
s2 = pd.read_csv(c2)
print '%s\n%s' % (s1,s2)
merged = s1.merge(s2, on="NAME", how = "outer") # Mergin two files on column NAME
print (merged)
return
splits(csv1, csv2)
hey little Pandas apprentice, try that
df.loc[df.STANCE.str.contains('bipedal')]
i am reading a file which is in the format below:
0.012281001 00:1c:c4:c2:1f:fe 1 30
0.012285001 00:1c:c4:c2:1f:fe 3 40
0.012288001 00:1c:c4:c2:1f:fe 2 50
0.012292001 00:1c:c4:c2:1f:fe 4 60
0.012295001 24:1c:c4:c2:2f:ce 5 70
I intend to make column 2 entities as keys and columns 3 and 4 as separate values. For each line I encounter, for that particular key, their respective values must add up (value 1 and value 2 should aggregate separately for that key). In the above example mentioned, I need to get the output like this:
'00:1c:c4:c2:1f:fe': 10 : 180, '24:1c:c4:c2:2f:ce': 5 : 70
The program i have written for simple 1 key 1 value is as below:
#!/usr/bin/python
import collections
result = collections.defaultdict(int)
clienthash = dict()
with open("luawrite", "r") as f:
for line in f:
hashes = line.split()
ckey = hashes[1]
val1 = float(hashes[2])
result[ckey] += val1
print result
How can I extend this for 2 values and how can I print them as the output mentioned above. I am not getting any ideas. Please help! BTW i am using python2.6
You can store all of the values in a single dictionary, using a tuple as the stored value:
with open("luawrite", "r") as f:
for line in f:
hashes = line.split()
ckey = hashes[1]
val1 = int(hashes[2])
val2 = int(hashes[3])
a,b = result[ckey]
result[ckey] = (a+val1, b+val2)
print result