writing data to csv from dictionaries with multiple values per key - python

Background
I am storing data in dictionaries. The dictionaries can be off different length and in a particular dictionary there could be keys with multiple values. I am trying to spit out the data on a CSV file.
Problem/Solution
Image 1 is how my actual output prints out. Image 2 shows how i would want my output to actually printout. Image 2 is the desired output.
CODE
import csv
from itertools import izip_longest
e = {'Lebron':[25,10],'Ray':[40,15]}
c = {'Nba':5000}
def writeData():
with open('file1.csv', mode='w') as csv_file:
fieldnames = ['Player Name','Points','Assist','Company','Total Employes']
writer = csv.writer(csv_file)
writer.writerow(fieldnames)
for employee, company in izip_longest(e.items(), c.items()):
row = list(employee)
row += list(company) if company is not None else ['', ''] # Write empty fields if no company
writer.writerow(row)
writeData()
I am open to all solutions/suggestions that can help me get my desired output format.

For a much simpler answer, you just need to add one line of code to what you have:
row = [row[0]] + row[1]
so:
for employee, company in izip_longest(e.items(), c.items()):
row = list(employee)
row = [row[0]] + row[1]
row += list(company) if company is not None else ['', ''] # Write empty fields if no company

from collections import defaultdict
values = defaultdict(dict)
values[Name1] = {Points: [], Assist: [], Company: blah, Total_Employees: 123}
for generating the output, traverse through each item in the values to give you names, and populate other values using the key_values in the nested dict.
Again, make sure that there no multiple entries with same name, or choose the one with unique entries in the defaultdict.
Demo for the example-
>>> from collections import defaultdict
>>> import csv
>>> values = defaultdict(dict)
>>> vals = [["Lebron", 25, 10, "Nba", 5000], ["Ray", 40, 15]]
>>> fields = ["Name", "Points", "Assist", "Company", "Total Employes"]
>>> for item in vals:
... if len(item) == len(fields):
... details = dict()
... for j in range(1, len(fields)):
... details[fields[j]] = item[j]
... values[item[0]] = details
... elif len(item) < len(fields):
... details = dict()
... for j in range(1, len(fields)):
... if j+1 <= len(item):
... details[fields[j]] = item[j]
... else:
... details[fields[j]] = ""
... values[item[0]] = details
...
>>> values
defaultdict(<class 'dict'>, {'Lebron': {'Points': 25, 'Assist': 10, 'Company': 'Nba', 'Total Employes': 5000}, 'Ray': {'Points': 40, 'Assist': 15, 'Company': '', 'Total Employes': ''}})
>>> csv_file = open('file1.csv', 'w')
>>> writer = csv.writer(csv_file)
>>> for i in values:
... row = [i]
... for j in values[i]:
... row.append(values[i][j])
... writer.writerow(row)
...
23
13
>>> csv_file.close()
Contents of 'file1.csv':
Lebron,25,10,Nba,5000
Ray,40,15,,

Related

Dynamic list creation and append values - python

I have a input data that is parsed from a json and printing the output like this from keys like tablename,columnname,columnlength
data = ('tablename', 'abc.xyz'),('tablename','abc.xyz'),('columnname', 'xxx'),('columnname', 'yyy'),('columnlen', 55)
data[0] =
abc.xyz
abc.xyz
abc.xyz
data[1] =
xxx
yyy
zzz
data[2] =
20
30
60
data[0] represents tablename
data[1] represents columnname
data[2] represents column length
I have code below that does creating the empty list manually
TableName_list = []
ColumnName_list = []
ColumnLen_list = []
for x in data:
if x[0] == 'tablename':
TableName_list.append(data[0]])
elif x[0] == 'columnname':
ColumnName_list.append(data[1])
elif x[0] == 'columnlen':
ColumnLen_list.append(data[2])
I need to create a dynamic empty list respectively for each fields(tablename,column,columnlength) and append the data to that empty list in the dictionary
and my output is needed like this in a dictionary
dict = {'TableName':TableName_list,'ColumnName':ColumnName_list,'ColumnLen':columnLength_list }
This is probably most easily done with a defaultdict:
from collections import defaultdict
dd = defaultdict(list)
data = [
('tablename', 'abc.xyz'),('tablename','abc.xyz'),
('columnname', 'xxx'),('columnname', 'yyy'),
('columnlen', 55),('columnlen', 30)
]
for d in data:
dd[d[0]].append(d[1])
Output:
defaultdict(<class 'list'>, {
'tablename': ['abc.xyz', 'abc.xyz'],
'columnname': ['xxx', 'yyy'],
'columnlen': [55, 30]
})
If the case of the names in the result is important, you could use a dictionary to translate the incoming names:
aliases = { 'tablename' : 'TableName', 'columnname' : 'ColumnName', 'columnlen' : 'ColumnLen' }
for d in data:
dd[aliases[d[0]]].append(d[1])
Output:
defaultdict(<class 'list'>, {
'TableName': ['abc.xyz', 'abc.xyz'],
'ColumnName': ['xxx', 'yyy'],
'ColumnLen': [55, 30]
})
I suggest to make a dictionary directly, something look like this:
out_dict = {}
for x in data:
key = x[0]
if key in out_dict.keys():
out_dict[key] = out_dict[key].append(x[1])
else:
out_dict[key] = [x[1]]
using pandas:
import pandas as pd
>>> pd.DataFrame(data).groupby(0)[1].apply(list).to_dict()
'''
{'columnlen': [55, 30],
'columnname': ['xxx', 'yyy'],
'tablename': ['abc.xyz', 'abc.xyz']}

How to group a list by value without causing an Attribute Error

I have a CSV, OutputA with format:
Position,Category,Name,Team,Points
1,A,James,Team 1,100
2,A,Mark,Team 2,95
3,A,Tom,Team 1,90
I am trying to get an output of a CSV which gets the total points for each team, the average points per team and the number of riders.
So output would be:
Team,Points,AvgPoints,NumOfRiders
Team1,190,95,2
Team2,95,95,1
I have this function to convert each row to a namedtuple:
fields = ("Position", "Category", "Name", "Team", "Points")
Results = namedtuple('CategoryResults', fields)
def csv_to_tuple(path):
with open(path, 'r', errors='ignore') as file:
reader = csv.reader(file)
for row in map(Results._make, reader):
yield row
Then this sorts the rows into a sorted list by there club:
moutputA = sorted(list(csv_to_tuple("Male/outputA.csv")), key=lambda k: k[3])
This returns a list like:
[CategoryResults(Position='13', Category='A', Name='Marek', Team='1', Points='48'), CategoryResults(Position='7', Category='A', Name='', Team='1', Points='70')]
I am confident that this so far is right although I could be wrong.
I am trying to create a new list of teams with the points (not yet added up).
For example:
[Team 1(1,2,3,4,5)]
[Team 2 (6,9,10)]
etc.
The idea is that I can find how many unique values of points there are (this equals the number of riders). However, when trying to group the list I have this code:
Clubs = []
Club_Points = []
for Names, Club in groupby(moutputA, lambda x: x[3]):
for Teams in Names:
Clubs.append(list(Teams))
for Club, Points in groupby(moutputA, lambda x: x[4]):
for Point in Clubs:
Club_Points.append(list(Point))
print(Clubs)
but this retuns this error:
Teams.append(list(Team))
AttributeError: 'itertools._grouper' object has no attribute 'append'
If data.csv contains:
Position,Category,Name,Team,Points
1,A,James,Team 1,100
2,A,Mark,Team 2,95
3,A,Tom,Team 1,90
Then this script:
import csv
from collections import namedtuple
from itertools import groupby
from statistics import mean
fields = ("Position", "Category", "Name", "Team", "Points")
Results = namedtuple('CategoryResults', fields)
def csv_to_tuple(path):
with open(path, 'r', errors='ignore') as file:
next(file) # skip header
reader = csv.reader(file)
for row in map(Results._make, reader):
yield row
moutputA = sorted(csv_to_tuple("data.csv"), key=lambda k: k.Team)
out = []
for team, group in groupby(moutputA, lambda x: x.Team):
group = list(group)
d = {}
d['Team'] = team
d['Points'] = sum(int(i.Points) for i in group)
d['AvgPoints'] = mean(int(i.Points) for i in group)
d['NumOfRider'] = len(group)
out.append(d)
with open('data_out.csv', 'w', newline='') as csvfile:
fieldnames = ['Team', 'Points', 'AvgPoints', 'NumOfRider']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in out:
writer.writerow(row)
Produces data_out.csv:
Team,Points,AvgPoints,NumOfRider
Team 1,190,95,2
Team 2,95,95,1
Screenshot from LibreOffice:
Here's a start. You should be able to figure out how to get what you want from this.
import csv, io
from collections import namedtuple
from itertools import groupby
data = '''\
Position,Category,Name,Team,Points
1,A,James,Team 1,100
2,A,Mark,Team 2,95
3,A,Tom,Team 1,90
'''
b = io.StringIO(data)
next(b)
fields = ("Position", "Category", "Name", "Team", "Points")
Results = namedtuple('CategoryResults', fields)
def csv_to_tuple(file):
reader = csv.reader(file)
for row in map(Results._make, reader):
yield row
rows = sorted(list(csv_to_tuple(b)), key=lambda k: k[3])
for TeamName, TeamRows in groupby(rows, lambda x: x[3]):
print(TeamName)
TeamPoints = [row.Points for row in TeamRows]
print(TeamPoints)
print()
All of this would be made easier by just using pandas. Check out the code below.
import pandas as pd
import numpy as np
df = pd.read_csv(input_path)
teams = list(set(df['Team'])) # unique list of all the teams
num_teams = len(teams)
points = np.empty(shape=num_teams)
avg_points = np.empty(shape=num_teams)
num_riders = np.empty(shape=num_teams)
for i in range(num_teams):
# find all rows where the entry in the 'Team' column
# is the same as teams[i]
req = df.loc[df['Team'] == teams[i]]
points[i] = np.sum(req['Points'])
num_riders[i] = len(req)
avg_points[i] = point[i]/num_riders[i]
dict_out = {
'Team':teams,
'Points':points,
'AvgPoints':avg_points,
'NumOfRiders':num_riders
}
df_out = pd.DataFrame(data=dict_out)
df_out.to_csv(output_path)

Aggregating values in one column by their corresponding value in another from two files

had a question regarding summing the multiple values of duplicate keys into one key with the aggregate total. For example:
1:5
2:4
3:2
1:4
Very basic but I'm looking for an output that looks like:
1:9
2:4
3:2
In the two files I am using, I am dealing with a list of 51 users(column 1 of user_artists.dat) who have the artistID(column 2) and how many times that user has listened to that particular artist given by the weight(column 3).
I am attempting to aggregate the total times that artist has been played, across all users and display it in a format such as:
Britney Spears (289) 2393140. Any help or input would be so appreciated.
import codecs
#from collections import defaultdict
with codecs.open("artists.dat", encoding = "utf-8") as f:
artists = f.readlines()
with codecs.open("user_artists.dat", encoding = "utf-8") as f:
users = f.readlines()
artist_list = [x.strip().split('\t') for x in artists][1:]
user_stats_list = [x.strip().split('\t') for x in users][1:]
artists = {}
for a in artist_list:
artistID, name = a[0], a[1]
artists[artistID] = name
grouped_user_stats = {}
for u in user_stats_list:
userID, artistID, weight = u
grouped_user_stats[artistID] = grouped_user_stats[artistID].astype(int)
grouped_user_stats[weight] = grouped_user_stats[weight].astype(int)
for artistID, weight in u:
grouped_user_stats.groupby('artistID')['weight'].sum()
print(grouped_user_stats.groupby('artistID')['weight'].sum())
#if userID not in grouped_user_stats:
#grouped_user_stats[userID] = { artistID: {'name': artists[artistID], 'plays': 1} }
#else:
#if artistID not in grouped_user_stats[userID]:
#grouped_user_stats[userID][artistID] = {'name': artists[artistID], 'plays': 1}
#else:
#grouped_user_stats[userID][artistID]['plays'] += 1
#print('this never happens')
#print(grouped_user_stats)
how about:
import codecs
from collections import defaultdict
# read stuff
with codecs.open("artists.dat", encoding = "utf-8") as f:
artists = f.readlines()
with codecs.open("user_artists.dat", encoding = "utf-8") as f:
users = f.readlines()
# transform artist data in a dict with "artist id" as key and "artist name" as value
artist_repo = dict(x.strip().split('\t')[:2] for x in artists[1:])
user_stats_list = [x.strip().split('\t') for x in users][1:]
grouped_user_stats = defaultdict(lambda:0)
for u in user_stats_list:
#userID, artistID, weight = u
grouped_user_stats[u[0]] += int(u[2]) # accumulate weights in a dict with artist id as key and sum of wights as values
# extra: "fancying" the data transforming the keys of the dict in "<artist name> (artist id)" format
grouped_user_stats = dict(("%s (%s)" % (artist_repo.get(k,"Unknown artist"), k), v) for k ,v in grouped_user_stats.iteritems() )
# lastly print it
for k, v in grouped_user_stats.iteritems():
print k,v

Python DictReader -- find if specific key value in a row is not found

I am working on a function to pull out of CSV specific rows. Every CSV row has a unique ID that identifies it to the function. Some IDs are missing. I want to somehow find after iterating these invalid IDs.
Example:
(a sample CSV db_short.csv with rows 1-52 and then 99)
import csv
def get_row(csvfile, row_id):
with open(csvfile, 'rb') as csvfile:
newfile = csv.DictReader(csvfile, delimiter=',', quotechar='|')
somevalue = 'default'
for row in newfile:
if row['id'] == str(row_id):
somevalue = 'id = {}'.format(row['id'])
else:
pass
return somevalue
db = "db_short.csv"
flatlist = [1, 18, 42, 51, 53, 99]
new_entries = []
for i in flatlist:
new_entries.append(get_row(db, i))
print new_entries
Note that flatlist includes a deliberately missing ID 53. This code predictably produces output where search for 'id' : 53' returns 'default'.
['id = 1', 'id = 18', 'id = 42', 'id = 51', 'default', 'id = 99']
I would however like to replace somevalue = 'default' with, say, a customized message alerting to a missing ID, that will only appear if DictReader went through the whole CSV and did not find any row that contains 'id' : '53' -- .
somevalue = '{} id missing!'.format(row_id)
So how do I have to change my code?

create a list of list of parameters from a file

Hi i am trying to create a list of parameters from a file
The final result should be something like
param=[[field],[units],[height],[site]]
The problem is that the information is split into lines and some of the parameters do not have all the information
#info in the file
[field1]
unit=m/s
height=70.4
site=site1
[field2]
height=20.6
site=site2
[field3]
units=m
...
so i would like to fulfill all the fields in such a way that, if there is not information assigns 0 or ''
Final result in the example
param={field1:'m/s',70.4,'site1',field2:'',20.6,site2, field3:'m',0,''}
I know how to create a dictionary from list of lists but not to set default values ('' for the strings values an 0 for the numeric ones) in case some values are missing
Thanks
You could group using a defaultdict:
from collections import defaultdict
with open("test.txt") as f:
d = defaultdict(list)
for line in map(str.rstrip, f):
if line.startswith("["):
d["fields"].append(line.strip("[]"))
else:
k,v = line.split("=")
d[k].append(v)
Input::
[field1]
unit=m/s
height=70.4
site=site1
[field2]
height=20.6
site=site2
[field3]
unit=m
height=6.0
site=site3
Output:
defaultdict(<type 'list'>, {'fields': ['field1', 'field2', 'field3'],
'site': ['site1', 'site2', 'site3'], 'unit': ['m/s', 'm'],
'height': ['70.4', '20.6', '6.0']})
If you actually want to group by field, you can use itertools.groupby grouping on lines that start with [:
from itertools import groupby
with open("test.txt") as f:
grps, d = groupby(map(str.rstrip,f), key=lambda x: x.startswith("[")), {}
for k,v in grps:
if k:
k, v = next(v).strip("[]"), list(next(grps)[1])
d[k] = v
print(d)
Output:
{'field2': ['height=20.6', 'site=site2'],
'field3': ['unit=m', 'height=6.0', 'site=site3'],
'field1': ['unit=m/s', 'height=70.4', 'site=site1']}
Each k is a line starting with [, we then call next on the grouper object to get all the lines up to the next line starting with [ or the EOF:
This would fill in the missing information.
f= open('file.txt','r')
field, units, height, site = [],[],[],[]
param = [ field, units, height, site]
lines = f.readlines()
i=0
while True:
try:
line1 = lines[i].rstrip()
if line1.startswith('['):
field.append(line1.strip('[]'))
else:
field.append(0)
i-= 1
except:
field.append(0)
try:
line2 = lines[i+1].rstrip()
if line2.startswith('unit') or line2.startswith('units'):
units.append(line2.split('=')[-1])
else:
units.append('')
i-=1
except:
units.append('')
try:
line3 = lines[i+2].rstrip()
if line3.startswith('height'):
height.append(line3.split('=')[-1])
else:
height.append(0)
i-=1
except:
height.append(0)
try:
line4 = lines[i+3].rstrip()
if line4.startswith('site'):
site.append(line4.split('=')[-1])
else:
site.append('')
except:
site.append('')
break
i +=4
Output:
param:
[['field1', 'field2', 'field3'],
['m/s', '', 'm'],
['70.4', '20.6', 0],
['site1', 'site2', '']]

Categories