Default Dict / Sort by min - python

and thank you in advance for your help. I have this code so far :
with open("clean_result.csv", "r", encoding="utf-8", errors="ignore") as
new_data:
reader = csv.reader(new_data, delimiter=',', quotechar='"')
for row in reader:
if row:
columns = [row[0], row[1]]
init_dict.append(columns)
for ean, price in init_dict:
result[ean].append(price)
And then I get the min value for each price with this line :
maxitems = {ean : min(result[ean]) for ean in result}
Current Output : {'8714789828558': '5,51', '3326100000182': '15,00', '3286010016683': '3,93' (...) }
What I would like to is add row[2] and get additionnal info, but only for the minimum price value.
Desired Output : {'8714789828558': '5,51', 'A', '3326100000182': '15,00', 'B' '3286010016683': '3,93', 'C' (...) }
I tried this :
for row in reader:
if row:
columns = [row[0], row[1], row[2]]
init_dict.append(columns)
for ean, price, desc in init_dict:
result[ean].append(price)
result[ean].append(desc)
maxitems = {ean : min(result[ean]) for ean in result}
But Output is like this. Half data are missing :
{'8714789828558': 'A', '3326100000182': 'B' '3286010016683': 'C' (...) }
I probably misunderstand something so please any help appreciated

from operator import itemgetter
from collections import defaultdict
result = defaultdict(list)
for row in reader:
if row:
result[row[0]].append((row[1], row[2]))
minitems = {ean : min(prices, key = itemgetter(0)) for ean, prices in result.iteritems()}

Related

how to find Total number of duplicate rows in to file in python code

how to find Total number of duplicate rows in to file and how to write python code
import csv
csv_data = csv.reader(file('T:\DataDump\Book1.csv'))
next(csv_data)
already_seen = set()
for row in csv_data:
Address = row[6]
if Address in already_seen:
print('{} is a duplicate Address'.format(Address))
else:
print('{} is a unique Address'.format(Address))
already_seen.add(Address)
Try using pandas instead of the csv module
import pandas as pd
csv_data = pd.read_csv('T:/DataDump/Book1.csv')
shape_original = csv_data.shape
print(f"Number of rows: {shape_original[0]}")
#Below how to drop duplicates
csv_data_no_duplicates = csv_data.drop_duplicates(keep="first")
shape_new = csv_data_no_duplicates.shape
print(f"Number of rows: {shape_new[0]}")
number_duplicates = shape_original[0] - shape_new[0]
I did with this example to try if it works:
thisdict = {
"brand": ["Ford","Renault","Ford"],
"model": ["Mustang","Laguna","Mustang"],
"year": ["1964","1978","1964"]
}
data = pd.DataFrame.from_dict(thisdict)
data_no_duplicates = data.drop_duplicates(keep="first")
print(data_no_duplicates.head())

How to group a list by value without causing an Attribute Error

I have a CSV, OutputA with format:
Position,Category,Name,Team,Points
1,A,James,Team 1,100
2,A,Mark,Team 2,95
3,A,Tom,Team 1,90
I am trying to get an output of a CSV which gets the total points for each team, the average points per team and the number of riders.
So output would be:
Team,Points,AvgPoints,NumOfRiders
Team1,190,95,2
Team2,95,95,1
I have this function to convert each row to a namedtuple:
fields = ("Position", "Category", "Name", "Team", "Points")
Results = namedtuple('CategoryResults', fields)
def csv_to_tuple(path):
with open(path, 'r', errors='ignore') as file:
reader = csv.reader(file)
for row in map(Results._make, reader):
yield row
Then this sorts the rows into a sorted list by there club:
moutputA = sorted(list(csv_to_tuple("Male/outputA.csv")), key=lambda k: k[3])
This returns a list like:
[CategoryResults(Position='13', Category='A', Name='Marek', Team='1', Points='48'), CategoryResults(Position='7', Category='A', Name='', Team='1', Points='70')]
I am confident that this so far is right although I could be wrong.
I am trying to create a new list of teams with the points (not yet added up).
For example:
[Team 1(1,2,3,4,5)]
[Team 2 (6,9,10)]
etc.
The idea is that I can find how many unique values of points there are (this equals the number of riders). However, when trying to group the list I have this code:
Clubs = []
Club_Points = []
for Names, Club in groupby(moutputA, lambda x: x[3]):
for Teams in Names:
Clubs.append(list(Teams))
for Club, Points in groupby(moutputA, lambda x: x[4]):
for Point in Clubs:
Club_Points.append(list(Point))
print(Clubs)
but this retuns this error:
Teams.append(list(Team))
AttributeError: 'itertools._grouper' object has no attribute 'append'
If data.csv contains:
Position,Category,Name,Team,Points
1,A,James,Team 1,100
2,A,Mark,Team 2,95
3,A,Tom,Team 1,90
Then this script:
import csv
from collections import namedtuple
from itertools import groupby
from statistics import mean
fields = ("Position", "Category", "Name", "Team", "Points")
Results = namedtuple('CategoryResults', fields)
def csv_to_tuple(path):
with open(path, 'r', errors='ignore') as file:
next(file) # skip header
reader = csv.reader(file)
for row in map(Results._make, reader):
yield row
moutputA = sorted(csv_to_tuple("data.csv"), key=lambda k: k.Team)
out = []
for team, group in groupby(moutputA, lambda x: x.Team):
group = list(group)
d = {}
d['Team'] = team
d['Points'] = sum(int(i.Points) for i in group)
d['AvgPoints'] = mean(int(i.Points) for i in group)
d['NumOfRider'] = len(group)
out.append(d)
with open('data_out.csv', 'w', newline='') as csvfile:
fieldnames = ['Team', 'Points', 'AvgPoints', 'NumOfRider']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in out:
writer.writerow(row)
Produces data_out.csv:
Team,Points,AvgPoints,NumOfRider
Team 1,190,95,2
Team 2,95,95,1
Screenshot from LibreOffice:
Here's a start. You should be able to figure out how to get what you want from this.
import csv, io
from collections import namedtuple
from itertools import groupby
data = '''\
Position,Category,Name,Team,Points
1,A,James,Team 1,100
2,A,Mark,Team 2,95
3,A,Tom,Team 1,90
'''
b = io.StringIO(data)
next(b)
fields = ("Position", "Category", "Name", "Team", "Points")
Results = namedtuple('CategoryResults', fields)
def csv_to_tuple(file):
reader = csv.reader(file)
for row in map(Results._make, reader):
yield row
rows = sorted(list(csv_to_tuple(b)), key=lambda k: k[3])
for TeamName, TeamRows in groupby(rows, lambda x: x[3]):
print(TeamName)
TeamPoints = [row.Points for row in TeamRows]
print(TeamPoints)
print()
All of this would be made easier by just using pandas. Check out the code below.
import pandas as pd
import numpy as np
df = pd.read_csv(input_path)
teams = list(set(df['Team'])) # unique list of all the teams
num_teams = len(teams)
points = np.empty(shape=num_teams)
avg_points = np.empty(shape=num_teams)
num_riders = np.empty(shape=num_teams)
for i in range(num_teams):
# find all rows where the entry in the 'Team' column
# is the same as teams[i]
req = df.loc[df['Team'] == teams[i]]
points[i] = np.sum(req['Points'])
num_riders[i] = len(req)
avg_points[i] = point[i]/num_riders[i]
dict_out = {
'Team':teams,
'Points':points,
'AvgPoints':avg_points,
'NumOfRiders':num_riders
}
df_out = pd.DataFrame(data=dict_out)
df_out.to_csv(output_path)

writing data to csv from dictionaries with multiple values per key

Background
I am storing data in dictionaries. The dictionaries can be off different length and in a particular dictionary there could be keys with multiple values. I am trying to spit out the data on a CSV file.
Problem/Solution
Image 1 is how my actual output prints out. Image 2 shows how i would want my output to actually printout. Image 2 is the desired output.
CODE
import csv
from itertools import izip_longest
e = {'Lebron':[25,10],'Ray':[40,15]}
c = {'Nba':5000}
def writeData():
with open('file1.csv', mode='w') as csv_file:
fieldnames = ['Player Name','Points','Assist','Company','Total Employes']
writer = csv.writer(csv_file)
writer.writerow(fieldnames)
for employee, company in izip_longest(e.items(), c.items()):
row = list(employee)
row += list(company) if company is not None else ['', ''] # Write empty fields if no company
writer.writerow(row)
writeData()
I am open to all solutions/suggestions that can help me get my desired output format.
For a much simpler answer, you just need to add one line of code to what you have:
row = [row[0]] + row[1]
so:
for employee, company in izip_longest(e.items(), c.items()):
row = list(employee)
row = [row[0]] + row[1]
row += list(company) if company is not None else ['', ''] # Write empty fields if no company
from collections import defaultdict
values = defaultdict(dict)
values[Name1] = {Points: [], Assist: [], Company: blah, Total_Employees: 123}
for generating the output, traverse through each item in the values to give you names, and populate other values using the key_values in the nested dict.
Again, make sure that there no multiple entries with same name, or choose the one with unique entries in the defaultdict.
Demo for the example-
>>> from collections import defaultdict
>>> import csv
>>> values = defaultdict(dict)
>>> vals = [["Lebron", 25, 10, "Nba", 5000], ["Ray", 40, 15]]
>>> fields = ["Name", "Points", "Assist", "Company", "Total Employes"]
>>> for item in vals:
... if len(item) == len(fields):
... details = dict()
... for j in range(1, len(fields)):
... details[fields[j]] = item[j]
... values[item[0]] = details
... elif len(item) < len(fields):
... details = dict()
... for j in range(1, len(fields)):
... if j+1 <= len(item):
... details[fields[j]] = item[j]
... else:
... details[fields[j]] = ""
... values[item[0]] = details
...
>>> values
defaultdict(<class 'dict'>, {'Lebron': {'Points': 25, 'Assist': 10, 'Company': 'Nba', 'Total Employes': 5000}, 'Ray': {'Points': 40, 'Assist': 15, 'Company': '', 'Total Employes': ''}})
>>> csv_file = open('file1.csv', 'w')
>>> writer = csv.writer(csv_file)
>>> for i in values:
... row = [i]
... for j in values[i]:
... row.append(values[i][j])
... writer.writerow(row)
...
23
13
>>> csv_file.close()
Contents of 'file1.csv':
Lebron,25,10,Nba,5000
Ray,40,15,,

New column for each element of the array

I use the csv library to create a table of products.
In order to then import it to the site, I need that each characteristic be written in a separate column.
Adding a new row is done using simple loop:
writer = csv.writer(csvfile)
for product in products:
writer.writerow((product['price'],
product['vendor_code'],
product['characteristics']))
Adding a new product:
product = []
product.append({
'price' : price,
'vendor_code' : vendor_code,
'characteristics' : characteristics,
})
characteristics - array that contains each characteristic as a separate element
How do I get the output file in this form:
190$ #0172 characteristic1 characteristic2 characteristic3
characteristics - initialization:
try:
characteristics = []
soup_characteristics = soup.find_all('tr', {'class' : 'product_card__product_characters_item clearfix'})
for ch in soup_characteristics:
characteristics.append(re.sub('\s\s+|\n',' ', ch.text))
except AttributeError:
characteristics = ""
Try unpacking the characteristic array:
for product in products:
writer.writerow((product['price'],
product['vendor_code'],
*product['characteristics']))
Here is the code I tested:
products = [{
'price': 100,
'vendor': 123,
'characters': [7, 8, 9],
}]
with open('test.csv', 'w') as fo:
writer = csv.writer(fo)
for p in products:
writer.writerow((
p['price'],
p['vendor'],
*p['characters'],
))
Here is the content of the test.csv file:
100,123,7,8,9
You should be able to build a list to write as an entire row:
for product in products:
row = [product['price'],product['vendor_code']] # [price,vendor_code]
row.extend(product['characteristics']) # [price,vendor_code,characteristic1,characteristic2,...]
writer.writerow(row) # writes each value in the list as a new column

Python DictReader -- find if specific key value in a row is not found

I am working on a function to pull out of CSV specific rows. Every CSV row has a unique ID that identifies it to the function. Some IDs are missing. I want to somehow find after iterating these invalid IDs.
Example:
(a sample CSV db_short.csv with rows 1-52 and then 99)
import csv
def get_row(csvfile, row_id):
with open(csvfile, 'rb') as csvfile:
newfile = csv.DictReader(csvfile, delimiter=',', quotechar='|')
somevalue = 'default'
for row in newfile:
if row['id'] == str(row_id):
somevalue = 'id = {}'.format(row['id'])
else:
pass
return somevalue
db = "db_short.csv"
flatlist = [1, 18, 42, 51, 53, 99]
new_entries = []
for i in flatlist:
new_entries.append(get_row(db, i))
print new_entries
Note that flatlist includes a deliberately missing ID 53. This code predictably produces output where search for 'id' : 53' returns 'default'.
['id = 1', 'id = 18', 'id = 42', 'id = 51', 'default', 'id = 99']
I would however like to replace somevalue = 'default' with, say, a customized message alerting to a missing ID, that will only appear if DictReader went through the whole CSV and did not find any row that contains 'id' : '53' -- .
somevalue = '{} id missing!'.format(row_id)
So how do I have to change my code?

Categories