how to find Total number of duplicate rows in to file and how to write python code
import csv
csv_data = csv.reader(file('T:\DataDump\Book1.csv'))
next(csv_data)
already_seen = set()
for row in csv_data:
Address = row[6]
if Address in already_seen:
print('{} is a duplicate Address'.format(Address))
else:
print('{} is a unique Address'.format(Address))
already_seen.add(Address)
Try using pandas instead of the csv module
import pandas as pd
csv_data = pd.read_csv('T:/DataDump/Book1.csv')
shape_original = csv_data.shape
print(f"Number of rows: {shape_original[0]}")
#Below how to drop duplicates
csv_data_no_duplicates = csv_data.drop_duplicates(keep="first")
shape_new = csv_data_no_duplicates.shape
print(f"Number of rows: {shape_new[0]}")
number_duplicates = shape_original[0] - shape_new[0]
I did with this example to try if it works:
thisdict = {
"brand": ["Ford","Renault","Ford"],
"model": ["Mustang","Laguna","Mustang"],
"year": ["1964","1978","1964"]
}
data = pd.DataFrame.from_dict(thisdict)
data_no_duplicates = data.drop_duplicates(keep="first")
print(data_no_duplicates.head())
I have a CSV, OutputA with format:
Position,Category,Name,Team,Points
1,A,James,Team 1,100
2,A,Mark,Team 2,95
3,A,Tom,Team 1,90
I am trying to get an output of a CSV which gets the total points for each team, the average points per team and the number of riders.
So output would be:
Team,Points,AvgPoints,NumOfRiders
Team1,190,95,2
Team2,95,95,1
I have this function to convert each row to a namedtuple:
fields = ("Position", "Category", "Name", "Team", "Points")
Results = namedtuple('CategoryResults', fields)
def csv_to_tuple(path):
with open(path, 'r', errors='ignore') as file:
reader = csv.reader(file)
for row in map(Results._make, reader):
yield row
Then this sorts the rows into a sorted list by there club:
moutputA = sorted(list(csv_to_tuple("Male/outputA.csv")), key=lambda k: k[3])
This returns a list like:
[CategoryResults(Position='13', Category='A', Name='Marek', Team='1', Points='48'), CategoryResults(Position='7', Category='A', Name='', Team='1', Points='70')]
I am confident that this so far is right although I could be wrong.
I am trying to create a new list of teams with the points (not yet added up).
For example:
[Team 1(1,2,3,4,5)]
[Team 2 (6,9,10)]
etc.
The idea is that I can find how many unique values of points there are (this equals the number of riders). However, when trying to group the list I have this code:
Clubs = []
Club_Points = []
for Names, Club in groupby(moutputA, lambda x: x[3]):
for Teams in Names:
Clubs.append(list(Teams))
for Club, Points in groupby(moutputA, lambda x: x[4]):
for Point in Clubs:
Club_Points.append(list(Point))
print(Clubs)
but this retuns this error:
Teams.append(list(Team))
AttributeError: 'itertools._grouper' object has no attribute 'append'
If data.csv contains:
Position,Category,Name,Team,Points
1,A,James,Team 1,100
2,A,Mark,Team 2,95
3,A,Tom,Team 1,90
Then this script:
import csv
from collections import namedtuple
from itertools import groupby
from statistics import mean
fields = ("Position", "Category", "Name", "Team", "Points")
Results = namedtuple('CategoryResults', fields)
def csv_to_tuple(path):
with open(path, 'r', errors='ignore') as file:
next(file) # skip header
reader = csv.reader(file)
for row in map(Results._make, reader):
yield row
moutputA = sorted(csv_to_tuple("data.csv"), key=lambda k: k.Team)
out = []
for team, group in groupby(moutputA, lambda x: x.Team):
group = list(group)
d = {}
d['Team'] = team
d['Points'] = sum(int(i.Points) for i in group)
d['AvgPoints'] = mean(int(i.Points) for i in group)
d['NumOfRider'] = len(group)
out.append(d)
with open('data_out.csv', 'w', newline='') as csvfile:
fieldnames = ['Team', 'Points', 'AvgPoints', 'NumOfRider']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in out:
writer.writerow(row)
Produces data_out.csv:
Team,Points,AvgPoints,NumOfRider
Team 1,190,95,2
Team 2,95,95,1
Screenshot from LibreOffice:
Here's a start. You should be able to figure out how to get what you want from this.
import csv, io
from collections import namedtuple
from itertools import groupby
data = '''\
Position,Category,Name,Team,Points
1,A,James,Team 1,100
2,A,Mark,Team 2,95
3,A,Tom,Team 1,90
'''
b = io.StringIO(data)
next(b)
fields = ("Position", "Category", "Name", "Team", "Points")
Results = namedtuple('CategoryResults', fields)
def csv_to_tuple(file):
reader = csv.reader(file)
for row in map(Results._make, reader):
yield row
rows = sorted(list(csv_to_tuple(b)), key=lambda k: k[3])
for TeamName, TeamRows in groupby(rows, lambda x: x[3]):
print(TeamName)
TeamPoints = [row.Points for row in TeamRows]
print(TeamPoints)
print()
All of this would be made easier by just using pandas. Check out the code below.
import pandas as pd
import numpy as np
df = pd.read_csv(input_path)
teams = list(set(df['Team'])) # unique list of all the teams
num_teams = len(teams)
points = np.empty(shape=num_teams)
avg_points = np.empty(shape=num_teams)
num_riders = np.empty(shape=num_teams)
for i in range(num_teams):
# find all rows where the entry in the 'Team' column
# is the same as teams[i]
req = df.loc[df['Team'] == teams[i]]
points[i] = np.sum(req['Points'])
num_riders[i] = len(req)
avg_points[i] = point[i]/num_riders[i]
dict_out = {
'Team':teams,
'Points':points,
'AvgPoints':avg_points,
'NumOfRiders':num_riders
}
df_out = pd.DataFrame(data=dict_out)
df_out.to_csv(output_path)
i try to write this list of dicts into xlsx file using openpyxlsx
products= [{'id':46329',
'discription':'AD BLeu',
'marque':'AZERT',
'category':'liquid',
'family': 'ADBLEU',
'photos':'D:\\hamzawi\\hamza\\image2py\\46329_1.png'},
{dict2 ...},
{dictn...}
]
# creat a workbook
filena = "produitimage.xlsx"
workbook = Workbook()
sheet = workbook.active
#add headers
sheet.append(["Product ID", "Product Name", "Marque",
"Category", "Family", "Photos"])
for product in products:
for item in product.items():
for row, entry in enumerate(item, start=3):
sheet.cell(row=row, column=1, value=entry)
#add some images
images = [item['photos'] for item in products]
for image in images:
logo = Image(image)
#logo.height = 150
#logo.width = 150
sheet.add_image(logo)
workbook.save(filename=filena)
i got xlsx file with only headers no data
Question: append list of dict
import openpyxl
products = [{'id':46329,
'discription':'AD BLeu',
'marque':'AZERT',
'category':'liquid',
'family': 'ADBLEU',
'photos':'D:\\hamzawi\\hamza\\image2py\\46329_1.png'}
]
# Dictionarys are not in order by default
# Define a `list` of `keys` in desired order
fieldnames = ['id', 'discription', 'marque', 'category', 'family', 'photos']
# create a new workbook
wb = openpyxl.Workbook()
ws = wb.active
# append headers
ws.append(["Product ID", "Product Name", "Marque", "Category", "Family", "Photos"])
# append data
# iterate `list` of `dict`
for product in products:
# create a `generator` yield product `value`
# use the fieldnames in desired order as `key`
values = (product[k] for k in fieldnames)
# append the `generator values`
ws.append(values)
# show Worksheet Values
for row_values in ws.iter_rows(values_only=True):
for value in row_values:
print(value, end='\t')
print()
Output:
Product ID Product Name Marque Category Family Photos
46329 AD BLeu AZERT liquid ADBLEU D:\hamzawi\hamza\image2py\46329_1.png
If you want the image, instead of the image file path, change the following:
# remove 'photos' from fieldnames
fieldnames = \
['id', 'discription', 'marque', 'category', 'family']
# you need the Row index, add a `enumerate(..., 2)`
for row, product in enumerate(products,2):
values = (product[k] for k in fieldnames)
sheet.append(values)
# after append the `values` add the image
# Here, Column 'F'
ws.add_image(Image(product['photos']), 'F{}'.format(row))
There are some problems in your code.
First, you are incrementing next_row value in the loop where you setting it, so increment doesn't have effect and every iteration the value of next_row equals 3.
Second, you are trying to write a list of dict values to excel cell, but I think you want it will be written as a row. So you need to just append it as you did with header above the loops:
for product in products:
sheet.append(list(product.values()))
If you need to insert an image in last cell in a row you may rewrite loop that way:
for row_index, product in enumerate(products):
values = list(product.values())
sheet.append(values[:-1])
col_row = get_column_letter(len(values)) + str(row_index+1)
photo_path = values[-1]
sheet.add_image(Image(photo_path), col_row)
and thank you in advance for your help. I have this code so far :
with open("clean_result.csv", "r", encoding="utf-8", errors="ignore") as
new_data:
reader = csv.reader(new_data, delimiter=',', quotechar='"')
for row in reader:
if row:
columns = [row[0], row[1]]
init_dict.append(columns)
for ean, price in init_dict:
result[ean].append(price)
And then I get the min value for each price with this line :
maxitems = {ean : min(result[ean]) for ean in result}
Current Output : {'8714789828558': '5,51', '3326100000182': '15,00', '3286010016683': '3,93' (...) }
What I would like to is add row[2] and get additionnal info, but only for the minimum price value.
Desired Output : {'8714789828558': '5,51', 'A', '3326100000182': '15,00', 'B' '3286010016683': '3,93', 'C' (...) }
I tried this :
for row in reader:
if row:
columns = [row[0], row[1], row[2]]
init_dict.append(columns)
for ean, price, desc in init_dict:
result[ean].append(price)
result[ean].append(desc)
maxitems = {ean : min(result[ean]) for ean in result}
But Output is like this. Half data are missing :
{'8714789828558': 'A', '3326100000182': 'B' '3286010016683': 'C' (...) }
I probably misunderstand something so please any help appreciated
from operator import itemgetter
from collections import defaultdict
result = defaultdict(list)
for row in reader:
if row:
result[row[0]].append((row[1], row[2]))
minitems = {ean : min(prices, key = itemgetter(0)) for ean, prices in result.iteritems()}
I am working on a function to pull out of CSV specific rows. Every CSV row has a unique ID that identifies it to the function. Some IDs are missing. I want to somehow find after iterating these invalid IDs.
Example:
(a sample CSV db_short.csv with rows 1-52 and then 99)
import csv
def get_row(csvfile, row_id):
with open(csvfile, 'rb') as csvfile:
newfile = csv.DictReader(csvfile, delimiter=',', quotechar='|')
somevalue = 'default'
for row in newfile:
if row['id'] == str(row_id):
somevalue = 'id = {}'.format(row['id'])
else:
pass
return somevalue
db = "db_short.csv"
flatlist = [1, 18, 42, 51, 53, 99]
new_entries = []
for i in flatlist:
new_entries.append(get_row(db, i))
print new_entries
Note that flatlist includes a deliberately missing ID 53. This code predictably produces output where search for 'id' : 53' returns 'default'.
['id = 1', 'id = 18', 'id = 42', 'id = 51', 'default', 'id = 99']
I would however like to replace somevalue = 'default' with, say, a customized message alerting to a missing ID, that will only appear if DictReader went through the whole CSV and did not find any row that contains 'id' : '53' -- .
somevalue = '{} id missing!'.format(row_id)
So how do I have to change my code?