Comparing two CSV files and print the difference - python

I'm trying to compare CSV files, I have 2 CSV Files - CSVFileOne, CSVFileTwo.
my desired output is printing CSVFileOne but only the rows that not existing in CSVFileTwo.
My code:
input_file = CSVFileOne
ABGSOne = []
with open(input_file, encoding='UTF-8') as fone:
rowsOne = csv.reader(fone,delimiter=",",lineterminator="\n")
next(rowsOne, None)
for rowOne in rowsOne:
abbgone = {}
abbgone['astringOne'] = row[0]
abbgone['bstringOne'] = row[1]
abbgone['cstringOne'] = row[2]
abbgone['dstringOne'] = row[3]
ABGSOne.append(abbgone)
input_fileTwo = CSVFileTwo
ABGSTwo = []
with open(input_fileTwo, encoding='UTF-8') as ftwo:
rowsTwo = csv.reader(ftwo,delimiter=",",lineterminator="\n")
next(rowsTwo, None)
for rowTwo in rowsTwo:
abbgtwo = {}
abbgtwo['astringTwo'] = row[0]
abbgtwo['bstringTwo'] = row[1]
abbgtwo['cstringTwo'] = row[2]
abbgtwo['dstringTwo'] = row[3]
ABGSOne.append(abbgTwo)
for abbgone in ABGSOne:
if abbgone['bstringOne'] == abbgtwo['bstringTwo']:
print('abbgone['bstringOne']

try this out .
with open('CSVFileOne.csv', 'r') as t1, open('CSVFileTwo.csv', 'r') as t2:
fileone = t1.readlines()
filetwo = t2.readlines()
with open('Desired.csv', 'w') as outFile:
for line in filetwo:
if line not in fileone:
outFile.write(line)

Related

Deleting all rows with the same email in csv file

I'd like to ask, how to delete all rows with the same email in csv file? Idea is to pick random rows, and if that row picked and printed out, delete all rows with it's email from file. Now code deletes only rows that are picked and printed. But if there were more rows with the same email it wouldn't delete them. How to fix it?
Full code: https://pastebin.com/qzHm4NSA
Data structure: https://ibb.co/wWXfL6X
def generate():
global winningRows
filename = enterFile()
noOfWinners = 5
winningNumbers = []
while len(winningNumbers) < noOfWinners:
luckyNumber = random.randint(1, totalEntries)
if luckyNumber not in winningNumbers:
winningNumbers.append(luckyNumber)
with open(filename, newline='\n') as entriesCSV:
entriesDict = csv.DictReader(entriesCSV,dialect="excel")
allRows = [row for row in entriesDict]
winningRows = [row for row in allRows if int(row["#"]) in winningNumbers]
nonWinningRows = [row for row in allRows if int(row["#"]) not in winningNumbers]
for row in winningRows:
winnerName = row["Name"]
winnerID = row["ID"]
winnerEmail = row["Email"]
print(f"The winner is {winnerName}, ID {winnerID}, email {winnerEmail}")
with open(filename, "w", newline='\n') as entriesCSV:
writer = csv.DictWriter(entriesCSV, fieldnames=["#", "Name", "ID", "Email"])
writer.writeheader()
writer.writerows(nonWinningRows)
Maintain a list of emails of the picked winners and then use it to filter out rows of non winners. For that,
Just modify the code segment as follows and that will solve your problem:
def generate():
global winningRows
filename = enterFile()
noOfWinners = 5
winningNumbers = []
nonWinningRows = []
winnerEmails = [] #change 1
while len(winningNumbers) < noOfWinners:
luckyNumber = random.randint(1, totalEntries)
if luckyNumber not in winningNumbers:
winningNumbers.append(luckyNumber)
with open(filename, newline='\n') as entriesCSV:
entriesDict = csv.DictReader(entriesCSV,dialect="excel")
allRows = [row for row in entriesDict]
winningRows = [row for row in allRows if int(row["#"]) in winningNumbers]
for row in winningRows:
if row["Email"] not in winnerEmails: #change 2
winnerName = row["Name"]
winnerID = row["ID"]
winnerEmail = row["Email"]
print(f"The winner is {winnerName}, ID {winnerID}, email {winnerEmail}")
winnerEmails.append(winnerEmail) #change 3
nonWinningRows = [row for row in allRows if int(row["#"]) not in winningNumbers and row["Email"] not in winnerEmails] #change 4
with open(filename, "w", newline='\n') as entriesCSV:
writer = csv.DictWriter(entriesCSV, fieldnames=["#", "Name", "ID", "Email"])
writer.writeheader()
writer.writerows(nonWinningRows)

Issue with Request--downloading an empty image

My request are downloading the images, but not properly. The images are empty. I know I am close to getting it correct, but I am not sure what I am missing--any help would be greatly appreciated!!!
fname ='testData.csv'
s = "dos2unix {}".format(fname)
inner_quote_grabber = re.compile(r'("[^"]*")', flags=re.M)
parenth_grabber = re.compile(r'\(([^)]*)\)', flags=re.M)
new_rows = []
matter = [0, 3,4]
file_counter = 0
file_prefix = 'images/'
file_out = 'outfile.csv'
with open(fname, 'r') as c:
reader = csv.reader(c, delimiter=',')
for row in reader:
t_row = []
#print( len(row), row)
for i in matter:
t_row.append(row[i])
last_row = []
print(row[7])
for group in parenth_grabber.findall(row[7]):
print('grabbing ', group)
file_counter += 1
click_clack = file_prefix + 'file_' + str(file_counter) +'.jpg'
print('group:', group)
req = requests.get(group)
print('status_code', req.status_code)
last_row.append(click_clack)
with open(click_clack, 'wb') as f:
req.raw.decode_content = True
shutil.copyfileobj(req.raw, f)
#f.write(text)
file_counter += 1
t_row.append(', '.join(last_row))
with open(file_out, 'a') as f:
writer = csv.writer(f)
writer.writerow(t_row)

Many CSV files (workbooks) are generated.I want them as a single CSV file ( under one single workbook in a single sheet).( in web crawling)

In my code many CSV files are being generated. I want them as a single csv file. The five parties of all years has to be executed under one csv file. Basically I am trying to do web crawling and trying to create a time series of polls for those (CDU, SPD, FDP, GRUNEN, LEFT)parties.
import scrapy
import re
import csv
class VoteSpider(scrapy.Spider):
name = 'VoteSpider'
start_urls = ['https://www.wahlrecht.de/umfragen/forsa.htm']
def __init__(self):
self.CDU = {}
self.SPD = {}
self.FDP = {}
self.Green = {}
self.left = {}
def parse(self, response):
regex = r"[forsa]+[\/]+[0-9]+.htm"
tableBody = response.xpath('//*[#class="wilko"]//tbody')
hxs = scrapy.Selector(response)
all_links = hxs.xpath('*//a/#href').extract()
yearLinks = []
for link in all_links:
matches = re.search(regex, link, re.MULTILINE)
if matches:
yearLinks.append(link)
for link in yearLinks:
newlink = "https://www.wahlrecht.de/umfragen/"+ link
yield scrapy.Request(url = newlink, callback=self.parseLink, meta={'name':link})
self.parseTable(tableBody)
def parseTable(self,tableBody):
CDU= []
SPD = []
FDP= []
Green= []
left= []
rows = tableBody.xpath('//tr')
del rows[:5]
for row in rows:
CDU.append(row.xpath('td//text()')[2].extract())
SPD.append(row.xpath('td//text()')[3].extract())
Green.append(row.xpath('td//text()')[4].extract())
FDP.append(row.xpath('td//text()')[5].extract())
left.append(row.xpath('td//text()')[6].extract())
with open('CDU'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('SPD'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('left'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('Green'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('FDP'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
self.CDU['Current'] = []
self.SPD['Current'] = []
self.Green['Current'] = []
self.FDP['Current'] = []
self.left['Current'] = []
self.CDU['Current'].append(CDU)
self.SPD['Current'].append(SPD)
self.Green['Current'].append(Green)
self.FDP['Current'].append(FDP)
self.left['Current'].append(left)
def parseLink(self, response):
CDU= []
SPD = []
FDP= []
Green= []
left= []
name = response.meta.get('name')
yearNumber = re.findall('\d+',name)[0]
x = 0
if yearNumber == '2007':
x = 4
elif yearNumber == '1998':
x = 3
elif yearNumber == '1999':
x = 3
elif yearNumber == '2000':
x = 3
elif yearNumber == '2001':
x = 3
elif yearNumber == '2002':
x = 3
elif yearNumber == '2003':
x = 3
elif yearNumber == '2004':
x = 3
elif yearNumber == '2005':
x = 5
elif yearNumber == '2006':
x = 3
elif yearNumber == '2008':
x = 4
elif yearNumber == '2013':
x = 4
tableBody = response.xpath('//*[#class="wilko"]//tbody')
rows = tableBody.xpath('//tr')
del rows[:x]
for row in rows:
CDU.append(row.xpath('td//text()')[2].extract())
SPD.append(row.xpath('td//text()')[3].extract())
Green.append(row.xpath('td//text()')[4].extract())
# print(row.xpath('td//text()').extract())
FDP.append(row.xpath('td//text()')[5].extract())
left.append(row.xpath('td//text()')[6].extract())
with open('CDU'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('SPD'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('left'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('Green'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('FDP'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
self.CDU[yearNumber]= []
self.SPD[yearNumber] = []
self.Green[yearNumber] = []
self.FDP[yearNumber] = []
self.left[yearNumber] = []
self.CDU[yearNumber].append(CDU)
self.SPD[yearNumber].append(SPD)
self.Green[yearNumber].append(Green)
self.FDP[yearNumber].append(FDP)
self.left[yearNumber].append(left)
I want the expected output to be as, all CDU, SPD, GRUNEN, FDP, LEFT parties of all years under one CSV file
Instead of opening multiple files, you can append to a single file, like so:
...
with open('ALL'+yearNumber+'.csv', 'a+') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
writer.writerows(SPD)
writer.writerows(left)
writer.writerows(Green)
writer.writerows(FDP)
...

Redundant instructions in two conditions

I want to optimize the following code:
for myFile in myFiles:
file = open(filename, 'rt')
try:
if CLIENT == "C1":
head = rows[:7]
tail = rows[7:]
for row in rows:
if "".join(row)!= "":
if not u_pass:
header = [ row.strip().replace(" ", "_") for row in row[3:] ]
u_pass = True
else:
self.usecases(row, data, index)
elif CLIENT == 'C2':
reader = csv.reader(file)
firstline = next(reader)
secondline = next(reader)
else:
for row in rows:
if "".join(row)!= "":
if not u_pass:
header = [ row.strip().replace(" ", "_") for row in row[3:] ]
u_pass = True
# Recuperation des donnees
else:
self.usecases(row, data, index)
The code below is repeated twice in the previous code, meaning there are some common instructions between these conditions "
for row in rows:
if "".join(row)!= "":
if not u_pass:
header = [ row.strip().replace(" ", "_") for row in row[3:] ]
u_pass = True
else:
self.usecases(row, data, index)
Below should do the trick:
for myFile in myFiles:
file = open(filename, 'rt')
try:
if CLIENT == "C2":
reader = csv.reader(file)
firstline = next(reader)
secondline = next(reader)
else:
if CLIENT == "C1":
head = rows[:7]
tail = rows[7:]
for row in rows:
if "".join(row)!= "":
if not u_pass:
header = [ row.strip().replace(" ", "_") for row in row[3:] ]
u_pass = True
else:
self.usecases(row, data, index)

My function to extract totals is exhausting my input file for future reading

The client includes 3 rows at the bottom that contain totals for me to reconcile against in my program. Only problem is that my program is exhausting the input file with readlines() before it can do anything else. Is there a way to keep the file from being exhausted during my get_recon_total function call?
#!/usr/bin/env python
# pre_process.py
import csv
import sys
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(in_obj)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
# Create a list to contain section header information
header_list = create_header_list(reader_list)
# Create dictionary that contains header list as the key,
# then all rows that match as a list of dictionaries.
master_dict = map_data(header_list, reader_list)
# Write data to processed file, create recon counts to compare
# to footer record
tot_cnt, rec_cnt, erec_cnt = write_data(master_dict, outfile, fieldnames)
print tot_cnt, rec_cnt, erec_cnt
def open_reader(file_obj):
'''
Uses DictReader from the csv module to take the first header line
as the fieldnames, then applies them to each element in the file.
Returns the DictReader object and the fieldnames being used (used
later when data is printed out with DictWriter.)
'''
reader = csv.DictReader(file_obj, delimiter=',')
return reader, reader.fieldnames
def create_header_list(in_obj):
p_id_list = []
for row in in_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) not in p_id_list:
p_id_list.append((row['PEOPLE_ID'], row['DON_DATE']))
return p_id_list
def map_data(header_list, data_obj):
master_dict = {}
client_section_list = []
for element in header_list:
for row in data_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) == element:
client_section_list.append(row)
element = list(element)
element_list = [client_section_list[0]['DEDUCT_AMT'],
client_section_list[0]['ND_AMT'],
client_section_list[0]['DEDUCT_YTD'],
client_section_list[0]['NONDEDUCT_YTD']
]
try:
element_list.append((float(client_section_list[0]['DEDUCT_YTD']) +
float(client_section_list[0]['NONDEDUCT_YTD'])
))
except ValueError:
pass
element.extend(element_list)
element = tuple(element)
master_dict[element] = client_section_list
client_section_list = []
return master_dict
def write_data(in_obj, outfile, in_fieldnames):
with open(outfile, 'wb') as writer_outfile:
writer = csv.writer(writer_outfile, delimiter=',')
dict_writer = csv.DictWriter(writer_outfile,
fieldnames=in_fieldnames,
extrasaction='ignore')
tot_cnt = 0
rec_cnt = 0
email_cnt = 0
for k, v in in_obj.iteritems():
writer_outfile.write(' -01- ')
writer.writerow(k)
rec_cnt += 1
for i, e in enumerate(v):
if v[i]['INT_CODE_EX0006'] != '' or v[i]['INT_CODE_EX0028'] != '':
email_cnt += 1
writer_outfile.write(' -02- ')
dict_writer.writerow(e)
tot_cnt += 1
return tot_cnt, rec_cnt, email_cnt
def get_recon_totals(in_obj):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in in_obj.readlines():
line = line.split(',')
if line[0] == 'T' and line[1] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Receipt Count':
print 'Receipt Count found.'
client_rec_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Email Receipt Count':
print 'E-Receipt Count Found.'
client_erec_cnt = line[2]
return client_tot_cnt, client_rec_cnt, client_erec_cnt
if __name__ == '__main__':
main()
If your file is not very large, you can convert reader generator to a list of dcitonary , by calling list() on reader and then use it in your code instead of trying to read from the file directly.
Example -
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
reader_list = list(reader)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(reader_list)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader_list, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
.
.
def get_recon_totals(reader_list):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in reader_list: #line here is a dict
if line[<fieldname for first column>] == 'T' and line[<fieldname for secondcolumn>] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[<fieldname for third column>]
.
. #continued like above
.
return client_tot_cnt, client_rec_cnt, client_erec_cnt

Categories