Deleting all rows with the same email in csv file - python

I'd like to ask, how to delete all rows with the same email in csv file? Idea is to pick random rows, and if that row picked and printed out, delete all rows with it's email from file. Now code deletes only rows that are picked and printed. But if there were more rows with the same email it wouldn't delete them. How to fix it?
Full code: https://pastebin.com/qzHm4NSA
Data structure: https://ibb.co/wWXfL6X
def generate():
global winningRows
filename = enterFile()
noOfWinners = 5
winningNumbers = []
while len(winningNumbers) < noOfWinners:
luckyNumber = random.randint(1, totalEntries)
if luckyNumber not in winningNumbers:
winningNumbers.append(luckyNumber)
with open(filename, newline='\n') as entriesCSV:
entriesDict = csv.DictReader(entriesCSV,dialect="excel")
allRows = [row for row in entriesDict]
winningRows = [row for row in allRows if int(row["#"]) in winningNumbers]
nonWinningRows = [row for row in allRows if int(row["#"]) not in winningNumbers]
for row in winningRows:
winnerName = row["Name"]
winnerID = row["ID"]
winnerEmail = row["Email"]
print(f"The winner is {winnerName}, ID {winnerID}, email {winnerEmail}")
with open(filename, "w", newline='\n') as entriesCSV:
writer = csv.DictWriter(entriesCSV, fieldnames=["#", "Name", "ID", "Email"])
writer.writeheader()
writer.writerows(nonWinningRows)

Maintain a list of emails of the picked winners and then use it to filter out rows of non winners. For that,
Just modify the code segment as follows and that will solve your problem:
def generate():
global winningRows
filename = enterFile()
noOfWinners = 5
winningNumbers = []
nonWinningRows = []
winnerEmails = [] #change 1
while len(winningNumbers) < noOfWinners:
luckyNumber = random.randint(1, totalEntries)
if luckyNumber not in winningNumbers:
winningNumbers.append(luckyNumber)
with open(filename, newline='\n') as entriesCSV:
entriesDict = csv.DictReader(entriesCSV,dialect="excel")
allRows = [row for row in entriesDict]
winningRows = [row for row in allRows if int(row["#"]) in winningNumbers]
for row in winningRows:
if row["Email"] not in winnerEmails: #change 2
winnerName = row["Name"]
winnerID = row["ID"]
winnerEmail = row["Email"]
print(f"The winner is {winnerName}, ID {winnerID}, email {winnerEmail}")
winnerEmails.append(winnerEmail) #change 3
nonWinningRows = [row for row in allRows if int(row["#"]) not in winningNumbers and row["Email"] not in winnerEmails] #change 4
with open(filename, "w", newline='\n') as entriesCSV:
writer = csv.DictWriter(entriesCSV, fieldnames=["#", "Name", "ID", "Email"])
writer.writeheader()
writer.writerows(nonWinningRows)

Related

Slice data frame without pandas based on user input

I'm trying to end the code, but I have problem how to slice data frame based on user's input. Is there any option to do this without pandas?
def dataSet_read():
enter = input('Enter file path:')
csvreader = csv.reader(open(enter))
head_inp = input('Has the file headers? Select Y or N:\n').upper()
header = []
if head_inp == 'Y':
header = next(csvreader)
print('\nFile headers:\n\n', header)
elif head_inp == 'N':
print("'\nFile doesn't have headers")
else:
print('Incorrect selection!!!')
sys.exit()
with open(str(enter), "r") as csvfile:
reader_variable = csv.reader(csvfile, delimiter = ",")
rows_inp = input("\nPlease provide range which you'd like to see using ',', otherwise all dataframe will open all dataset.\n")
if rows_inp == '':
for row in reader_variable:
print(row)
else:
print("????")
cast it to list then you can slice like what it is in normal list structure.
enter = input('Enter file path:')
rows_inp = input("slice")
with open(enter , 'r') as f:
reader_variable = csv.reader(f)
reader_list= list(reader_variable)
for row in reader_list[:rows_inp]:#if you want slice the whole data
current_date = row[:rows_inp] #if you want slice per row
print(current_date)
I found the way to get what I need, maybe it's not the best approach but works :)
with open(str(enter), "r") as csvfile:
reader_variable = csv.reader(csvfile, delimiter = ",")
rows_inp = input("\nPlease provide range which you'd like to see using ',', otherwise all dataframe will open all dataset.\n")
if rows_inp == '':
for row in reader_variable:
print(row)
else:
i, j = rows_inp.split(',')
reader_list = list(reader_variable)
print(reader_list[int(i):int(j)+1])

Dictionary from csv file to group different names in a column

I am doing an assignment where I have a csvfile and I have to generate matplot lib graphs. I have a column full of genres of different movies, with the genres seperated by | (vertical bar). I have to create a dictionary of these genres, once, without repeating, to assign them to the appropriate movies. How will I go about doing that?
this is what I have so far:
import csv
from matplotlib import pyplot as plt
dp = open("C:\Users\jayan\OneDrive\Desktop\IMDB_movie_metadata_for_assignment_6.csv",'r', encoding='utf8').read()
with open("C:\Users\jayan\OneDrive\Desktop\IMDB_movie_metadata_for_assignment_6.csv", errors = 'ignore') as csvfile:
for line in csvfile:
fields = line.split(",")
newField = (fields[4]).split("|")
newerField = fields[16].strip()
movies = (fields[0])
genre_dictionary = {tuple(newField):(movies)}
print(genre_dictionary)
I will suppose your csv has two columns: genres and movies. Tell me if it's not the case. You can do something like:
def find_col_ind(columns):
ind = -1
ind_col_genres = -1
ind_col_movie = -1
for col in columns:
ind += 1
if col == 'movie_title':
ind_col_movie = ind
elif col == 'genres':
ind_col_genres = ind
return ind_col_genres, ind_col_movie
def create_dict(filename):
with open(filename, 'r') as csvfile:
data = dict()
is_first = True
for line in csvfile:
columns = line.split(',')
if is_first:
ind_col_genres, ind_col_movie = find_col_ind(columns)
is_first = False
continue
genres = columns[ind_col_genres].split('|')
movie = columns[ind_col_movie]
for genre in genres:
if genre in data:
data[genre].append(movie.strip('\nÂ\xa0 '))
else:
data[genre] = [movie.strip('\nÂ\xa0 ')]
return data
if __name__ == "__main__":
data = create_dict('test.csv')
print(data)

Is there a way to read and alter the contents of a huge csv file in PyCharm?

I'm attempting to create a program currently that can read a csv, determine if a substring is included in one of the columns of each row, and if it isn't present, rewrites certain columns to a new csv. I have the code down for this much- but the csv I need to use the program for has well over 3 million rows. I use PyCharm and currently I'm not able to process this much data. It can only view the csv in a read-only format which doesn't allow me to use it. I know pandas has a chunk size feature but I don't know how to implement this with the rest of my code.
def reading(csv_input):
originalLength = 0
rowCount = 0
with open(f'Web Report {csv_input}', 'w') as file:
writer = csv.writer(file)
writer.writerow(['Index', 'URL Category', 'User IP', 'URL'])
dropCount = 0
data = pd.read_csv(csv_input, chunksize=100000)
df = pd.DataFrame(data,
columns=['Line', 'Date', 'Hour', 'User Name', 'User IP', 'Site Name',
'URL Category', 'Action', 'Action Description'])
originalLength = len(df.index)
for line in range(originalLength):
dataLine = df.loc[line]
x = dataLine.get(key='Action')
if x == 0:
siteName = dataLine.get(key='Site Name')
if 'dbk' in siteName:
dropCount = dropCount + 1
elif 'ptc' in siteName:
dropCount = dropCount + 1
elif 'wcf' in siteName:
dropCount = dropCount + 1
elif 'google' in siteName:
dropCount = dropCount + 1
else:
writer.writerow([line, # Original Index
df.loc[line].get(key='URL Category'), # Original URL Category
df.loc[line].get(key='User IP'), # Original User IP
df.loc[line].get(key='Site Name')]) # Original Site Name
rowCount = rowCount + 1
else:
dropCount = dropCount + 1
file.close()
print("Input: " + str(csv_input))
print("Output: " + str(file.name))
print("Original Length: " + str(originalLength))
print("Current Length: " + str(rowCount))
print("Drop Count: " + str(dropCount) + "\n")
return df
If you use csv to write file then you could use it also to read row by row.
import csv
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader: # read row by row
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
If you want to use pandas with chunk then you should use for-loop for this.
And when you write with pandas then you need append mode without headers.
import pandas as pd
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create new file with headers
df.to_csv('output.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output.csv', mode='a', header=False)
Minimal working code
import pandas as pd
import csv
# --- create some data ---
data = {
'A': range(0,10),
'B': range(10,20),
'C': range(20,30),
} # columns
df = pd.DataFrame(data)
df.to_csv('input.csv', index=False)
# --- read and write with `pandas` ---
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create empty with headers
df.to_csv('output_pandas.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output_pandas.csv', mode='a', header=False)
# --- read and write with `csv` ---
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader:
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
Doc: read_csv(), to_csv()

Many CSV files (workbooks) are generated.I want them as a single CSV file ( under one single workbook in a single sheet).( in web crawling)

In my code many CSV files are being generated. I want them as a single csv file. The five parties of all years has to be executed under one csv file. Basically I am trying to do web crawling and trying to create a time series of polls for those (CDU, SPD, FDP, GRUNEN, LEFT)parties.
import scrapy
import re
import csv
class VoteSpider(scrapy.Spider):
name = 'VoteSpider'
start_urls = ['https://www.wahlrecht.de/umfragen/forsa.htm']
def __init__(self):
self.CDU = {}
self.SPD = {}
self.FDP = {}
self.Green = {}
self.left = {}
def parse(self, response):
regex = r"[forsa]+[\/]+[0-9]+.htm"
tableBody = response.xpath('//*[#class="wilko"]//tbody')
hxs = scrapy.Selector(response)
all_links = hxs.xpath('*//a/#href').extract()
yearLinks = []
for link in all_links:
matches = re.search(regex, link, re.MULTILINE)
if matches:
yearLinks.append(link)
for link in yearLinks:
newlink = "https://www.wahlrecht.de/umfragen/"+ link
yield scrapy.Request(url = newlink, callback=self.parseLink, meta={'name':link})
self.parseTable(tableBody)
def parseTable(self,tableBody):
CDU= []
SPD = []
FDP= []
Green= []
left= []
rows = tableBody.xpath('//tr')
del rows[:5]
for row in rows:
CDU.append(row.xpath('td//text()')[2].extract())
SPD.append(row.xpath('td//text()')[3].extract())
Green.append(row.xpath('td//text()')[4].extract())
FDP.append(row.xpath('td//text()')[5].extract())
left.append(row.xpath('td//text()')[6].extract())
with open('CDU'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('SPD'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('left'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('Green'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('FDP'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
self.CDU['Current'] = []
self.SPD['Current'] = []
self.Green['Current'] = []
self.FDP['Current'] = []
self.left['Current'] = []
self.CDU['Current'].append(CDU)
self.SPD['Current'].append(SPD)
self.Green['Current'].append(Green)
self.FDP['Current'].append(FDP)
self.left['Current'].append(left)
def parseLink(self, response):
CDU= []
SPD = []
FDP= []
Green= []
left= []
name = response.meta.get('name')
yearNumber = re.findall('\d+',name)[0]
x = 0
if yearNumber == '2007':
x = 4
elif yearNumber == '1998':
x = 3
elif yearNumber == '1999':
x = 3
elif yearNumber == '2000':
x = 3
elif yearNumber == '2001':
x = 3
elif yearNumber == '2002':
x = 3
elif yearNumber == '2003':
x = 3
elif yearNumber == '2004':
x = 3
elif yearNumber == '2005':
x = 5
elif yearNumber == '2006':
x = 3
elif yearNumber == '2008':
x = 4
elif yearNumber == '2013':
x = 4
tableBody = response.xpath('//*[#class="wilko"]//tbody')
rows = tableBody.xpath('//tr')
del rows[:x]
for row in rows:
CDU.append(row.xpath('td//text()')[2].extract())
SPD.append(row.xpath('td//text()')[3].extract())
Green.append(row.xpath('td//text()')[4].extract())
# print(row.xpath('td//text()').extract())
FDP.append(row.xpath('td//text()')[5].extract())
left.append(row.xpath('td//text()')[6].extract())
with open('CDU'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('SPD'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('left'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('Green'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('FDP'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
self.CDU[yearNumber]= []
self.SPD[yearNumber] = []
self.Green[yearNumber] = []
self.FDP[yearNumber] = []
self.left[yearNumber] = []
self.CDU[yearNumber].append(CDU)
self.SPD[yearNumber].append(SPD)
self.Green[yearNumber].append(Green)
self.FDP[yearNumber].append(FDP)
self.left[yearNumber].append(left)
I want the expected output to be as, all CDU, SPD, GRUNEN, FDP, LEFT parties of all years under one CSV file
Instead of opening multiple files, you can append to a single file, like so:
...
with open('ALL'+yearNumber+'.csv', 'a+') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
writer.writerows(SPD)
writer.writerows(left)
writer.writerows(Green)
writer.writerows(FDP)
...

Sum of a particular column in a csv file

There is a csv file, say A.csv, having content:
Place,Hotel,Food,Fare
Norway,Regal,NonVeg,5000
Poland,Jenny,Italiano,6000
Norway,Suzane,Vegeterian,4000
Norway,Regal,NonVeg,5000
I have to parse this csv and obtain an output by passing arguments in command prompt.
Example 1:
mycode.py Place
Desired output is:
Place,Fare
Norway,14000
Poland,6000
Example 2:
mycode.py Place Hotel
Desired output is:
Place,Hotel,Fare
Norway,Regal,10000
Poland,Jenny,6000
Norway,Suzane,4000
So it is clear from the above example that no matter what you pass as argument it gives you the sum of the Fare header for the common ones.
Below is my code and I am able to pass arguments and get an output, but I am stuck in sum of Fare. Can any one help me with this.
import sys
import csv
import collections
d = collections.defaultdict(list)
Data = []
Result = []
Final = []
Argvs = []
argv_len = len(sys.argv)
index = 0
input = ''
file = open('A.csv', 'rb')
try:
reader = csv.reader(file)
for row in reader:
Data.append(row)
for x in range(1, argv_len):
Argvs.append(sys.argv[x])
Argvs.append('Fare')
for input in Argvs:
for y in range(0, len(Data[0])):
if(input == Data[0][y]):
for z in range(1, len(Data)):
Result.append(Data[z][y])
break
Final.append(Result)
Result = []
New = []
NewFinal = []
for x in range(0, len(Final[0])):
for y in range(0, len(Final)):
New.append(Final[y][x])
NewFinal.append(New)
New = []
out = {}
for a in NewFinal:
out.setdefault(a[0],[]).append(int(a[-1]))
with open("output.csv", "wb") as csv_file:
writer = csv.writer(csv_file, dialect='excel', delimiter=',')
writer.writerow(Argvs)
for k,v in out.iteritems():
writer.writerow((k,sum(v)))
except Exception,e:
print str(e)
finally:
file.close()
I edit the code and tried to group it. Now I am able to get the aggregate of the Fare but not the desired output.
So when I am passing:
mycode.py Place Hotel
Instead of:
Place,Hotel,Fare
Norway,Regal,10000
Poland,Jenny,6000
Norway,Suzane,4000
I am getting:
Place,Hotel,Fare
Norway,14000
Poland,6000
Finally i managed to get my desired output.
Below i am sharing the final code. \
import sys
import csv
Data = []
Result = []
Final = []
Argvs = []
argv_len = len(sys.argv)
index = 0
input = ''
file = open('A.csv', 'rb')
try:
reader = csv.reader(file)
for row in reader:
Data.append(row)
for x in range(1, argv_len):
Argvs.append(sys.argv[x])
Argvs.append('Fare')
for input in Argvs:
for y in range(0, len(Data[0])):
if(input == Data[0][y]):
for z in range(1, len(Data)):
Result.append(Data[z][y])
break
Final.append(Result)
Result = []
New = []
NewFinal = []
for x in range(0, len(Final[0])):
for y in range(0, len(Final)):
New.append(Final[y][x])
NewFinal.append(New)
New = []
out = {}
for a in NewFinal:
count_val = a[-1]
del a[-1]
key_val = ','.join(a)
out.setdefault(key_val.strip('"'),[]).append(int(count_val))
with open("output.csv", "wb") as csv_file:
writer = csv.writer(csv_file, delimiter=',',quotechar=' ')
writer.writerow(Argvs)
for k,v in out.iteritems():
writer.writerow((k,sum(v)))
except Exception,e:
print str(e)
finally:
file.close()

Categories