Dictionary from csv file to group different names in a column - python

I am doing an assignment where I have a csvfile and I have to generate matplot lib graphs. I have a column full of genres of different movies, with the genres seperated by | (vertical bar). I have to create a dictionary of these genres, once, without repeating, to assign them to the appropriate movies. How will I go about doing that?
this is what I have so far:
import csv
from matplotlib import pyplot as plt
dp = open("C:\Users\jayan\OneDrive\Desktop\IMDB_movie_metadata_for_assignment_6.csv",'r', encoding='utf8').read()
with open("C:\Users\jayan\OneDrive\Desktop\IMDB_movie_metadata_for_assignment_6.csv", errors = 'ignore') as csvfile:
for line in csvfile:
fields = line.split(",")
newField = (fields[4]).split("|")
newerField = fields[16].strip()
movies = (fields[0])
genre_dictionary = {tuple(newField):(movies)}
print(genre_dictionary)

I will suppose your csv has two columns: genres and movies. Tell me if it's not the case. You can do something like:
def find_col_ind(columns):
ind = -1
ind_col_genres = -1
ind_col_movie = -1
for col in columns:
ind += 1
if col == 'movie_title':
ind_col_movie = ind
elif col == 'genres':
ind_col_genres = ind
return ind_col_genres, ind_col_movie
def create_dict(filename):
with open(filename, 'r') as csvfile:
data = dict()
is_first = True
for line in csvfile:
columns = line.split(',')
if is_first:
ind_col_genres, ind_col_movie = find_col_ind(columns)
is_first = False
continue
genres = columns[ind_col_genres].split('|')
movie = columns[ind_col_movie]
for genre in genres:
if genre in data:
data[genre].append(movie.strip('\nÂ\xa0 '))
else:
data[genre] = [movie.strip('\nÂ\xa0 ')]
return data
if __name__ == "__main__":
data = create_dict('test.csv')
print(data)

Related

Deleting all rows with the same email in csv file

I'd like to ask, how to delete all rows with the same email in csv file? Idea is to pick random rows, and if that row picked and printed out, delete all rows with it's email from file. Now code deletes only rows that are picked and printed. But if there were more rows with the same email it wouldn't delete them. How to fix it?
Full code: https://pastebin.com/qzHm4NSA
Data structure: https://ibb.co/wWXfL6X
def generate():
global winningRows
filename = enterFile()
noOfWinners = 5
winningNumbers = []
while len(winningNumbers) < noOfWinners:
luckyNumber = random.randint(1, totalEntries)
if luckyNumber not in winningNumbers:
winningNumbers.append(luckyNumber)
with open(filename, newline='\n') as entriesCSV:
entriesDict = csv.DictReader(entriesCSV,dialect="excel")
allRows = [row for row in entriesDict]
winningRows = [row for row in allRows if int(row["#"]) in winningNumbers]
nonWinningRows = [row for row in allRows if int(row["#"]) not in winningNumbers]
for row in winningRows:
winnerName = row["Name"]
winnerID = row["ID"]
winnerEmail = row["Email"]
print(f"The winner is {winnerName}, ID {winnerID}, email {winnerEmail}")
with open(filename, "w", newline='\n') as entriesCSV:
writer = csv.DictWriter(entriesCSV, fieldnames=["#", "Name", "ID", "Email"])
writer.writeheader()
writer.writerows(nonWinningRows)
Maintain a list of emails of the picked winners and then use it to filter out rows of non winners. For that,
Just modify the code segment as follows and that will solve your problem:
def generate():
global winningRows
filename = enterFile()
noOfWinners = 5
winningNumbers = []
nonWinningRows = []
winnerEmails = [] #change 1
while len(winningNumbers) < noOfWinners:
luckyNumber = random.randint(1, totalEntries)
if luckyNumber not in winningNumbers:
winningNumbers.append(luckyNumber)
with open(filename, newline='\n') as entriesCSV:
entriesDict = csv.DictReader(entriesCSV,dialect="excel")
allRows = [row for row in entriesDict]
winningRows = [row for row in allRows if int(row["#"]) in winningNumbers]
for row in winningRows:
if row["Email"] not in winnerEmails: #change 2
winnerName = row["Name"]
winnerID = row["ID"]
winnerEmail = row["Email"]
print(f"The winner is {winnerName}, ID {winnerID}, email {winnerEmail}")
winnerEmails.append(winnerEmail) #change 3
nonWinningRows = [row for row in allRows if int(row["#"]) not in winningNumbers and row["Email"] not in winnerEmails] #change 4
with open(filename, "w", newline='\n') as entriesCSV:
writer = csv.DictWriter(entriesCSV, fieldnames=["#", "Name", "ID", "Email"])
writer.writeheader()
writer.writerows(nonWinningRows)

Python lists from csv file

I am working on an assignment for plotting graphs from a csv file with movies and they have different categories to them like genres, budget, director's name, etc. We are asked to put the genres (key) and movies (values) in a dictionary and then put the other categories into a list or a tuple to match the movies. I have already created the dictionary, by genres that match movies. How would I go about creating lists of the other categories and match them to the movies to graph?
This is what I have so far:
def find_col_ind(columns):
ind = -1
ind_col_genres = -1
ind_col_movie = -1
for col in columns:
ind += 1
if col == 'movie_title':
ind_col_movie = ind
elif col == 'genres':
ind_col_genres = ind
return ind_col_genres, ind_col_movie
def create_dict(filename):
with open('C:\\Users\\jayan\\OneDrive\\Desktop\\IMDB_movie_metadata_for_assignment_6.csv', 'r',errors='ignore') as csvfile:
data = dict()
is_first = True
for line in csvfile:
columns = line.split(',')
if is_first:
ind_col_genres, ind_col_movie = find_col_ind(columns)
is_first = False
continue
genres = columns[ind_col_genres].split('|')
movie = columns[ind_col_movie]
for genre in genres:
if genre in data:
data[genre].append(movie.strip('\nÂ\xa0 '))
else:
data[genre] = [movie.strip('\nÂ\xa0 ')]
return data
if __name__ == "__main__":
data = create_dict("C:\\Users\\jayan\\OneDrive\\Desktop\\IMDB_movie_metadata_for_assignment_6.csv")
print(data)

How to pick values for specific times in a date (large list of date, time, value)

I have a file with these columns: date, times, and value of a stock. Basically, per-minute value of stocks. I would like to calculate the difference in the value of a stock at 10 AM and 4 PM. This is the code I have so far:
fileName = "C:\\...\\US_200901_210907.csv"
with open(fileName) as f:
for line in f.readlines()[1:]:
split = line.split(";")
time = split[3]
date = split[2]
for timev in f.readlines()[1:]:
if timev == '100000':
Spot = float(split[2])
elif timev == '160000':
Close = float(split[2])
Diff = Spot - Close
print(Diff)
I am not sure if I am doing this right. But the code needs to cycle/loop through each date first, find the value of the stock at '100000' and '160000' and then calculate the difference between the two. Then move to the next day. And at the end of all days, print the differences for each day.
The "Diff = Spot - Close" line also gives me an error, says "NameError: name 'Spot' is not defined"
Any help is appreciated.
Dataset looks like this (extract):
====================
After working more on this on my own, I was able to get this to work:
import csv
filename = "C:\\...\\US_200901_210907.csv"
with open(filename, 'r') as f:
reader = csv.reader(f, delimiter=';')
next(reader, None) # skip header
rows = list(reader)
listOfDates = []
index = 0
for row in rows:
if rows[index][2] not in listOfDates:
listOfDates.append(rows[index][2])
index = index + 1
print(listOfDates)
startPrice = 0
endPrice = 0
index = 0
startPriceSet = False
endPriceSet = False
for date in listOfDates:
for row in rows:
if rows[index][2] == date:
# print(rows[index][2])
# print(date)
if rows[index][3] == '100000':
startPrice = float(rows[index][7])
startPriceSet = True
elif rows[index][3] == '160000':
endPrice = float(rows[index][7])
endPriceSet = True
index = index + 1
if startPriceSet and endPriceSet:
print(date, startPrice, endPrice, startPrice - endPrice)
startPriceSet = False
endPriceSet = False
Why not leverage a pandas DataFrame for this calculation -
import pandas as pd
df = pd.read_csv("C:\\...\\US_200901_210907.csv")
# give appropriate column names before or after loading the data
# assuming we have the columns 'time', 'date' & 'stockvalue' in df
# might have to use pandas.to_datetime
print(df[(df['time']=='time1') && (df['date']=='date1')]['stockvalue']-df[(df['time']=='time2') && (df['date']=='date1')]['stockvalue'])
Also, why do you have an embedded for loop?
One of the approach with the sheet you have provided:
import pandas as pd
from collections import defaultdict
df = pd.read_excel("Data.xlsx", header=None, dtype='str')
out = defaultdict(lambda: defaultdict(float))
for rowindex, row in df.iterrows():
date = row[2]
name = row[0]
if row[3] == "100000":
out[name]['DATE'] = row[2]
out[name]['START'] = float(row[4])
if row[3] == "160000":
out[name]['END'] = float(row[4])
for stock, data in out.items():
print (stock+': DATE: '+data['DATE']+' START: '+data['START']+' END:'+data['END']+' diff = '+str(int(data['END']-data['START'])))

Creating a histogram with .txt file using Python?

Here's the code I tried. It gave me a syntax error highlighting 'data'. Any help? The .txt file has 4 columns if that's of any help.
def file():
file = open('hsp.txt', 'r')
col = [] data = file.readlines()
for i in range(1,len(data)-1):
col.append(int(float(data[i].split(',')[5])))
return col
def hist(col):
handspan = []
for i in range(11):
handspan.append(0)
for i in (col):
handspan[i] += 1
return handspan
col = file()
handspan = hist(col)
print(col)
print(handspan)
It is because your line
col = [] data = file.readlines()
should be on two separate lines:
col = []
data = file.readlines()
You can try this, it worked for me.
Hence it is a histogram it yields a dictionary.
Better answers are welcome!
import string
def list_from_file(filename):
myfile = open(filename, 'r')
data = myfile.read().split()
col = []
for word in data:
col.append(word)
return col
def myhist(col):
hist = {}
for word in col:
word = word.lower()
word = word.strip(string.punctuation + string.whitespace)
hist[word] = hist.get(word, 0)+1
return hist
col = list_from_file('em.txt')
colf = myhist(col)
print(colf)

how to align strings in python without creating a table

I have a problem. I'm trying to print a serie of lists in python to have it with a vertical align. My code is:
def show():
book = "data.txt"
f = open(book,'r')
line = f.readlines()
f.close()
x=0
z = ''
l = []
x = []
i = 0
starting = '{:>4} {:>15} {:>15}'.format('Name', "Gender", "Year")
print(starting)
for p in line:
p = p.replace(',',' ')
x = p.index(' ')
name = p[0:x]
a = p.index('e 1')
gender = p[x:a+1]
year = p[(a+2):]
if len(name) == 3:
line_new = '{:>2} {:>15} {:>15}'.format(name, gender, year)
else:
line_new = '{:>5} {:>15} {:>15}'.format(name, gender, year)
print(line_new)
The problem is that I'm trying to have something like:
I want to put all the names of the left (and I don't have problems) then, under Gender, I want to create an equal list of Genders all on the same vertical and same thing for year
Untested, but try this:
import itertools
with open("data.txt") as data:
pep = [line.strip().split(',') for line in data]
widths = [len(max(r, key=len)) for r in itertools.izip_longest(*pep, fillvalue="")]
print "%-{0}%s%-{1}%s%-{2}%s".format(widths[0], widths[1], widths[2])\
%("Name", "Gender", "Year")
print "\n".join(["%-{0}%s%-{1}%s%-{2}%s".format(widths[0], widths[1], widths[2])\
%(attr[0], attr[1], attr[2]) for attr in pep])

Categories