Python lists from csv file - python

I am working on an assignment for plotting graphs from a csv file with movies and they have different categories to them like genres, budget, director's name, etc. We are asked to put the genres (key) and movies (values) in a dictionary and then put the other categories into a list or a tuple to match the movies. I have already created the dictionary, by genres that match movies. How would I go about creating lists of the other categories and match them to the movies to graph?
This is what I have so far:
def find_col_ind(columns):
ind = -1
ind_col_genres = -1
ind_col_movie = -1
for col in columns:
ind += 1
if col == 'movie_title':
ind_col_movie = ind
elif col == 'genres':
ind_col_genres = ind
return ind_col_genres, ind_col_movie
def create_dict(filename):
with open('C:\\Users\\jayan\\OneDrive\\Desktop\\IMDB_movie_metadata_for_assignment_6.csv', 'r',errors='ignore') as csvfile:
data = dict()
is_first = True
for line in csvfile:
columns = line.split(',')
if is_first:
ind_col_genres, ind_col_movie = find_col_ind(columns)
is_first = False
continue
genres = columns[ind_col_genres].split('|')
movie = columns[ind_col_movie]
for genre in genres:
if genre in data:
data[genre].append(movie.strip('\nÂ\xa0 '))
else:
data[genre] = [movie.strip('\nÂ\xa0 ')]
return data
if __name__ == "__main__":
data = create_dict("C:\\Users\\jayan\\OneDrive\\Desktop\\IMDB_movie_metadata_for_assignment_6.csv")
print(data)

Related

convert nested XML to CSV with Python

¿Could anybody help me with the xml transformation to csv?
I have a nested xml file with around 200 columns and I need to choose some of them (id, email, date etc.) With the code below I obtain a csv file but only with 20 columns and it takes only the information from the first row.
data = s3.get_object(Bucket=source_bucket, Key=file_key)
contents = data['Body'].read()
tree = ET.fromstring(contents)
root = tree.findall(".")
# open a temp file for writing
csv_data = open('/tmp/converted.csv', 'w')
# create the csv writer object
csvwriter = csv.writer(csv_data)
csv_head = []
csv_body = []
count = 0
for i in range(len(root[0])):
for x in root[0][i]:
if count == 0:
csv_head.append(x.tag)
csv_body.append(x.text)
print("***********")
print(x.text)
count = count + 1
csv_body.append("\n")
csvwriter.writerow(csv_head)
csvwriter.writerow(csv_body)
csv_data.close()
I also tried the next code, but it does not work:
csv_head = ['order_id', 'date', 'email']
csv_body = []
for i in root:
order_id = i.find('order_id')
if order_id != None:
order_id = order_id.text
date = i.find('date')
if date != None:
date = date.text
email = i.find('email')
if email != None:
email = email.text
csv_body.append({'order_id': order_id, 'date': date, 'email': email})
print("***********")
csvwriter.writerow(csv_head)
csvwriter.writerow(csv_body)
csv_data.close()

Dictionary from csv file to group different names in a column

I am doing an assignment where I have a csvfile and I have to generate matplot lib graphs. I have a column full of genres of different movies, with the genres seperated by | (vertical bar). I have to create a dictionary of these genres, once, without repeating, to assign them to the appropriate movies. How will I go about doing that?
this is what I have so far:
import csv
from matplotlib import pyplot as plt
dp = open("C:\Users\jayan\OneDrive\Desktop\IMDB_movie_metadata_for_assignment_6.csv",'r', encoding='utf8').read()
with open("C:\Users\jayan\OneDrive\Desktop\IMDB_movie_metadata_for_assignment_6.csv", errors = 'ignore') as csvfile:
for line in csvfile:
fields = line.split(",")
newField = (fields[4]).split("|")
newerField = fields[16].strip()
movies = (fields[0])
genre_dictionary = {tuple(newField):(movies)}
print(genre_dictionary)
I will suppose your csv has two columns: genres and movies. Tell me if it's not the case. You can do something like:
def find_col_ind(columns):
ind = -1
ind_col_genres = -1
ind_col_movie = -1
for col in columns:
ind += 1
if col == 'movie_title':
ind_col_movie = ind
elif col == 'genres':
ind_col_genres = ind
return ind_col_genres, ind_col_movie
def create_dict(filename):
with open(filename, 'r') as csvfile:
data = dict()
is_first = True
for line in csvfile:
columns = line.split(',')
if is_first:
ind_col_genres, ind_col_movie = find_col_ind(columns)
is_first = False
continue
genres = columns[ind_col_genres].split('|')
movie = columns[ind_col_movie]
for genre in genres:
if genre in data:
data[genre].append(movie.strip('\nÂ\xa0 '))
else:
data[genre] = [movie.strip('\nÂ\xa0 ')]
return data
if __name__ == "__main__":
data = create_dict('test.csv')
print(data)

How to pick values for specific times in a date (large list of date, time, value)

I have a file with these columns: date, times, and value of a stock. Basically, per-minute value of stocks. I would like to calculate the difference in the value of a stock at 10 AM and 4 PM. This is the code I have so far:
fileName = "C:\\...\\US_200901_210907.csv"
with open(fileName) as f:
for line in f.readlines()[1:]:
split = line.split(";")
time = split[3]
date = split[2]
for timev in f.readlines()[1:]:
if timev == '100000':
Spot = float(split[2])
elif timev == '160000':
Close = float(split[2])
Diff = Spot - Close
print(Diff)
I am not sure if I am doing this right. But the code needs to cycle/loop through each date first, find the value of the stock at '100000' and '160000' and then calculate the difference between the two. Then move to the next day. And at the end of all days, print the differences for each day.
The "Diff = Spot - Close" line also gives me an error, says "NameError: name 'Spot' is not defined"
Any help is appreciated.
Dataset looks like this (extract):
====================
After working more on this on my own, I was able to get this to work:
import csv
filename = "C:\\...\\US_200901_210907.csv"
with open(filename, 'r') as f:
reader = csv.reader(f, delimiter=';')
next(reader, None) # skip header
rows = list(reader)
listOfDates = []
index = 0
for row in rows:
if rows[index][2] not in listOfDates:
listOfDates.append(rows[index][2])
index = index + 1
print(listOfDates)
startPrice = 0
endPrice = 0
index = 0
startPriceSet = False
endPriceSet = False
for date in listOfDates:
for row in rows:
if rows[index][2] == date:
# print(rows[index][2])
# print(date)
if rows[index][3] == '100000':
startPrice = float(rows[index][7])
startPriceSet = True
elif rows[index][3] == '160000':
endPrice = float(rows[index][7])
endPriceSet = True
index = index + 1
if startPriceSet and endPriceSet:
print(date, startPrice, endPrice, startPrice - endPrice)
startPriceSet = False
endPriceSet = False
Why not leverage a pandas DataFrame for this calculation -
import pandas as pd
df = pd.read_csv("C:\\...\\US_200901_210907.csv")
# give appropriate column names before or after loading the data
# assuming we have the columns 'time', 'date' & 'stockvalue' in df
# might have to use pandas.to_datetime
print(df[(df['time']=='time1') && (df['date']=='date1')]['stockvalue']-df[(df['time']=='time2') && (df['date']=='date1')]['stockvalue'])
Also, why do you have an embedded for loop?
One of the approach with the sheet you have provided:
import pandas as pd
from collections import defaultdict
df = pd.read_excel("Data.xlsx", header=None, dtype='str')
out = defaultdict(lambda: defaultdict(float))
for rowindex, row in df.iterrows():
date = row[2]
name = row[0]
if row[3] == "100000":
out[name]['DATE'] = row[2]
out[name]['START'] = float(row[4])
if row[3] == "160000":
out[name]['END'] = float(row[4])
for stock, data in out.items():
print (stock+': DATE: '+data['DATE']+' START: '+data['START']+' END:'+data['END']+' diff = '+str(int(data['END']-data['START'])))

Having trouble reading CSV files with rows of different column count

I am having trouble reading through a file where the rows have different lengths. Specifically, I know that the file is 13 rows long and that rows 1 and 13 have 2 values in them where the rest (2-12) have 4. I want to get one value from row 1 and one value from row 13, and one value from each of rows 2-12 depending on whether or not their preceding value is equal to "credit" or "debit". Since the rows have different lengths I get 'index out of range' errors. Any help would be greatly appreciated. Thanks!
class Checkbook:
"""Checkbook class for list of check transactions"""
def __init__(self, filename):
"""initializer for Checkbook class"""
self.name = filename
self.debitList = []
self.creditList = []
self.startAmt = 0
self.endAmt = 0
self.shouldBeBal = 0
with open(filename) as csvFile:
readCSV = csv.reader(csvFile, delimiter = ',')
#rowCount = sum(1 for row in readCSV) - 1
#print(rowCount)
next(csvFile)
#in range(1, rowCount, 1):
for row in readCSV:
if (row[2] == " debit"):
debitAmt = row[3]
self.debitList.append(debitAmt)
elif (row[2] == " credit"):
creditAmt = row[3]
self.creditList.append(creditAmt)
Well, you have to either avoid the IndexError
for row in readCSV:
if len(row) > 2: # make sure the row is long enough
if (row[2] == " debit"): # now this can't fail
# ...
elif (row[2] == " credit"):
# ...
or handle it:
for row in readCSV:
try:
if (row[2] == " debit"):
# ...
elif (row[2] == " credit"):
# ...
except IndexError:
pass # do nothing

how to align strings in python without creating a table

I have a problem. I'm trying to print a serie of lists in python to have it with a vertical align. My code is:
def show():
book = "data.txt"
f = open(book,'r')
line = f.readlines()
f.close()
x=0
z = ''
l = []
x = []
i = 0
starting = '{:>4} {:>15} {:>15}'.format('Name', "Gender", "Year")
print(starting)
for p in line:
p = p.replace(',',' ')
x = p.index(' ')
name = p[0:x]
a = p.index('e 1')
gender = p[x:a+1]
year = p[(a+2):]
if len(name) == 3:
line_new = '{:>2} {:>15} {:>15}'.format(name, gender, year)
else:
line_new = '{:>5} {:>15} {:>15}'.format(name, gender, year)
print(line_new)
The problem is that I'm trying to have something like:
I want to put all the names of the left (and I don't have problems) then, under Gender, I want to create an equal list of Genders all on the same vertical and same thing for year
Untested, but try this:
import itertools
with open("data.txt") as data:
pep = [line.strip().split(',') for line in data]
widths = [len(max(r, key=len)) for r in itertools.izip_longest(*pep, fillvalue="")]
print "%-{0}%s%-{1}%s%-{2}%s".format(widths[0], widths[1], widths[2])\
%("Name", "Gender", "Year")
print "\n".join(["%-{0}%s%-{1}%s%-{2}%s".format(widths[0], widths[1], widths[2])\
%(attr[0], attr[1], attr[2]) for attr in pep])

Categories