convert nested XML to CSV with Python - python

¿Could anybody help me with the xml transformation to csv?
I have a nested xml file with around 200 columns and I need to choose some of them (id, email, date etc.) With the code below I obtain a csv file but only with 20 columns and it takes only the information from the first row.
data = s3.get_object(Bucket=source_bucket, Key=file_key)
contents = data['Body'].read()
tree = ET.fromstring(contents)
root = tree.findall(".")
# open a temp file for writing
csv_data = open('/tmp/converted.csv', 'w')
# create the csv writer object
csvwriter = csv.writer(csv_data)
csv_head = []
csv_body = []
count = 0
for i in range(len(root[0])):
for x in root[0][i]:
if count == 0:
csv_head.append(x.tag)
csv_body.append(x.text)
print("***********")
print(x.text)
count = count + 1
csv_body.append("\n")
csvwriter.writerow(csv_head)
csvwriter.writerow(csv_body)
csv_data.close()
I also tried the next code, but it does not work:
csv_head = ['order_id', 'date', 'email']
csv_body = []
for i in root:
order_id = i.find('order_id')
if order_id != None:
order_id = order_id.text
date = i.find('date')
if date != None:
date = date.text
email = i.find('email')
if email != None:
email = email.text
csv_body.append({'order_id': order_id, 'date': date, 'email': email})
print("***********")
csvwriter.writerow(csv_head)
csvwriter.writerow(csv_body)
csv_data.close()

Related

Dictionary from csv file to group different names in a column

I am doing an assignment where I have a csvfile and I have to generate matplot lib graphs. I have a column full of genres of different movies, with the genres seperated by | (vertical bar). I have to create a dictionary of these genres, once, without repeating, to assign them to the appropriate movies. How will I go about doing that?
this is what I have so far:
import csv
from matplotlib import pyplot as plt
dp = open("C:\Users\jayan\OneDrive\Desktop\IMDB_movie_metadata_for_assignment_6.csv",'r', encoding='utf8').read()
with open("C:\Users\jayan\OneDrive\Desktop\IMDB_movie_metadata_for_assignment_6.csv", errors = 'ignore') as csvfile:
for line in csvfile:
fields = line.split(",")
newField = (fields[4]).split("|")
newerField = fields[16].strip()
movies = (fields[0])
genre_dictionary = {tuple(newField):(movies)}
print(genre_dictionary)
I will suppose your csv has two columns: genres and movies. Tell me if it's not the case. You can do something like:
def find_col_ind(columns):
ind = -1
ind_col_genres = -1
ind_col_movie = -1
for col in columns:
ind += 1
if col == 'movie_title':
ind_col_movie = ind
elif col == 'genres':
ind_col_genres = ind
return ind_col_genres, ind_col_movie
def create_dict(filename):
with open(filename, 'r') as csvfile:
data = dict()
is_first = True
for line in csvfile:
columns = line.split(',')
if is_first:
ind_col_genres, ind_col_movie = find_col_ind(columns)
is_first = False
continue
genres = columns[ind_col_genres].split('|')
movie = columns[ind_col_movie]
for genre in genres:
if genre in data:
data[genre].append(movie.strip('\nÂ\xa0 '))
else:
data[genre] = [movie.strip('\nÂ\xa0 ')]
return data
if __name__ == "__main__":
data = create_dict('test.csv')
print(data)

How to pick values for specific times in a date (large list of date, time, value)

I have a file with these columns: date, times, and value of a stock. Basically, per-minute value of stocks. I would like to calculate the difference in the value of a stock at 10 AM and 4 PM. This is the code I have so far:
fileName = "C:\\...\\US_200901_210907.csv"
with open(fileName) as f:
for line in f.readlines()[1:]:
split = line.split(";")
time = split[3]
date = split[2]
for timev in f.readlines()[1:]:
if timev == '100000':
Spot = float(split[2])
elif timev == '160000':
Close = float(split[2])
Diff = Spot - Close
print(Diff)
I am not sure if I am doing this right. But the code needs to cycle/loop through each date first, find the value of the stock at '100000' and '160000' and then calculate the difference between the two. Then move to the next day. And at the end of all days, print the differences for each day.
The "Diff = Spot - Close" line also gives me an error, says "NameError: name 'Spot' is not defined"
Any help is appreciated.
Dataset looks like this (extract):
====================
After working more on this on my own, I was able to get this to work:
import csv
filename = "C:\\...\\US_200901_210907.csv"
with open(filename, 'r') as f:
reader = csv.reader(f, delimiter=';')
next(reader, None) # skip header
rows = list(reader)
listOfDates = []
index = 0
for row in rows:
if rows[index][2] not in listOfDates:
listOfDates.append(rows[index][2])
index = index + 1
print(listOfDates)
startPrice = 0
endPrice = 0
index = 0
startPriceSet = False
endPriceSet = False
for date in listOfDates:
for row in rows:
if rows[index][2] == date:
# print(rows[index][2])
# print(date)
if rows[index][3] == '100000':
startPrice = float(rows[index][7])
startPriceSet = True
elif rows[index][3] == '160000':
endPrice = float(rows[index][7])
endPriceSet = True
index = index + 1
if startPriceSet and endPriceSet:
print(date, startPrice, endPrice, startPrice - endPrice)
startPriceSet = False
endPriceSet = False
Why not leverage a pandas DataFrame for this calculation -
import pandas as pd
df = pd.read_csv("C:\\...\\US_200901_210907.csv")
# give appropriate column names before or after loading the data
# assuming we have the columns 'time', 'date' & 'stockvalue' in df
# might have to use pandas.to_datetime
print(df[(df['time']=='time1') && (df['date']=='date1')]['stockvalue']-df[(df['time']=='time2') && (df['date']=='date1')]['stockvalue'])
Also, why do you have an embedded for loop?
One of the approach with the sheet you have provided:
import pandas as pd
from collections import defaultdict
df = pd.read_excel("Data.xlsx", header=None, dtype='str')
out = defaultdict(lambda: defaultdict(float))
for rowindex, row in df.iterrows():
date = row[2]
name = row[0]
if row[3] == "100000":
out[name]['DATE'] = row[2]
out[name]['START'] = float(row[4])
if row[3] == "160000":
out[name]['END'] = float(row[4])
for stock, data in out.items():
print (stock+': DATE: '+data['DATE']+' START: '+data['START']+' END:'+data['END']+' diff = '+str(int(data['END']-data['START'])))

Remove row from the CSV file if condition met

I am trying to scrape pickels.com.au.
I am trying to update the pickels_dataset.csv file if the link is the same and if the price is not the same them I am removing the list and inserting the new row to the CSV file, but it doesn't remove the old entry from the CSV file.
What would be the best way to remove and update the row in the CSV file.
Below is my code...
import requests
from scrapy.selector import Selector
import csv
import re
from tqdm import tqdm
from time import sleep
with open('pickels_dataset.csv', 'a+', newline='', encoding='utf-8') as auction_csv_file:
auction_csv_writer = csv.writer(auction_csv_file)
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('M ake')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
sale_price = auction_data.get('MinimumBid')
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, "${:,.2f}".format(sale_price).strip() ,
link, sale_date
]
with open('pickels_dataset.csv', 'r+') as csv_read:
auction_reader = list(csv.reader(csv_read))
for each in auction_reader:
if link in each:
each_link, each_price = each[9], each[0]
if (link == each_link) and (sale_price != each_price):
auction_reader.clear()
print('New list found, old list deleted')
auction_csv_writer.writerow(auction_values)
print('New value added')
continue
elif (link == each[9]) and (sale_price == each[0]):
print('Same result already exist in the file')
continue
else:
auction_csv_writer.writerow(auction_values)
print('Unique result found and added.')
break
Your current script is opening your auction CSV file for appending, and then whilst it is still open, attempting to open it again for reading. This is probably why it is not updating as expected.
A better approach would be to first read the entire contents of your existing saved auction file into a dictionary. The key could be the link which would then make it easy to determine if you have already seen an existing auction.
Next scrape the current auctions and update the saved_auctions dictionary as needed.
Finally at the end, write the contents of saved_auctions back to the CSV file.
For example:
import requests
from scrapy.selector import Selector
import csv
import re
auction_filename = 'pickels_dataset.csv'
# Load existing auctions into a dictionary with link as key
saved_auctions = {}
with open(auction_filename, newline='', encoding='utf-8') as f_auction_file:
for row in csv.reader(f_auction_file):
saved_auctions[row[9]] = row # dictionary key is link
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('Make')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
minimum_bid = auction_data.get('MinimumBid')
sale_price = "${:,.2f}".format(minimum_bid).strip()
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, sale_price,
link, sale_date
]
if link in saved_auctions:
if saved_auctions[link][8] == sale_price:
print('Same result already exists in the file')
else:
print('New value updated')
saved_auctions[link] = auction_values # Updated the entry
else:
print('New auction added')
saved_auctions[link] = auction_values
# Update the saved auction file
with open(auction_filename, 'w', newline='', encoding='utf-8') as f_auction_file:
csv_auction_file = csv.writer(f_auction_file)
csv_auction_file.writerows(saved_auctions.values())
If you want to also remove auctions that are no longer active, then it would probably be best to simply ignore the saved file and just write all current entries as is.

JSONDecodeError: Expecting value: line 1 column 1 (char 0) when using Pushift API to scrape Reddit Data

import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image

Multi-threading list iterating for loop

this function reads from a text file and re-formats the contents, and then writes the contents to a csv. I'm trying to use threading to multi-thread the for i in lines loop, this is the longest part of a larger script and takes up most of the run time because the list lines contains thousands of elements. Can someone help me straighten this out? Doing this synchronously instead of in parallel is taking up tons of time. I have seen many other answers to similar questions but I've yet to understand the answers and implement them correctly so far.
def sheets(i):
# time format for spreadsheet
dt_time = datetime.now().strftime('%m/%d|%H:%M')
# for league name (NFL,NBA,NHL ETC.) in list containing league names
for league_name in leagues2:
league_name = league_name.split('|')[0]
with open(final_stats_path, 'r+') as lines:
lines = lines.readlines()
# i = one long string containg details about the event in the loop, eg. sport, game day, game id, home team name
for i in lines:
i = i.split(',')
minprice = i[6]
totaltix = i[5]
event_date = i[2]
try:
dayofweek = datetime.strptime(event_date, '%Y-%m-%d').strftime('%A')
except:
continue
event_date = i[2][2:]
event_date = str(event_date).split('-')
event_date = event_date[1]+'/'+event_date[2]
sport = i[4]
event = i[1].replace('Basketball','').replace('\n','')
away = i[8].replace('Basketball', '').replace('\n','')
eventid = i[0]
event_home = i[9].replace('Basketball', '').replace('\n','')
event = event.split(' at ')[0]
tixdata = str(totaltix)
eventid = 'https://pro.stubhub.com/simweb/sim/services/priceanalysis?eventId='+str(eventid)+'&sectionId=0'
directory = root+'\data'+'\\'+sport+'\\'
report = directory+'report.xlsx'
fname = directory+'teams.txt'
eventleague = sport
f = open(directory+'acronym.txt', 'r+')
lines_2 = f.readlines()
for qt in lines_2:
qt = qt.split('-')
compare = qt[1]
if event_home in compare:
event_home = qt[0]
else:
pass
troop = []
d = {
'ID' : eventid,
'Date' : event_date,
'Day' : dayofweek,
'Away' : away,
}
s = {
'time' : tixdata
}
numbers = event_home+'.txt'
numbers_new = 'bk\\bk_'+numbers
with open(directory+numbers_new, 'a+') as y:
pass
with open(directory+numbers, 'a+') as o:
pass
with open(directory+numbers, 'r+') as g:
for row in g:
if str(eventid) in row:
#print('the event is in the list')
row_update = row.replace('}', ", '"+dt_time+"': '"+tixdata+"'}")
with open(directory+numbers_new, 'a+') as y:
y.write(row_update)
break
else:
with open(directory+numbers, 'a+') as p:
#print('the event is not in the list')
p.write(str(d)+'\n')
with open(directory+numbers_new, 'a+') as n:
n.write(str(d)+'\n')
sizefile = os.path.getsize(directory+numbers_new)
if sizefile > 0:
shutil.copy(directory+numbers_new, directory+numbers)
open(directory+numbers_new, 'w').close()
else:
pass
df = []
with open(directory+numbers, 'r+') as t:
for row in t:
b = eval(row)
dfs = df.append(b)
df = pd.DataFrame(df)
yark = list(df.columns)[:-5]
zed = ['ID', 'Date', 'Day', 'Away']
columns = zed+yark
try:
df = df[columns]
except:
pass
df.index = range(1, 2*len(df)+1, 2)
df = df.reindex(index=range(2*len(df)))
writer = pd.ExcelWriter(directory+event_home+'.xlsx', engine='xlsxwriter')
try:
df.to_excel(writer, sheet_name=event_home)
except:
continue
workbook = writer.book
worksheet = writer.sheets[event_home]
format1 = workbook.add_format({'num_format': '#,##0.00'})
worksheet.set_column('A:ZZ', 18, format1)
writer.save()
if __name__ == "__main__":
pool = ThreadPool(8) # Make the Pool of workers
results = pool.map(sheets) #Open the urls in their own threads
pool.close() #close the pool and wait for the work to finish
pool.join()
##get_numbers()
##stats_to_csv()
##stats_to_html()
#sheets()
Try changing the following line:
results = pool.map(sheets)
to:
results = pool.map(sheets,range(8))

Categories