Reading CSV data as headers and value pairs

Reading CSV data as headers and value pairs - python

I am trying to read a CSV file and take the headers as key and the body as the value pairs.
Here is my Python file:
from pymongo import MongoClient
import requests
import csv
import sys
data = {}
def readCsv(csvFile, column):
with open(csvFile, "rb") as infile:
reader = csv.reader(infile)
headers = next(reader)[1:]
for row in reader:
data[row[column]] = {key: value for key, value in zip(headers, row[1:])}
def readCalllogs():
for loanNumber in data:
logDetail = data[loanNumber]
print logDetail
resp = requests.post('http://localhost:3500/api/v1/call_logs', json=logDetail)
if resp.status_code != 201:
print resp.status_code
else:
print logDetail
def writeYAMLFile():
for loan in loans:
print loan["assignedTo"]
def loadCalllogsFromCsv():
readCsv("calllogs.csv", 0)
readCalllogs()
def main():
loadCalllogsFromCsv()
if __name__ == "__main__":
main()
But I am getting an Index error:
File "./load_calllogs.py", line 16, in readCsv
data[row[column]] = {key: value for key, value in zip(headers, row[1:])}
IndexError: list index out of range
Here is my CSV file:
loanNumber,date,contactStatus,contactRelation,contactName,response,tenantId,actionDate,action,assignedTo,remarks,caller
1,,CONNECTED,SELF,NAME1,RESPONSE1,,,FIELD_PTP,ASSIGN1,REMARK1,CALLER1
2,,WRONG_NUMBER,SELF,NAME2,RESPONSE2,,,MEET_GUARANTOR,ASSIGN2,REMARK2,CALLER2
3,,CONNECTED,WIFE,NAME3,RESPONSE3,,,FIELD_PTP,ASSIGN3,REMARK3,CALLER3
4,,NO_RESPONSE,HUSBAND,NAME4,RESPONSE4,,,MEET_GUARANTOR,ASSIGN4,REMARK4,CALLER4
5,,CONNECTED,SON,NAME5,RESPONSE5,,,VISIT_CUSTOMER,ASSIGN5,REMARK5,CALLER5
6,,CONNECTED,SON,NAME6,RESPONSE6,,,VISIT_CUSTOMER,ASSIGN6,REMARK6,CALLER6

Try out this code.
file = "your_file.csv"
my_list = []
with open(file, mode='r') as input_file:
rows = []
for row in input_file:
rows.append(row.rstrip('\n').split(","))
keys = rows[0]
for values in rows[1:]:
my_list.append(dict(zip(keys, values)))
Here is the output (a list containing dicts):
[{'actionDate': '', 'caller': 'CALLER1', 'contactStatus': 'CONNECTED', 'contactName': 'NAME1', 'tenantId': '', 'loanNumber': '1', 'action': 'FIELD_PTP', 'contactRelation': 'SELF', 'assignedTo': 'ASSIGN1', 'remarks': 'REMARK1', 'date': '', 'response': 'RESPONSE1'}, {'actionDate': '', 'caller': 'CALLER2', 'contactStatus': 'WRONG_NUMBER', 'contactName': 'NAME2', 'tenantId': '', 'loanNumber': '2', 'action': 'MEET_GUARANTOR', 'contactRelation': 'SELF', 'assignedTo': 'ASSIGN2', 'remarks': 'REMARK2', 'date': '', 'response': 'RESPONSE2'}, {'actionDate': '', 'caller': 'CALLER3', 'contactStatus': 'CONNECTED', 'contactName': 'NAME3', 'tenantId': '', 'loanNumber': '3', 'action': 'FIELD_PTP', 'contactRelation': 'WIFE', 'assignedTo': 'ASSIGN3', 'remarks': 'REMARK3', 'date': '', 'response': 'RESPONSE3'}, {'actionDate': '', 'caller': 'CALLER4', 'contactStatus': 'NO_RESPONSE', 'contactName': 'NAME4', 'tenantId': '', 'loanNumber': '4', 'action': 'MEET_GUARANTOR', 'contactRelation': 'HUSBAND', 'assignedTo': 'ASSIGN4', 'remarks': 'REMARK4', 'date': '', 'response': 'RESPONSE4'}, {'actionDate': '', 'caller': 'CALLER5', 'contactStatus': 'CONNECTED', 'contactName': 'NAME5', 'tenantId': '', 'loanNumber': '5', 'action': 'VISIT_CUSTOMER', 'contactRelation': 'SON', 'assignedTo': 'ASSIGN5', 'remarks': 'REMARK5', 'date': '', 'response': 'RESPONSE5'}, {'actionDate': '', 'caller': 'CALLER6', 'contactStatus': 'CONNECTED', 'contactName': 'NAME6', 'tenantId': '', 'loanNumber': '6', 'action': 'VISIT_CUSTOMER', 'contactRelation': 'SON', 'assignedTo': 'ASSIGN6', 'remarks': 'REMARK6', 'date': '', 'response': 'RESPONSE6'}]

A DictReader would help you. It automatically reads the header in and then converts each following row into a dictionary based on that row. Columns are then accessed by their name, rather than their position:
import csv
data = {}
def readCsv(csvFile, column):
with open(csvFile, "rb") as infile:
reader = csv.DictReader(infile)
for row in reader:
data[row[column]] = row
readCsv("calllogs.csv", 'loanNumber')
print data
This would give you:
{'1': {'actionDate': '', 'loanNumber': '1', 'assignedTo': 'ASSIGN1', 'caller': 'CALLER1', 'tenantId': '', 'action': 'FIELD_PTP', 'remarks': 'REMARK1', 'contactName': 'NAME1', 'contactRelation': 'SELF', 'date': '', 'response': 'RESPONSE1', 'contactStatus': 'CONNECTED'}, '3': {'actionDate': '', 'loanNumber': '3', 'assignedTo': 'ASSIGN3', 'caller': 'CALLER3', 'tenantId': '', 'action': 'FIELD_PTP', 'remarks': 'REMARK3', 'contactName': 'NAME3', 'contactRelation': 'WIFE', 'date': '', 'response': 'RESPONSE3', 'contactStatus': 'CONNECTED'}, '2': {'actionDate': '', 'loanNumber': '2', 'assignedTo': 'ASSIGN2', 'caller': 'CALLER2', 'tenantId': '', 'action': 'MEET_GUARANTOR', 'remarks': 'REMARK2', 'contactName': 'NAME2', 'contactRelation': 'SELF', 'date': '', 'response': 'RESPONSE2', 'contactStatus': 'WRONG_NUMBER'}, '5': {'actionDate': '', 'loanNumber': '5', 'assignedTo': 'ASSIGN5', 'caller': 'CALLER5', 'tenantId': '', 'action': 'VISIT_CUSTOMER', 'remarks': 'REMARK5', 'contactName': 'NAME5', 'contactRelation': 'SON', 'date': '', 'response': 'RESPONSE5', 'contactStatus': 'CONNECTED'}, '4': {'actionDate': '', 'loanNumber': '4', 'assignedTo': 'ASSIGN4', 'caller': 'CALLER4', 'tenantId': '', 'action': 'MEET_GUARANTOR', 'remarks': 'REMARK4', 'contactName': 'NAME4', 'contactRelation': 'HUSBAND', 'date': '', 'response': 'RESPONSE4', 'contactStatus': 'NO_RESPONSE'}, '6': {'actionDate': '', 'loanNumber': '6', 'assignedTo': 'ASSIGN6', 'caller': 'CALLER6', 'tenantId': '', 'action': 'VISIT_CUSTOMER', 'remarks': 'REMARK6', 'contactName': 'NAME6', 'contactRelation': 'SON', 'date': '', 'response': 'RESPONSE6', 'contactStatus': 'CONNECTED'}}
You will note that the loadNumber field is used as the key which is also left in the dictionary itself.

Related

Python script to extract specific data with Xpath

I would like to extract all data of the row named "Nb B" at this url page : https://www.coteur.com/cotes-foot.php
Here is my python script :
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get('https://www.coteur.com/cotes-foot.php')
#Store url associated with the soccer games
url_links = []
for i in driver.find_elements_by_xpath('//a[contains(#href, "match/cotes-")]'):
url_links.append(i.get_attribute('href'))
print(len(url_links), '\n')
nb_bookies = []
for i in driver.find_elements_by_xpath('//td[contains(#class, " odds")][contains(#style, "")]'):
nb_bookies.append(i.text)
print(nb_bookies)
And here is the output :
25
['1.80', '3.55', '4.70', '95%', '', '1.40', '4.60', '8.00', '94.33%', '', '2.35', '3.42', '2.63', '90.18%', '', '3.20', '3.60', '2.05', '92.19%', '', '7.00', '4.80', '1.35', '90.81%', '', '5.30', '4.30', '1.70', '99.05%', '', '2.15', '3.55', '3.65', '97.92%', '', '2.90', '3.20', '2.20', '88.81%', '', '3.95', '3.40', '2.10', '97.65%', '', '2.00', '3.80', '3.90', '98.04%', '', '2.40', '3.05', '3.50', '96.98%', '', '3.70', '3.20', '2.00', '91.72%', '', '2.75', '2.52', '3.05', '91.17%', '', '4.20', '3.05', '1.69', '84.23%', '', '1.22', '5.10', '10.00', '88.42%', '', '1.54', '4.60', '5.10', '93.72%', '', '3.00', '3.10', '2.45', '93.59%', '', '2.40', '3.50', '2.55', '90.55%', '', '1.76', '3.50', '4.20', '90.8%', '', '11.50', '5.30', '1.36', '98.91%', '', '3.00', '3.50', '2.20', '92.64%', '', '1.72', '3.42', '5.00', '92.62%', '', '1.08', '9.25', '19.00', '91.33%', '', '9.75', '5.75', '1.36', '98.82%', '', '5.70', '4.50', '1.63', '98.88%', '']
All the data of the table is extracted and you can see '' for the last row whereas I just want the last row.

To get the data from the last column only, fix your XPath accordingly :
nb_bookies = []
for i in driver.find_elements_by_xpath('//tr[#id and #role="row" ]/td[last()]'):
nb_bookies.append(i.text)
Output :
['12', '12', '1', '9', '11', '12', '12', '12', '12', '12', '11', '2', '11', '11', '9', '12', '11', '12', '12', '12', '12', '12', '10', '5', '12']

Your code is perfectly fine, the problem is to do with the window size that is spawned by the Automator in a headless mode. The default window size and display size in headless mode is 800x600 on all platforms.
The developers of the site have set the header to only appear if the width of the window is >1030px and only then the display: none; is removed from DOM. You can test this for yourself by shrinking & expanding the window size.
You need to understand that if an element's attribute contains style="display: none;" which means the element is hidden then Selenium won't be able to interact with the element, i.e. if a user can't see it then the same behavior applies to selenium.
Simply adding this line to enlarge your window in a headless mode will solve your problem.
options.add_argument("window-size=1400,800")

Python Pandas read_html get rid of nested span element in table

I try to grab some stock data from a website. The german website onvista.de have all the information I need. Now I tried to get the stock data into a pandas dataframe.
Like this:
url = 'https://www.onvista.de/aktien/fundamental/ADLER-REAL-ESTATE-AG-Aktie-DE0005008007'
onvista_table = pd.read_html(url)
This works fine for other websites. But the onvista site has a nested 'span' element in the th element, which has text in it. How do I get rid of the span element in the th element, to get a proper dataframe, without the text?
So I tried it with beautifulsoup to get rid of the 'span' element:
url = 'https://www.onvista.de/aktien/fundamental/ADLER-REAL-ESTATE-AG-Aktie-DE0005008007'
r = requests.get(url)
soup = BeautifulSoup(r.text,'html.parser')
onvista_table = soup
clean_data = []
for i in range (0,len(onvista_table.find_all('table'))):
table = onvista_table.find_all('table')[i]
for tool_tip_span in table.find_all('span',{"class":"INFO_LAYER_CONTAINER"}):
tool_tip_span.decompose()
rows = table.find_all('tr')
for row in rows:
raw_data = []
for cell in row.find_all(['td','th']):
raw_data.append(cell.get_text().strip())
if len(raw_data)<9:
print(raw_data)
the result looks like this:
['Gewinn', '2020e', '2019e', '2018e', '2017', '2016', '2015', '2014']
['Gewinn pro Aktie in EUR', '-', '1,20', '0,89', '1,91', '2,11', '1,83', '4,65']
['KGV', '-', '12,52', '16,79', '6,95', '6,24', '7,06', '1,45']
['Gewinnwachstum', '-', '+45,18%', '-60,00%', '-9,47%', '+15,30%', '-60,64%', '+80,93%']
['PEG', '-', '-', '0,49', '-0,13', '-0,65', '0,46', '-0,02']
['Dividende', '2020e', '2019e', '2018e', '2017', '2016', '2015', '2014']
['Dividende (netto) in EUR', '-', '0,00', '0,00', '0,00', '0,00', '0,00', '0,00']
['Dividendenrendite', '-', '0,00', '0,05', '0,00', '0,00', '0,00', '0,00']
['Cash-Flow', '2020e', '2019e', '2018e', '2017', '2016', '2015', '2014']
['Cashflow pro Aktie in EUR', '-', '1,63', '2,38', '0,63', '2,11', '0,54', '0,52']
['Kurs-Cashflow Verhältnis (KCV)', '-', '9,25', '6,32', '21,08', '6,24', '23,94', '13,00']
['Umsatz', '', '', '', '2017', '2016', '2015', '2014']
['Umsatz in Mio. EUR', '', '', '', '299,30', '412,80', '384,80', '140,70']
['Umsatzwachstum', '', '', '', '-27,49%', '+7,27%', '+173,48%', '+632,81%']
['Umsatz pro Mitarbeiter in EUR', '', '', '', '598.600,00', '1.294.043,88', '1.485.714,28', '1.851.315,78']
['Buchwert', '', '', '', '2017', '2016', '2015', '2014']
['Buchwert pro Aktie in EUR', '', '', '', '18,03', '19,16', '16,87', '9,76']
['Kurs-Buchwert-Verhältnis', '', '', '', '0,73', '0,75', '0,84', '0,76']
['Bilanz', '', '', '', '2017', '2016', '2015', '2014']
['Bilanzsumme in Mio. EUR', '', '', '', '3.779,00', '3.430,50', '3.076,20', '1.416,50']
['Eigenkapitalquote', '', '', '', '+29,48%', '+28,71%', '+27,19%', '+23,36%']
['Verschuldungsgrad', '', '', '', '+239,10%', '+248,20%', '+267,74%', '+327,94%']
['dynam. Verschuldungsgrad', '', '', '', '+7.340,49%', '+2.430,71%', '+8.958,80%', '+6.500,00%']
['Bilanzierungsmethode', '', '', '', 'IFRS', 'IFRS', 'IFRS', 'IFRS']
['Marktkapitalisierung', '', '', '', '2017', '2016', '2015', '2014']
['Marktkapitalisierung in Mio. EUR', '', '', '', '764,52', '691,20', '655,58', '237,00']
['Marktkapitalisierung/Umsatz', '', '', '', '2,55', '1,67', '1,70', '1,68']
['Marktkapitalisierung/Mitarbeiter in EUR', '', '', '', '1.529.050,36', '2.166.794,35', '2.531.214,90', '3.118.461,26']
['Marktkapitalisierung/EBITDA', '', '', '', '2,44', '2,19', '3,69', '1,37']
['Rentabilität', '', '', '', '2017', '2016', '2015', '2014']
['Cashflow-Marge', '', '', '', '+12,12%', '+24,37%', '+6,49%', '+11,86%']
['EBIT-Marge', '', '', '', '+104,17%', '+75,82%', '+45,79%', '+122,45%']
['EBITDA-Marge', '', '', '', '+104,57%', '+76,11%', '+46,04%', '+122,81%']
['Eigenkapitalrendite', '', '', '', '+11,37%', '+12,27%', '+8,61%', '+32,87%']
['Gesamtkapitalrendite', '', '', '', '+7,55%', '+7,26%', '+5,08%', '+10,58%']
['Cashflow Return on Investment', '', '', '', '+0,96%', '+2,93%', '+0,81%', '+1,17%']
['Steuerquote', '', '', '', '+9,97%', '+28,65%', '+17,40%', '+15,96%']
This is exactly what I want, only as a pandas dataframe. So please can someone tell me, how I can do this.
Kind regards,
Hoh

Once you have each table into a list of lists you can add to a new data frame. Example data:
raw_data = [
['Gewinn', '2020e', '2019e', '2018e', '2017', '2016', '2015', '2014'],
['Gewinn pro Aktie in EUR', '-', '1,20', '0,89', '1,91', '2,11', '1,83', '4,65'],
['KGV', '-', '12,52', '16,79', '6,95', '6,24', '7,06', '1,45'],
['Gewinnwachstum', '-', '+45,18%', '-60,00%', '-9,47%', '+15,30%', '-60,64%', '+80,93%'],
['PEG', '-', '-', '0,49', '-0,13', '-0,65', '0,46', '-0,02']
]
Create data frame like so:
# get first list as headers
headers = raw_data.pop(0)
df_gewinn = DataFrame(raw_data, columns=headers)
Then repeat this for each table (Dividende, Cash-Flow, Umsatz, etc.).

How to use Python xlwings to copy a large list of lists to Excel

I have a lists of lists of length 42, and each list has about 16 items in it. I have noticed that copying the list to excel using xlwings only works for up to 25 lists and anything after that doesn't work, or works sometimes and sometimes doesn't. I have the complete list and code below if anyone would like to reproduce the issue.
import xlwings as xw
data = [['1st', '(6)', '29.9', '407m', '22/05/2017', 'GRAC', 'M', '23.76', '23.76', '23.13', '8.62', '0.50', 'Supreme Flash', '1111', '', '$6.60'], ['8th', '(5)', '29.8', '407m', '29/05/2017', 'GRAC', '5', '24.64', '23.52', '23.15', '9.02', '16.00', 'Vision Time', '1788', '', '$17.80'], ['5th', '(3)', '30.3', '305m', '12/06/2017', 'GRAC', '5', '18.25', '17.84', '17.81', '3.30', '5.75', 'Red Red Wine', '7835', '', '$21.60'], ['2nd', '(2)', '30.1', '407m', '07/07/2017', 'GRAC', 'MX', '23.62', '23.57', '22.89', '8.60', '0.75', 'Tictac Cloud', '3222', '', '$24.10'], ['4th', '(4)', '29.9', '407m', '14/07/2017', 'GRAC', '5', '23.58', '23.44', '22.98', '8.67', '2.00', 'Kooringa Theo', '2434', '', '$7.00'], ['8th', '(4)', '29.9', '407m', '24/07/2017', 'GRAC', '5', '24.44', '23.75', '23.03', '8.88', '9.75', 'Myraki', '3458', '', '$10.20'], ['1st', '(1)', '30.4', '407m', '07/08/2017', 'GRAC', '5', '23.41', '23.41', '23.12', '8.52', '3.00', 'Myraki', '11', '', '$8.10'], ['1st', '(7)', '30.4', '407m', '14/08/2017', 'GRAF', '5', '23.53', '23.53', '23.18', '8.62', '0.75', 'Gee Tee Bee', '11', '', '$26.40'], ['4th', '(6)', '30.6', '420m', '22/08/2017', 'LISM', '5', '24.58', '23.97', '23.88', '', '8.75', 'Bazaar Mckenzie', '5444', '', '$12.20'], ['5th', '(8)', '31.7', '407m', '23/10/2017', 'GRAC', '5', '23.86', '23.55', '23.27', '8.71', '4.25', 'Hidden Sniper', '1755', '', '$8.50'], ['3rd', '(8)', '31.3', '407m', '30/10/2017', 'GRAC', '5', '23.68', '23.40', '23.13', '8.63', '4.00', 'Hidden Sniper', '1763', '', '$10.20'], ['1st', '(8)', '30.4', '420m', '14/11/2017', 'LISC', '5', '24.19', '24.19', '23.93', '9.82', '1.50', 'Pavlova Cloud', '2211', '', '$3.60'], ['3rd', '(1)', '30.3', '420m', '21/11/2017', 'LISM', '5', '24.34', '24.12', '24.10', '9.78', '3.00', 'Senor Izmir', '3333', '', '$5.50'], ['6th', '(6)', '30.2', '420m', '28/11/2017', 'LISM', '5', '24.98', '24.16', '24.01', '10.17', '11.75', 'Ace Gambler', '7666', '', '$3.80'], ['5th', '(8)', '30.2', '407m', '04/12/2017', 'GRAF', '5', '23.68', '23.11', '23.11', '8.80', '8.25', 'Slippery Valley', '1665', '', '$12.80'], ['1st', '(8)', '30.1', '411m', '08/12/2017', 'CASC', '4/5', '23.55', '23.55', '23.34', '', '2.25', 'Plane Spotter', '1111', '', '$3.40'], ['1st', '(2)', '30.3', '411m', '15/12/2017', 'CASO', '4/5', '23.29', '23.29', '23.29', '', '2.25', 'Benne Fortuna', '1111', '', '$5.10'], ['3rd', '(5)', '30.4', '407m', '01/01/2018', 'GRAF', '5', '23.68', '23.52', '22.94', '8.66', '2.25', 'Bella Lyndan', '1433', '', '$3.80'], ['5th', '(3)', '30.1', '420m', '09/01/2018', 'LISM', '5', '24.37', '24.00', '23.90', '9.82', '5.25', 'Brightest Star', '4555', '', '$4.30'], ['4th', '(2)', '30.4', '420m', '16/01/2018', 'LISM', '5', '24.60', '24.11', '24.04', '10.28', '7.00', 'Lucky Call', '7644', '', '$6.30'], ['1st', '(1)', '30.2', '407m', '22/01/2018', 'GRAC', '4/5', '23.21', '23.21', '23.20', '8.68', '6.75', 'Soltador', '7211', '', '$3.30'], ['2nd', '(2)', '29.9', '407m', '29/01/2018', 'GRAC', '4/5', '23.36', '23.25', '23.24', '8.59', '1.50', 'Slippery Valley', '7322', '', '$3.60'], ['4th', '(6)', '29.8', '407m', '05/02/2018', 'GRAF', '5', '23.69', '23.18', '23.18', '8.61', '7.25', 'Karaoke Cloud', '1444', '', '$3.10'], ['3rd', '(6)', '30.0', '420m', '13/02/2018', 'LISM', '5', '24.18', '24.01', '24.01', '9.80', '2.25', 'Tranquil Invader', '4333', '', '$5.90'], ['3rd', '(1)', '30.0', '420m', '20/02/2018', 'LISM', '5', '24.23', '24.10', '23.95', '9.86', '1.75', 'Benne Fortuna', '3333', '', '$3.30'], ['2nd', '(4)', '30.0', '420m', '27/02/2018', 'LISM', '5', '24.18', '23.91', '23.91', '9.75', '3.75', 'Oh So Fabio', '3322', '\n$4.70'], ['6th', '(4)', '30.0', '407m', '05/03/2018', 'GRAF', '5', '24.57', '23.63', '23.36', '8.63', '13.25', 'Star Billing', '2676', '', '$5.90'], ['1st', '(4)', '29.8', '407m', '12/03/2018', 'GRAC', '4/5', '23.27', '23.27', '23.08', '8.57', '0.50', 'Senor Izmir', '3321', '', '$8.50'], ['3rd', '(8)', '30.4', '407m', '19/03/2018', 'GRAC', '4/5', '23.24', '23.02', '23.02', '8.58', '3.00', "Freddy's Back", '1633', '', '$17.40'], ['6th', '(5)', '30.6', '420m', '27/03/2018', 'LISM', '5', '24.88', '24.25', '23.97', '10.31', '9.00', 'Kingsbrae Steve', '7666', '', '$4.00'], ['1st', '(3)', '30.4', '407m', '02/04/2018', 'GRAF', '5', '23.17', '23.17', '23.15', '8.54', '1.25', 'Whistler Valley', '2221', '', '$5.60'], ['3rd', '(1)', '30.3', '407m', '09/04/2018', 'GRAC', 'NG', '23.41', '23.13', '23.13', '8.53', '4.00', 'Orara Sal', '4323', '', '$3.60'], ['5th', '(3)', '30.0', '520m', '17/04/2018', 'LISM', '4/5', '30.67', '30.30', '30.06', '4.53', '5.25', 'Kulu Turkey', '2455', '', '$4.70'], ['5th', '(5)', '30.2', '411m', '27/04/2018', 'CASO', '5', '24.26', '23.86', '23.18', '', '5.75', 'Our Cavalier', '5555', '', '$4.30'], ['6th', '(3)', '31.4', '305m', '13/08/2018', 'GRAC', '4/5', '18.29', '17.79', '17.31', '3.31', '7.00', "Here's Molly", '8856', '', '$7.60'], ['1st', '(6)', '31.6', '305m', '20/08/2018', 'GRAC', '5', '17.66', '17.66', '17.66', '3.19', '1.25', 'Sandler', '1111', '', '$3.30'], ['1st', '(3)', '31.6', '420m', '28/08/2018', 'LISM', '4/5', '24.46', '24.46', '24.05', '9.95', '2.00', "Don't Seamus", '1111', '', '$2.00'], ['7th', '(7)', '31.6', '407m', '03/09/2018', 'GRAF', '4/5', '24.05', '23.48', '23.39', '8.72', '8.25', 'Kooringa Molly', '4667', '', '$6.50'], ['6th', '(4)', '31.4', '411m', '07/09/2018', 'CASC', '5', '23.90', '23.49', '23.15', '', '5.75', 'Nitro Beach', '6566', '', '$5.70'], ['4th', '(3)', '31.1', '420m', '11/09/2018', 'LISM', '4/5', '24.33', '23.91', '23.80', '9.78', '6.00', 'Blue Max', '4444', '', '$10.10'], ['5th', '(3)', '31.3', '411m', '14/09/2018', 'CASO', '5', '24.01', '23.25', '22.97', '', '10.75', 'Kingsbrae Steve', '7755', '\n$3.60']]
wb = xw.Book('example.xlsm')
sht = wb.sheets["Sheet1"]
sht.clear()
sht.range('A1').value = data[1:26]
The above code works and copies each list to successive row. However it doesnt work when I change the 26 to any number above. Also the code doesn't work if my starting index is 0, for example sht.range('A1').value = data[0:5]How can I get this working properly?

Ok I've realised xlwings certainly struggles and is unpredictable with lists. For anyone having this issue, simply convert the list to a dataframe and it works as expected. Sample code below:
import xlwings as xw
import pandas as pd
data = [['1st', '(6)',...]] #View complete list above
wb = xw.Book('example.xlsm')
sht = wb.sheets["Sheet1"]
sht.clear()
df = pd.DataFrame(data)
sht.range("A1").value = df

All lists/tuples that represent rows must be of the same length. It's a known limitation and there should be an appropriate error message with one of the next releases, see the issue.
Your answer works as numpy arrays or pandas dataframes are always regular arrays.

NoneType Error when trying to parse Table using BeautifulSoup

Here's my code:
source = urllib.request.urlopen('http://nflcombineresults.com/nflcombinedata_expanded.php ?year=2015&pos=&college=').read()
soup = bs.BeautifulSoup(source, 'lxml')
table = soup.table
table = soup.find(id='datatable')
table_rows = table.find_all('tr')
#print(table_rows)
year = []
name = []
college = []
pos = []
height = []
weight = []
hand_size = []
arm_length = []
wonderlic = []
fortyyrd = []
for row in table_rows[1:]:
col = row.find_all('td')
#row = [i.text for i in td]
#print(col[4])
# Create a variable of the string inside each <td> tag pair,
column_1 = col[0].string.strip()
# and append it to each variable
year.append(column_1)
column_2 = col[1].string.strip()
name.append(column_2)
column_3 = col[2].string.strip()
college.append(column_3)
column_4 = col[3].string.strip()
pos.append(column_4)
#print(col[4])
column_5 = col[4].string.strip()
height.append(column_5)
There are several more columns in the table I want to add, but whenever I try and run these last two lines, I get an error saying:
"AttributeError: 'NoneType' object has no attribute 'strip'"
when I print col[4] right above this line, I get:
<td><div align="center">69</div></td>
I originally thought this is due to missing data, but the first instance of missing data in the original table on the website is in the 9th column (Wonderlic) of the first row, not the 4th column.
There are several other columns not included in this snippet of code that I want to add to my dataframe and I'm getting the NoneType error with them as well despite there being an entry in that cell.
I'm fairly new to parsing tables from a site using BeautifulSoup and so this could be a stupid question, but why is this object NoneType how can I fix this so I can put this table into a pandas dataframe?

Alternately if you want to try it with pandas, you can do it like so:
import pandas as pd
df = pd.read_html("http://nflcombineresults.com/nflcombinedata_expanded.php?year=2015&pos=&college=")[0]
df.head()
Output:

AttributeError: 'NoneType' object has no attribute 'strip'
The actual error is happening on the last row of the table which has a single cell, here is it's HTML:
<tr style="background-color:#333333;"><td colspan="15"> </td></tr>
Just slice it:
for row in table_rows[1:-1]:
As far as improving the overall quality of the code, you can/should follow #宏杰李's answer.

import requests
from bs4 import BeautifulSoup
r = requests.get('http://nflcombineresults.com/nflcombinedata_expanded.php?year=2015&pos=&college=')
soup = BeautifulSoup(r.text, 'lxml')
for tr in soup.table.find_all('tr'):
row = [td.text for td in tr.find_all('td')]
print (row)
out:
['Year', 'Name', 'College', 'POS', 'Height (in)', 'Weight (lbs)', 'Hand Size (in)', 'Arm Length (in)', 'Wonderlic', '40 Yard', 'Bench Press', 'Vert Leap (in)', 'Broad Jump (in)', 'Shuttle', '3Cone', '60Yd Shuttle']
['2015', 'Ameer Abdullah', 'Nebraska', 'RB', '69', '205', '8.63', '30.00', '', '4.60', '24', '42.5', '130', '3.95', '6.79', '11.18']
['2015', 'Nelson Agholor', 'Southern California', 'WR', '73', '198', '9.25', '32.25', '', '4.42', '12', '', '', '', '', '']
['2015', 'Malcolm Agnew', 'Southern Illinois', 'RB', '70', '202', '', '', '', '*4.59', '', '', '', '', '', '']
['2015', 'Jay Ajayi', 'Boise State', 'RB', '73', '221', '10.00', '32.00', '24', '4.57', '19', '39.0', '121', '4.10', '7.10', '11.10']
['2015', 'Brandon Alexander', 'Central Florida', 'FS', '74', '195', '', '', '', '*4.59', '', '', '', '', '', '']
['2015', 'Kwon Alexander', 'Louisiana State', 'OLB', '73', '227', '9.25', '30.25', '', '4.55', '24', '36.0', '121', '4.20', '7.14', '']
['2015', 'Mario Alford', 'West Virginia', 'WR', '68', '180', '9.38', '31.25', '', '4.43', '13', '34.0', '121', '4.07', '6.64', '11.22']
['2015', 'Detric Allen', 'East Carolina', 'CB', '73', '200', '', '', '', '*4.59', '', '', '', '', '', '']
['2015', 'Javorius Allen', 'Southern California', 'RB', '73', '221', '9.38', '31.75', '12', '4.53', '11', '35.5', '121', '4.28', '6.96', '']
As you can see, there are a lot of empty fields in the table, the better way is to put all the field in a list, then unpack them or use namedtuple.
This will improve your code stability.

Using Python to Read Non-Strict CSV File

My CSV file is here
Here is my data format:
1763212493,zhangniuzhang,male,,yes，(this is chinese comma,not in english)i
do,hubei wuhan,1763212493,69,86,316,,,tp2.sinaimg.cn/1763212493/50/5613831962/1,0,"free,house,trip,80","1863415082,1752861352"
and my code :
import csv
with open("test.csv", "r") as f:
reader = csv.DictReader(f)
for row in reader:
print row
It's very simple, but I got like following:
{'mn': '316', 'ci': '', 'sx': 'male', 'ei': '', 'ad': 'hubei;"wuhan', 'vi': '', 'fui;': 'house', 'de': 'yes\xef\xbc\x8ci do', 'iu': 'tp2.sinaimg.cn/1763212493/50/5613831962/1', 'an': '69', 'un': '1763212493', 'iv': '0', 'sn': 'zhangniuzhang', None: ['trip', '80""', '1863415082', '1752861352"""'], 'tg': 'free', '_id': '1763212493', 'fn': '86'}
{'mn': '1104', 'ci': '', 'sx': 'femail', 'ei': '', 'ad': 'jilin;"changchun', 'vi': '', 'fui;': 'art', 'de': '', 'iu': 'tp2.sinaimg.cn/1854635021/50/1289455604/0', 'an': '71', 'un': '1854635021', 'iv': '0', 'sn': 'ladywang', None: ['reading', 'music""', '1949520584', '1288127940', '1193111400"""'], 'tg': 'life', '_id': '1854635021', 'fn': '258'}
For the first record, ad equals hubei;"wuhan, but in the original file there is no "; it is in a different column.
Many fields have wrong value. For the first record:
1763212493,zhangniuzhang,male,,yes，i
do,hubei wuhan,1763212493,69,86,316,,,tp2.sinaimg.cn/1763212493/50/5613831962/1,0,"free,house,trip,80","1863415082,1752861352"
The output should be:
{'mn': '316', 'ci': '', 'sx': 'male', 'ei': '', 'ad': 'hubei wuhan', 'vi': '', 'fui': '1863415082,1752861352', 'de': 'yes\xef\xbc\x8ci do', 'iu': 'tp2.sinaimg.cn/1763212493/50/5613831962/1', 'an': '69', 'un': '1763212493', 'iv': '0', 'sn': 'zhangniuzhang', 'tg': 'free,house,trip,80', '_id': '1763212493', 'fn': '86'}
But the output is mess, not the right order nor the right values.
Any suggestions?

You can try like .
filepath is the path of your test.csv
fdata = open(filepath)
fread = [ l for l in fdata.readlines() if l.strip() ]
now you can iterate fread

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Reading CSV data as headers and value pairs - python

Related

Python script to extract specific data with Xpath

Python Pandas read_html get rid of nested span element in table

How to use Python xlwings to copy a large list of lists to Excel

NoneType Error when trying to parse Table using BeautifulSoup

Using Python to Read Non-Strict CSV File

Categories

Resources