Using Python to Read Non-Strict CSV File - python

My CSV file is here
Here is my data format:
1763212493,zhangniuzhang,male,,yes,(this is chinese comma,not in english)i
do,hubei wuhan,1763212493,69,86,316,,,tp2.sinaimg.cn/1763212493/50/5613831962/1,0,"free,house,trip,80","1863415082,1752861352"
and my code :
import csv
with open("test.csv", "r") as f:
reader = csv.DictReader(f)
for row in reader:
print row
It's very simple, but I got like following:
{'mn': '316', 'ci': '', 'sx': 'male', 'ei': '', 'ad': 'hubei;"wuhan', 'vi': '', 'fui;': 'house', 'de': 'yes\xef\xbc\x8ci do', 'iu': 'tp2.sinaimg.cn/1763212493/50/5613831962/1', 'an': '69', 'un': '1763212493', 'iv': '0', 'sn': 'zhangniuzhang', None: ['trip', '80""', '1863415082', '1752861352"""'], 'tg': 'free', '_id': '1763212493', 'fn': '86'}
{'mn': '1104', 'ci': '', 'sx': 'femail', 'ei': '', 'ad': 'jilin;"changchun', 'vi': '', 'fui;': 'art', 'de': '', 'iu': 'tp2.sinaimg.cn/1854635021/50/1289455604/0', 'an': '71', 'un': '1854635021', 'iv': '0', 'sn': 'ladywang', None: ['reading', 'music""', '1949520584', '1288127940', '1193111400"""'], 'tg': 'life', '_id': '1854635021', 'fn': '258'}
For the first record, ad equals hubei;"wuhan, but in the original file there is no "; it is in a different column.
Many fields have wrong value. For the first record:
1763212493,zhangniuzhang,male,,yes,i
do,hubei wuhan,1763212493,69,86,316,,,tp2.sinaimg.cn/1763212493/50/5613831962/1,0,"free,house,trip,80","1863415082,1752861352"
The output should be:
{'mn': '316', 'ci': '', 'sx': 'male', 'ei': '', 'ad': 'hubei wuhan', 'vi': '', 'fui': '1863415082,1752861352', 'de': 'yes\xef\xbc\x8ci do', 'iu': 'tp2.sinaimg.cn/1763212493/50/5613831962/1', 'an': '69', 'un': '1763212493', 'iv': '0', 'sn': 'zhangniuzhang', 'tg': 'free,house,trip,80', '_id': '1763212493', 'fn': '86'}
But the output is mess, not the right order nor the right values.
Any suggestions?

You can try like .
filepath is the path of your test.csv
fdata = open(filepath)
fread = [ l for l in fdata.readlines() if l.strip() ]
now you can iterate fread

Related

Better way to remap elements in each json object in json array

I have a json array that looks like this:
{'1': {'ID': '1', ' ': 'Aaron K###', 'Distributor': 'National Energy', 'State': 'BC', 'Brand': 'Trane', 'Cell': '778-###-####', 'email address': '', 'Notes': '', '': ''}, '2': {'ID': '2', ' ': 'Martin', 'Distributor': 'Pierce Phelps', 'State': 'PA', 'Brand': 'Bryant/Carrier', 'Cell': '267-###-####', 'email address': '', 'Notes': '', '': ''},...
and I wanted to reconfigure it so that it matched django's serialization format.
I wrote this function to do it:
def re_serialize_reg_json():
d = load_file()
for i in d:
d[i]['Name'] = d[i][' ']
d[i]['pk'] = d[i]['ID']
d[i]['model'] = 'homepage.territorymanager'
d[i]['fields'] = {
'Name' : d[i]['Name'],
'Cell' : d[i]['Cell'],
'Email Address' : d[i]['email address'],
'Notes' : d[i]['Notes'],
'Distributor' : d[i]['Distributor'],
'State' :d[i]['State'],
'Brand' :d[i]['Brand'],
}
del d[i][' ']
del d[i]['ID']
del d[i]['Name']
del d[i]['Cell']
del d[i]['email address']
del d[i]['Notes']
del d[i]['Distributor']
del d[i]['State']
del d[i]['Brand']
del d[i]['']
and it works fine:
output:
{'1': {'pk': '1', 'model': 'homepage.territorymanager', 'fields': {'Name': 'Aaron Kirkus', 'Cell': '778-875-4983', 'Email Address': '', 'Notes': '', 'Distributor': 'National Energy', 'State': 'BC', 'Brand': 'Trane'}}, '2': {'pk': '2', 'model': 'homepage.territorymanager', 'fields': {'Name': 'Aaron Martin ', 'Cell': '267-246-0522', 'Email Address': '', 'Notes': '', 'Distributor': 'Pierce Phelps', 'State': 'PA', 'Brand': 'Bryant/Carrier'}},...
But I feel like it's not a very effective method for achieving this. Any ideas appreciated.
A list to store some keys, and some loop, would make the code a lot nicer
def re_serialize_reg_json(d):
for key, values in d.items():
values.update({'Name': values[' '], 'pk': values['ID'], 'model': 'homepage.territorymanager'})
f = ['Name', 'Cell', 'email address', 'Notes', 'Distributor', 'State', 'Brand']
values['fields'] = {' '.join(w.capitalize() for w in k.split()): values[k] for k in f}
for k in f + [' ', '', 'ID']:
del values[k]
return d

How to use Python xlwings to copy a large list of lists to Excel

I have a lists of lists of length 42, and each list has about 16 items in it. I have noticed that copying the list to excel using xlwings only works for up to 25 lists and anything after that doesn't work, or works sometimes and sometimes doesn't. I have the complete list and code below if anyone would like to reproduce the issue.
import xlwings as xw
data = [['1st', '(6)', '29.9', '407m', '22/05/2017', 'GRAC', 'M', '23.76', '23.76', '23.13', '8.62', '0.50', 'Supreme Flash', '1111', '', '$6.60'], ['8th', '(5)', '29.8', '407m', '29/05/2017', 'GRAC', '5', '24.64', '23.52', '23.15', '9.02', '16.00', 'Vision Time', '1788', '', '$17.80'], ['5th', '(3)', '30.3', '305m', '12/06/2017', 'GRAC', '5', '18.25', '17.84', '17.81', '3.30', '5.75', 'Red Red Wine', '7835', '', '$21.60'], ['2nd', '(2)', '30.1', '407m', '07/07/2017', 'GRAC', 'MX', '23.62', '23.57', '22.89', '8.60', '0.75', 'Tictac Cloud', '3222', '', '$24.10'], ['4th', '(4)', '29.9', '407m', '14/07/2017', 'GRAC', '5', '23.58', '23.44', '22.98', '8.67', '2.00', 'Kooringa Theo', '2434', '', '$7.00'], ['8th', '(4)', '29.9', '407m', '24/07/2017', 'GRAC', '5', '24.44', '23.75', '23.03', '8.88', '9.75', 'Myraki', '3458', '', '$10.20'], ['1st', '(1)', '30.4', '407m', '07/08/2017', 'GRAC', '5', '23.41', '23.41', '23.12', '8.52', '3.00', 'Myraki', '11', '', '$8.10'], ['1st', '(7)', '30.4', '407m', '14/08/2017', 'GRAF', '5', '23.53', '23.53', '23.18', '8.62', '0.75', 'Gee Tee Bee', '11', '', '$26.40'], ['4th', '(6)', '30.6', '420m', '22/08/2017', 'LISM', '5', '24.58', '23.97', '23.88', '', '8.75', 'Bazaar Mckenzie', '5444', '', '$12.20'], ['5th', '(8)', '31.7', '407m', '23/10/2017', 'GRAC', '5', '23.86', '23.55', '23.27', '8.71', '4.25', 'Hidden Sniper', '1755', '', '$8.50'], ['3rd', '(8)', '31.3', '407m', '30/10/2017', 'GRAC', '5', '23.68', '23.40', '23.13', '8.63', '4.00', 'Hidden Sniper', '1763', '', '$10.20'], ['1st', '(8)', '30.4', '420m', '14/11/2017', 'LISC', '5', '24.19', '24.19', '23.93', '9.82', '1.50', 'Pavlova Cloud', '2211', '', '$3.60'], ['3rd', '(1)', '30.3', '420m', '21/11/2017', 'LISM', '5', '24.34', '24.12', '24.10', '9.78', '3.00', 'Senor Izmir', '3333', '', '$5.50'], ['6th', '(6)', '30.2', '420m', '28/11/2017', 'LISM', '5', '24.98', '24.16', '24.01', '10.17', '11.75', 'Ace Gambler', '7666', '', '$3.80'], ['5th', '(8)', '30.2', '407m', '04/12/2017', 'GRAF', '5', '23.68', '23.11', '23.11', '8.80', '8.25', 'Slippery Valley', '1665', '', '$12.80'], ['1st', '(8)', '30.1', '411m', '08/12/2017', 'CASC', '4/5', '23.55', '23.55', '23.34', '', '2.25', 'Plane Spotter', '1111', '', '$3.40'], ['1st', '(2)', '30.3', '411m', '15/12/2017', 'CASO', '4/5', '23.29', '23.29', '23.29', '', '2.25', 'Benne Fortuna', '1111', '', '$5.10'], ['3rd', '(5)', '30.4', '407m', '01/01/2018', 'GRAF', '5', '23.68', '23.52', '22.94', '8.66', '2.25', 'Bella Lyndan', '1433', '', '$3.80'], ['5th', '(3)', '30.1', '420m', '09/01/2018', 'LISM', '5', '24.37', '24.00', '23.90', '9.82', '5.25', 'Brightest Star', '4555', '', '$4.30'], ['4th', '(2)', '30.4', '420m', '16/01/2018', 'LISM', '5', '24.60', '24.11', '24.04', '10.28', '7.00', 'Lucky Call', '7644', '', '$6.30'], ['1st', '(1)', '30.2', '407m', '22/01/2018', 'GRAC', '4/5', '23.21', '23.21', '23.20', '8.68', '6.75', 'Soltador', '7211', '', '$3.30'], ['2nd', '(2)', '29.9', '407m', '29/01/2018', 'GRAC', '4/5', '23.36', '23.25', '23.24', '8.59', '1.50', 'Slippery Valley', '7322', '', '$3.60'], ['4th', '(6)', '29.8', '407m', '05/02/2018', 'GRAF', '5', '23.69', '23.18', '23.18', '8.61', '7.25', 'Karaoke Cloud', '1444', '', '$3.10'], ['3rd', '(6)', '30.0', '420m', '13/02/2018', 'LISM', '5', '24.18', '24.01', '24.01', '9.80', '2.25', 'Tranquil Invader', '4333', '', '$5.90'], ['3rd', '(1)', '30.0', '420m', '20/02/2018', 'LISM', '5', '24.23', '24.10', '23.95', '9.86', '1.75', 'Benne Fortuna', '3333', '', '$3.30'], ['2nd', '(4)', '30.0', '420m', '27/02/2018', 'LISM', '5', '24.18', '23.91', '23.91', '9.75', '3.75', 'Oh So Fabio', '3322', '\n$4.70'], ['6th', '(4)', '30.0', '407m', '05/03/2018', 'GRAF', '5', '24.57', '23.63', '23.36', '8.63', '13.25', 'Star Billing', '2676', '', '$5.90'], ['1st', '(4)', '29.8', '407m', '12/03/2018', 'GRAC', '4/5', '23.27', '23.27', '23.08', '8.57', '0.50', 'Senor Izmir', '3321', '', '$8.50'], ['3rd', '(8)', '30.4', '407m', '19/03/2018', 'GRAC', '4/5', '23.24', '23.02', '23.02', '8.58', '3.00', "Freddy's Back", '1633', '', '$17.40'], ['6th', '(5)', '30.6', '420m', '27/03/2018', 'LISM', '5', '24.88', '24.25', '23.97', '10.31', '9.00', 'Kingsbrae Steve', '7666', '', '$4.00'], ['1st', '(3)', '30.4', '407m', '02/04/2018', 'GRAF', '5', '23.17', '23.17', '23.15', '8.54', '1.25', 'Whistler Valley', '2221', '', '$5.60'], ['3rd', '(1)', '30.3', '407m', '09/04/2018', 'GRAC', 'NG', '23.41', '23.13', '23.13', '8.53', '4.00', 'Orara Sal', '4323', '', '$3.60'], ['5th', '(3)', '30.0', '520m', '17/04/2018', 'LISM', '4/5', '30.67', '30.30', '30.06', '4.53', '5.25', 'Kulu Turkey', '2455', '', '$4.70'], ['5th', '(5)', '30.2', '411m', '27/04/2018', 'CASO', '5', '24.26', '23.86', '23.18', '', '5.75', 'Our Cavalier', '5555', '', '$4.30'], ['6th', '(3)', '31.4', '305m', '13/08/2018', 'GRAC', '4/5', '18.29', '17.79', '17.31', '3.31', '7.00', "Here's Molly", '8856', '', '$7.60'], ['1st', '(6)', '31.6', '305m', '20/08/2018', 'GRAC', '5', '17.66', '17.66', '17.66', '3.19', '1.25', 'Sandler', '1111', '', '$3.30'], ['1st', '(3)', '31.6', '420m', '28/08/2018', 'LISM', '4/5', '24.46', '24.46', '24.05', '9.95', '2.00', "Don't Seamus", '1111', '', '$2.00'], ['7th', '(7)', '31.6', '407m', '03/09/2018', 'GRAF', '4/5', '24.05', '23.48', '23.39', '8.72', '8.25', 'Kooringa Molly', '4667', '', '$6.50'], ['6th', '(4)', '31.4', '411m', '07/09/2018', 'CASC', '5', '23.90', '23.49', '23.15', '', '5.75', 'Nitro Beach', '6566', '', '$5.70'], ['4th', '(3)', '31.1', '420m', '11/09/2018', 'LISM', '4/5', '24.33', '23.91', '23.80', '9.78', '6.00', 'Blue Max', '4444', '', '$10.10'], ['5th', '(3)', '31.3', '411m', '14/09/2018', 'CASO', '5', '24.01', '23.25', '22.97', '', '10.75', 'Kingsbrae Steve', '7755', '\n$3.60']]
wb = xw.Book('example.xlsm')
sht = wb.sheets["Sheet1"]
sht.clear()
sht.range('A1').value = data[1:26]
The above code works and copies each list to successive row. However it doesnt work when I change the 26 to any number above. Also the code doesn't work if my starting index is 0, for example sht.range('A1').value = data[0:5]How can I get this working properly?
Ok I've realised xlwings certainly struggles and is unpredictable with lists. For anyone having this issue, simply convert the list to a dataframe and it works as expected. Sample code below:
import xlwings as xw
import pandas as pd
data = [['1st', '(6)',...]] #View complete list above
wb = xw.Book('example.xlsm')
sht = wb.sheets["Sheet1"]
sht.clear()
df = pd.DataFrame(data)
sht.range("A1").value = df
All lists/tuples that represent rows must be of the same length. It's a known limitation and there should be an appropriate error message with one of the next releases, see the issue.
Your answer works as numpy arrays or pandas dataframes are always regular arrays.

Turn a text file into a dictionary with Python

I have a text file with a pattern:
[Badges_373382]
Deleted=0
Button2=0 1497592154
Button1=0 1497592154
ProxReader=0
StartProc=100 1509194246 ""
NextStart=0
LastSeen=1509194246
Enabled=1
Driver=Access Control
Program=AccessProxBadge
LocChg=1509120279
Name=asd
Neuron=7F0027BF2D
Owner=373381
LostSince=1509120774
Index1=218
Photo=unknown.jpg
LastProxReader=0
Temp=0
LastTemp=0
LastMotionless=0
LastMotion=1497592154
BatteryLow=0
PrevReader=10703
Reader=357862
SuspendTill=0
SuspendSince=0
Status=1001
ConvertUponDownload=0
AXSFlags=0
Params=10106
Motion=1
USER_DATA_CreationDate=6/15/2017 4:48:15 PM
OwnerOldName=asd
[Badges_373384]
Deleted=0
Button2=0 1497538610
Button1=0 1497538610
ProxReader=0
StartProc=100 1509194246 ""
NextStart=0
LastSeen=1513872678
Enabled=1
Driver=Access Control
Program=AccessProxBadge
LocChg=1513872684
Name=dsa
Neuron=7F0027CC1C
Owner=373383
LostSince=1513872723
Index1=219
Photo=unknown.jpg
LastProxReader=0
Temp=0
LastTemp=0
LastMotionless=0
LastMotion=1497538610
BatteryLow=0
PrevReader=357874
Reader=357873
SuspendTill=0
SuspendSince=0
Status=1001
ConvertUponDownload=0
AXSFlags=0
Params=10106
Motion=1
USER_DATA_CreationDate=6/15/2017 4:48:51 PM
OwnerOldName=dsa
[Badges_373386]
Deleted=0
Button2=0 1497780768
Button1=0 1497780768
ProxReader=0
StartProc=100 1509194246 ""
NextStart=0
LastSeen=1514124910
Enabled=1
Driver=Access Control
Program=AccessProxBadge
LocChg=1514124915
Name=ss
Neuron=7F0027B5FD
Owner=373385
LostSince=1514124950
Index1=220
Photo=unknown.jpg
LastProxReader=0
Temp=0
LastTemp=0
LastMotionless=0
LastMotion=1497780768
BatteryLow=0
PrevReader=357872
Reader=357871
SuspendTill=0
SuspendSince=0
Status=1001
ConvertUponDownload=0
AXSFlags=0
Params=10106
Motion=1
USER_DATA_CreationDate=6/15/2017 4:49:24 PM
OwnerOldName=ss
Every new "Badge" info starts with [Badges_number] and end with blank line.
Using Python 3.6, I would like to turn this file into a dictionary so that I could easily access that information.
It should look like this:
content = {"Badges_373382:{"Deleted:0,.."},"Badges_371231":{"Deleted":0,..}"}
I'm pretty confused on how to do that, I'd love to get some help.
Thanks!
This is basically an INI file, and Python provides the configparser module to parse such files.
import configparser
config = configparser.ConfigParser()
config.readfp(open('badges.ini'))
r = {section: dict(config[section]) for section in config.sections()}
You can loop through each line and keep track if you have seen a header in the format [Badges_373382]:
import re
import itertools
with open('filename.txt') as f:
f = filter(lambda x:x, [i.strip('\n') for i in f])
new_data = [(a, list(b)) for a, b in itertools.groupby(f, key=lambda x:bool(re.findall('\[[a-zA-Z]+_+\d+\]', x)))]
final_data = {new_data[i][-1][-1]:dict(c.split('=') for c in new_data[i+1][-1]) for i in range(0, len(new_data), 2)}
Output:
{'[Badges_373384]': {'OwnerOldName': 'dsa', 'LastMotionless': '0', 'NextStart': '0', 'Driver': 'Access Control', 'LastTemp': '0', 'USER_DATA_CreationDate': '6/15/2017 4:48:51 PM', 'Program': 'AccessProxBadge', 'LocChg': '1513872684', 'Reader': '357873', 'LostSince': '1513872723', 'LastMotion': '1497538610', 'Status': '1001', 'Deleted': '0', 'SuspendTill': '0', 'ProxReader': '0', 'LastSeen': '1513872678', 'BatteryLow': '0', 'Index1': '219', 'Name': 'dsa', 'Temp': '0', 'Enabled': '1', 'StartProc': '100 1509194246 ""', 'Motion': '1', 'Button2': '0 1497538610', 'Button1': '0 1497538610', 'SuspendSince': '0', 'ConvertUponDownload': '0', 'PrevReader': '357874', 'AXSFlags': '0', 'LastProxReader': '0', 'Photo': 'unknown.jpg', 'Neuron': '7F0027CC1C', 'Owner': '373383', 'Params': '10106'}, '[Badges_373382]': {'OwnerOldName': 'asd', 'LastMotionless': '0', 'NextStart': '0', 'Driver': 'Access Control', 'LastTemp': '0', 'USER_DATA_CreationDate': '6/15/2017 4:48:15 PM', 'Program': 'AccessProxBadge', 'LocChg': '1509120279', 'Reader': '357862', 'LostSince': '1509120774', 'LastMotion': '1497592154', 'Status': '1001', 'Deleted': '0', 'SuspendTill': '0', 'ProxReader': '0', 'LastSeen': '1509194246', 'BatteryLow': '0', 'Index1': '218', 'Name': 'asd', 'Temp': '0', 'Enabled': '1', 'StartProc': '100 1509194246 ""', 'Motion': '1', 'Button2': '0 1497592154', 'Button1': '0 1497592154', 'SuspendSince': '0', 'ConvertUponDownload': '0', 'PrevReader': '10703', 'AXSFlags': '0', 'LastProxReader': '0', 'Photo': 'unknown.jpg', 'Neuron': '7F0027BF2D', 'Owner': '373381', 'Params': '10106'}, '[Badges_373386]': {'OwnerOldName': 'ss', 'LastMotionless': '0', 'NextStart': '0', 'Driver': 'Access Control', 'LastTemp': '0', 'USER_DATA_CreationDate': '6/15/2017 4:49:24 PM', 'Program': 'AccessProxBadge', 'LocChg': '1514124915', 'Reader': '357871', 'LostSince': '1514124950', 'LastMotion': '1497780768', 'Status': '1001', 'Deleted': '0', 'SuspendTill': '0', 'ProxReader': '0', 'LastSeen': '1514124910', 'BatteryLow': '0', 'Index1': '220', 'Name': 'ss', 'Temp': '0', 'Enabled': '1', 'StartProc': '100 1509194246 ""', 'Motion': '1', 'Button2': '0 1497780768', 'Button1': '0 1497780768', 'SuspendSince': '0', 'ConvertUponDownload': '0', 'PrevReader': '357872', 'AXSFlags': '0', 'LastProxReader': '0', 'Photo': 'unknown.jpg', 'Neuron': '7F0027B5FD', 'Owner': '373385', 'Params': '10106'}}
You can just go through each line of the file and add what you need. Their are three cases of lines you can come across:
1. The is a header, it will be a key final dictionary. You can just check if a line starts with "[Badges" here, and store the current header with a temporary variable while reading the file.
2. The line is a blank line, marking the end of the current badge data being read. All you need to do here is add the information collected from the current badge and add it to the dictionary, with the correct corresponding key. Depending on your implementation, you can delete these beforehand, or keep them when reading the lines.
3. Otherwise, the line has some info that needs to be stored. You first need to split this info on "=", and store it in your dictionary.
With these suggestions, you can write something like this to accomplish this task:
from collections import defaultdict
# dictionary of dictionary values
data = defaultdict(dict)
with open('pattern.txt') as file:
lines = [line.strip('\n') for line in file]
# keeps track of current header
header = None
# case 2, deletes empty lines before hand
valid_lines = [line for line in lines if line]
for line in valid_lines:
# case 1, for headers
if line.startswith('[Badges'):
# updates current header, and deletes square brackets
header = line.replace('[', '').replace(']', '')
# case 3, data has been found
else:
# split and add the data
info = line.split('=')
key, value = info[0], info[1]
data[header][key] = value
print(dict(data))
Which outputs:
{'Badges_373382': {'Deleted': '0', 'Button2': '0 1497592154', 'Button1': '0 1497592154', 'ProxReader': '0', 'StartProc': '100 1509194246 ""', 'NextStart': '0', 'LastSeen': '1509194246', 'Enabled': '1', 'Driver': 'Access Control', 'Program': 'AccessProxBadge', 'LocChg': '1509120279', 'Name': 'asd', 'Neuron': '7F0027BF2D', 'Owner': '373381', 'LostSince': '1509120774', 'Index1': '218', 'Photo': 'unknown.jpg', 'LastProxReader': '0', 'Temp': '0', 'LastTemp': '0', 'LastMotionless': '0', 'LastMotion': '1497592154', 'BatteryLow': '0', 'PrevReader': '10703', 'Reader': '357862', 'SuspendTill': '0', 'SuspendSince': '0', 'Status': '1001', 'ConvertUponDownload': '0', 'AXSFlags': '0', 'Params': '10106', 'Motion': '1', 'USER_DATA_CreationDate': '6/15/2017 4:48:15 PM', 'OwnerOldName': 'asd'}, 'Badges_373384': {'Deleted': '0', 'Button2': '0 1497538610', 'Button1': '0 1497538610', 'ProxReader': '0', 'StartProc': '100 1509194246 ""', 'NextStart': '0', 'LastSeen': '1513872678', 'Enabled': '1', 'Driver': 'Access Control', 'Program': 'AccessProxBadge', 'LocChg': '1513872684', 'Name': 'dsa', 'Neuron': '7F0027CC1C', 'Owner': '373383', 'LostSince': '1513872723', 'Index1': '219', 'Photo': 'unknown.jpg', 'LastProxReader': '0', 'Temp': '0', 'LastTemp': '0', 'LastMotionless': '0', 'LastMotion': '1497538610', 'BatteryLow': '0', 'PrevReader': '357874', 'Reader': '357873', 'SuspendTill': '0', 'SuspendSince': '0', 'Status': '1001', 'ConvertUponDownload': '0', 'AXSFlags': '0', 'Params': '10106', 'Motion': '1', 'USER_DATA_CreationDate': '6/15/2017 4:48:51 PM', 'OwnerOldName': 'dsa'}, 'Badges_373386': {'Deleted': '0', 'Button2': '0 1497780768', 'Button1': '0 1497780768', 'ProxReader': '0', 'StartProc': '100 1509194246 ""', 'NextStart': '0', 'LastSeen': '1514124910', 'Enabled': '1', 'Driver': 'Access Control', 'Program': 'AccessProxBadge', 'LocChg': '1514124915', 'Name': 'ss', 'Neuron': '7F0027B5FD', 'Owner': '373385', 'LostSince': '1514124950', 'Index1': '220', 'Photo': 'unknown.jpg', 'LastProxReader': '0', 'Temp': '0', 'LastTemp': '0', 'LastMotionless': '0', 'LastMotion': '1497780768', 'BatteryLow': '0', 'PrevReader': '357872', 'Reader': '357871', 'SuspendTill': '0', 'SuspendSince': '0', 'Status': '1001', 'ConvertUponDownload': '0', 'AXSFlags': '0', 'Params': '10106', 'Motion': '1', 'USER_DATA_CreationDate': '6/15/2017 4:49:24 PM', 'OwnerOldName': 'ss'}}
Note: The above code is just a possibility, feel free to adapt it to your needs, or even improve it.
I also used collections.defaultdict to add the data, since its easier to use. You can also wrap dict() at the end to convert it to a normal dictionary, which is optional.
You can try regex and split the result of output:
pattern='^\[Badges.+?OwnerOldName=\w+'
import re
with open('file.txt','r') as f:
match=re.finditer(pattern,f.read(),re.DOTALL | re.MULTILINE)
new=[]
for kk in match:
if kk.group()!='\n':
new.append(kk.group())
print({i.split()[0]:i.split()[1:] for i in new})
output:
{'[Badges_373384]': ['Deleted=0', 'Button2=0', '1497538610', 'Button1=0', '1497538610', 'ProxReader=0', 'StartProc=100', '1509194246', '""', 'NextStart=0', 'LastSeen=1513872678', 'Enabled=1', 'Driver=Access', 'Control', 'Program=AccessProxBadge', 'LocChg=1513872684', 'Name=dsa', 'Neuron=7F0027CC1C', 'Owner=373383', 'LostSince=1513872723', 'Index1=219', 'Photo=unknown.jpg', 'LastProxReader=0', 'Temp=0', 'LastTemp=0', 'LastMotionless=0', 'LastMotion=1497538610', 'BatteryLow=0', 'PrevReader=357874', 'Reader=357873', 'SuspendTill=0', 'SuspendSince=0', 'Status=1001', 'ConvertUponDownload=0', 'AXSFlags=0', 'Params=10106', 'Motion=1', 'USER_DATA_CreationDate=6/15/2017', '4:48:51', 'PM', 'OwnerOldName=dsa'], '[Badges_373382]': ['Deleted=0', 'Button2=0', '1497592154', 'Button1=0', '1497592154', 'ProxReader=0', 'StartProc=100', '1509194246', '""', 'NextStart=0', 'LastSeen=1509194246', 'Enabled=1', 'Driver=Access', 'Control', 'Program=AccessProxBadge', 'LocChg=1509120279', 'Name=asd', 'Neuron=7F0027BF2D', 'Owner=373381', 'LostSince=1509120774', 'Index1=218', 'Photo=unknown.jpg', 'LastProxReader=0', 'Temp=0', 'LastTemp=0', 'LastMotionless=0', 'LastMotion=1497592154', 'BatteryLow=0', 'PrevReader=10703', 'Reader=357862', 'SuspendTill=0', 'SuspendSince=0', 'Status=1001', 'ConvertUponDownload=0', 'AXSFlags=0', 'Params=10106', 'Motion=1', 'USER_DATA_CreationDate=6/15/2017', '4:48:15', 'PM', 'OwnerOldName=asd'], '[Badges_373386]': ['Deleted=0', 'Button2=0', '1497780768', 'Button1=0', '1497780768', 'ProxReader=0', 'StartProc=100', '1509194246', '""', 'NextStart=0', 'LastSeen=1514124910', 'Enabled=1', 'Driver=Access', 'Control', 'Program=AccessProxBadge', 'LocChg=1514124915', 'Name=ss', 'Neuron=7F0027B5FD', 'Owner=373385', 'LostSince=1514124950', 'Index1=220', 'Photo=unknown.jpg', 'LastProxReader=0', 'Temp=0', 'LastTemp=0', 'LastMotionless=0', 'LastMotion=1497780768', 'BatteryLow=0', 'PrevReader=357872', 'Reader=357871', 'SuspendTill=0', 'SuspendSince=0', 'Status=1001', 'ConvertUponDownload=0', 'AXSFlags=0', 'Params=10106', 'Motion=1', 'USER_DATA_CreationDate=6/15/2017', '4:49:24', 'PM', 'OwnerOldName=ss']}

NoneType Error when trying to parse Table using BeautifulSoup

Here's my code:
source = urllib.request.urlopen('http://nflcombineresults.com/nflcombinedata_expanded.php ?year=2015&pos=&college=').read()
soup = bs.BeautifulSoup(source, 'lxml')
table = soup.table
table = soup.find(id='datatable')
table_rows = table.find_all('tr')
#print(table_rows)
year = []
name = []
college = []
pos = []
height = []
weight = []
hand_size = []
arm_length = []
wonderlic = []
fortyyrd = []
for row in table_rows[1:]:
col = row.find_all('td')
#row = [i.text for i in td]
#print(col[4])
# Create a variable of the string inside each <td> tag pair,
column_1 = col[0].string.strip()
# and append it to each variable
year.append(column_1)
column_2 = col[1].string.strip()
name.append(column_2)
column_3 = col[2].string.strip()
college.append(column_3)
column_4 = col[3].string.strip()
pos.append(column_4)
#print(col[4])
column_5 = col[4].string.strip()
height.append(column_5)
There are several more columns in the table I want to add, but whenever I try and run these last two lines, I get an error saying:
"AttributeError: 'NoneType' object has no attribute 'strip'"
when I print col[4] right above this line, I get:
<td><div align="center">69</div></td>
I originally thought this is due to missing data, but the first instance of missing data in the original table on the website is in the 9th column (Wonderlic) of the first row, not the 4th column.
There are several other columns not included in this snippet of code that I want to add to my dataframe and I'm getting the NoneType error with them as well despite there being an entry in that cell.
I'm fairly new to parsing tables from a site using BeautifulSoup and so this could be a stupid question, but why is this object NoneType how can I fix this so I can put this table into a pandas dataframe?
Alternately if you want to try it with pandas, you can do it like so:
import pandas as pd
df = pd.read_html("http://nflcombineresults.com/nflcombinedata_expanded.php?year=2015&pos=&college=")[0]
df.head()
Output:
AttributeError: 'NoneType' object has no attribute 'strip'
The actual error is happening on the last row of the table which has a single cell, here is it's HTML:
<tr style="background-color:#333333;"><td colspan="15"> </td></tr>
Just slice it:
for row in table_rows[1:-1]:
As far as improving the overall quality of the code, you can/should follow #宏杰李's answer.
import requests
from bs4 import BeautifulSoup
r = requests.get('http://nflcombineresults.com/nflcombinedata_expanded.php?year=2015&pos=&college=')
soup = BeautifulSoup(r.text, 'lxml')
for tr in soup.table.find_all('tr'):
row = [td.text for td in tr.find_all('td')]
print (row)
out:
['Year', 'Name', 'College', 'POS', 'Height (in)', 'Weight (lbs)', 'Hand Size (in)', 'Arm Length (in)', 'Wonderlic', '40 Yard', 'Bench Press', 'Vert Leap (in)', 'Broad Jump (in)', 'Shuttle', '3Cone', '60Yd Shuttle']
['2015', 'Ameer Abdullah', 'Nebraska', 'RB', '69', '205', '8.63', '30.00', '', '4.60', '24', '42.5', '130', '3.95', '6.79', '11.18']
['2015', 'Nelson Agholor', 'Southern California', 'WR', '73', '198', '9.25', '32.25', '', '4.42', '12', '', '', '', '', '']
['2015', 'Malcolm Agnew', 'Southern Illinois', 'RB', '70', '202', '', '', '', '*4.59', '', '', '', '', '', '']
['2015', 'Jay Ajayi', 'Boise State', 'RB', '73', '221', '10.00', '32.00', '24', '4.57', '19', '39.0', '121', '4.10', '7.10', '11.10']
['2015', 'Brandon Alexander', 'Central Florida', 'FS', '74', '195', '', '', '', '*4.59', '', '', '', '', '', '']
['2015', 'Kwon Alexander', 'Louisiana State', 'OLB', '73', '227', '9.25', '30.25', '', '4.55', '24', '36.0', '121', '4.20', '7.14', '']
['2015', 'Mario Alford', 'West Virginia', 'WR', '68', '180', '9.38', '31.25', '', '4.43', '13', '34.0', '121', '4.07', '6.64', '11.22']
['2015', 'Detric Allen', 'East Carolina', 'CB', '73', '200', '', '', '', '*4.59', '', '', '', '', '', '']
['2015', 'Javorius Allen', 'Southern California', 'RB', '73', '221', '9.38', '31.75', '12', '4.53', '11', '35.5', '121', '4.28', '6.96', '']
As you can see, there are a lot of empty fields in the table, the better way is to put all the field in a list, then unpack them or use namedtuple.
This will improve your code stability.

Reading CSV data as headers and value pairs

I am trying to read a CSV file and take the headers as key and the body as the value pairs.
Here is my Python file:
from pymongo import MongoClient
import requests
import csv
import sys
data = {}
def readCsv(csvFile, column):
with open(csvFile, "rb") as infile:
reader = csv.reader(infile)
headers = next(reader)[1:]
for row in reader:
data[row[column]] = {key: value for key, value in zip(headers, row[1:])}
def readCalllogs():
for loanNumber in data:
logDetail = data[loanNumber]
print logDetail
resp = requests.post('http://localhost:3500/api/v1/call_logs', json=logDetail)
if resp.status_code != 201:
print resp.status_code
else:
print logDetail
def writeYAMLFile():
for loan in loans:
print loan["assignedTo"]
def loadCalllogsFromCsv():
readCsv("calllogs.csv", 0)
readCalllogs()
def main():
loadCalllogsFromCsv()
if __name__ == "__main__":
main()
But I am getting an Index error:
File "./load_calllogs.py", line 16, in readCsv
data[row[column]] = {key: value for key, value in zip(headers, row[1:])}
IndexError: list index out of range
Here is my CSV file:
loanNumber,date,contactStatus,contactRelation,contactName,response,tenantId,actionDate,action,assignedTo,remarks,caller
1,,CONNECTED,SELF,NAME1,RESPONSE1,,,FIELD_PTP,ASSIGN1,REMARK1,CALLER1
2,,WRONG_NUMBER,SELF,NAME2,RESPONSE2,,,MEET_GUARANTOR,ASSIGN2,REMARK2,CALLER2
3,,CONNECTED,WIFE,NAME3,RESPONSE3,,,FIELD_PTP,ASSIGN3,REMARK3,CALLER3
4,,NO_RESPONSE,HUSBAND,NAME4,RESPONSE4,,,MEET_GUARANTOR,ASSIGN4,REMARK4,CALLER4
5,,CONNECTED,SON,NAME5,RESPONSE5,,,VISIT_CUSTOMER,ASSIGN5,REMARK5,CALLER5
6,,CONNECTED,SON,NAME6,RESPONSE6,,,VISIT_CUSTOMER,ASSIGN6,REMARK6,CALLER6
Try out this code.
file = "your_file.csv"
my_list = []
with open(file, mode='r') as input_file:
rows = []
for row in input_file:
rows.append(row.rstrip('\n').split(","))
keys = rows[0]
for values in rows[1:]:
my_list.append(dict(zip(keys, values)))
Here is the output (a list containing dicts):
[{'actionDate': '', 'caller': 'CALLER1', 'contactStatus': 'CONNECTED', 'contactName': 'NAME1', 'tenantId': '', 'loanNumber': '1', 'action': 'FIELD_PTP', 'contactRelation': 'SELF', 'assignedTo': 'ASSIGN1', 'remarks': 'REMARK1', 'date': '', 'response': 'RESPONSE1'}, {'actionDate': '', 'caller': 'CALLER2', 'contactStatus': 'WRONG_NUMBER', 'contactName': 'NAME2', 'tenantId': '', 'loanNumber': '2', 'action': 'MEET_GUARANTOR', 'contactRelation': 'SELF', 'assignedTo': 'ASSIGN2', 'remarks': 'REMARK2', 'date': '', 'response': 'RESPONSE2'}, {'actionDate': '', 'caller': 'CALLER3', 'contactStatus': 'CONNECTED', 'contactName': 'NAME3', 'tenantId': '', 'loanNumber': '3', 'action': 'FIELD_PTP', 'contactRelation': 'WIFE', 'assignedTo': 'ASSIGN3', 'remarks': 'REMARK3', 'date': '', 'response': 'RESPONSE3'}, {'actionDate': '', 'caller': 'CALLER4', 'contactStatus': 'NO_RESPONSE', 'contactName': 'NAME4', 'tenantId': '', 'loanNumber': '4', 'action': 'MEET_GUARANTOR', 'contactRelation': 'HUSBAND', 'assignedTo': 'ASSIGN4', 'remarks': 'REMARK4', 'date': '', 'response': 'RESPONSE4'}, {'actionDate': '', 'caller': 'CALLER5', 'contactStatus': 'CONNECTED', 'contactName': 'NAME5', 'tenantId': '', 'loanNumber': '5', 'action': 'VISIT_CUSTOMER', 'contactRelation': 'SON', 'assignedTo': 'ASSIGN5', 'remarks': 'REMARK5', 'date': '', 'response': 'RESPONSE5'}, {'actionDate': '', 'caller': 'CALLER6', 'contactStatus': 'CONNECTED', 'contactName': 'NAME6', 'tenantId': '', 'loanNumber': '6', 'action': 'VISIT_CUSTOMER', 'contactRelation': 'SON', 'assignedTo': 'ASSIGN6', 'remarks': 'REMARK6', 'date': '', 'response': 'RESPONSE6'}]
A DictReader would help you. It automatically reads the header in and then converts each following row into a dictionary based on that row. Columns are then accessed by their name, rather than their position:
import csv
data = {}
def readCsv(csvFile, column):
with open(csvFile, "rb") as infile:
reader = csv.DictReader(infile)
for row in reader:
data[row[column]] = row
readCsv("calllogs.csv", 'loanNumber')
print data
This would give you:
{'1': {'actionDate': '', 'loanNumber': '1', 'assignedTo': 'ASSIGN1', 'caller': 'CALLER1', 'tenantId': '', 'action': 'FIELD_PTP', 'remarks': 'REMARK1', 'contactName': 'NAME1', 'contactRelation': 'SELF', 'date': '', 'response': 'RESPONSE1', 'contactStatus': 'CONNECTED'}, '3': {'actionDate': '', 'loanNumber': '3', 'assignedTo': 'ASSIGN3', 'caller': 'CALLER3', 'tenantId': '', 'action': 'FIELD_PTP', 'remarks': 'REMARK3', 'contactName': 'NAME3', 'contactRelation': 'WIFE', 'date': '', 'response': 'RESPONSE3', 'contactStatus': 'CONNECTED'}, '2': {'actionDate': '', 'loanNumber': '2', 'assignedTo': 'ASSIGN2', 'caller': 'CALLER2', 'tenantId': '', 'action': 'MEET_GUARANTOR', 'remarks': 'REMARK2', 'contactName': 'NAME2', 'contactRelation': 'SELF', 'date': '', 'response': 'RESPONSE2', 'contactStatus': 'WRONG_NUMBER'}, '5': {'actionDate': '', 'loanNumber': '5', 'assignedTo': 'ASSIGN5', 'caller': 'CALLER5', 'tenantId': '', 'action': 'VISIT_CUSTOMER', 'remarks': 'REMARK5', 'contactName': 'NAME5', 'contactRelation': 'SON', 'date': '', 'response': 'RESPONSE5', 'contactStatus': 'CONNECTED'}, '4': {'actionDate': '', 'loanNumber': '4', 'assignedTo': 'ASSIGN4', 'caller': 'CALLER4', 'tenantId': '', 'action': 'MEET_GUARANTOR', 'remarks': 'REMARK4', 'contactName': 'NAME4', 'contactRelation': 'HUSBAND', 'date': '', 'response': 'RESPONSE4', 'contactStatus': 'NO_RESPONSE'}, '6': {'actionDate': '', 'loanNumber': '6', 'assignedTo': 'ASSIGN6', 'caller': 'CALLER6', 'tenantId': '', 'action': 'VISIT_CUSTOMER', 'remarks': 'REMARK6', 'contactName': 'NAME6', 'contactRelation': 'SON', 'date': '', 'response': 'RESPONSE6', 'contactStatus': 'CONNECTED'}}
You will note that the loadNumber field is used as the key which is also left in the dictionary itself.

Categories