Python read XML file (near 50mb) - python

I'm parsing a XML String into CSV string but it's going very slow:
INDEX_COLUMN = "{urn:schemas-microsoft-com:office:spreadsheet}Index"
CELL_ELEMENT = "Cell"
DATA_ELEMENT = "Data"
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
rows.pop(0)
csv.append(join(",", header))
for row in rows:
values = get_cells_text(row)
csv.append(join(",", values))
return join("\n", csv)
def serialize_xml(xml):
return ET.fromstring(xml)
def get_cells_text(row):
keys = []
cells = normalize_row_cells(row)
for elm in cells:
keys.append(elm[0].text or "")
while len(keys) < 92:
keys.append("")
return keys
def normalize_row_cells(row):
cells = list(row)
updated_cells = copy.deepcopy(cells)
pos = 1
for elm in cells:
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
while index > pos:
empty_elm = ET.Element(CELL_ELEMENT)
child = ET.SubElement(empty_elm, DATA_ELEMENT)
child.text = ""
updated_cells.insert(pos - 1, empty_elm)
pos += 1
pos += 1
return updated_cells
The XML String sometimes miss a few columns and I need to iterate it to fill missing columns - every row must have 92 columns. That's why I have some helper functions to manipulate XML.
Right now I'm running my function with 4GB as Lambda and still getting timeout :(
Any idea on how to improve performance?

The normalize_row_cells constructs ElementTree Element instances but get_cells_text is only interested in each instance's child's text attribute, so I would consider changing normalize_row_cells to just return the text. Also, it's performing copies and calling list.insert: inserting elements into the middle of lists can be expensive, because each element after the insertion point must be moved.
Something like this (untested code) avoids making copies and insertions and returns only the required text, making get_cells_text redundant.
def normalize_row_cells(row):
cells = list(row)
updated_cells = []
pos = 1
for _ in range(0, 92):
elm = cells[pos - 1]
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
if index == pos:
updated_cells.append(elm[0].text)
pos += 1
else:
update_cells.append("")
return updated_cells
If you can match your cells to their header names then using csv.DictWriter from the standard library might be even better (you need to profile to be sure).
import csv
import io
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
with io.StringIO() as f:
writer = csv.DictWriter(f, fieldnames=header)
for row in rows:
row = get_cells_text(row)
writer.writerow(row)
f.seek(0)
data = f.read()
return data
def get_cells_text(row):
row_dict = {}
for cell in row:
column_name = get_column_name(cell) # <- can this be done?
row_dict[column_name] = elm[0].text or ""
return row_dict

Related

How to convert a csv-file to a dictionnary of lists with python?

I'm trying to have this kind of result :
Here is the csv-file :
OsmID,NewName,IdLocal
1020287758,NN1,Id0001
1021229973,NN2,Id0002
1025409497,NN3,Id0003
I'm using the code below:
import csv
input = r'C:\Users\_M92\csvFiles\csv0001.csv'
fileRead = open(input, 'r')
with open(input, 'r') as csv:
headerLine = fileRead.readline()
header = headerLine.split(",")
#print(header)
nameIndex = header.index("OsmID")
output = {}
for line in fileRead.readlines():
values = line.split(",")
output[values[nameIndex]] = values
print(output)
And it results in the following error:
File "c:\Users\_M92\Scripts\CsvToDict.py",
line 19, in <module>
nameIndex = header.index("OsmID")
ValueError: 'OsmID' is not in list
Instead of manually splitting each line by commas, use the CSV module that you've imported. This module contains a DictReader class that will yield dictionaries for each row. Then, you just need to add this to your output dictionary.
# Create an empty dictionary
# We will add keys to this as needed
output = {}
# Keep track of number of rows, so we can add an empty column if needed
row_count = 0
# This function adds a row to the output dictionary
def add_row(row_dict):
global row_count # Need to declare this as global because we're assigning to the variable in this function
if not row_dict: return # If row is empty, do nothing
for k, v in row_dict.items():
# Loop over all key-value pairs in the row to add
if k not in output: # If the output doesn't contain this column, create a blank column
output[k] = [None] * row_count
output[k].append(v) # Append the value to the correct column in output
row_count += 1
input_file = r'C:\Users\_M92\csvFiles\csv0001.csv'
with open(input_file, 'r') as fh:
reader = csv.DictReader(fh) # Create a DictReader
for row in reader:
add_row(row) # Add every row to the output
This gives the following output:
{'OsmID': ['1020287758', '1021229973', '1025409497'],
'NewName': ['NN1', 'NN2', 'NN3'],
'IdLocal': ['Id0001', 'Id0002', 'Id0003']}
Note: I removed the blank lines in the input csv you provided, but it doesn't make a difference to the program, since a blank line will yield an empty dictionary from DictReader, and add_row doesn't do anything with empty dicts
Note 2: You could discard the row_count variable if you dynamically count the number of rows like so:
def add_row(row_dict):
row_count = 0
for first_key, first_val in output.items():
row_count = len(first_val)
break # We can just break out here because all keys should have the same number of values
# Create keys that do not yet exist in output but do exist in the new row
existing_keys = set(output.keys())
new_row_keys = set(row_dict.keys())
keys_to_create = new_row_keys - existing_keys
for key in keys_to_create:
output[key] = [None] * row_count
# Append to each column in output
for key in output:
output[key].append(row_dict.get(key, None)) # If the key doesn't exist in the current row, append None
You could use Pandas
import pandas as pd
f = r'C:\Users\_M92\csvFiles\csv0001.csv'
df = pd.read_csv(f).to_dict('list')
Try to go from this snippet for you. This is the 'From scratch' method. Please use a lib to do it properly!:
import os
input_path = r'test.csv'
header_line = 0
sep_csv_line = "\n\n"
sep_csv_column = ","
with open(os.path.join(os.path.dirname(__file__), input_path), 'r') as csv:
content = csv.read()
split = content.split(sep_csv_line)
columns = split[header_line].split(sep_csv_column)
print(f"{columns = }")
output = {}
for column in columns:
output[column] = []
for line in split[header_line+1:]:
print(f"{line = }")
elements = line.split(sep_csv_column)
print(f"{elements = }")
for i, column in enumerate(columns):
element = elements[i]
print(f"{element = }")
output[column].append(element)
print(f"{output = }")
print(f"{output['OsmID'] = }")
Here is the output console:
columns = ['OsmID', 'NewName', 'IdLocal']
line = '1020287758,NN1,Id0001'
elements = ['1020287758', 'NN1', 'Id0001']
element = '1020287758'
element = 'NN1'
element = 'Id0001'
line = '1021229973,NN2,Id0002'
elements = ['1021229973', 'NN2', 'Id0002']
element = '1021229973'
element = 'NN2'
element = 'Id0002'
line = '1025409497,NN3,Id0003'
elements = ['1025409497', 'NN3', 'Id0003']
element = '1025409497'
element = 'NN3'
element = 'Id0003'
output = {'OsmID': ['1020287758', '1021229973', '1025409497'], 'NewName': ['NN1', 'NN2', 'NN3'], 'IdLocal': ['Id0001', 'Id0002', 'Id0003']}
output['OsmID'] = ['1020287758', '1021229973', '1025409497']

"UnboundLocalError: local variable '?' referenced before assignment " error on large number of iteration

In Python, I'm trying to iterate over 6000 URLs and download them,
when I try with a small number of iteration(4 URLs) everything works as expected.
with open("SECmasterURLs.txt",'r') as f:
byte_data = f.read()
count = 0
masterurls = byte_data.splitlines()
createFolder(r"/Users/egecikrikci/Desktop/SEC Scrape/ParsedDatas")
createFolder(r"/Users/egecikrikci/Desktop/SEC Scrape/MasterDatas")
ParsedFolder = (r"/Users/egecikrikci/Desktop/SEC Scrape/ParsedDatas/")
MasterFolder = (r"/Users/egecikrikci/Desktop/SEC Scrape/MasterDatas/")
for line in masterurls:
DataDownloader(line, ParsedFolder, MasterFolder)
process = psutil.Process(os.getpid())
__memoryusage__ = (process.memory_info().rss) # in bytes
print (__memoryusage__ / 1000000)
and as output it creates 2 files as expected and downloads my 4 files from URLs listed in SECmasterURLs.txt.
But when I try with 6000 URL's it returns an error:
UnboundLocalError Traceback (most recent call last)
<ipython-input-32-cc04452d2aa1> in <module>
11 for line in xx:
12
---> 13 DataDownloader(line, ParsedFolder, MasterFolder)
14 process = psutil.Process(os.getpid())
15 __memoryusage__ = (process.memory_info().rss) # in bytes
<ipython-input-27-1ffb4717a449> in DataDownloader(file_url, folderforparsed, folderformaster)
25
26 # define a new dataset with out the header info.
---> 27 data_format = data[start_ind + 1:]
28
29 master_data = []
UnboundLocalError: local variable 'start_ind' referenced before assignment
and here is the code inside DataDownloader:
def DataDownloader(file_url, folderforparsed, folderformaster):
urlsplit = file_url.split('/')
urlsplit2 = urlsplit[8].split('.')
filenamebuilder = '{}{}'.format(urlsplit2[0],urlsplit2[1] + '.txt')
MasterFiles = open(folderforparsed + 'parsed' + filenamebuilder, 'w')
content = requests.get(file_url).content
count = 0
with open(folderformaster + filenamebuilder, 'wb') as f:
f.write(content)
# let's open it and we will now have a byte stream to play with.
with open(folderformaster + filenamebuilder,'rb') as f:
byte_data = f.read()
# Now that we loaded the data, we have a byte stream that needs to be decoded and then split by -------.
data = byte_data.decode("utf-8").split('----')
# We need to remove the headers, so look for the end of the header and grab it's index
for index, item in enumerate(data):
if "ftp://ftp.sec.gov/edgar/" in item:
start_ind = index
# define a new dataset with out the header info.
data_format = data[start_ind + 1:]
master_data = []
# now we need to break the data into sections, this way we can move to the final step of getting each row value.
for index, item in enumerate(data_format):
# if it's the first index, it won't be even so treat it differently
if index == 0:
clean_item_data = item.replace('\n','|').split('|')
clean_item_data = clean_item_data[8:]
else:
clean_item_data = item.replace('\n','|').split('|')
for index, row in enumerate(clean_item_data):
# when you find the text file.
if '.txt' in row:
# grab the values that belong to that row. It's 4 values before and one after.
mini_list = clean_item_data[(index - 4): index + 1]
if len(mini_list) != 0:
mini_list[4] = "https://www.sec.gov/Archives/" + mini_list[4]
master_data.append(mini_list)
#loop through each document in the master list.
for index, document in enumerate(master_data):
# create a dictionary for each document in the master list
document_dict = {}
document_dict['cik_number'] = document[0]
document_dict['company_name'] = document[1]
document_dict['form_id'] = document[2]
document_dict['date'] = document[3]
document_dict['file_url'] = document[4]
master_data[index] = document_dict
for document_dict in master_data:
# if it's a 10-K document pull the url and the name.
if document_dict['form_id'] == '10-K':
# get the components
comp_name = document_dict['company_name']
docu_url = document_dict['file_url']
form_type = document_dict['form_id']
print('-'*100)
print(comp_name)
print(docu_url)
print('Form Type is: {}'.format(form_type))
MasterFiles.write('-'*75)
MasterFiles.write('\n')
MasterFiles.write(comp_name)
MasterFiles.write('\n')
MasterFiles.write(docu_url)
MasterFiles.write('\n')
MasterFiles.write(form_type)
MasterFiles.write('\n')
count = count + 1
What is happening is that when your code hits this line
data_format = data[start_ind + 1:]
it blows up because start_ind has not be initialized.
That variable is initialized via these lines:
for index, item in enumerate(data):
if "ftp://ftp.sec.gov/edgar/" in item:
start_ind = index
So if the data does not include that string, start_ind will never be initialized. For some subset of the 6000 urls you're processing, that string must not be there.

Python iterating/look up dictionary unexpected behavior

I have a problem when I try to look up data in a csv dictionary. A list of dates and times are in one csv and it should look up the data to specific date and time in second csv. I look for an exact match and 22 next records. The problem is that it only fetch for first date and time and rest is not found even though I can see it's there. I feel like this has a very easy solution, but I can't think anything. It must be a problem in my iteration code.
Code:
import csv
csv_eph = open("G:\\db.csv")
csv_reader_eph = csv.reader(csv_eph, delimiter=",")
csv_dict_eph = csv.DictReader (csv_eph)
csv_matches = open("G:\\query.csv")
csv_reader_matches = csv.reader(csv_matches, delimiter=",")
csv_dict_matches = csv.DictReader (csv_matches)
result = []
var = 0
for row in csv_dict_matches:
datum = row["Date"]
cas = row["Time"]
result.append('\n')
result.append(row)
for eph in csv_dict_eph:
if str(eph["datum"]) == str(datum) and str(eph["cas"]) == str(cas):
var = 23
if var > 0:
result.append(eph)
var = var - 1
with open("G:\\compiled.txt", "w") as output:
for item in result:
output.write(str(item))
output.write('\n')
SOLUTION!
I implemented jasonharper solution and it works flawlesly, many thanks. It was indeed problem with end of dictionary. Now fixed it looks like this and works like intended:
import csv
csv_eph = open("G:\\db.csv")
csv_reader_eph = csv.reader(csv_eph, delimiter=",")
csv_dict_eph = csv.DictReader (csv_eph)
csv_matches = open("G:\\query.csv")
csv_reader_matches = csv.reader(csv_matches, delimiter=",")
csv_dict_matches = csv.DictReader (csv_matches)
#jasonharper
eph_list = []
for eph in csv_dict_eph:
eph_list.append(eph)
print (eph_list)
result = []
var = 0
for row in csv_dict_matches:
print (row)
datum = row["Date"]
cas = row["Time"]
result.append('\n')
result.append(row)
for eph in eph_list:
if str(eph["datum"]) == str(datum) and str(eph["cas"]) == str(cas):
var = 23
if var > 0:
result.append(eph)
var = var - 1
with open("G:\\compiled.txt", "w") as output:
for item in result:
output.write(str(item))
output.write('\n')
i believe changing:
csv_dict_eph = csv.DictReader (csv_eph)
to:
csv_dict_eph = list(csv.DictReader(csv_eph))
will fix the problem.

Merging two csv files into list of dictionaries

i have a task to do and i got stuck because whatever i do it does't seem to work.
So i have to csv files.
First called persons_file and it contains header line: id, name, surname.
And visits_file containing id, person_id, site.
I have to write a function called merge that gets to files as arguments (both StrionIO type) and returns list of dictionaries with number of visits for each users:
[ {
"id": (person's id),
"name": (person's name),
"surname": (person's surname),
"visits": (number of visits)
} ]
I came up with this and i don't know where my mistake is.
import io
def merge(persons_file,visits_file):
line_counter = 0
return_list = []
list_of_person_ids = []
visits = 0
for row in visits_file:
if line_counter == 0:
line_counter+=1
continue
list_of_person_ids.append(row.split(',')[1])
line_counter = 0
for row in persons_file:
if line_counter == 0:
line_counter+=1
continue
help_dict = {}
split_row = row.split(',')
help_dict['id'] = split_row[0]
help_dict['name'] = split_row[1]
help_dict['surname'] = split_row[2][:len(split_row[2])-1]
if split_row[0] in list_of_person_ids:
visits = list_of_person_ids.count(split_row[0])
help_dict['visits'] = str(visits)
return_list.append(help_dict)
visits=0
return return_list
file1 = open('persons_file.csv' , mode='r')
file2 = open('visits_file.csv' , mode='r')
persons_file_arg = io.StringIO(file1.read())
visits_file_arg = io.StringIO(file2.read())
list_of_visits = merge(persons_file_arg,visits_file_arg)
for i in list_of_visits:
print(i)
file1.close()
file2.close()
I will be glad if anyone could help me.
What is the issue? Is it the output that is not what you expected, or are you getting an exception? Your code seems like it should achieve the result you want, but I have a couple suggestions to make that could simplify things.
Look into collections.Counter you could then call count_of_visits_by_person_id = Counter(list_of_person_ids) to get a result of the form:
{person_id: number_of_visits, ...}. You could then use this to simply look up the number of visits in your next for loop. e.g.:
from collections import Counter
...
count_of_visits_by_person_id = Counter(list_of_person_ids)
for row in persons_file:
if line_counter == 0:
line_counter += 1
continue
help_dict = {}
split_row = row.split(',')
help_dict['id'] = split_row[0]
help_dict['name'] = split_row[1]
help_dict['surname'] = split_row[2][:-1]
# [:len(split_row[2]) - 1] is equivalent to [:-1]
# I assume you are stripping whitespace from the right side,
# which can also be accomplished using split_row[2].rstrip()
if split_row[0] in count_of_visits_by_person_id:
visits = count_of_visits_by_person_id[split_row[0]]
else:
visits = 0
help_dict['visits'] = str(visits)
return_list.append(help_dict)
The generally simpler and safer way to open files is using the with statement. Here is an example:
with open('visits_file.csv', mode='r') as visits_file:
row = visits_file.readline()
while row:
row = visits_file.readline() # Skips the first line
list_of_person_ids.append(row.split(',')[1])

Rewind the file pointer to the beginning of the previous line

I am doing text processing and using 'readline()' function as follows:
ifd = open(...)
for line in ifd:
while (condition)
do something...
line = ifd.readline()
condition = ....
#Here when the condition becomes false I need to rewind the pointer so that the 'for' loop read the same line again.
ifd.fseek() followed by readline is giving me a '\n' character. How to rewind the pointer so that the whole line is read again.
>>> ifd.seek(-1,1)
>>> line = ifd.readline()
>>> line
'\n'
Here is my code
labtestnames = sorted(tmp)
#Now read each line in the inFile and write into outFile
ifd = open(inFile, "r")
ofd = open(outFile, "w")
#read the header
header = ifd.readline() #Do nothing with this line. Skip
#Write header into the output file
nl = "mrn\tspecimen_id\tlab_number\tlogin_dt\tfluid"
offset = len(nl.split("\t"))
nl = nl + "\t" + "\t".join(labtestnames)
ofd.write(nl+"\n")
lenFields = len(nl.split("\t"))
print "Reading the input file and converting into modified file for further processing (correlation analysis etc..)"
prevTup = (0,0,0)
rowComplete = 0
k=0
for line in ifd:
k=k+1
if (k==200): break
items = line.rstrip("\n").split("\t")
if((items[0] =='')):
continue
newline= list('' for i in range(lenFields))
newline[0],newline[1],newline[3],newline[2],newline[4] = items[0], items[1], items[3], items[2], items[4]
ltests = []
ltvals = []
while(cmp(prevTup, (items[0], items[1], items[3])) == 0): # If the same mrn, lab_number and specimen_id then fill the same row. else create a new row.
ltests.append(items[6])
ltvals.append(items[7])
pos = ifd.tell()
line = ifd.readline()
prevTup = (items[0], items[1], items[3])
items = line.rstrip("\n").split("\t")
rowComplete = 1
if (rowComplete == 1): #If the row is completed, prepare newline and write into outfile
indices = [labtestnames.index(x) for x in ltests]
j=0
ifd.seek(pos)
for i in indices:
newline[i+offset] = ltvals[j]
j=j+1
if (rowComplete == 0): #
currTup = (items[0], items[1], items[3])
ltests = items[6]
ltvals = items[7]
pos = ifd.tell()
line = ifd.readline()
items = line.rstrip("\n").split("\t")
newTup = (items[0], items[1], items[3])
if(cmp(currTup, newTup) == 0):
prevTup = currTup
ifd.seek(pos)
continue
else:
indices = labtestnames.index(ltests)
newline[indices+offset] = ltvals
ofd.write(newline+"\n")
The problem can be handled more simply using itertools.groupby. groupby can cluster all the contiguous lines that deal with the same mrn, specimen_id, and lab_num.
The code that does this is
for key, group in IT.groupby(reader, key = mykey):
where reader iterates over the lines of the input file, and mykey is defined by
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
Each row from reader is passed to mykey, and all rows with the same key are clustered together in the same group.
While we're at it, we might as well use the csv module to read each line into a dict (which I call row). This frees us from having to deal with low-level string manipulation like line.rstrip("\n").split("\t") and instead of referring to columns by index numbers (e.g. row[3]) we can write code that speaks in higher-level terms such as row['lab_num'].
import itertools as IT
import csv
inFile = 'curious.dat'
outFile = 'curious.out'
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
fieldnames = 'mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate'.split()
with open(inFile, 'rb') as ifd:
reader = csv.DictReader(ifd, delimiter = '\t')
with open(outFile, 'wb') as ofd:
writer = csv.DictWriter(
ofd, fieldnames, delimiter = '\t', lineterminator = '\n', )
writer.writeheader()
for key, group in IT.groupby(reader, key = mykey):
new = {}
row = next(group)
for key in ('mrn', 'specimen_id', 'date', 'lab_num'):
new[key] = row[key]
new[row['labtest']] = row['result_val']
for row in group:
new[row['labtest']] = row['result_val']
writer.writerow(new)
yields
mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate
4419529 1614487 26.2675 5802791G 0.1
3319529 1614487 26.2675 5802791G 0.3 153 8.1 2.1 4
5713871 682571 56.0779 9732266E 4.1
This seems to be a perfect use case for yield expressions. Consider the following example that prints lines from a file, repeating some of them at random:
def buflines(fp):
r = None
while True:
r = yield r or next(fp)
if r:
yield None
from random import randint
with open('filename') as fp:
buf = buflines(fp)
for line in buf:
print line
if randint(1, 100) > 80:
print 'ONCE AGAIN::'
buf.send(line)
Basically, if you want to process an item once again, you send it back to the generator. On the next iteration you will be reading the same item once again.

Categories