Speedup extracting data form larger xml files using python - python

Hello I am not strong python user , but need to extract the xml file values.
I am using for loop to get attribute values from 'xml.dom.minidom.document'
Both the xyz or temp uses for loop , since the file has half million values it takes time.
I tried using lxml, but it had error:
module 'lxml' has no attribute 'parse' or 'Xpath'
The xml file has following format
<?xml version="1.0" encoding="utf-8"?>
<variable_output>
<!--version : 1-->
<!--object title : Volume (1)-->
<!--scalar variable : Temperature (TEMP)-->
<POINT>
<Vertex>
<Position x="-0.176300004" y="-0.103100002" z="-0.153699994"/>
<Scalar TEMP="84.192421"/>
</Vertex>
</POINT>
<POINT>
<Vertex>
<Position x="-0.173557162" y="-0.103100002" z="-0.153699994"/>
<Scalar TEMP="83.9050522"/>
</Vertex>
</POINT>
<POINT>
<Vertex>
<Position x="-0.170814306" y="-0.103100002" z="-0.153699994"/>
<Scalar TEMP="83.7506332"/>
</Vertex>
</POINT>
</variable_output>
The following code give larger time for bigger files.
from xml.dom.minidom import parse
import xml.dom.minidom
import csv
import pandas as pd
import numpy as np
import os
import glob
import time
from lxml import etree
v=[]
doc =parse("document.xml")
Val = doc.getElementsByTagName("Scalar")
t0 = time.time()
for s in Val:
v=np.append(v,float(s.attributes['TEMP'].value))
res=np.array([v])
t1 = time.time()
total = (t1-t0)
print('Time for Value', str(total))
# Using lxml
doc2=etree.parse("document.xml")
# try using Xpath
t0 = time.time()
temp=doc2.Xpath("/POINT/Vertex/Scaler/#TEMP")
t1 = time.time()
total2 = t1-t0
print('Time for Value', str(total2))
# save data as csv from xml
pd.DataFrame(res.T).to_csv(('Data.csv'),index=False,header=False) #write timestep as csv
The error while using the Xpath to get the values of Temp,or x,y,z:
In [12]: temp=doc2.Xpath("/POINT/Vertex/Scaler/#TEMP")
Traceback (most recent call last):
File "<ipython-input-12-bbd832a3074e>", line 1, in <module>
temp=doc2.Xpath("/POINT/Vertex/Scaler/#TEMP")
AttributeError: 'lxml.etree._ElementTree' object has no attribute 'Xpath'

I recommend iterparse() for large xml files:
import timeit
import os, psutil
import datetime
import pandas as pd
import xml.etree.ElementTree as ET
class parse_xml:
def __init__(self, path):
self.xml = os.path.split(path)[1]
print(self.xml)
columns = ["Pos_x", "Pos_y", "Pos_z", "Scalar_Temp"]
data = []
for event, elem in ET.iterparse(self.xml, events=("end",)):
if elem.tag == "Position":
x = elem.get("x")
y = elem.get("y")
z = elem.get("z")
if elem.tag == "Scalar":
row = (x, y, z , elem.get("TEMP"))
data.append(row)
elem.clear()
df = pd.DataFrame(data, columns=columns)
print(df)
def main():
xml_file = r"D:\Daten\Programmieren\stackoverflow\document.xml"
parse_xml(xml_file)
if __name__ == "__main__":
now = datetime.datetime.now()
starttime = timeit.default_timer()
main()
process = psutil.Process(os.getpid())
print('\nFinished')
print(f"{now:%Y-%m-%d %H:%M}")
print('Runtime:', timeit.default_timer()-starttime)
print(f'RAM: {process.memory_info().rss/1000**2} MB')
Output:
document.xml
Pos_x Pos_y Pos_z Scalar_Temp
0 -0.176300004 -0.103100002 -0.153699994 84.192421
1 -0.173557162 -0.103100002 -0.153699994 83.9050522
2 -0.170814306 -0.103100002 -0.153699994 83.7506332
Finished
2022-11-29 23:51
Runtime: 0.007375300000000029
RAM: 55.619584 MB
If the output will be too large you can write it to a sqlite3 database with df.to_sql().

Related

Why arent packages being recognised in this function?

I am running the following script:
# STEP 1: import packages, declare tindexes
import pandas as pd
import yfinance as yf
import datetime as dt
bnpl_tindex = ["APT.AX","Z1P.AX","LFS.AX","SZL.AX","HUM.AX","SPT.AX","OPY.AX","IOU.AX","LBY.AX","DOU.AX"]
target_dates = pd.date_range(start=dt.date.today() - dt.timedelta(days=365), end=dt.date.today(), freq="M").strftime('%Y-%m-%d')
target_dates = target_dates.append(pd.Index([dt.date.today().strftime('%Y-%m-%d')]))
target_dates.sort_values()
#STEP 2: source functions
from collect_index_data import collect_index_data
#DELETE LATER... TESTING!
collect_index_data(bnpl_tindex, target_dates)
#
collect_index_data is as follows:
def collect_index_data(ticker_list,date_list):
if (bool(ticker_list) and all(isinstance(elem, str) for elem in ticker_list) )==False:
sys.exit('Input should be a list or a single string.')
else:
print("Components of Index: ")
#initialise dictionary
d = {}
#loop through ticker list
for x in ticker_list:
d["DF.{0}".format(x)] = yf.Ticker(x)
#testing
print(x)
#testing
print(d)
and I get the following error message
Components of Index:
Traceback (most recent call last):
File "C:\Users\thoma\Desktop\Files\Programming\Python\run_tindex_data.py", line 27, in <module>
collect_index_data(bnpl_tindex, target_dates)
File "C:\Users\thoma\Desktop\Files\Programming\Python\collect_index_data.py", line 12, in collect_index_data
d["DF.{0}".format(x)] = yf.Ticker("MSFT")
NameError: name 'yf' is not defined
My question is why is yfinance package not being recognised in my function?
I could import it inside the function, but I plan to run the function multiple times in a script - so this would be computationally wasteful.
thanks!

Final piece of code missing for fundamental data extract from TWS IB

I took the below code from one of the answered queries in Stackoverflow (unfortunately i cannot give full credit as i cannot locate the page anymore). I changed it a bit to fit my purpose.
I want to extract historical Reuters data (fundamentalData) for a list of tickers. The below code works fine but it only grabs the last ticker data. I know i need to build a while loop but i tried many times and none worked. I'm sure this is a quick fix but since I am new at coding and python in general I just can't find the solution. Any help would be appreciated!
#Import all libriaries
from ib.opt import ibConnection, message
from time import sleep
import lxml.etree
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from IPython.core.debugger import set_trace
from ibapi import wrapper
from ibapi.client import EClient
from ibapi.common import *
from ibapi.contract import Contract
#upload excel file of list of company tickers you want to review
us_list= pd.read_excel(r'C:\TWS API\Request Reuters data\Quant Project IB TWS_test.xlsx', engine='openpyxl')
stocksList = us_list[['TICKER']]
stocksList
def fundamentalData_handler(msg):
global imported
imported = msg.data
def error_handler(msg):
pass
# processing of the lines in financial statements
def extractVauleFromLineItem(fiscalperiod, code):
stage1 = fiscalperiod.find(name='lineitem', coacode=code)
if (not stage1 is None):
stage2 = stage1.get_text()
if (stage2 == stage2):
stage3 = float(stage2)
if (stage3 == stage3):
return (stage3)
else:
return (0.0)
result = pd.DataFrame(columns =['Year', 'Ticker','control','TotalRevenue', 'GrossProfit', 'CommonSharesOutstanding','DilutedNormalizedEPS', 'totalCash', 'TotalDebt','Dividends'])
outcomes = []
for i, row in stocksList.iterrows():
contract = Contract()
contract.symbol = row['TICKER']
contract.secType = "STK"
contract.currency = "USD"
contract.exchange = "SMART"
tws = ibConnection("127.0.0.1",port=7497, clientId=901)
tws.register(error_handler, message.Error)
tws.register(fundamentalData_handler, message.fundamentalData)
tws.connect()
tws.reqFundamentalData(1,contract,'ReportsFinStatements')
sleep(1)
tws.disconnect()
print(contract.symbol)
soup = BeautifulSoup(imported) # library for processing of the obtained XML data
data = []
print(soup.find(name='issueid', type="Ticker").get_text())
print(soup.find(name='coid', type="CompanyName").get_text())
# I found that IB API is not very stable.
# Sometimes it returns data of the wrong company.
# So the control is important
print('Control -',contract.symbol == soup.find(name='issueid', type="Ticker").get_text())
print()
for fiscalperiod in soup.find_all(name="fiscalperiod", type="Annual"):
year = fiscalperiod['fiscalyear']
TotalRevenue = extractVauleFromLineItem(fiscalperiod, 'RTLR')
GrossProfit = extractVauleFromLineItem(fiscalperiod, 'SGRP')
CommonSharesOutstanding = extractVauleFromLineItem(fiscalperiod, 'QTCO')
DilutedNormalizedEPS = extractVauleFromLineItem(fiscalperiod, 'VDES')
totalCash = extractVauleFromLineItem(fiscalperiod, 'OTLO')
TotalDebt = extractVauleFromLineItem(fiscalperiod, 'STLD')
Dividends = extractVauleFromLineItem(fiscalperiod, 'FCDP')
thisYearData = (year,contract.symbol, (contract.symbol == soup.find(name='issueid', type="Ticker").get_text()),TotalRevenue , GrossProfit, CommonSharesOutstanding, totalCash, TotalDebt, Dividends)
data.append(thisYearData)
df_data = pd.DataFrame(data, columns =['Year','control','TotalRevenue', 'GrossProfit', 'CommonSharesOutstanding','DilutedNormalizedEPS', 'totalCash', 'TotalDebt','Dividends'])
df_data = df_data.sort_values(by=['Year'])

convert xml to csv python

New to python,I am presently in the process of converting the XML to CSV using Python 3.6.1
Input file is file1.xml file:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Package>
<name>AllFeatureRules</name>
<pkgId>13569656</pkgId>
<pkgMetadata>
<creator>rsikhapa</creator>
<createdDate>13-05-2018 10:07:16</createdDate>
<pkgVersion>3.0.29</pkgVersion>
<application>All</application>
<icType>Feature</icType>
<businessService>Common</businessService>
<technology>All,NA</technology>
<runTimeFormat>RBML</runTimeFormat>
<inputForTranslation></inputForTranslation>
<pkgDescription></pkgDescription>
</pkgMetadata>
<rules>
<rule>
<name>ip_slas_scheduling</name>
<ruleId>46288</ruleId>
<ruleVersion>1.3.0</ruleVersion>
<ruleVersionId>1698132</ruleVersionId>
<nuggetId>619577</nuggetId>
<nuggetVersionId>225380</nuggetVersionId>
<icType>Feature</icType>
<creator>paws</creator>
<customer></customer>
</rule>
</rules>
<versionChanges>
<rulesAdded/>
<rulesModified/>
<rulesDeleted/>
</versionChanges>
</Package>
python code:
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse("file1.xml")
root = tree.getroot()
get_range = lambda col: range(len(col))
l = [{r[i].tag:r[i].text for i in get_range(r)} for r in root]
df = pd.DataFrame.from_dict(l)
df.to_csv('ABC.csv')
python code written as above
problem is it is taking csv conversion only for parent element(pkgmetadata) not for child element(rules).
,
not converting all xml file into csv .please let me know solution
to iterate over every entry, you can use the element trees ET.iter() function.
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse("file1.xml")
root = tree.getroot()
iter_root = root.iter()
l = {}
for elem in iter_root:
l[str(elem.tag)] = str(elem.text)
df = pd.DataFrame.from_dict(l,orient="index")
df.to_csv('ABC.csv')
producing a csv:
;0
Package;"
"
name;ip_slas_scheduling
pkgId;13569656
pkgMetadata;"
"
creator;paws
createdDate;13-05-2018 10:07:16
pkgVersion;3.0.29
application;All
icType;Feature
businessService;Common
technology;All,NA
runTimeFormat;RBML
inputForTranslation;None
pkgDescription;None
rules;"
"
rule;"
"
ruleId;46288
ruleVersion;1.3.0
ruleVersionId;1698132
nuggetId;619577
nuggetVersionId;225380
customer;None
versionChanges;"
"
rulesAdded;None
rulesModified;None
rulesDeleted;None

Multithreading/Multiprocessing to parse single XML file? [duplicate]

This question already has answers here:
Parsing Very Large XML Files Using Multiprocessing
(2 answers)
Closed 5 years ago.
Can someone tell me how to assign jobs to multiple threads to speed up parsing time? For example, I have XML file with 200k lines, I would assign 50k lines to each 4 threads and parse them using SAX parser. What I have done so far is 4 threads parsing on 200k lines which means 200k*4 = 800k duplicating results.
Any help is appreciated.
test.xml:
<?xml version="1.0" encoding="utf-8"?>
<votes>
<row Id="1" PostId="1" VoteTypeId="2" CreationDate="2014-05-13T00:00:00.000" />
<row Id="2" PostId="1" VoteTypeId="2" CreationDate="2014-05-13T00:00:00.000" />
<row Id="3" PostId="3" VoteTypeId="2" CreationDate="2014-05-13T00:00:00.000" />
<row Id="5" PostId="3" VoteTypeId="2" CreationDate="2014-05-13T00:00:00.000" />
</votes>
My source code:
import json
import xmltodict
from lxml import etree
import xml.etree.ElementTree as ElementTree
import threading
import time
def sax_parsing():
t = threading.currentThread()
for event, element in etree.iterparse("/home/xiang/Downloads/FYP/parallel-python/test.xml"):
#below codes read the attributes in an element specified
if element.tag == 'row':
print("Thread: %s" % t.getName())
row_id = element.attrib.get('Id')
row_post_id = element.attrib.get('PostId')
row_vote_type_id = element.attrib.get('VoteTypeId')
row_user_id = element.attrib.get('UserId')
row_creation_date = element.attrib.get('CreationDate')
print('ID: %s, PostId: %s, VoteTypeID: %s, UserId: %s, CreationDate: %s'% (row_id,row_post_id,row_vote_type_id,row_user_id,row_creation_date))
element.clear()
return
if __name__ == "__main__":
start = time.time() #calculate execution time
main_thread = threading.currentThread()
no_threads = 4
for i in range(no_threads):
t = threading.Thread(target=sax_parsing)
t.start()
for t in threading.enumerate():
if t is main_thread:
continue
t.join()
end = time.time() #calculate execution time
exec_time = end - start
print('Execution time: %fs' % (exec_time))
simplest way you could expend your parse function to receive start row and end row like so:
def sax_parsing(start, end):
and then when sending the threading command:
t = threading.Thread(target=sax_parsing, args=(i*50, i+1*50))
and change if element.tag == 'row': to if element.tag == 'row' and element.attrib.get('Id') >= start and element.attrib.get('Id') < end:
so each thread checks just the rows it was given in the range
(didn't actually check this, so play around)

Splitting file based on data comparison

I've been recently using a Garmin GPS path tracker which produces files like this:
<?xml version="1.0" encoding="UTF-8"?>
<gpx version="1.1" creator="GPS Track Editor" xmlns="http://www.topografix.com/GPX/1/1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:gte="http://www.gpstrackeditor.com/xmlschemas/General/1" xmlns:gpxtpx="http://www.garmin.com/xmlschemas/TrackPointExtension/v1" xmlns:gpxx="http://www.garmin.com/xmlschemas/GpxExtensions/v3" targetNamespace="http://www.topografix.com/GPX/1/1" elementFormDefault="qualified" xsi:schemaLocation="http://www.topografix.com/GPX/1/1 http://www.topografix.com/GPX/1/1/gpx.xsd">
<metadata>
<name>Ślad_16-SIE-15 190121.gpx</name>
<link href="http://www.garmin.com">
<text>Garmin International</text>
</link>
</metadata>
<trk>
<name>16-SIE-15 19:01:21</name>
<trkseg>
<trkpt lat="55.856890" lon="-4.250866">
<ele>9.27</ele>
<time>2015-08-16T08:32:13Z</time>
</trkpt>
<trkpt lat="55.856904" lon="-4.250904">
<ele>6.39</ele>
<time>2015-08-16T08:32:15Z</time>
</trkpt>
...
<trkpt lat="55.876979" lon="-4.286995">
<ele>46.28</ele>
<time>2015-08-16T17:22:14Z</time>
</trkpt>
<extensions>
<gte:name>#1</gte:name>
<gte:color>#fbaf00</gte:color>
</extensions>
</trkseg>
</trk>
</gpx>
The thing is that sometimes the device is losing signal (when in an inner city for example), which causes a footpath to be interpolated in an unpleasent manner:
footpath
I would like to split the footpath file into three separate files (to avoid these long arrows - see picture).
I ended up with following decomposition of a problem:
Read the original file latitude (lat) and longitude (lon) values
Compare 2 consecutive lat and lon values until assumed difference is
met while saving them to file one.
Add ending to file one, add predata tags to file two, continue with
comparing
Since I'm trying to learn Python 2.X, I'm stuck with this:
gpxFile = open('track.gpx', 'r')
with open("track.gpx", "r") as gpxFile:
data = gpxFile.read()
print data
for subString in data:
subString = data[data.find("<trkpt")+12:data.find("lon")-2] + " " + data[data.find("lon")+5:data.find(
"<ele>")-6]
Can anybody help me with that or at least give me a heads up of what to look for in a documentation or tutorials?
Thanks.
Cheers!
This isn't perfect, but it should do what you want. If not, it should serve as a good starting point. It works by reading in the XML file, extracting all of the track points, and then finding the gaps based on the timestamps. For each group of points, it outputs a new file named original_N.gpx (N = 0,1,2,...) where the input file is original.gpx. It could be modified to use distance between points, but time seemed a little easier. Look at delta_too_large(pt1, pt2) to change the gap detection, currently two seconds.
GitHub (Public Domain)
#!/usr/bin/env python
# Copyright (C) 2015 Harvey Chapman <hchapman#3gfp.com>
# Public Domain
# Use at your own risk.
"""
Splits a gpx file with breaks in the track into separate files.
Based on: http://stackoverflow.com/q/33803614/47078
"""
import sys
import re
import os
from datetime import datetime, timedelta
from itertools import izip
from xml.etree import ElementTree
ns = { 'gpx': 'http://www.topografix.com/GPX/1/1' }
def iso8601_to_datetime(datestring):
d = datetime(*map(int, re.split('\D', datestring)[:-1]))
# intentionally ignoring timezone info (for now)
# d = d.replace(tzinfo=UTC)
return d
def datetime_from_trkpt(trkpt):
datestring = trkpt.find('gpx:time', ns).text
return iso8601_to_datetime(datestring)
def delta_too_large(trkpt1, trkpt2):
delta = datetime_from_trkpt(trkpt2) - datetime_from_trkpt(trkpt1)
return delta > timedelta(seconds=2)
def trkpt_groups(trkpts):
last_index = 0
for n, (a,b) in enumerate(izip(trkpts[:-1], trkpts[1:]), start=1):
if delta_too_large(a,b):
yield last_index, n
last_index = n
yield last_index, len(trkpts)
def remove_all_trkpts_from_trkseg(trkseg):
trkpts = trkseg.findall('gpx:trkpt', ns)
for trkpt in trkpts:
trkseg.remove(trkpt)
return trkpts
def add_trkpts_to_trkseg(trkseg, trkpts):
# not sure if this will be slow or not...
for trkpt in reversed(trkpts):
trkseg.insert(0, trkpt)
def save_xml(filename, index, tree):
filename_parts = os.path.splitext(filename)
new_filename = '{1}_{0}{2}'.format(index, *filename_parts)
with open(new_filename, 'wb') as f:
tree.write(f,
xml_declaration=True,
encoding='utf-8',
method='xml')
def get_trkseg(tree):
trk = tree.getroot().findall('gpx:trk', ns)
if len(trk) > 1:
raise Exception("Don't know how to parse multiple tracks!")
trkseg = trk[0].findall('gpx:trkseg', ns)
if len(trkseg) > 1:
raise Exception("Don't know how to parse multiple track segment lists!")
return trkseg[0]
def split_gpx_file(filename):
ElementTree.register_namespace('', ns['gpx'])
tree = ElementTree.parse(filename)
trkseg = get_trkseg(tree)
trkpts = remove_all_trkpts_from_trkseg(trkseg)
for n, (start,end) in enumerate(trkpt_groups(trkpts)):
# Remove all points and insert only the ones for this group
remove_all_trkpts_from_trkseg(trkseg)
add_trkpts_to_trkseg(trkseg, trkpts[start:end])
save_xml(filename, n, tree)
if __name__ == '__main__':
if len(sys.argv) < 2:
print >> sys.stderr, "Usage: {} file.gpx".format(sys.argv[0])
sys.exit(-1)
split_gpx_file(sys.argv[1])

Categories