convert xml to csv python - python

New to python,I am presently in the process of converting the XML to CSV using Python 3.6.1
Input file is file1.xml file:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Package>
<name>AllFeatureRules</name>
<pkgId>13569656</pkgId>
<pkgMetadata>
<creator>rsikhapa</creator>
<createdDate>13-05-2018 10:07:16</createdDate>
<pkgVersion>3.0.29</pkgVersion>
<application>All</application>
<icType>Feature</icType>
<businessService>Common</businessService>
<technology>All,NA</technology>
<runTimeFormat>RBML</runTimeFormat>
<inputForTranslation></inputForTranslation>
<pkgDescription></pkgDescription>
</pkgMetadata>
<rules>
<rule>
<name>ip_slas_scheduling</name>
<ruleId>46288</ruleId>
<ruleVersion>1.3.0</ruleVersion>
<ruleVersionId>1698132</ruleVersionId>
<nuggetId>619577</nuggetId>
<nuggetVersionId>225380</nuggetVersionId>
<icType>Feature</icType>
<creator>paws</creator>
<customer></customer>
</rule>
</rules>
<versionChanges>
<rulesAdded/>
<rulesModified/>
<rulesDeleted/>
</versionChanges>
</Package>
python code:
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse("file1.xml")
root = tree.getroot()
get_range = lambda col: range(len(col))
l = [{r[i].tag:r[i].text for i in get_range(r)} for r in root]
df = pd.DataFrame.from_dict(l)
df.to_csv('ABC.csv')
python code written as above
problem is it is taking csv conversion only for parent element(pkgmetadata) not for child element(rules).
,
not converting all xml file into csv .please let me know solution

to iterate over every entry, you can use the element trees ET.iter() function.
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse("file1.xml")
root = tree.getroot()
iter_root = root.iter()
l = {}
for elem in iter_root:
l[str(elem.tag)] = str(elem.text)
df = pd.DataFrame.from_dict(l,orient="index")
df.to_csv('ABC.csv')
producing a csv:
;0
Package;"
"
name;ip_slas_scheduling
pkgId;13569656
pkgMetadata;"
"
creator;paws
createdDate;13-05-2018 10:07:16
pkgVersion;3.0.29
application;All
icType;Feature
businessService;Common
technology;All,NA
runTimeFormat;RBML
inputForTranslation;None
pkgDescription;None
rules;"
"
rule;"
"
ruleId;46288
ruleVersion;1.3.0
ruleVersionId;1698132
nuggetId;619577
nuggetVersionId;225380
customer;None
versionChanges;"
"
rulesAdded;None
rulesModified;None
rulesDeleted;None

Related

Speedup extracting data form larger xml files using python

Hello I am not strong python user , but need to extract the xml file values.
I am using for loop to get attribute values from 'xml.dom.minidom.document'
Both the xyz or temp uses for loop , since the file has half million values it takes time.
I tried using lxml, but it had error:
module 'lxml' has no attribute 'parse' or 'Xpath'
The xml file has following format
<?xml version="1.0" encoding="utf-8"?>
<variable_output>
<!--version : 1-->
<!--object title : Volume (1)-->
<!--scalar variable : Temperature (TEMP)-->
<POINT>
<Vertex>
<Position x="-0.176300004" y="-0.103100002" z="-0.153699994"/>
<Scalar TEMP="84.192421"/>
</Vertex>
</POINT>
<POINT>
<Vertex>
<Position x="-0.173557162" y="-0.103100002" z="-0.153699994"/>
<Scalar TEMP="83.9050522"/>
</Vertex>
</POINT>
<POINT>
<Vertex>
<Position x="-0.170814306" y="-0.103100002" z="-0.153699994"/>
<Scalar TEMP="83.7506332"/>
</Vertex>
</POINT>
</variable_output>
The following code give larger time for bigger files.
from xml.dom.minidom import parse
import xml.dom.minidom
import csv
import pandas as pd
import numpy as np
import os
import glob
import time
from lxml import etree
v=[]
doc =parse("document.xml")
Val = doc.getElementsByTagName("Scalar")
t0 = time.time()
for s in Val:
v=np.append(v,float(s.attributes['TEMP'].value))
res=np.array([v])
t1 = time.time()
total = (t1-t0)
print('Time for Value', str(total))
# Using lxml
doc2=etree.parse("document.xml")
# try using Xpath
t0 = time.time()
temp=doc2.Xpath("/POINT/Vertex/Scaler/#TEMP")
t1 = time.time()
total2 = t1-t0
print('Time for Value', str(total2))
# save data as csv from xml
pd.DataFrame(res.T).to_csv(('Data.csv'),index=False,header=False) #write timestep as csv
The error while using the Xpath to get the values of Temp,or x,y,z:
In [12]: temp=doc2.Xpath("/POINT/Vertex/Scaler/#TEMP")
Traceback (most recent call last):
File "<ipython-input-12-bbd832a3074e>", line 1, in <module>
temp=doc2.Xpath("/POINT/Vertex/Scaler/#TEMP")
AttributeError: 'lxml.etree._ElementTree' object has no attribute 'Xpath'
I recommend iterparse() for large xml files:
import timeit
import os, psutil
import datetime
import pandas as pd
import xml.etree.ElementTree as ET
class parse_xml:
def __init__(self, path):
self.xml = os.path.split(path)[1]
print(self.xml)
columns = ["Pos_x", "Pos_y", "Pos_z", "Scalar_Temp"]
data = []
for event, elem in ET.iterparse(self.xml, events=("end",)):
if elem.tag == "Position":
x = elem.get("x")
y = elem.get("y")
z = elem.get("z")
if elem.tag == "Scalar":
row = (x, y, z , elem.get("TEMP"))
data.append(row)
elem.clear()
df = pd.DataFrame(data, columns=columns)
print(df)
def main():
xml_file = r"D:\Daten\Programmieren\stackoverflow\document.xml"
parse_xml(xml_file)
if __name__ == "__main__":
now = datetime.datetime.now()
starttime = timeit.default_timer()
main()
process = psutil.Process(os.getpid())
print('\nFinished')
print(f"{now:%Y-%m-%d %H:%M}")
print('Runtime:', timeit.default_timer()-starttime)
print(f'RAM: {process.memory_info().rss/1000**2} MB')
Output:
document.xml
Pos_x Pos_y Pos_z Scalar_Temp
0 -0.176300004 -0.103100002 -0.153699994 84.192421
1 -0.173557162 -0.103100002 -0.153699994 83.9050522
2 -0.170814306 -0.103100002 -0.153699994 83.7506332
Finished
2022-11-29 23:51
Runtime: 0.007375300000000029
RAM: 55.619584 MB
If the output will be too large you can write it to a sqlite3 database with df.to_sql().

How can I parse a VCARD in a XML file

I'm trying to parse an XML file in which there is some VCARD. I need the info: FN, NOTE (SIREN and A) and print them as a list as FN, SIREN_A. I would also like to add them in a list if the string in the description equals "diviseur" only
I've tried different things (vobject, finditer) but none of them work. For my parser, I'm using the library xml.etree.ElementTree and pandas which usually are causing some incompatibilies.
code python :
import xml.etree.ElementTree as ET
import vobject
newlist=[]
data=[]
data.append(newlist)
diviseur=[]
tree=ET.parse('test_oc.xml')
root=tree.getroot()
newlist=[]
for lifeCycle in root.findall('{http://ltsc.ieee.org/xsd/LOM}lifeCycle'):
for contribute in lifeCycle.findall('{http://ltsc.ieee.org/xsd/LOM}contribute'):
for entity in contribute.findall('{http://ltsc.ieee.org/xsd/LOM}entity'):
vcard = vobject.readOne(entity)
siren = vcard.contents['note'].value,":",vcard.contents['fn'].value
print ('siren',siren.text)
for date in contribute.findall('{http://ltsc.ieee.org/xsd/LOM}date'):
for description in date.findall('{http://ltsc.ieee.org/xsd/LOM}description'):
entite=description.find('{http://ltsc.ieee.org/xsd/LOM}string')
print ('Type entité:', entite.text)
newlist.append(entite)
j=0
for j in range(len(entite)-1):
if entite[j]=="diviseur":
diviseur.append(siren[j])
print('diviseur:', diviseur)
newlist.append(diviseur)
data.append(newlist)
print(data)
xml file to parse:
<?xml version="1.0" encoding="UTF-8"?>
<lom:lom xmlns:lom="http://ltsc.ieee.org/xsd/LOM" xmlns:lomfr="http://www.lom-fr.fr/xsd/LOMFR" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://ltsc.ieee.org/xsd/LOM">
<lom:version uniqueElementName="version">
<lom:string language="http://id.loc.gov/vocabulary/iso639-2/fre">V4.1</lom:string>
</lom:version>
<lom:lifeCycle uniqueElementName="lifeCycle">
<lom:contribute>
<lom:entity><![CDATA[
BEGIN:VCARD
VERSION:4.0
FN:Cailler
N:;Valérie;;Mr;
ORG:Veoli
NOTE:SIREN=203025106
NOTE :ISNI=0000000000000000
END:VCARD
]]></lom:entity>
<lom:date uniqueElementName="date">
<lom:dateTime uniqueElementName="dateTime">2019-07-10</lom:dateTime>
<lom:description uniqueElementName="description">
<lom:string>departure</lom:string>
</lom:description>
</lom:date>
</lom:contribute>
<lom:contribute>
<lom:entity><![CDATA[
BEGIN:VCARD
VERSION:4.0
FN:Besnard
N:;Ugo;;Mr;
ORG:MG
NOTE:SIREN=501 025 205
NOTE :A=0000 0000
END:VCARD
]]></lom:entity>
<lom:date uniqueElementName="date">
<lom:dateTime uniqueElementName="dateTime">2019-07-10</lom:dateTime>
<lom:description uniqueElementName="description">
<lom:string>diviseur</lom:string>
</lom:description>
</lom:date>
</lom:contribute>
</lom:lifeCycle>
</lom:lom>
Traceback (most recent call last):
File "parser_export_csv_V2.py", line 73, in
vcard = vobject.readOne(entity)
File "C:\Users\b\AppData\Local\Programs\Python\Python36-32\lib\site-packages\vobject\base.py", line 1156, in readOne
allowQP))
File "C:\Users\b\AppData\Local\Programs\Python\Python36-32\lib\site-packages\vobject\base.py", line 1089, in readComponents
for line, n in getLogicalLines(stream, allowQP):
File "C:\Users\b\AppData\Local\Programs\Python\Python36-32\lib\site-packages\vobject\base.py", line 869, in getLogicalLines
val = fp.read(-1)
AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'read'
There are a few problems here.
entity is an Element instance, and vCard is a plain text data format. vobject.readOne() expects text.
There is unwanted whitespace adjacent to the vCard properties in the XML file.
NOTE :ISNI=0000000000000000 is invalid; it should be NOTE:ISNI=0000000000000000 (space removed).
vcard.contents['note'] is a list and does not have a value property.
Here is code that probably doesn't produce exactly what you want, but I hope it helps:
import xml.etree.ElementTree as ET
import vobject
NS = {"lom": "http://ltsc.ieee.org/xsd/LOM"}
tree = ET.parse('test_oc.xml')
for contribute in tree.findall('.//lom:contribute', NS):
desc_string = contribute.find('.//lom:string', NS)
print(desc_string.text)
entity = contribute.find('lom:entity', NS)
txt = entity.text.replace(" ", "") # Text with spaces removed
vcard = vobject.readOne(txt)
for p in vcard.contents["note"]:
print(p.name, p.value)
for p in vcard.contents["fn"]:
print(p.name, p.value)
print()
Output:
departure
NOTE SIREN=203025106
NOTE ISNI=0000000000000000
FN Cailler
diviseur
NOTE SIREN=501025205
NOTE A=00000000
FN Besnard

Python XML findall does not work

I am trying to use findall to select on some xml elements, but i can't get any results.
import xml.etree.ElementTree as ET
import sys
storefront = sys.argv[1]
xmlFileName = 'promotions{0}.xml'
xmlFile = xmlFileName.format(storefront)
csvFileName = 'hrz{0}.csv'
csvFile = csvFileName.format(storefront)
ET.register_namespace('', "http://www.demandware.com/xml/impex/promotion/2008-01-31")
tree = ET.parse(xmlFile)
root = tree.getroot()
print('------------------Generate test-------------\n')
csv = open(csvFile,'w')
n = 0
for child in root.findall('campaign'):
print(child.attrib['campaign-id'])
print(n)
n+=1
The XML looks something like this:
<?xml version="1.0" encoding="UTF-8"?>
<promotions xmlns="http://www.demandware.com/xml/impex/promotion/2008-01-31">
<campaign campaign-id="10off-310781">
<enabled-flag>true</enabled-flag>
<campaign-scope>
<applicable-online/>
</campaign-scope>
<customer-groups match-mode="any">
<customer-group group-id="Everyone"/>
</customer-groups>
</campaign>
<campaign campaign-id="MNT-deals">
<enabled-flag>true</enabled-flag>
<campaign-scope>
<applicable-online/>
</campaign-scope>
<start-date>2017-07-03T22:00:00.000Z</start-date>
<end-date>2017-07-31T22:00:00.000Z</end-date>
<customer-groups match-mode="any">
<customer-group group-id="Everyone"/>
</customer-groups>
</campaign>
<campaign campaign-id="black-friday">
<enabled-flag>true</enabled-flag>
<campaign-scope>
<applicable-online/>
</campaign-scope>
<start-date>2017-11-23T23:00:00.000Z</start-date>
<end-date>2017-11-24T23:00:00.000Z</end-date>
<customer-groups match-mode="any">
<customer-group group-id="Everyone"/>
</customer-groups>
<custom-attributes>
<custom-attribute attribute-id="expires_date">2017-11-29</custom-attribute>
</custom-attributes>
</campaign>
<promotion-campaign-assignment promotion-id="winter17-new-bubble" campaign-id="winter17-new-bubble">
<qualifiers match-mode="any">
<customer-groups/>
<source-codes/>
<coupons/>
</qualifiers>
<rank>100</rank>
</promotion-campaign-assignment>
<promotion-campaign-assignment promotion-id="xmas" campaign-id="xmas">
<qualifiers match-mode="any">
<customer-groups/>
<source-codes/>
<coupons/>
</qualifiers>
</promotion-campaign-assignment>
</promotions>
Any ideas what i am doing wrong?
I have tried different solutions that i found on stackoverflow but nothing seems to work for me(from the things i have tried).
The list is empty.
Sorry if it is something very obvious i am new to python.
As mentioned here by #MartijnPieters, etree's .findall uses the namespaces argument while the .register_namespace() is used for xml output of the tree. Therefore, consider mapping the default namespace with an explicit prefix. Below uses doc but can even be cosmin.
Additionally, consider with and enumerate() even the csv module as better handlers for your print and CSV outputs.
import csv
...
root = tree.getroot()
print('------------------Generate test-------------\n')
with open(csvFile, 'w') as f:
c = csv.writer(f, lineterminator='\n')
for n, child in enumerate(root.findall('doc:campaign', namespaces={'doc':'http://www.demandware.com/xml/impex/promotion/2008-01-31'})):
print(child.attrib['campaign-id'])
print(n)
c.writerow([child.attrib['campaign-id']])
# ------------------Generate test-------------
# 10off-310781
# 0
# MNT-deals
# 1
# black-friday
# 2

Parsing xml tree attributes (file has no elements)

I have been trying to use minidom but have no real preference. For some reason lxml will not install on my machine.
I would like to parse an xml file:
<?xml version="1.
-<transfer frmt="1" vtl="0" serial_number="E5XX-0822" date="2016-10-03 16:34:53.000" style="startstop">
-<plateInfo>
<plate barcode="E0122326" name="384plate" type="source"/>
<plate barcode="A1234516" name="1536plateD" type="destination"/>
</plateInfo>
-<printmap total="1387">
<w reason="" cf="13" aa="1.779" eo="299.798" tof="32.357" sv="1565.311" ct="1.627" ft="1.649" fc="88.226" memt="0.877" fldu="Percent" fld="DMSO" dy="0" dx="0" region="-1" tz="18989.481" gy="72468.649" gx="55070.768" avt="50" vt="50" vl="3.68" cvl="3.63" t="16:30:47.703" dc="0" dr="0" dn="A1" c="0" r="0" n="A1"/>
<w reason="" cf="13" aa="1.779" eo="299.798" tof="32.357" sv="1565.311" ct="1.627" ft="1.649" fc="88.226" memt="0.877" fldu="Percent" fld="DMSO" dy="0" dx="0" region="-1" tz="18989.481" gy="72468.649" gx="55070.768" avt="50" vt="50" vl="3.68" cvl="3.63" t="16:30:47.703" dc="0" dr="0" dn="A1" c="1" r="0" n="A2"/>
</printmap>
</transfer>
The files do not have any element details, as you can see. All the information is contained in the attributes. In trying to adapt another SO post, I have this - but it seems to be geared more toward elements. I am also failing at a good way to "browse" the xml information, i.e. I would like to say "dir(xml_file)" and have a list of all the methods I can carry out on my tree structure, or see all the attributes. I know this was a lot and potentially different directions, but thank you in advance!
def parse(files):
for xml_file in files:
xmldoc = minidom.parse(xml_file)
transfer = xmldoc.getElementsByTagName('transfer')[0]
plateInfo = transfer.getElementsByTagName('plateInfo')[0]
With minidom you can access the attributes of a particular element using the method attributes which can then be treated as dictionary; this example iterates and print the attributes of the element transfer[0]:
from xml.dom.minidom import parse, parseString
xml_file='''<?xml version="1.0" encoding="UTF-8"?>
<transfer frmt="1" vtl="0" serial_number="E5XX-0822" date="2016-10-03 16:34:53.000" style="startstop">
<plateInfo>
<plate barcode="E0122326" name="384plate" type="source"/>
<plate barcode="A1234516" name="1536plateD" type="destination"/>
</plateInfo>
<printmap total="1387">
<w reason="" cf="13" aa="1.779" eo="299.798" tof="32.357" sv="1565.311" ct="1.627" ft="1.649" fc="88.226" memt="0.877" fldu="Percent" fld="DMSO" dy="0" dx="0" region="-1" tz="18989.481" gy="72468.649" gx="55070.768" avt="50" vt="50" vl="3.68" cvl="3.63" t="16:30:47.703" dc="0" dr="0" dn="A1" c="0" r="0" n="A1"/>
<w reason="" cf="13" aa="1.779" eo="299.798" tof="32.357" sv="1565.311" ct="1.627" ft="1.649" fc="88.226" memt="0.877" fldu="Percent" fld="DMSO" dy="0" dx="0" region="-1" tz="18989.481" gy="72468.649" gx="55070.768" avt="50" vt="50" vl="3.68" cvl="3.63" t="16:30:47.703" dc="0" dr="0" dn="A1" c="1" r="0" n="A2"/>
</printmap>
</transfer>'''
xmldoc = parseString(xml_file)
transfer = xmldoc.getElementsByTagName('transfer')
attlist= transfer[0].attributes.keys()
for a in attlist:
print transfer[0].attributes[a].name,transfer[0].attributes[a].value
you can find more information here:
http://www.diveintopython.net/xml_processing/attributes.html

lxml use namespace instead of ns0, ns1,

I have just started with lxml basics and I am stuck with namespaces: I need to generate an xml like this:
<CityModel
xmlns:bldg="http://www.opengis.net/citygml/building/2.0"
<cityObjectMember>
<bldg:Building>
<bldg:function>1000</bldg:function>
</bldg:Building>
</cityObjectMember>
</CityModel>
By using the following code:
from lxml import etree
cityModel = etree.Element("cityModel")
cityObject = etree.SubElement(cityModel, "cityObjectMember")
bldg = etree.SubElement(cityObject, "{http://schemas.opengis.net/citygml/building/2.0/building.xsd}bldg")
function = etree.SubElement(bldg, "{bldg:}function")
function.text = "1000"
print etree.tostring(cityModel, pretty_print=True)
I get this:
<cityModel>
<cityObjectMember>
<ns0:bldg xmlns:ns0="http://schemas.opengis.net/citygml/building/2.0/building.xsd">
<ns1:function xmlns:ns1="bldg:">1000</ns1:function>
</ns0:bldg>
</cityObjectMember>
</cityModel>
which is quite different from what I want, and my software doesn't parse it.
How to get the correct xml?
from lxml import etree
ns_bldg = "http://www.opengis.net/citygml/building/2.0"
nsmap = {
'bldg': ns_bldg,
}
cityModel = etree.Element("cityModel", nsmap=nsmap)
cityObject = etree.SubElement(cityModel, "cityObjectMember")
bldg = etree.SubElement(cityObject, "{%s}Building" % ns_bldg)
function = etree.SubElement(bldg, "{%s}function" % ns_bldg)
function.text = "1000"
print etree.tostring(cityModel, pretty_print=True)
prints
<cityModel xmlns:bldg="http://www.opengis.net/citygml/building/2.0">
<cityObjectMember>
<bldg:Building>
<bldg:function>1000</bldg:function>
</bldg:Building>
</cityObjectMember>
</cityModel>
See lxml.etree Tutorial - Namespaces.

Categories