How to create xml file with multiple element in python - python

I want to create an XML file using python
like this:
<?xml version="1.0" encoding="utf-8"?>
<vehicle id="m0">
<timestep pos="2.3000" angle="11.1766" lane="-250709918#7_0" speed="0.0000" time="8.0"
</vehicle>
<vehicle id="m1">
<timestep pos="2.3000" angle="11.1766" lane="-250709918#7_0" speed="0.0000" time="8.0"
</vehicle>
........
my code:
doc = xml.dom.minidom.Document()
root = doc.createElement('vehicle')
for veh in veh_dict:
root.setAttribute('id', veh)
doc.appendChild(root)
for index, value in enumerate(veh_dict[veh]):
nodeManager = doc.createElement('timestep')
nodeManager.setAttribute('time', str(veh_dict[veh][index]['time']))
nodeManager.setAttribute('angle', str(veh_dict[veh][index]['angle']))
nodeManager.setAttribute('lane', str(veh_dict[veh][index]['lane']))
nodeManager.setAttribute(' pos', str(veh_dict[veh][index]['pos']))
nodeManager.setAttribute('speed', str(veh_dict[veh][index]['speed']))
nodeManager.setAttribute('type', str(veh_dict[veh][index]['type']))
nodeManager.setAttribute('x', str(veh_dict[veh][index]['x']))
nodeManager.setAttribute('y', str(veh_dict[veh][index]['y']))
root.appendChild(nodeManager)
fp = open('Manager.xml', 'w')
doc.writexml(fp, indent='\t', addindent='\t', newl='\n', encoding="utf-8")
My output has all datas, but they are all written in one of the 'vehicle'
like this:
<vehicle id="m2.9">
<timestep pos="2.3000" angle="11.1766" lane="-250709918#7_0" speed="0.0000" time="8.0" type="custom_moto" x="469.2605" y="5896.8761"/>
<timestep pos="3.3001" angle="12.9664" lane="-250709918#7_0" speed="1.0001" time="9.0" type="custom_moto" x="470.1134" y="5907.0132"/>
<timestep pos="6.4467" angle="12.2144" lane="-250709918#7_0" speed="3.1466" time="10.0" type="custom_moto" x="470.849" y="5900.3489"/>
<timestep pos="12.7147" angle="11.8696" lane="-250709918#7_0" speed="6.2681" time="11.0"
.......
Is the root always being overwritten?
How can solve it?

Add the root element inside the loop:
import xml.dom.minidom
doc = xml.dom.minidom.Document()
topElem = doc.createElement('vehicles')
for veh in veh_dict:
for index, value in enumerate(veh_dict[veh]):
root = doc.createElement('vehicle')
root.setAttribute('id', veh)
doc.appendChild(root)
nodeManager = doc.createElement('timestep')
nodeManager.setAttribute('time', str(veh_dict[veh][index]['time']))
nodeManager.setAttribute('angle', str(veh_dict[veh][index]['angle']))
nodeManager.setAttribute('lane', str(veh_dict[veh][index]['lane']))
nodeManager.setAttribute(' pos', str(veh_dict[veh][index]['pos']))
nodeManager.setAttribute('speed', str(veh_dict[veh][index]['speed']))
nodeManager.setAttribute('type', str(veh_dict[veh][index]['type']))
nodeManager.setAttribute('x', str(veh_dict[veh][index]['x']))
nodeManager.setAttribute('y', str(veh_dict[veh][index]['y']))
root.appendChild(nodeManager)
topElem.appendChild(root)
fp = open('Manager.xml', 'w')
doc.writexml(fp, indent='\t', addindent='\t', newl='\n', encoding="utf-8")

Consider using a top-level root above <vehicle> elements as required for well-formed XML documents. Also, avoid the repetitious lines and use the inner dictionary keys as the iterator variable. Finally, use context manager, with, to write built XML to file.
import xml.dom.minidom
# LIST OF DICTS
veh_dicts = [{'x': '469.2605', 'y': '5896.8761', 'time': 8.0, 'lane': '-250709918#7_0',
'angle': '11.1766', 'pos': '2.3000', 'speed': '0.0000', 'type': 'custom_moto'},
{'x': '470.1134', 'y': '5907.0132', 'time': 9.0, 'lane': '-250709918#7_0',
'angle': '12.9664', 'pos': '3.3001', 'speed': '1.0001', 'type': 'custom_moto'}]
doc = xml.dom.minidom.Document()
root = doc.createElement('vehicles') # TOP-LEVEL ROOT
doc.appendChild(root)
# ITERATE THROUGH EACH DICT
for i, veh in enumerate(veh_dicts, start=1):
vehichleElem = doc.createElement('vehicle')
vehichleElem.setAttribute('id', f'm{i}') # USES F-STRING (Python 3.6+)
root.appendChild(vehichleElem)
nodeManager = doc.createElement('timestep')
for k in veh.keys():
nodeManager.setAttribute(k, str(veh[k]))
vehichleElem.appendChild(nodeManager)
with open('MiniDomXMLBuild.xml', 'w') as fp: # CONTEXT MANAGER (NO close() NEEDED)
doc.writexml(fp, addindent='\t', newl='\n', encoding="utf-8")
Output
<?xml version="1.0" encoding="utf-8"?>
<vehicles>
<vehicle id="m1">
<timestep angle="11.1766" lane="-250709918#7_0" pos="2.3000" speed="0.0000" time="8.0" type="custom_moto" x="469.2605" y="5896.8761"/>
</vehicle>
<vehicle id="m2">
<timestep angle="12.9664" lane="-250709918#7_0" pos="3.3001" speed="1.0001" time="9.0" type="custom_moto" x="470.1134" y="5907.0132"/>
</vehicle>
</vehicles>

Related

How to extract specfic values from xml file using python xml.etree.ElementTree iterating until an id is found inside a hidden child node?

I need to iterate over the tag ObjectHeader and when the tag ObjectType/Id is equal to 1424 I need to extract all the values inside the following tags ObjectVariant/ObjectValue/Characteristic/Name and ObjectVariant/ObjectValue/PropertyValue/Value and put them in a dictionary. The expected output will be like this:
{"Var1": 10.4,
"Var2": 15.6}
Here is a snippet from the XML that I'm working with which has 30k lines (Hint: Id 1424 only appears once in the whole XML file).
<ObjectContext>
<ObjectHeader>
<ObjectType>
<Id>1278</Id>
<Name>ID_NAME</Name>
</ObjectType>
<ObjectVariant>
<ObjectValue>
<Characteristic>
<Name>Var1</Name>
<Description>Something about the name</Description>
</Characteristic>
<PropertyValue>
<Value>10.6</Value>
<Description>Something about the value</Description>
</PropertyValue>
</ObjectValue>
</ObjectVariant>
</ObjectHeader>
<ObjectHeader>
<ObjectType>
<Id>1424</Id>
<Name>ID_NAME</Name>
</ObjectType>
<ObjectVariant>
<ObjectValue>
<Characteristic>
<Name>Var1</Name>
<Description>Something about the name</Description>
</Characteristic>
<PropertyValue>
<Value>10.4</Value>
<Description>Something about the value</Description>
</PropertyValue>
</ObjectValue>
<ObjectValue>
<Characteristic>
<Name>Var2</Name>
<CharacteristicType>Something about the name</CharacteristicType>
</Characteristic>
<PropertyValue>
<Value>15.6</Value>
<Description>Something about the value</Description>
</PropertyValue>
</ObjectValue>
</ObjectVariant>
</ObjectHeader>
</ObjectContext>
Here is one possibility to write all to pandas and then filter the interessting values:
import pandas as pd
import xml.etree.ElementTree as ET
tree = ET.parse("xml_to_dict.xml")
root = tree.getroot()
columns = ["id", "name", "value"]
row_list = []
for objHead in root.findall('.//ObjectHeader'):
for elem in objHead.iter():
if elem.tag == 'Id':
id = elem.text
if elem.tag == 'Name':
name = elem.text
if elem.tag == 'Value':
value = elem.text
row = id, name, value
row_list.append(row)
df = pd.DataFrame(row_list, columns=columns)
dff = df.query('id == "1424"')
print("Dictionary:", dict(list(zip(dff['name'], dff['value']))))
Output:
Dictionary: {'Var1': '10.4', 'Var2': '15.6'}

Python re.findall organize list

I have a text file with entries like this:
<soap:Envelope xmlns:soap="http://www.w3.org/2003/05/soap-envelope" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<soap:Body>
<Applications_GetResponse xmlns="http://www.country.com">
<Applications>
<CS_Application>
<Name>Spain</Name>
<Key>2345364564</Key>
<Status>NORMAL</Status>
<Modules>
<CS_Module>
<Name>zaragoza</Name>
<Key>8743249725</Key>
<DevelopmentEffort>0</DevelopmentEffort>
<LogicalDBConnections/>
</CS_Module>
<CS_Module>
<Name>malaga</Name>
<Key>8743249725</Key>
<DevelopmentEffort>0</DevelopmentEffort>
<LogicalDBConnections/>
</CS_Module>
</Modules>
<CreatedBy>7</CreatedBy>
</CS_Application>
<CS_Application>
<Name>UK</Name>
<Key>2345364564</Key>
<Status>NORMAL</Status>
<Modules>
<CS_Module>
<Name>london</Name>
<Key>8743249725</Key>
<DevelopmentEffort>0</DevelopmentEffort>
<LogicalDBConnections/>
</CS_Module>
<CS_Module>
<Name>liverpool</Name>
<Key>8743249725</Key>
<DevelopmentEffort>0</DevelopmentEffort>
<LogicalDBConnections/>
</CS_Module>
</Modules>
<CreatedBy>7</CreatedBy>
</CS_Application>
</Applications>
</Applications_GetResponse>
</soap:Body>
</soap:Envelope>
I would like to analyze it and obtain the name of the country in the sequence of the cities.
I tried some things with python re.finall, but I didn't get anything like it
print("HERE APPLICATIONS")
applications = re.findall('<CS_Application><Name>(.*?)</Name>', response_apply.text)
print(applications)
print("HERE MODULES")
modules = re.findall('<CS_Module><Name>(.*?)</Name>', response_apply.text)
print(modules)
return:
host-10$ sudo python3 capture.py
HERE APPLICATIONS
['Spain', 'UK']
HERE MODULES
['zaragoza', 'malaga', 'london', 'liverpool']
The expected result is, I would like the result to be like this:
HERE
The Country: Spain - Cities: zaragoza,malaga
The Country: UK - Cities: london,liverpool
Regex is not good to parse xml. Better use xml parser..
If you want regex solution then hope below code help you.
import re
s = """\n<soap:Envelope xmlns:soap="http://www.w3.org/2003/05/soap-envelope" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">\n <soap:Body>\n <Applications_GetResponse xmlns="http://www.country.com">\n <Applications>\n <CS_Application>\n <Name>Spain</Name>\n <Key>2345364564</Key>\n <Status>NORMAL</Status>\n <Modules>\n <CS_Module>\n <Name>zaragoza</Name>\n <Key>8743249725</Key>\n <DevelopmentEffort>0</DevelopmentEffort>\n <LogicalDBConnections/>\n </CS_Module>\n <CS_Module>\n <Name>malaga</Name>\n <Key>8743249725</Key>\n <DevelopmentEffort>0</DevelopmentEffort>\n <LogicalDBConnections/>\n </CS_Module>\n </Modules>\n <CreatedBy>7</CreatedBy>\n </CS_Application>\n <CS_Application>\n <Name>UK</Name>\n <Key>2345364564</Key>\n <Status>NORMAL</Status>\n <Modules>\n <CS_Module>\n <Name>london</Name>\n <Key>8743249725</Key>\n <DevelopmentEffort>0</DevelopmentEffort>\n <LogicalDBConnections/>\n </CS_Module>\n <CS_Module>\n <Name>liverpool</Name>\n <Key>8743249725</Key>\n <DevelopmentEffort>0</DevelopmentEffort>\n <LogicalDBConnections/>\n </CS_Module>\n </Modules>\n <CreatedBy>7</CreatedBy>\n </CS_Application>\n </Applications>\n </Applications_GetResponse>\n </soap:Body>\n</soap:Envelope>\n"""
pattern1 = re.compile(r'<CS_Application>([\s\S]*?)</CS_Application>')
pattern2 = re.compile(r'<Name>(.*)?</Name>')
for m in re.finditer(pattern1, s):
ss = m.group(1)
res = []
for mm in re.finditer(pattern2, ss):
res.append(mm.group(1))
print("The Country: "+res[0]+" - Cities: "+",".join(res[1:len(res)]))

python lxml - loop/iterate through excel rows and save each row as one xml

the problem is that the 2nd xml file contains also the data from the first iteration of the excel row and the third xml file every data from the first and 2nd rows
Working since hours on that and cant figure it out
from lxml import etree
import openpyxl
# Create root element with namespace information
xmlns = "http://xml.datev.de/bedi/tps/ledger/v040"
xsi = "http://www.w3.org/2001/XMLSchema-instance"
schemaLocation = "http://xml.datev.de/bedi/tps/ledger/v040 Belegverwaltung_online_ledger_import_v040.xsd"
version = "4.0"
generator_info = "DATEV Musterdaten"
generating_system = "DATEV manuell"
xmlRoot = etree.Element(
"{" + xmlns + "}LedgerImport",
version=version,
attrib={"{" + xsi + "}schemaLocation": schemaLocation},
generator_info=generator_info,
generating_system=generating_system,
nsmap={'xsi': xsi, None: xmlns}
)
####open excel file speadsheet
wb = openpyxl.load_workbook('import_spendesk_datev.xlsx')
sheet = wb['Import']
# build the xml tree
for i in range(2,6):
consolidate = etree.SubElement(xmlRoot, 'consolidate', attrib={'consolidatedAmount': str(sheet.cell(row=i,column=16).value),'consolidatedDate': str(sheet.cell(row=i,column=2).value), 'consolidatedInvoiceId': str(sheet.cell(row=i,column=13).value), 'consolidatedCurrencyCode': str(sheet.cell(row=i,column=12).value) })
accountsPayableLedger = etree.SubElement(consolidate, 'accountsPayableLedger')
account = etree.SubElement(accountsPayableLedger, 'bookingText')
account.text = sheet.cell(row=i,column=21).value
invoice = etree.SubElement(accountsPayableLedger, 'invoiceId')
invoice.text = sheet.cell(row=i,column=13).value
date = etree.SubElement(accountsPayableLedger, 'date')
date.text = sheet.cell(row=i,column=2).value
amount = etree.SubElement(accountsPayableLedger, 'amount')
amount.text = sheet.cell(row=i,column=16).value
account_no = etree.SubElement(accountsPayableLedger, 'accountNo')
account_no.text = sheet.cell(row=i,column=19).value
cost1 = etree.SubElement(accountsPayableLedger, 'costCategoryId')
cost1.text = sheet.cell(row=i,column=15).value
currency_code = etree.SubElement(accountsPayableLedger, 'currencyCode')
currency_code.text = sheet.cell(row=i,column=12).value
party_id = etree.SubElement(accountsPayableLedger, 'partyId')
party_id.text = sheet.cell(row=i,column=20).value
bpaccount = etree.SubElement(accountsPayableLedger, 'bpAccountNo')
bpaccount.text = sheet.cell(row=i,column=20).value
doc = etree.ElementTree(xmlRoot)
doc.write( str(sheet.cell(row=i,column=13).value)+".xml", xml_declaration=True, encoding='utf-8', pretty_print=True)
as described
this for every single excel row and for each row one .xml file
<?xml version='1.0' encoding='UTF-8'?>
<LedgerImport xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://xml.datev.de/bedi/tps/ledger/v040" generating_system="DATEV manuell" generator_info="DATEV Musterdaten" version="4.0" xsi:schemaLocation="http://xml.datev.de/bedi/tps/ledger/v040 Belegverwaltung_online_ledger_import_v040.xsd">
<consolidate consolidatedAmount="1337.01">
<accountsPayableLedger>
<bookingText>amazon</bookingText>
<invoiceId>1</invoiceId>
</accountsPayableLedger>
</consolidate>
</LedgerImport>
The same xmlRoot object is reused several times. You need to create a new root element for each iteration in the for loop.
The code that creates the root element can be put in a function. Here is a simplified example:
from lxml import etree
def makeroot():
return etree.Element("LedgerImport")
for i in range(2, 6):
xmlRoot = makeroot()
consolidate = etree.SubElement(xmlRoot, 'consolidate',
attrib={'consolidatedAmount': str(i)})
doc = etree.ElementTree(xmlRoot)
doc.write(str(i) + ".xml", xml_declaration=True, encoding='utf-8', pretty_print=True)
After #mzjn pointed out your basic mistake, here is a thing I made for fun - you can create nested XML with a declarative mapping, instead of laboriously calling etree.SubElement yourself.
Here is how. Assume this as the basic situation:
from lxml import etree
import openpyxl
ns = {
None: 'http://xml.datev.de/bedi/tps/ledger/v040',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
}
mapping = {
'_tag': '{' + ns[None] + '}LedgerImport',
'attrib': {
'version': '4.0',
'{' + ns['xsi'] + '}schemaLocation': 'http://xml.datev.de/bedi/tps/ledger/v040 Belegverwaltung_online_ledger_import_v040.xsd',
'generator_info': 'DATEV Musterdaten',
'generating_system': 'DATEV manuell',
},
'nsmap': ns,
'_children': [{
'_tag': 'consolidate',
'attrib': {
'consolidatedAmount': lambda: sheet.cell(i, 16).value,
'consolidatedDate': lambda: sheet.cell(i, 2).value,
'consolidatedInvoiceId': lambda: sheet.cell(i, 13).value,
'consolidatedCurrencyCode': lambda: sheet.cell(i, 12).value,
},
'_children': [{
'_tag': 'accountsPayableLedger',
'_children': [
{'_tag': 'bookingText', '_text': lambda: sheet.cell(i, 21).value},
{'_tag': 'invoiceId', '_text': lambda: sheet.cell(i, 13).value},
{'_tag': 'date', '_text': lambda: sheet.cell(i, 2).value},
{'_tag': 'amount', '_text': lambda: sheet.cell(i, 16).value},
{'_tag': 'accountNo', '_text': lambda: sheet.cell(i, 19).value},
{'_tag': 'costCategoryId', '_text': lambda: sheet.cell(i, 15).value},
{'_tag': 'currencyCode', '_text': lambda: sheet.cell(i, 12).value},
{'_tag': 'partyId', '_text': lambda: sheet.cell(i, 20).value},
{'_tag': 'bpAccountNo', '_text': lambda: sheet.cell(i, 20).value},
]
}]
}],
}
The nested dict resembles your final XML document. Its keys also resemble the parameters that etree.Element() and etree.SubElement() take, with the addition of _text and _children.
Now we can define a single recursive helper function that takes this input tree and transforms it into a nested XML tree of the same configuration. As a bonus we can execute the lambda functions, which allows us to dynamically calculate attribute values and text:
def build_tree(template, parent=None):
# prepare a dict for calling etree.Element()/etree.SubElement()
params = {k: v for k, v in template.items() if k not in ['_children', '_text']}
# calculate any dynamic attribute values
for name in params.get('attrib', {}):
value = params['attrib'][name]
params['attrib'][name] = str(value() if callable(value) else value)
if parent is None:
node = etree.Element(**params)
else:
params['_parent'] = parent
node = etree.SubElement(**params)
# calculate (if necessary) and set the node text
if '_text' in template:
if callable(template['_text']):
node.text = str(template['_text']())
else:
node.text = str(template['_text']) if template['_text'] else template['_text']
# recurse into children, if any
for child in template.get('_children', []):
build_tree(child, node)
return node
We can call this in a loop:
wb = openpyxl.load_workbook('import_spendesk_datev.xlsx')
sheet = wb['Import']
for i in range(2,6):
root = build_tree(mapping)
doc = etree.ElementTree(root)
name = "%s.xml" % sheet.cell(i, 13).value
doc.write(name, xml_declaration=True, encoding='utf-8', pretty_print=True)
This should generate a couple of nicely nested XML documents, and it should be a lot easier to manage if your XML structure changes or gets more complicated.
Alternatively, consider XSLT, the special-purpose declarative langauge designed to transform XML files, which lxml does support. Specifically, pass parameters from Python to the stylesheet to transform a template XML (not unlike passing parameters to a prepared SQL statement):
XML template (includes all top-level namespaces)
<?xml version='1.0' encoding='UTF-8'?>
<LedgerImport xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://xml.datev.de/bedi/tps/ledger/v040"
generating_system="DATEV manuell"
generator_info="DATEV Musterdaten" version="4.0"
xsi:schemaLocation="http://xml.datev.de/bedi/tps/ledger/v040 Belegverwaltung_online_ledger_import_v040.xsd">
<consolidate consolidatedAmount="???">
<accountsPayableLedger>
<bookingText>???</bookingText>
<invoiceId>???</invoiceId>
<date>???</date>
<amount>???</amount>
<accountNo>???</accountNo>
<costCategoryId>???</costCategoryId>
<currencyCode>???</currencyCode>
<partyId>???</partyId>
<bpAccountNo>???</bpAccountNo>
</accountsPayableLedger>
</consolidate>
</LedgerImport>
XSLT (save as .xsl file, a little longer due to default namespace in XML)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:doc="http://xml.datev.de/bedi/tps/ledger/v040">
<xsl:output indent="yes"/>
<xsl:strip-space elements="*"/>
<!-- INITIALIZE PARAMETERS -->
<xsl:param name="prm_consolidate" />
<xsl:param name="prm_bookingText" />
<xsl:param name="prm_invoiceId" />
<xsl:param name="prm_date" />
<xsl:param name="prm_amount" />
<xsl:param name="prm_accountNo" />
<xsl:param name="prm_costCategoryId" />
<xsl:param name="prm_currencyCode" />
<xsl:param name="prm_partyId" />
<xsl:param name="prm_bpAccountNo" />
<!-- IDENTITY TRANSFORM -->
<xsl:template match="#*|node()">
<xsl:copy>
<xsl:apply-templates select="#*|node()"/>
</xsl:copy>
</xsl:template>
<!-- REWRITE TITLE TEXT -->
<xsl:template match="doc:accountsPayableLedger">
<xsl:copy>
<xsl:element name="consolidate" namespace="http://xml.datev.de/bedi/tps/ledger/v040">
<xsl:attribute name="consolidatedAmount"><xsl:value-of select="$prm_consolidate"/></xsl:attribute>
</xsl:element>
<xsl:element name="bookingText" namespace="http://xml.datev.de/bedi/tps/ledger/v040"><xsl:value-of select="$prm_bookingText"/></xsl:element>
<xsl:element name="invoiceId" namespace="http://xml.datev.de/bedi/tps/ledger/v040"><xsl:value-of select="$prm_invoiceId"/></xsl:element>
<xsl:element name="date" namespace="http://xml.datev.de/bedi/tps/ledger/v040"><xsl:value-of select="$prm_date"/></xsl:element>
<xsl:element name="amount" namespace="http://xml.datev.de/bedi/tps/ledger/v040"><xsl:value-of select="$prm_amount"/></xsl:element>
<xsl:element name="accountNo" namespace="http://xml.datev.de/bedi/tps/ledger/v040"><xsl:value-of select="$prm_accountNo"/></xsl:element>
<xsl:element name="costCategoryId" namespace="http://xml.datev.de/bedi/tps/ledger/v040"><xsl:value-of select="$prm_costCategoryId"/></xsl:element>
<xsl:element name="currencyCode" namespace="http://xml.datev.de/bedi/tps/ledger/v040"><xsl:value-of select="$prm_currencyCode"/></xsl:element>
<xsl:element name="partyId" namespace="http://xml.datev.de/bedi/tps/ledger/v040"><xsl:value-of select="$prm_partyId"/></xsl:element>
<xsl:element name="bpAccountNo" namespace="http://xml.datev.de/bedi/tps/ledger/v040"><xsl:value-of select="$prm_bpAccountNo"/></xsl:element>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
Python (no DOM element building)
import lxml.etree as et
# LOAD XML AND XSL
xml = et.parse('/path/to/Template.xml')
xsl = et.parse('/path/to/XSLTScript.xsl')
### OPEN EXCEL SPREADSHEET
wb = openpyxl.load_workbook('import_spendesk_datev.xlsx')
sheet = wb['Import']
# LOOP THROUGH ROWS
for i in range(2, 6):
consolidate = et.XSLT.strparam(sheet.cell(row=i,column=16).value)
account = et.XSLT.strparam(sheet.cell(row=i,column=21).value)
invoice = et.XSLT.strparam(sheet.cell(row=i,column=13).value)
date = et.XSLT.strparam(sheet.cell(row=i,column=2).value)
amount = et.XSLT.strparam(sheet.cell(row=i,column=16).value)
account_no = et.XSLT.strparam(sheet.cell(row=i,column=19).value)
cost1 = et.XSLT.strparam(sheet.cell(row=i,column=15).value)
currency_code = et.XSLT.strparam(sheet.cell(row=i,column=12).value)
party_id = et.XSLT.strparam(sheet.cell(row=i,column=20).value)
bpaccount = et.XSLT.strparam(sheet.cell(row=i,column=20).value)
# PASS PARAMETER TO XSLT
transform = et.XSLT(xsl)
result = transform(xml, prm_consolidate = consolidate,
prm_bookingText=account,
prm_invoiceId = invoice,
prm_date = date,
prm_amount = amount,
prm_account_no = account_no,
prm_costCategoryId = cost1,
prm_currencyCode = currency_code,
prm_partyId = party_id,
prm_bpAccountNo = bpaccount)
# SAVE XML TO FILE
with open('/path/to/Output_Row{}.xml'.format(i), 'wb') as f:
f.write(result)

Converting an xml doc into a specific dot-expanded json structure

I have the following XML document:
<Item ID="288917">
<Main>
<Platform>iTunes</Platform>
<PlatformID>353736518</PlatformID>
</Main>
<Genres>
<Genre FacebookID="6003161475030">Comedy</Genre>
<Genre FacebookID="6003172932634">TV-Show</Genre>
</Genres>
<Products>
<Product Country="CA">
<URL>https://itunes.apple.com/ca/tv-season/id353187108?i=353736518</URL>
<Offers>
<Offer Type="HDBUY">
<Price>3.49</Price>
<Currency>CAD</Currency>
</Offer>
<Offer Type="SDBUY">
<Price>2.49</Price>
<Currency>CAD</Currency>
</Offer>
</Offers>
</Product>
<Product Country="FR">
<URL>https://itunes.apple.com/fr/tv-season/id353187108?i=353736518</URL>
<Rating>Tout public</Rating>
<Offers>
<Offer Type="HDBUY">
<Price>2.49</Price>
<Currency>EUR</Currency>
</Offer>
<Offer Type="SDBUY">
<Price>1.99</Price>
<Currency>EUR</Currency>
</Offer>
</Offers>
</Product>
</Products>
</Item>
Currently, to get it into json format I'm doing the following:
parser = etree.XMLParser(recover=True)
node = etree.fromstring(s, parser=parser)
data = xmltodict.parse(etree.tostring(node))
Of course the xmltodict is doing the heavy lifting. However, it gives me a format that is not ideal for what I'm trying to accomplish. Here is what I'd like the end data to look like:
{
"Item[#ID]": 288917, # if no preceding element, use the root node tag
"Main.Platform": "iTunes",
"Main.PlatformID": "353736518",
"Genres.Genre": ["Comedy", "TV-Show"] # list of elements if repeated
"Genres.Genre[#FacebookID]": ["6003161475030", "6003161475030"],
"Products.Product[#Country]": ["CA", "FR"],
"Products.Product.URL": ["https://itunes.apple.com/ca/tv-season/id353187108?i=353736518", "https://itunes.apple.com/fr/tv-season/id353187108?i=353736518"],
"Products.Product.Offers.Offer[#Type]": ["HDBUY", "SDBUY", "HDBUY", "SDBUY"],
"Products.Product.Offers.Offer.Price": ["3.49", "2.49", "2.49", "1.99"],
"Products.Product.Offers.Offer.Currency": "EUR"
}
This is a bit verbose, but it wasn't too hard to format this as a flat dict. Here is an example:
node = etree.fromstring(file_data.encode('utf-8'), parser=parser)
data = OrderedDict()
nodes = [(node, ''),] # format is (node, prefix)
while nodes:
for sub, prefix in nodes:
# remove the prefix tag unless its for the first attribute
tag_prefix = '.'.join(prefix.split('.')[1:]) if ('.' in prefix) else ''
atr_prefix = sub.tag if (sub == node) else tag_prefix
# tag
if sub.text.strip():
_prefix = tag_prefix + '.' + sub.tag
_value = sub.text.strip()
if data.get(_prefix): # convert it to a list if multiple values
if not isinstance(data[_prefix], list): data[_prefix] = [data[_prefix],]
data[_prefix].append(_value)
else:
data[_prefix] = _value
# atr
for k, v in sub.attrib.items():
_prefix = atr_prefix + '[#%s]' % k
_value = v
if data.get(_prefix): # convert it to a list if multiple values
if not isinstance(data[_prefix], list): data[_prefix] = [data[_prefix],]
data[_prefix].append(_value)
else:
data[_prefix] = _value
nodes.remove((sub, prefix))
for s in sub.getchildren():
_prefix = (prefix + '.' + sub.tag).strip('.')
nodes.append((s, _prefix))
if not nodes: break
You can use recursion here. One way is to store the paths progressively as your recurse the XML document, and return a result dictionary at the end, which can be serialized to JSON.
The below demo uses the standard library xml.etree.ElementTree for parsing XML documents.
Demo:
from xml.etree.ElementTree import ElementTree
from pprint import pprint
# Setup XML tree for parsing
tree = ElementTree()
tree.parse("sample.xml")
root = tree.getroot()
def collect_xml_paths(root, path=[], result={}):
"""Collect XML paths into a dictionary"""
# First collect root items
if not result:
root_id, root_value = tuple(root.attrib.items())[0]
root_key = root.tag + "[#%s]" % root_id
result[root_key] = root_value
# Go through each child from root
for child in root:
# Extract text
text = child.text.strip()
# Update path
new_path = path[:]
new_path.append(child.tag)
# Create dot separated key
key = ".".join(new_path)
# Get child attributes
attributes = child.attrib
# Ensure we have attributes
if attributes:
# Add each attribute to result
for k, v in attributes.items():
attrib_key = key + "[#%s]" % k
result.setdefault(attrib_key, []).append(v)
# Add text if it exists
if text:
result.setdefault(key, []).append(text)
# Recurse through paths once done iteration
collect_xml_paths(child, new_path)
# Separate single values from list values
return {k: v[0] if len(v) == 1 else v for k, v in result.items()}
pprint(collect_xml_paths(root))
Output:
{'Genres.Genre': ['Comedy', 'TV-Show'],
'Genres.Genre[#FacebookID]': ['6003161475030', '6003172932634'],
'Item[#ID]': '288917',
'Main.Platform': 'iTunes',
'Main.PlatformID': '353736518',
'Products.Product.Offers.Offer.Currency': ['CAD', 'CAD', 'EUR', 'EUR'],
'Products.Product.Offers.Offer.Price': ['3.49', '2.49', '2.49', '1.99'],
'Products.Product.Offers.Offer[#Type]': ['HDBUY', 'SDBUY', 'HDBUY', 'SDBUY'],
'Products.Product.Rating': 'Tout public',
'Products.Product.URL': ['https://itunes.apple.com/ca/tv-season/id353187108?i=353736518',
'https://itunes.apple.com/fr/tv-season/id353187108?i=353736518'],
'Products.Product[#Country]': ['CA', 'FR']}
If you want to serialize this dictionary to JSON, you can use json.dumps():
from json import dumps
print(dumps(collect_xml_paths(root)))
# {"Item[#ID]": "288917", "Main.Platform": "iTunes", "Main.PlatformID": "353736518", "Genres.Genre[#FacebookID]": ["6003161475030", "6003172932634"], "Genres.Genre": ["Comedy", "TV-Show"], "Products.Product[#Country]": ["CA", "FR"], "Products.Product.URL": ["https://itunes.apple.com/ca/tv-season/id353187108?i=353736518", "https://itunes.apple.com/fr/tv-season/id353187108?i=353736518"], "Products.Product.Offers.Offer[#Type]": ["HDBUY", "SDBUY", "HDBUY", "SDBUY"], "Products.Product.Offers.Offer.Price": ["3.49", "2.49", "2.49", "1.99"], "Products.Product.Offers.Offer.Currency": ["CAD", "CAD", "EUR", "EUR"], "Products.Product.Rating": "Tout public"}

Convert XML into dictionary

I need to convert XML file into the dictionary (later on it will be converted into JSON).
A sample of XML script looks like:
<?xml version="1.0" encoding="UTF-8"?>
<osm version="0.6" generator="Overpass API 0.7.55.3 9da5e7ae">
<note>The data included in this document is from www.openstreetmap.org. The data is made available under ODbL.</note>
<meta osm_base="2018-06-17T15:31:02Z"/>
...
<node id="2188497873" lat="52.5053306" lon="13.4360114">
<tag k="alt_name" v="Spreebalkon"/>
<tag k="name" v="Brommybalkon"/>
<tag k="tourism" v="viewpoint"/>
<tag k="wheelchair" v="yes"/>
</node>
...
</osm>
With the simple code I have already filtered all the values that I needed for my dictionary:
Code
import xml.etree.ElementTree as ET
input_file = r"D:\berlin\trial_xml\berlin_viewpoint_locations.xml"
tree = ET.parse(input_file)
root = tree.getroot()
lst1 = tree.findall("./node")
for item1 in lst1:
print('id:',item1.get('id'))
print('lat:',item1.get('lat'))
print('lon:',item1.get('lon'))
for item1_tags_and_nd in item1.iter('tag'):
print(item1_tags_and_nd.get('k') + ":", item1_tags_and_nd.get('v'))
Result
id: 2188497873
lat: 52.5053306
lon: 13.4360114
alt_name: Spreebalkon
name: Brommybalkon
tourism: viewpoint
wheelchair: yes
Can you help me, please to append properly and efficiently these values into a dictionary?
I want it to look like:
{'id': '2188497873', 'lat': 52.5053306, 'lon': 13.4360114, 'alt_name': 'Spreebalkon', 'name': 'Brommybalkon', 'tourism': 'viewpoint', 'wheelchair': 'yes'}
I have tried with
dictionary = {}
dictionary['id'] = []
dictionary['lat'] = []
dictionary['lon'] = []
lst1 = tree.findall("./node")
for item1 in lst1:
dictionary['id'].append(item1.get('id'))
dictionary['lat'].append(item1.get('lat'))
dictionary['lon'].append(item1.get('lon'))
for item1_tags_and_nd in item1.iter('tag'):
dictionary[item1_tags_and_nd.get('k')] = item1_tags_and_nd.get('v')
but it does not work so far.
I suggest you construct a list of dicts, instead of a dict of lists like:
result_list = []
for item in tree.findall("./node"):
dictionary = {}
dictionary['id'] = item.get('id')
dictionary['lat'] = item.get('lat')
dictionary['lon'] = item.get('lon')
result_list.append(dictionary)
Or as a couple of comprehensions like:
result_list = [{k: item.get(k) for k in ('id', 'lat', 'lon')}
for item in tree.findall("./node")]
And for the nested case:
result_list = [{k: (item.get(k) if k != 'tags' else
{i.get('k'): i.get('v') for i in item.iter('tag')})
for k in ('id', 'lat', 'lon', 'tags')}
for item in tree.findall("./node")]
Results:
{
'id': '2188497873',
'lat': '52.5053306',
'lon': '13.4360114',
'tags': {
'alt_name': 'Spreebalkon',
'name': 'Brommybalkon',
'tourism': 'viewpoint',
'wheelchair': 'yes'
}
}

Categories