Converting an xml doc into a specific dot-expanded json structure - python

I have the following XML document:
<Item ID="288917">
<Genre FacebookID="6003161475030">Comedy</Genre>
<Genre FacebookID="6003172932634">TV-Show</Genre>
<Product Country="CA">
<Offer Type="HDBUY">
<Offer Type="SDBUY">
<Product Country="FR">
<Rating>Tout public</Rating>
<Offer Type="HDBUY">
<Offer Type="SDBUY">
Currently, to get it into json format I'm doing the following:
parser = etree.XMLParser(recover=True)
node = etree.fromstring(s, parser=parser)
data = xmltodict.parse(etree.tostring(node))
Of course the xmltodict is doing the heavy lifting. However, it gives me a format that is not ideal for what I'm trying to accomplish. Here is what I'd like the end data to look like:
"Item[#ID]": 288917, # if no preceding element, use the root node tag
"Main.Platform": "iTunes",
"Main.PlatformID": "353736518",
"Genres.Genre": ["Comedy", "TV-Show"] # list of elements if repeated
"Genres.Genre[#FacebookID]": ["6003161475030", "6003161475030"],
"Products.Product[#Country]": ["CA", "FR"],
"Products.Product.URL": ["", ""],
"Products.Product.Offers.Offer[#Type]": ["HDBUY", "SDBUY", "HDBUY", "SDBUY"],
"Products.Product.Offers.Offer.Price": ["3.49", "2.49", "2.49", "1.99"],
"Products.Product.Offers.Offer.Currency": "EUR"

This is a bit verbose, but it wasn't too hard to format this as a flat dict. Here is an example:
node = etree.fromstring(file_data.encode('utf-8'), parser=parser)
data = OrderedDict()
nodes = [(node, ''),] # format is (node, prefix)
while nodes:
for sub, prefix in nodes:
# remove the prefix tag unless its for the first attribute
tag_prefix = '.'.join(prefix.split('.')[1:]) if ('.' in prefix) else ''
atr_prefix = sub.tag if (sub == node) else tag_prefix
# tag
if sub.text.strip():
_prefix = tag_prefix + '.' + sub.tag
_value = sub.text.strip()
if data.get(_prefix): # convert it to a list if multiple values
if not isinstance(data[_prefix], list): data[_prefix] = [data[_prefix],]
data[_prefix] = _value
# atr
for k, v in sub.attrib.items():
_prefix = atr_prefix + '[#%s]' % k
_value = v
if data.get(_prefix): # convert it to a list if multiple values
if not isinstance(data[_prefix], list): data[_prefix] = [data[_prefix],]
data[_prefix] = _value
nodes.remove((sub, prefix))
for s in sub.getchildren():
_prefix = (prefix + '.' + sub.tag).strip('.')
nodes.append((s, _prefix))
if not nodes: break

You can use recursion here. One way is to store the paths progressively as your recurse the XML document, and return a result dictionary at the end, which can be serialized to JSON.
The below demo uses the standard library xml.etree.ElementTree for parsing XML documents.
from xml.etree.ElementTree import ElementTree
from pprint import pprint
# Setup XML tree for parsing
tree = ElementTree()
root = tree.getroot()
def collect_xml_paths(root, path=[], result={}):
"""Collect XML paths into a dictionary"""
# First collect root items
if not result:
root_id, root_value = tuple(root.attrib.items())[0]
root_key = root.tag + "[#%s]" % root_id
result[root_key] = root_value
# Go through each child from root
for child in root:
# Extract text
text = child.text.strip()
# Update path
new_path = path[:]
# Create dot separated key
key = ".".join(new_path)
# Get child attributes
attributes = child.attrib
# Ensure we have attributes
if attributes:
# Add each attribute to result
for k, v in attributes.items():
attrib_key = key + "[#%s]" % k
result.setdefault(attrib_key, []).append(v)
# Add text if it exists
if text:
result.setdefault(key, []).append(text)
# Recurse through paths once done iteration
collect_xml_paths(child, new_path)
# Separate single values from list values
return {k: v[0] if len(v) == 1 else v for k, v in result.items()}
{'Genres.Genre': ['Comedy', 'TV-Show'],
'Genres.Genre[#FacebookID]': ['6003161475030', '6003172932634'],
'Item[#ID]': '288917',
'Main.Platform': 'iTunes',
'Main.PlatformID': '353736518',
'Products.Product.Offers.Offer.Currency': ['CAD', 'CAD', 'EUR', 'EUR'],
'Products.Product.Offers.Offer.Price': ['3.49', '2.49', '2.49', '1.99'],
'Products.Product.Offers.Offer[#Type]': ['HDBUY', 'SDBUY', 'HDBUY', 'SDBUY'],
'Products.Product.Rating': 'Tout public',
'Products.Product.URL': ['',
'Products.Product[#Country]': ['CA', 'FR']}
If you want to serialize this dictionary to JSON, you can use json.dumps():
from json import dumps
# {"Item[#ID]": "288917", "Main.Platform": "iTunes", "Main.PlatformID": "353736518", "Genres.Genre[#FacebookID]": ["6003161475030", "6003172932634"], "Genres.Genre": ["Comedy", "TV-Show"], "Products.Product[#Country]": ["CA", "FR"], "Products.Product.URL": ["", ""], "Products.Product.Offers.Offer[#Type]": ["HDBUY", "SDBUY", "HDBUY", "SDBUY"], "Products.Product.Offers.Offer.Price": ["3.49", "2.49", "2.49", "1.99"], "Products.Product.Offers.Offer.Currency": ["CAD", "CAD", "EUR", "EUR"], "Products.Product.Rating": "Tout public"}


How to extract specfic values from xml file using python xml.etree.ElementTree iterating until an id is found inside a hidden child node?

I need to iterate over the tag ObjectHeader and when the tag ObjectType/Id is equal to 1424 I need to extract all the values inside the following tags ObjectVariant/ObjectValue/Characteristic/Name and ObjectVariant/ObjectValue/PropertyValue/Value and put them in a dictionary. The expected output will be like this:
{"Var1": 10.4,
"Var2": 15.6}
Here is a snippet from the XML that I'm working with which has 30k lines (Hint: Id 1424 only appears once in the whole XML file).
<Description>Something about the name</Description>
<Description>Something about the value</Description>
<Description>Something about the name</Description>
<Description>Something about the value</Description>
<CharacteristicType>Something about the name</CharacteristicType>
<Description>Something about the value</Description>
Here is one possibility to write all to pandas and then filter the interessting values:
import pandas as pd
import xml.etree.ElementTree as ET
tree = ET.parse("xml_to_dict.xml")
root = tree.getroot()
columns = ["id", "name", "value"]
row_list = []
for objHead in root.findall('.//ObjectHeader'):
for elem in objHead.iter():
if elem.tag == 'Id':
id = elem.text
if elem.tag == 'Name':
name = elem.text
if elem.tag == 'Value':
value = elem.text
row = id, name, value
df = pd.DataFrame(row_list, columns=columns)
dff = df.query('id == "1424"')
print("Dictionary:", dict(list(zip(dff['name'], dff['value']))))
Dictionary: {'Var1': '10.4', 'Var2': '15.6'}

Transform Nested XML

I am currently looking to parse out a nested XML into a pandas Datatable so I can generate a CSV with each column being an element name and the value of that being the element text but I am having some issues parsing the information out. Below is an example of the nested XML and what I have tried.
The below XML can be quite large with hundreds of different records. This is what I tried:
##Import modules
import xml.etree.ElementTree as ET
import pandas as pd
from lxml import etree
tree = ET.parse("File.xml")
root = tree.getroot()
for subelement in root:
for subsub in subelement:
print(subsub.tag,",", subsub.text, subsub.attrib, subsub.items())
for subelement in root:
for subsub in subelement:
for subsubsub in subsub:
print(subsubsub.tag,",", subsubsub.text, subsubsub.attrib)
<?xml version="1.0" encoding="utf-16"?>
<test1 xmlns="test.xsd">
<test2 ID="123123123" test3="123123">
<test6 ID="123123">
<test7>123 street</test7>
<test10 ID="434234">
<test3>type of work</test3>
<test9>test work</test9>
<test12 ID="234234234">
<test12 ID="123123">
<test3>Something Here</test3>
<test13>Some date</test13>
<test16 ID="6456456456">
<test3>Something Something</test3>
<test2 ID="353453245" test3="list of something">
<Comments>Some comment</Comments>
<test6 ID="567456756">
<test3>Not today</test3>
<test17>Some Info</test17>
<test12 ID="456436346">
<test12 ID="4364356">
<test3> ID</test3>
<test12 ID="123123123443">
<test3>Other ID</test3>
<test16 ID="34252345">
<test3>None test</test3>
Update So would the full code look something like this?
with open("file.csv", "w", newline='') as fout:
header = ['test3','test4','test7','test9','test13','test14','test17','test18','test19','Comments']
csvout = csv.DictWriter(fout, fieldnames=header)
row = {}
for _, elem in ET.iterparse('file.xml'):
# strip the namespace from the element tag name; e.g. {Test.xsd}test14 > test14
tag = re.sub("^{.*?}", "", elem.tag)
if tag == 'test2':
if len(row) != 0:
row = {}
if len(elem) == 0:
text = elem.text
old = row.get(tag)
if old is None:
# first occurrence of the tag
row[tag] = text
elif isinstance(old, str):
# second occurrence of the tag
row[tag] = [old, text]
# already a list
For nested XML you can use iterparse() function to iterate over all elements in the XML. You would then need to have logic to handle the elements depending on what tag it's looking at to add to a dictionary object to export as a row.
for _, elem in ET.iterparse('file.xml'):
if len(elem) == 0:
print(f'{elem.tag} {elem.attrib} text={elem.text}')
print(f'{elem.tag} {elem.attrib}')
To create a row in a CSV file from the element text then can do something like this. If, for example, the "test2" marks the beginning of a new record then that can be used to write the record to a new row and clear the dictionary for the next record.
If want to output all or some attributes then need to add a few lines of code for that. If attribute names have the same name as element name or multiple elements have same attribute (e.g. ID) then need to address that in your code.
import xml.etree.ElementTree as ET
import re
import csv
with open("out.csv", "w", newline='') as fout:
header = ['test3','test4','test7','test9','test13','test14','test17','test18','test19','Comments']
csvout = csv.DictWriter(fout, fieldnames=header)
row = {}
for _, elem in ET.iterparse('test.xml'):
# strip the namespace from the element tag name; e.g. {Test.xsd}test14 > test14
tag = re.sub("^{.*?}", "", elem.tag)
if tag == 'test2':
if len(row) != 0:
row = {}
if len(elem) == 0:
row[tag] = elem.text
{'test3': 'Something Something', 'test4': 'AA', 'Comments': 'BB', 'test7': '123 street', 'test9': 'test work', 'test14': '746745636', 'test13': 'Some date'}
{'test3': 'None test', 'test4': 'Someone', 'Comments': 'Some comment', 'test7': '5634643643', 'test17': 'Some Info', 'test19': 'Somewhere', 'test18': '63243333', 'test14': '456436436346', 'test13': '54234532452345'}
CSV Output:
Something Something,AA,123 street,test work,Some date,746745636,,,,BB
None test,Someone,5634643643,,54234532452345,456436436346,Some Info,63243333,Somewhere,Some comment
If want to handle duplicate tags and create a list of values then try something like this:
if len(elem) == 0:
text = elem.text
old = row.get(tag)
if old is None:
# first occurrence
row[tag] = text
elif isinstance(old, str):
# second occurrence > create list
row[tag] = [old, text]

Convert XML into dictionary

I need to convert XML file into the dictionary (later on it will be converted into JSON).
A sample of XML script looks like:
<?xml version="1.0" encoding="UTF-8"?>
<osm version="0.6" generator="Overpass API 9da5e7ae">
<note>The data included in this document is from The data is made available under ODbL.</note>
<meta osm_base="2018-06-17T15:31:02Z"/>
<node id="2188497873" lat="52.5053306" lon="13.4360114">
<tag k="alt_name" v="Spreebalkon"/>
<tag k="name" v="Brommybalkon"/>
<tag k="tourism" v="viewpoint"/>
<tag k="wheelchair" v="yes"/>
With the simple code I have already filtered all the values that I needed for my dictionary:
import xml.etree.ElementTree as ET
input_file = r"D:\berlin\trial_xml\berlin_viewpoint_locations.xml"
tree = ET.parse(input_file)
root = tree.getroot()
lst1 = tree.findall("./node")
for item1 in lst1:
for item1_tags_and_nd in item1.iter('tag'):
print(item1_tags_and_nd.get('k') + ":", item1_tags_and_nd.get('v'))
id: 2188497873
lat: 52.5053306
lon: 13.4360114
alt_name: Spreebalkon
name: Brommybalkon
tourism: viewpoint
wheelchair: yes
Can you help me, please to append properly and efficiently these values into a dictionary?
I want it to look like:
{'id': '2188497873', 'lat': 52.5053306, 'lon': 13.4360114, 'alt_name': 'Spreebalkon', 'name': 'Brommybalkon', 'tourism': 'viewpoint', 'wheelchair': 'yes'}
I have tried with
dictionary = {}
dictionary['id'] = []
dictionary['lat'] = []
dictionary['lon'] = []
lst1 = tree.findall("./node")
for item1 in lst1:
for item1_tags_and_nd in item1.iter('tag'):
dictionary[item1_tags_and_nd.get('k')] = item1_tags_and_nd.get('v')
but it does not work so far.
I suggest you construct a list of dicts, instead of a dict of lists like:
result_list = []
for item in tree.findall("./node"):
dictionary = {}
dictionary['id'] = item.get('id')
dictionary['lat'] = item.get('lat')
dictionary['lon'] = item.get('lon')
Or as a couple of comprehensions like:
result_list = [{k: item.get(k) for k in ('id', 'lat', 'lon')}
for item in tree.findall("./node")]
And for the nested case:
result_list = [{k: (item.get(k) if k != 'tags' else
{i.get('k'): i.get('v') for i in item.iter('tag')})
for k in ('id', 'lat', 'lon', 'tags')}
for item in tree.findall("./node")]
'id': '2188497873',
'lat': '52.5053306',
'lon': '13.4360114',
'tags': {
'alt_name': 'Spreebalkon',
'name': 'Brommybalkon',
'tourism': 'viewpoint',
'wheelchair': 'yes'

How to get path of all elements in lxml with attribute

I have the following code:
tree = etree.ElementTree(new_xml)
for e in new_xml.iter():
print tree.getpath(e), e.text
This will give me something like the following:
/Item/Purchases/Purchase[1]/Rating R
/Item/Purchases/Purchase[2]/Rating R
However, I need to get the path not of the list element but of the attribute. Here is what the xml looks like:
<Purchase Country="US">
<Purchase Country="CA">
How would I get the following path instead?
/Item/Purchases/Purchase[#Country="US"]/Rating R
/Item/Purchases/Purchase[#Country="CA"]/Rating R
Not pretty, but it does the job.
replacements = {}
for e in tree.iter():
path = tree.getpath(e)
if'/Purchase\[\d+\]$', path):
new_predicate = '[#Country="' + e.attrib['Country'] + '"]'
new_path = re.sub('\[\d+\]$', new_predicate, path)
replacements[path] = new_path
for key, replacement in replacements.iteritems():
path = path.replace(key, replacement)
print path, e.text.strip()
prints this for me:
/Item/Purchases/Purchase[#Country="US"]/Rating R
/Item/Purchases/Purchase[#Country="CA"]/Rating R

ParseError parsing empty valued xml doc

There are lots of articles pertaining to parsing xml with elementtree. I've gone through a bunch of them and read through the docs but I can't come up with a solution that works for me. I'm trying to supplement info thats created by another app in a nfo file but i need to preserve the conventions in the file.
Here is an example of how the file is laid out
<name>Test Name</name>
<alt name />
<file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file>
<file local="" type="excel">http://filestore/file2.xls</file>
<file local="C:\file\file3.xls" type="excel" />
<file local="" type="ppt" />
Note: Elements are not closed properly e.g...
<alt name /> should be <alt name></alt name>
This is what I'm running...
import xml.etree.ElementTree as ET
tree = ET.parse('file.nfo')
root = tree.getroot()
The error I'm getting is...
xml.etree.ElementTree.ParseError: not well-formed (invalid token):
I've tried...
myparser = ET.XMLParser(encoding='UTF-8')
tree = ET.parse('file.nfo', myparser)
Also tried, xmlparser, opening with codecs but i'm pretty sure its the formatting. I'm guessing the immediate issue is non-escaped > but i suspect ET needs opening/closing?
I'm sure i could open this file and go through it with regex but i was hoping to use ElementTree.
The end goal is to have the details from the nfo as a dictionary that looks like...
dict = {'title': [{'name': 'Test Name',
'alt name': '',
'file': [{'local': 'C:\file\file1.doc', 'type': 'word', 'url': 'http://filestore/file1.doc'},
{'local': '', 'type': 'excel', 'url': 'http://filestore/file2.xls'},
{'local': 'C:\file\file3.xls', 'type': 'excel', 'url': ''},
{'local': '', 'type': 'ppt', 'url': ''}]
I'm sure there is a better (more pythonic) way to do this but I'm pretty new to python.
Any help would be appreciated
EDIT: I'm also trying to avoid using 3rd party libraries if possible
So I ended up creating a customer parser of sorts, its not ideal but it works. It was suggested to me that lxml and html.parser may parse malformed xml better but i just went with this.
I'm also still very interested in any feedback whether it be on this or using any other method.
import re
def merge_dicts(*dict_args):
result = {}
for dictionary in dict_args:
return result
def make_dict(str_arg, op):
result = {}
result = dict(s.split(op) for s in str_arg.split(","))
return result
lst = r' <name>Test Name</name>'
lst = r' <alt name />'
lst = r' <file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file>'
lst = r' <file local="" type="excel">http://filestore/file2.xls</file>'
lst = r' <file local="C:\file\file3.xls" type="excel" />'
lst = r' <file local="" type="ppt" />'
def match_pattern(file_str):
#<description>desc blah</description>'
pattern1 = r'''(?x)
\s* # cut leading whitespace
< (?P<tag_open> (\w+?|\w*\s\w+?)+) \b # word boundary, so we can
> # skip attributes
(?P<tag_body> .+? ) # insides
</ (?P<tag_close> (\w+?|\w*\s\w+?)+) > # closing tag, nothing interesting
#<alt name />
pattern2 = r'''(?x)
< (?P<tag_open> (\w+?|\w*\s\w+?)+) \b
#<file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file>'
pattern3 = r'''(?x)
< (?P<tag_open> (\w+?|\w*\s\w+!=?)+) \b
(?P<tag_attrib1> (\w*\=.*?)) # 1st attribute
(?P<tag_attrib2> (\w*\=.*)) # 2nd attribute
.*? >
(?P<tag_body> .+? )
</ (?P<tag_close> (\w+?|\w*\s\w+?)+) >
#<file local="" type="ppt" />
pattern4 = r'''(?x)
< (?P<tag_open> (\w+?|\w*\s\w+!=?)+) \b
(?P<tag_attrib1> (\w*\=.*?)) # 1st attribute
(?P<tag_attrib2> (\w*\=.*)) # 2nd attribute
pat_str = 'pattern'
pat_val = 1
return_dict = {}
while (pat_val <= 4):
pattern = pat_str+str(pat_val)
matchObj = re.match(eval(pattern), file_str, re.L|re.M)
if matchObj:
#for k, v in matchObj.groupdict().items():
# print('{!r}) == {!r}'.format(k, v))
if pat_val == 1:
body ='tag_body')
return_dict = {'tag_open'): body}
elif pat_val == 2:
return_dict = {'tag_open'): ''}
elif pat_val == 3:
attr1 = make_dict('tag_attrib1'), '=')
attr2 = make_dict('tag_attrib2'), '=')
body = {'url':'tag_body')}
attrib = merge_dicts(attr1, attr2, body)
return_dict = {'tag_open'): attrib}
elif pat_val == 4:
attr1 = make_dict('tag_attrib1'), '=')
attr2 = make_dict('tag_attrib2'), '=')
body = {'url': ''}
attrib = merge_dicts(attr1, attr2, body)
return_dict = {'tag_open'): attrib}
return return_dict
pat_val = pat_val + 1
if pat_val > 4:
print("No match!!")
def in_file(file):
result = {}
with open(file, "r") as file:
data = (
for d in data:
if data.index(d) == 0 or data.index(d) == len(data)-1:
if data.index(d) == 0:
print(re.sub('<|/|>', '', d))
elif d:
lst = []
dct = {}
if 'file' in match_pattern(d).keys():
for i in match_pattern(d).items():
if 'file' in result.keys():
lst = result['file']
dct = {i[0]: lst}
result = merge_dicts(result, dct)
dct = {i[0]: [i[1]]}
result = merge_dicts(result, dct)
result = merge_dicts(result, match_pattern(d))
print('else', match_pattern(d))
return result
NOTE: I dropped the top most dictionary from the original post
