Convert XML file to dict in Python [duplicate]

Convert XML file to dict in Python [duplicate] - python

I have a program that reads an XML document from a socket. I have the XML document stored in a string which I would like to convert directly to a Python dictionary, the same way it is done in Django's simplejson library.
Take as an example:
str ="<?xml version="1.0" ?><person><name>john</name><age>20</age></person"
dic_xml = convert_to_dic(str)
Then dic_xml would look like {'person' : { 'name' : 'john', 'age' : 20 } }

xmltodict (full disclosure: I wrote it) does exactly that:
xmltodict.parse("""
<?xml version="1.0" ?>
<person>
<name>john</name>
<age>20</age>
</person>""")
# {u'person': {u'age': u'20', u'name': u'john'}}

This is a great module that someone created. I've used it several times.
http://code.activestate.com/recipes/410469-xml-as-dictionary/
Here is the code from the website just in case the link goes bad.
from xml.etree import cElementTree as ElementTree
class XmlListConfig(list):
def __init__(self, aList):
for element in aList:
if element:
# treat like dict
if len(element) == 1 or element[0].tag != element[1].tag:
self.append(XmlDictConfig(element))
# treat like list
elif element[0].tag == element[1].tag:
self.append(XmlListConfig(element))
elif element.text:
text = element.text.strip()
if text:
self.append(text)
class XmlDictConfig(dict):
'''
Example usage:
>>> tree = ElementTree.parse('your_file.xml')
>>> root = tree.getroot()
>>> xmldict = XmlDictConfig(root)
Or, if you want to use an XML string:
>>> root = ElementTree.XML(xml_string)
>>> xmldict = XmlDictConfig(root)
And then use xmldict for what it is... a dict.
'''
def __init__(self, parent_element):
if parent_element.items():
self.update(dict(parent_element.items()))
for element in parent_element:
if element:
# treat like dict - we assume that if the first two tags
# in a series are different, then they are all different.
if len(element) == 1 or element[0].tag != element[1].tag:
aDict = XmlDictConfig(element)
# treat like list - we assume that if the first two tags
# in a series are the same, then the rest are the same.
else:
# here, we put the list in dictionary; the key is the
# tag name the list elements all share in common, and
# the value is the list itself
aDict = {element[0].tag: XmlListConfig(element)}
# if the tag has attributes, add those to the dict
if element.items():
aDict.update(dict(element.items()))
self.update({element.tag: aDict})
# this assumes that if you've got an attribute in a tag,
# you won't be having any text. This may or may not be a
# good idea -- time will tell. It works for the way we are
# currently doing XML configuration files...
elif element.items():
self.update({element.tag: dict(element.items())})
# finally, if there are no child tags and no attributes, extract
# the text
else:
self.update({element.tag: element.text})
Example usage:
tree = ElementTree.parse('your_file.xml')
root = tree.getroot()
xmldict = XmlDictConfig(root)
//Or, if you want to use an XML string:
root = ElementTree.XML(xml_string)
xmldict = XmlDictConfig(root)

The following XML-to-Python-dict snippet parses entities as well as attributes following this XML-to-JSON "specification". It is the most general solution handling all cases of XML.
from collections import defaultdict
def etree_to_dict(t):
d = {t.tag: {} if t.attrib else None}
children = list(t)
if children:
dd = defaultdict(list)
for dc in map(etree_to_dict, children):
for k, v in dc.items():
dd[k].append(v)
d = {t.tag: {k:v[0] if len(v) == 1 else v for k, v in dd.items()}}
if t.attrib:
d[t.tag].update(('#' + k, v) for k, v in t.attrib.items())
if t.text:
text = t.text.strip()
if children or t.attrib:
if text:
d[t.tag]['#text'] = text
else:
d[t.tag] = text
return d
It is used:
from xml.etree import cElementTree as ET
e = ET.XML('''
<root>
<e />
<e>text</e>
<e name="value" />
<e name="value">text</e>
<e> <a>text</a> <b>text</b> </e>
<e> <a>text</a> <a>text</a> </e>
<e> text <a>text</a> </e>
</root>
''')
from pprint import pprint
pprint(etree_to_dict(e))
The output of this example (as per above-linked "specification") should be:
{'root': {'e': [None,
'text',
{'#name': 'value'},
{'#text': 'text', '#name': 'value'},
{'a': 'text', 'b': 'text'},
{'a': ['text', 'text']},
{'#text': 'text', 'a': 'text'}]}}
Not necessarily pretty, but it is unambiguous, and simpler XML inputs result in simpler JSON. :)
Update
If you want to do the reverse, emit an XML string from a JSON/dict, you can use:
try:
basestring
except NameError: # python3
basestring = str
def dict_to_etree(d):
def _to_etree(d, root):
if not d:
pass
elif isinstance(d, basestring):
root.text = d
elif isinstance(d, dict):
for k,v in d.items():
assert isinstance(k, basestring)
if k.startswith('#'):
assert k == '#text' and isinstance(v, basestring)
root.text = v
elif k.startswith('#'):
assert isinstance(v, basestring)
root.set(k[1:], v)
elif isinstance(v, list):
for e in v:
_to_etree(e, ET.SubElement(root, k))
else:
_to_etree(v, ET.SubElement(root, k))
else:
raise TypeError('invalid type: ' + str(type(d)))
assert isinstance(d, dict) and len(d) == 1
tag, body = next(iter(d.items()))
node = ET.Element(tag)
_to_etree(body, node)
return ET.tostring(node)
pprint(dict_to_etree(d))

This lightweight version, while not configurable, is pretty easy to tailor as needed, and works in old pythons. Also it is rigid - meaning the results are the same regardless of the existence of attributes.
import xml.etree.ElementTree as ET
from copy import copy
def dictify(r,root=True):
if root:
return {r.tag : dictify(r, False)}
d=copy(r.attrib)
if r.text:
d["_text"]=r.text
for x in r.findall("./*"):
if x.tag not in d:
d[x.tag]=[]
d[x.tag].append(dictify(x,False))
return d
So:
root = ET.fromstring("<erik><a x='1'>v</a><a y='2'>w</a></erik>")
dictify(root)
Results in:
{'erik': {'a': [{'x': '1', '_text': 'v'}, {'y': '2', '_text': 'w'}]}}

Disclaimer:
This modified XML parser was inspired by Adam Clark
The original XML parser works for most of simple cases. However, it didn't work for some complicated XML files. I debugged the code line by line and finally fixed some issues. If you find some bugs, please let me know. I am glad to fix it.
class XmlDictConfig(dict):
'''
Note: need to add a root into if no exising
Example usage:
>>> tree = ElementTree.parse('your_file.xml')
>>> root = tree.getroot()
>>> xmldict = XmlDictConfig(root)
Or, if you want to use an XML string:
>>> root = ElementTree.XML(xml_string)
>>> xmldict = XmlDictConfig(root)
And then use xmldict for what it is... a dict.
'''
def __init__(self, parent_element):
if parent_element.items():
self.updateShim( dict(parent_element.items()) )
for element in parent_element:
if len(element):
aDict = XmlDictConfig(element)
# if element.items():
# aDict.updateShim(dict(element.items()))
self.updateShim({element.tag: aDict})
elif element.items(): # items() is specialy for attribtes
elementattrib= element.items()
if element.text:
elementattrib.append((element.tag,element.text )) # add tag:text if there exist
self.updateShim({element.tag: dict(elementattrib)})
else:
self.updateShim({element.tag: element.text})
def updateShim (self, aDict ):
for key in aDict.keys(): # keys() includes tag and attributes
if key in self:
value = self.pop(key)
if type(value) is not list:
listOfDicts = []
listOfDicts.append(value)
listOfDicts.append(aDict[key])
self.update({key: listOfDicts})
else:
value.append(aDict[key])
self.update({key: value})
else:
self.update({key:aDict[key]}) # it was self.update(aDict)

The most recent versions of the PicklingTools libraries (1.3.0 and 1.3.1) support tools for converting from XML to a Python dict.
The download is available here: PicklingTools 1.3.1
There is quite a bit of documentation for the converters here: the documentation describes in detail all of the decisions and issues that will arise when converting between XML and Python dictionaries (there are a number of edge cases: attributes, lists, anonymous lists, anonymous dicts, eval, etc. that most converters don't handle). In general, though,
the converters are easy to use. If an 'example.xml' contains:
<top>
<a>1</a>
<b>2.2</b>
<c>three</c>
</top>
Then to convert it to a dictionary:
>>> from xmlloader import *
>>> example = file('example.xml', 'r') # A document containing XML
>>> xl = StreamXMLLoader(example, 0) # 0 = all defaults on operation
>>> result = xl.expect XML()
>>> print result
{'top': {'a': '1', 'c': 'three', 'b': '2.2'}}
There are tools for converting in both C++ and Python: the C++ and Python do indentical conversion, but the C++ is about 60x faster

You can do this quite easily with lxml. First install it:
[sudo] pip install lxml
Here is a recursive function I wrote that does the heavy lifting for you:
from lxml import objectify as xml_objectify
def xml_to_dict(xml_str):
""" Convert xml to dict, using lxml v3.4.2 xml processing library """
def xml_to_dict_recursion(xml_object):
dict_object = xml_object.__dict__
if not dict_object:
return xml_object
for key, value in dict_object.items():
dict_object[key] = xml_to_dict_recursion(value)
return dict_object
return xml_to_dict_recursion(xml_objectify.fromstring(xml_str))
xml_string = """<?xml version="1.0" encoding="UTF-8"?><Response><NewOrderResp>
<IndustryType>Test</IndustryType><SomeData><SomeNestedData1>1234</SomeNestedData1>
<SomeNestedData2>3455</SomeNestedData2></SomeData></NewOrderResp></Response>"""
print xml_to_dict(xml_string)
The below variant preserves the parent key / element:
def xml_to_dict(xml_str):
""" Convert xml to dict, using lxml v3.4.2 xml processing library, see http://lxml.de/ """
def xml_to_dict_recursion(xml_object):
dict_object = xml_object.__dict__
if not dict_object: # if empty dict returned
return xml_object
for key, value in dict_object.items():
dict_object[key] = xml_to_dict_recursion(value)
return dict_object
xml_obj = objectify.fromstring(xml_str)
return {xml_obj.tag: xml_to_dict_recursion(xml_obj)}
If you want to only return a subtree and convert it to dict, you can use Element.find() to get the subtree and then convert it:
xml_obj.find('.//') # lxml.objectify.ObjectifiedElement instance
See the lxml docs here. I hope this helps!

I wrote a simple recursive function to do the job:
from xml.etree import ElementTree
root = ElementTree.XML(xml_to_convert)
def xml_to_dict_recursive(root):
if len(root.getchildren()) == 0:
return {root.tag:root.text}
else:
return {root.tag:list(map(xml_to_dict_recursive, root.getchildren()))}

def xml_to_dict(node):
u'''
#param node:lxml_node
#return: dict
'''
return {'tag': node.tag, 'text': node.text, 'attrib': node.attrib, 'children': {child.tag: xml_to_dict(child) for child in node}}

The code from http://code.activestate.com/recipes/410469-xml-as-dictionary/ works well, but if there are multiple elements that are the same at a given place in the hierarchy it just overrides them.
I added a shim between that looks to see if the element already exists before self.update(). If so, pops the existing entry and creates a lists out of the existing and the new. Any subsequent duplicates are added to the list.
Not sure if this can be handled more gracefully, but it works:
import xml.etree.ElementTree as ElementTree
class XmlDictConfig(dict):
def __init__(self, parent_element):
if parent_element.items():
self.updateShim(dict(parent_element.items()))
for element in parent_element:
if len(element):
aDict = XmlDictConfig(element)
if element.items():
aDict.updateShim(dict(element.items()))
self.updateShim({element.tag: aDict})
elif element.items():
self.updateShim({element.tag: dict(element.items())})
else:
self.updateShim({element.tag: element.text.strip()})
def updateShim (self, aDict ):
for key in aDict.keys():
if key in self:
value = self.pop(key)
if type(value) is not list:
listOfDicts = []
listOfDicts.append(value)
listOfDicts.append(aDict[key])
self.update({key: listOfDicts})
else:
value.append(aDict[key])
self.update({key: value})
else:
self.update(aDict)

#dibrovsd: Solution will not work if the xml have more than one tag with same name
On your line of thought, I have modified the code a bit and written it for general node instead of root:
from collections import defaultdict
def xml2dict(node):
d, count = defaultdict(list), 1
for i in node:
d[i.tag + "_" + str(count)]['text'] = i.findtext('.')[0]
d[i.tag + "_" + str(count)]['attrib'] = i.attrib # attrib gives the list
d[i.tag + "_" + str(count)]['children'] = xml2dict(i) # it gives dict
return d

From #K3---rnc response (the best for me) I've added a small modifications to get an OrderedDict from an XML text (some times order matters):
def etree_to_ordereddict(t):
d = OrderedDict()
d[t.tag] = OrderedDict() if t.attrib else None
children = list(t)
if children:
dd = OrderedDict()
for dc in map(etree_to_ordereddict, children):
for k, v in dc.iteritems():
if k not in dd:
dd[k] = list()
dd[k].append(v)
d = OrderedDict()
d[t.tag] = OrderedDict()
for k, v in dd.iteritems():
if len(v) == 1:
d[t.tag][k] = v[0]
else:
d[t.tag][k] = v
if t.attrib:
d[t.tag].update(('#' + k, v) for k, v in t.attrib.iteritems())
if t.text:
text = t.text.strip()
if children or t.attrib:
if text:
d[t.tag]['#text'] = text
else:
d[t.tag] = text
return d
Following #K3---rnc example, you can use it:
from xml.etree import cElementTree as ET
e = ET.XML('''
<root>
<e />
<e>text</e>
<e name="value" />
<e name="value">text</e>
<e> <a>text</a> <b>text</b> </e>
<e> <a>text</a> <a>text</a> </e>
<e> text <a>text</a> </e>
</root>
''')
from pprint import pprint
pprint(etree_to_ordereddict(e))
Hope it helps ;)

An alternative (builds a lists for the same tags in hierarchy):
from xml.etree import cElementTree as ElementTree
def xml_to_dict(xml, result):
for child in xml:
if len(child) == 0:
result[child.tag] = child.text
else:
if child.tag in result:
if not isinstance(result[child.tag], list):
result[child.tag] = [result[child.tag]]
result[child.tag].append(xml_to_dict(child, {}))
else:
result[child.tag] = xml_to_dict(child, {})
return result
xmlTree = ElementTree.parse('my_file.xml')
xmlRoot = xmlTree.getroot()
dictRoot = xml_to_dict(xmlRoot, {})
result = {xmlRoot.tag: dictRoot}

Here's a link to an ActiveState solution - and the code in case it disappears again.
==================================================
xmlreader.py:
==================================================
from xml.dom.minidom import parse
class NotTextNodeError:
pass
def getTextFromNode(node):
"""
scans through all children of node and gathers the
text. if node has non-text child-nodes, then
NotTextNodeError is raised.
"""
t = ""
for n in node.childNodes:
if n.nodeType == n.TEXT_NODE:
t += n.nodeValue
else:
raise NotTextNodeError
return t
def nodeToDic(node):
"""
nodeToDic() scans through the children of node and makes a
dictionary from the content.
three cases are differentiated:
- if the node contains no other nodes, it is a text-node
and {nodeName:text} is merged into the dictionary.
- if the node has the attribute "method" set to "true",
then it's children will be appended to a list and this
list is merged to the dictionary in the form: {nodeName:list}.
- else, nodeToDic() will call itself recursively on
the nodes children (merging {nodeName:nodeToDic()} to
the dictionary).
"""
dic = {}
for n in node.childNodes:
if n.nodeType != n.ELEMENT_NODE:
continue
if n.getAttribute("multiple") == "true":
# node with multiple children:
# put them in a list
l = []
for c in n.childNodes:
if c.nodeType != n.ELEMENT_NODE:
continue
l.append(nodeToDic(c))
dic.update({n.nodeName:l})
continue
try:
text = getTextFromNode(n)
except NotTextNodeError:
# 'normal' node
dic.update({n.nodeName:nodeToDic(n)})
continue
# text node
dic.update({n.nodeName:text})
continue
return dic
def readConfig(filename):
dom = parse(filename)
return nodeToDic(dom)
def test():
dic = readConfig("sample.xml")
print dic["Config"]["Name"]
print
for item in dic["Config"]["Items"]:
print "Item's Name:", item["Name"]
print "Item's Value:", item["Value"]
test()
==================================================
sample.xml:
==================================================
<?xml version="1.0" encoding="UTF-8"?>
<Config>
<Name>My Config File</Name>
<Items multiple="true">
<Item>
<Name>First Item</Name>
<Value>Value 1</Value>
</Item>
<Item>
<Name>Second Item</Name>
<Value>Value 2</Value>
</Item>
</Items>
</Config>
==================================================
output:
==================================================
My Config File
Item's Name: First Item
Item's Value: Value 1
Item's Name: Second Item
Item's Value: Value 2

At one point I had to parse and write XML that only consisted of elements without attributes so a 1:1 mapping from XML to dict was possible easily. This is what I came up with in case someone else also doesnt need attributes:
def xmltodict(element):
if not isinstance(element, ElementTree.Element):
raise ValueError("must pass xml.etree.ElementTree.Element object")
def xmltodict_handler(parent_element):
result = dict()
for element in parent_element:
if len(element):
obj = xmltodict_handler(element)
else:
obj = element.text
if result.get(element.tag):
if hasattr(result[element.tag], "append"):
result[element.tag].append(obj)
else:
result[element.tag] = [result[element.tag], obj]
else:
result[element.tag] = obj
return result
return {element.tag: xmltodict_handler(element)}
def dicttoxml(element):
if not isinstance(element, dict):
raise ValueError("must pass dict type")
if len(element) != 1:
raise ValueError("dict must have exactly one root key")
def dicttoxml_handler(result, key, value):
if isinstance(value, list):
for e in value:
dicttoxml_handler(result, key, e)
elif isinstance(value, basestring):
elem = ElementTree.Element(key)
elem.text = value
result.append(elem)
elif isinstance(value, int) or isinstance(value, float):
elem = ElementTree.Element(key)
elem.text = str(value)
result.append(elem)
elif value is None:
result.append(ElementTree.Element(key))
else:
res = ElementTree.Element(key)
for k, v in value.items():
dicttoxml_handler(res, k, v)
result.append(res)
result = ElementTree.Element(element.keys()[0])
for key, value in element[element.keys()[0]].items():
dicttoxml_handler(result, key, value)
return result
def xmlfiletodict(filename):
return xmltodict(ElementTree.parse(filename).getroot())
def dicttoxmlfile(element, filename):
ElementTree.ElementTree(dicttoxml(element)).write(filename)
def xmlstringtodict(xmlstring):
return xmltodict(ElementTree.fromstring(xmlstring).getroot())
def dicttoxmlstring(element):
return ElementTree.tostring(dicttoxml(element))

I have modified one of the answers to my taste and to work with multiple values with the same tag for example consider the following xml code saved in XML.xml file
<A>
<B>
<BB>inAB</BB>
<C>
<D>
<E>
inABCDE
</E>
<E>value2</E>
<E>value3</E>
</D>
<inCout-ofD>123</inCout-ofD>
</C>
</B>
<B>abc</B>
<F>F</F>
</A>
and in python
import xml.etree.ElementTree as ET
class XMLToDictionary(dict):
def __init__(self, parentElement):
self.parentElement = parentElement
for child in list(parentElement):
child.text = child.text if (child.text != None) else ' '
if len(child) == 0:
self.update(self._addToDict(key= child.tag, value = child.text.strip(), dict = self))
else:
innerChild = XMLToDictionary(parentElement=child)
self.update(self._addToDict(key=innerChild.parentElement.tag, value=innerChild, dict=self))
def getDict(self):
return {self.parentElement.tag: self}
class _addToDict(dict):
def __init__(self, key, value, dict):
if not key in dict:
self.update({key: value})
else:
identical = dict[key] if type(dict[key]) == list else [dict[key]]
self.update({key: identical + [value]})
tree = ET.parse('./XML.xml')
root = tree.getroot()
parseredDict = XMLToDictionary(root).getDict()
print(parseredDict)
the output is
{'A': {'B': [{'BB': 'inAB', 'C': {'D': {'E': ['inABCDE', 'value2', 'value3']}, 'inCout-ofD': '123'}}, 'abc'], 'F': 'F'}}

Updated method posted by firelion.cis (since getchildren is deprecated):
from xml.etree import ElementTree
root = ElementTree.XML(xml_to_convert)
def xml_to_dict_recursive(root):
if len(list(root)) == 0:
return {root.tag:root.text}
else:
return {root.tag:list(map(xml_to_dict_recursive, list(root)))}

import xml.etree.ElementTree as ET
root = ET.parse(xml_filepath).getroot()
def parse_xml(node):
ans = {}
for child in node:
if len(child) == 0:
ans[child.tag] = child.text
elif child.tag not in ans:
ans[child.tag] = parse_xml(child)
elif not isinstance(ans[child.tag], list):
ans[child.tag] = [ans[child.tag]]
ans[child.tag].append(parse_xml(child))
else:
ans[child.tag].append(parse_xml(child))
return ans
it merges same field into list and squeezes fields containing one child.

Super simple code
#Follow this, its easy and nothing required, convert the XML into a string and use the find command to find the word that you are looking for as following
#hope this is easy and simple
def xml_key(key, text1):
tx1 = "<" + key + ">"
tx2 = "</" + key + ">"
tx = text1.find(tx1)
ty = text1.find(tx2)
tx = tx + len(tx1)
tw = text1[tx:ty]
return(tw)
text1 = "<person><name>john</name><age>20</age></person>"
dict1 = {"name": xml_key("name",text1),"age":xml_key("age",text1)}
print(dict1)
output :
{'name': 'john'}

I have a recursive method to get a dictionary from a lxml element
def recursive_dict(element):
return (element.tag.split('}')[1],
dict(map(recursive_dict, element.getchildren()),
**element.attrib))

Related

Output CSV is in binary with python 3.10

I am trying to create a CSV file from a OSM file. However, every time I run my code I'm getting a "B" in the output, like this,
b'id',b'lat',b'lon',b'user',b'uid',b'version',b'changeset',b'timestamp'
I can NOT figure out what I'm doing wrong.
Code is below.
def get_element(osm_file, tags=('node', 'way', 'relation')):
"""Yield element if it is the right type of tag"""
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
def validate_element(element, validator, schema=SCHEMA):
"""Raise ValidationError if element does not match schema"""
if validator.validate(element, schema) is not True:
field, errors = next(validator.errors.iteritems())
message_string = "\nElement of type '{0}' has the following errors:\n{1}"
error_string = pprint.pformat(errors)
raise Exception(message_string.format(field, error_string))
class UnicodeDictWriter(csv.DictWriter, object):
"""Extend csv.DictWriter to handle Unicode input"""
def writerow(self, row):
super(UnicodeDictWriter, self).writerow({
k: (v.encode('utf-8') if isinstance(v, str) else v) for k, v in row.items()
})
def writerows(self, rows):
for row in rows:
self.writerow(row)
# ================================================== #
# Main Function #
# ================================================== #
def process_map(file_in, validate):
"""Iteratively process each XML element and write to csv(s)"""
with codecs.open(NODES_PATH, 'w') as nodes_file, \
codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
codecs.open(WAYS_PATH, 'w') as ways_file, \
codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:
nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
nodes_writer.writeheader()
node_tags_writer.writeheader()
ways_writer.writeheader()
way_nodes_writer.writeheader()
way_tags_writer.writeheader()
validator = cerberus.Validator()
for element in get_element(file_in, tags=('node', 'way')):
el = shape_element(element)
if el:
if validate is True:
validate_element(el, validator)
if element.tag == 'node':
nodes_writer.writerow(el['node'])
node_tags_writer.writerows(el['node_tags'])
elif element.tag == 'way':
ways_writer.writerow(el['way'])
way_nodes_writer.writerows(el['way_nodes'])
way_tags_writer.writerows(el['way_tags'])

Because of this line in UnicodeDictWriter.writerow():
k: (v.encode('utf-8') if isinstance(v, str) else v) for k, v in row.items()
# ^^^^^^^^^^^^^^^^^ specifically, this
You are encoding the values in the UTF-8 codec, which outputs bytes objects. These then get written to your CSV file.
As Mark Tolonen pointed out, strings in Python 3 are already Unicode, so there's no need to subclass csv.DictWriter for this purpose.

Convert a dot notation string to object to access lxml objects python

I am trying to update the xml using object notation using lxml objectify.
<xml>
<fruit>
<citrus>
<lemon />
</citrus>
</fruit>
</xml>
I am trying to add another fruit called mango using lxml objectify like
root = lxml.objectify.fromstring(xml_string)
root.fruit.citrus = 'orange'
def update(path, value):
// code
update('fruit.citrus', 'orange')
I would like to pass a string like 'fruit.citrus' because I cannot pass an object fruit.citrus.
How do I achieve this in Python ie how do I execute the code 'root.fruit.citrus = 'orange' inside the update function. How to convert string to object?

Try Below solution:
import lxml.objectify, lxml.etree
xml = '<xml> <fruit> <citrus> <lemon /> </citrus> </fruit> </xml>'
root = lxml.objectify.fromstring(xml)
print("Before:")
print(lxml.etree.tostring(root))
def update(path, value):
parent = None
lst = path.split('.')
while lst:
ele = lst.pop(0)
parent = getattr(root, ele) if parent is None else getattr(parent, ele)
lxml.etree.SubElement(parent, value)
update('fruit.citrus', 'orange')
print("After:")
print(lxml.etree.tostring(root))
Output:
Before:
b'<xml><fruit><citrus><lemon/></citrus></fruit></xml>'
After:
b'<xml><fruit><citrus><lemon/><orange/></citrus></fruit></xml>'

If you insist on using objectify, you may not like this, but I think this is a pretty clean solution using lxml etree:
from lxml import etree
doc = etree.fromstring("""<xml>
<fruit>
<citrus>
<lemon />
</citrus>
</fruit>
</xml>""")
def update(root, path, item):
elems = root.xpath(path)
for elem in elems:
elem.append(etree.Element(item))
update(doc, 'fruit/citrus', 'orange')
print(etree.tostring(doc).decode())

The above answers were partially correct. It did not have the ability to handle indexes. The below code handles all the cases with ObjectPath (https://lxml.de/objectify.html).
import lxml.objectify, lxml.etree
from robot.api.deco import keyword
class ConfigXML(object):
def get_xml(self, filename):
self.root = lxml.objectify.fromstring(open(filename).read())
def add_attribute(self, path, **kwargs):
path_obj = lxml.objectify.ObjectPath(path)
for key in kwargs:
path_obj.find(self.root).set(key, kwargs[key])
def add_value(self, path, value):
path_obj = lxml.objectify.ObjectPath(path)
path_obj.setattr(self.root, value)
def add_tag(self, path, tag):
path_obj = lxml.objectify.ObjectPath(path)
lxml.objectify.SubElement(path_obj.find(self.root), tag)
def generate_xml(self):
lxml.objectify.deannotate(self.root, cleanup_namespaces=True, xsi_nil=True)
return lxml.etree.tostring(self.root).decode('utf-8')

Extracting nested XML elements of different sizes into Pandas

Lets assume we have an arbitrary XML document like below
<?xml version="1.0" encoding="UTF-8"?>
<programs xmlns="http://something.org/schema/s/program">
<program xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://something.org/schema/s/program http://something.org/schema/s/program.xsd">
<orgUnitId>Organization 1</orgUnitId>
<requiredLevel>academic bachelor</requiredLevel>
<requiredLevel>academic master</requiredLevel>
<programDescriptionText xml:lang="nl">Here is some text; blablabla</programDescriptionText>
<searchword xml:lang="nl">Scrum master</searchword>
</program>
<program xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://something.org/schema/s/program http://something.org/schema/s/program.xsd">
<requiredLevel>bachelor</requiredLevel>
<requiredLevel>academic master</requiredLevel>
<requiredLevel>academic bachelor</requiredLevel>
<orgUnitId>Organization 2</orgUnitId>
<programDescriptionText xml:lang="nl">Text from another organization about some stuff.</programDescriptionText>
<searchword xml:lang="nl">Excutives</searchword>
</program>
<program xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<orgUnitId>Organization 3</orgUnitId>
<programDescriptionText xml:lang="nl">Also another huge text description from another organization.</programDescriptionText>
<searchword xml:lang="nl">Negotiating</searchword>
<searchword xml:lang="nl">Effective leadership</searchword>
<searchword xml:lang="nl">negotiating techniques</searchword>
<searchword xml:lang="nl">leadership</searchword>
<searchword xml:lang="nl">strategic planning</searchword>
</program>
</programs>
Currently I'm looping over the elements I need by using their absolute paths, since I'm not able to use any of the get or find methods in ElementTree. As such, my code looks like below:
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np
import itertools
tree = ET.parse('data.xml')
root = tree.getroot()
root.tag
dfcols=['organization','description','level','keyword']
organization=[]
description=[]
level=[]
keyword=[]
for node in root:
for child in
node.findall('.//{http://something.org/schema/s/program}orgUnitId'):
organization.append(child.text)
for child in node.findall('.//{http://something.org/schema/s/program}programDescriptionText'):
description.append(child.text)
for child in node.findall('.//{http://something.org/schema/s/program}requiredLevel'):
level.append(child.text)
for child in node.findall('.//{http://something.org/schema/s/program}searchword'):
keyword.append(child.text)
The goal, of course, is to create one dataframe. However, since each node in the XML file contains one or multiple elements, such as requiredLevel or searchword I'm currently losing data when I'm casting it to a dataframe by either:
df=pd.DataFrame(list(itertools.zip_longest(organization,
description,level,searchword,
fillvalue=np.nan)),columns=dfcols)
or using pd.Series as given here or another solution which I don't seem to get it fit from here
My best bet is not to use Lists at all, since they don't seem to index the data correctly. That is, I lose data from the 2nd to Xth child node. But right now I'm stuck, and don't see any other options.
What my end result should look like is this:
organization description level keyword
Organization 1 .... academic bachelor, Scrum master
academic master
Organization 2 .... bachelor, Executives
academic master,
academic bachelor
Organization 3 .... Negotiating,
Effective leadership,
negotiating techniques,
....

Consider building a list of dictionaries with comma-collapsed text values. Then pass list into the pandas.DataFrame constructor:
dicts = []
for node in root:
orgs = ", ".join([org.text for org in node.findall('.//{http://something.org/schema/s/program}orgUnitId')])
desc = ", ".join([desc.text for desc in node.findall('.//{http://something.org/schema/s/program}programDescriptionText')])
lvls = ", ".join([lvl.text for lvl in node.findall('.//{http://something.org/schema/s/program}requiredLevel')])
wrds = ", ".join([wrd.text for wrd in node.findall('.//{http://something.org/schema/s/program}searchword')])
dicts.append({'organization': orgs, 'description': desc, 'level': lvls, 'keyword': wrds})
final_df = pd.DataFrame(dicts, columns=['organization','description','level','keyword'])
Output
print(final_df)
# organization description level keyword
# 0 Organization 1 Here is some text; blablabla academic bachelor, academic master Scrum master
# 1 Organization 2 Text from another organization about some stuff. bachelor, academic master, academic bachelor Excutives
# 2 Organization 3 Also another huge text description from anothe... Negotiating, Effective leadership, negotiating...

A lightweight xml_to_dict converter can be found here. It can be improved by this to handle namespaces.
def xml_to_dict(xml='', remove_namespace=True):
"""Converts an XML string into a dict
Args:
xml: The XML as string
remove_namespace: True (default) if namespaces are to be removed
Returns:
The XML string as dict
Examples:
>>> xml_to_dict('<text><para>hello world</para></text>')
{'text': {'para': 'hello world'}}
"""
def _xml_remove_namespace(buf):
# Reference: https://stackoverflow.com/a/25920989/1498199
it = ElementTree.iterparse(buf)
for _, el in it:
if '}' in el.tag:
el.tag = el.tag.split('}', 1)[1]
return it.root
def _xml_to_dict(t):
# Reference: https://stackoverflow.com/a/10077069/1498199
from collections import defaultdict
d = {t.tag: {} if t.attrib else None}
children = list(t)
if children:
dd = defaultdict(list)
for dc in map(_xml_to_dict, children):
for k, v in dc.items():
dd[k].append(v)
d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
if t.attrib:
d[t.tag].update(('#' + k, v) for k, v in t.attrib.items())
if t.text:
text = t.text.strip()
if children or t.attrib:
if text:
d[t.tag]['#text'] = text
else:
d[t.tag] = text
return d
buffer = io.StringIO(xml.strip())
if remove_namespace:
root = _xml_remove_namespace(buffer)
else:
root = ElementTree.parse(buffer).getroot()
return _xml_to_dict(root)
So let s be the string which holds your xml. We can convert it to a dict via
d = xml_to_dict(s, remove_namespace=True)
Now the solution is straight forward:
rows = []
for program in d['programs']['program']:
cols = []
cols.append(program['orgUnitId'])
cols.append(program['programDescriptionText']['#text'])
try:
cols.append(','.join(program['requiredLevel']))
except KeyError:
cols.append('')
try:
searchwords = program['searchword']['#text']
except TypeError:
searchwords = []
for searchword in program['searchword']:
searchwords.append(searchword['#text'])
searchwords = ','.join(searchwords)
cols.append(searchwords)
rows.append(cols)
df = pd.DataFrame(rows, columns=['organization', 'description', 'level', 'keyword'])

Extract attributes and certain tag values from xml using python script

I want to parse an XML content and return a dictionary which contains only the name attribute and its values as dictionary. For example:
<ecmaarray>
<number name="xyz1">123.456</number>
<ecmaarray name="xyz2">
<string name="str1">aaa</string>
<number name="num1">55</number>
</ecmaarray>
<strictarray name="xyz3">
<string>aaa</string>
<number>55</number>
</strictarray>
</ecmaarray>
The output has to be in a dictionary something like this..
Dict:{ 'xyz1': 123.456,
'xyz2': {'str1':'aaa', 'num1': '55'},
'xyz3': ['aaa','55']
}
Can any one suggest a recursive solution for this ?

Assuming situation like this:
<strictarray name="xyz4">
<string>aaa</string>
<number name="num1">55</number>
</strictarray>
is not possible, here's a sample code using lxml:
from lxml import etree
tree = etree.parse('test.xml')
result = {}
for element in tree.xpath('/ecmaarray/*'):
name = element.attrib["name"]
text = element.text
childs = element.getchildren()
if not childs:
result[name] = text
else:
child_dict = {}
child_list = []
for child in childs:
child_name = child.attrib.get('name')
child_text = child.text
if child_name:
child_dict[child_name] = child_text
else:
child_list.append(child_text)
if child_dict:
result[name] = child_dict
else:
result[name] = child_list
print result
prints:
{'xyz3': ['aaa', '55'],
'xyz2': {'str1': 'aaa', 'num1': '55'},
'xyz1': '123.456'}
You may want to improve the code - it's just a hint on where to go.
Hope that helps.

python: serialize a dictionary into a simple html output

using app engine - yes i know all about django templates and other template engines.
Lets say i have a dictionary or a simple object, i dont know its structure and i want to serialize it into html.
so if i had
{'data':{'id':1,'title':'home','address':{'street':'some road','city':'anycity','postal':'somepostal'}}}
want i want is that rendered in some form of readable html using lists or tables;
data:
id:1
title:home
address:
street: some road
city: anycity
postal:somepostal
now i know i can do
for key in dict.items
print dict[key]
but that wont dive into the child values and list each key, value pair when the key/value is a dictionary - ie the address dict.
Is their a module for python that is lightweight/fast that will do this nicely. or does anyone have any simple code they can paste that might do this.
Solution
All the solutions here were useful. pprint is no doubt the more stable means of printing the dictionary, though it falls short of returning anything near html. Though still printable.
I ended up with this for now:
def printitems(dictObj, indent=0):
p=[]
p.append('<ul>\n')
for k,v in dictObj.iteritems():
if isinstance(v, dict):
p.append('<li>'+ k+ ':')
p.append(printitems(v))
p.append('</li>')
else:
p.append('<li>'+ k+ ':'+ v+ '</li>')
p.append('</ul>\n')
return '\n'.join(p)
It converts the dict into unordered lists which is ok for now. some css and perhaps a little tweaking should make it readable.
Im going to reward the answer to the person that wrote the above code, i made a couple of small changes as the unordered lists were not nesting. I hope all agree that many of the solutions offered proved useful, But the above code renders a true html representation of a dictionary, even if crude.

The example made by pyfunc could easily be modified to generate simple nested html lists.
z = {'data':{'id':1,'title':'home','address':{'street':'some road','city':'anycity','postal':'somepostal'}}}
def printItems(dictObj, indent):
print ' '*indent + '<ul>\n'
for k,v in dictObj.iteritems():
if isinstance(v, dict):
print ' '*indent , '<li>', k, ':', '</li>'
printItems(v, indent+1)
else:
print ' '*indent , '<li>', k, ':', v, '</li>'
print ' '*indent + '</ul>\n'
printItems(z,0)
Not terribly pretty of course, but somewhere to start maybe. If all you want to do is visualize data, the pprint module really is good enough. You could just use the "pre" tag on the result from pprint and put that on your web page.
the pprint version would look something like this:
import pprint
z = {'data':{'id':1,'title':'home','address':{'street':'some road','city':'anycity','postal':'somepostal'}}}
print '<pre>', pprint.pformat(z), '</pre>'
And the html output look something like this:
{'data': {'address': {'city': 'anycity',
'postal': 'somepostal',
'street': 'some road'},
'id': 1,
'title': 'home'}}
Which isn't that pretty, but it at least shows the data in a more structured way.

import pprint
pprint.pprint(yourDict)
Well, no HTML, but similar to your for/print approach.
EDIT: or use:
niceText = pprint.pformat(yourDict)
this will give you the same nice output with all indents, etc. Now you can iterate over lines and format it into HTML:
htmlLines = []
for textLine in pprint.pformat(yourDict).splitlines():
htmlLines.append('<br/>%s' % textLine) # or something even nicer
htmlText = '\n'.join(htmlLines)

Here's my simple solution, It can handle any level of nested dictionary.
import json
temp_text = {'decision': {'date_time': None, 'decision_type': None},
'not_received': {'date_time': '2019-04-15T19:18:43.825766'},
'received': {'date_time': None},
'rfi': {'date_time': None},
'under_review': {'date_time': None}}
dict_text_for_html = json.dumps(
temp_text, indent=4
).replace(' ', '&nbsp').replace(',\n', ',<br>').replace('\n', '<br>')
html view of python dict

I needed something similar, but also wanted to pretty print lists, and lists inside the dict. Here's what I came up:
def format(self, obj, indent = 1):
if isinstance(obj, list):
htmls = []
for k in obj:
htmls.append(self.format(k,indent+1))
return '[<div style="margin-left: %dem">%s</div>]' % (indent, ',<br>'.join(htmls))
if isinstance(obj, dict):
htmls = []
for k,v in obj.iteritems():
htmls.append("<span style='font-style: italic; color: #888'>%s</span>: %s" % (k,self.format(v,indent+1)))
return '{<div style="margin-left: %dem">%s</div>}' % (indent, ',<br>'.join(htmls))
return str(obj)
Then, if you're using webapp on appengine, you can just do the following:
self.response.out.write(self.format(obj))
This is an example of the output:

Look at my implementation:
def pretty_items(r, d, nametag="<strong>%s: </strong>", itemtag='<li>%s</li>',
valuetag="%s", blocktag=('<ul>', '</ul>')):
if isinstance(d, dict):
r.append(blocktag[0])
for k, v in d.iteritems():
name = nametag % k
if isinstance(v, dict) or isinstance(v, list):
r.append(itemtag % name)
pretty_items(r, v)
else:
value = valuetag % v
r.append(itemtag % (name + value))
r.append(blocktag[1])
elif isinstance(d, list):
r.append(blocktag[0])
for i in d:
if isinstance(i, dict) or isinstance(i, list):
r.append(itemtag % " - ")
pretty_items(r, i)
else:
r.append(itemtag % i)
r.append(blocktag[1])
Will output all items in HTML format using <ul> and <li> tags. And is also optional to change the tags. And then, just use CSS to handle with the indentation.

None of the above examples give good results, so I wrote two of my own functions that create beautiful looking html output for dictionaries.
def dict_to_html(dd, level=0):
"""
Convert dict to html using basic html tags
"""
import simplejson
text = ''
for k, v in dd.iteritems():
text += '<br>' + ' '*(4*level) + '<b>%s</b>: %s' % (k, dict_to_html(v, level+1) if isinstance(v, dict) else (simplejson.dumps(v) if isinstance(v, list) else v))
return text
def dict_to_html_ul(dd, level=0):
"""
Convert dict to html using ul/li tags
"""
import simplejson
text = '<ul>'
for k, v in dd.iteritems():
text += '<li><b>%s</b>: %s</li>' % (k, dict_to_html_ul(v, level+1) if isinstance(v, dict) else (simplejson.dumps(v) if isinstance(v, list) else v))
text += '</ul>'
return text

You could use pretty print (pprint)
or if you want to do some further processing of display then you have to run through the dict yourself.
Be warned that the code is crude and will require numerous refinements. Solution uses recursion too, which is bad, if the recursion depth is higher.
z = {'data':{'id':1,'title':'home','address':{'street':'some road','city':'anycity','postal':'somepostal', 'telephone':{'home':'xxx','offie':'yyy'}}}}
def printItems(dictObj, indent):
it = dictObj.iteritems()
for k,v in it:
if isinstance(v, dict):
print ' '*indent , k, ':'
printItems(v, indent+1)
else:
print ' '*indent , k, ':', v
printItems(z,0)
Output:
data :
address :
city : anycity
postal : somepostal
street : some road
telephone :
home : xxx
offie : yyy
id : 1
title : home

Here is my version with support of lists (labels are verbose names of keys in dictionary):
def render_value(value, labels):
if isinstance(value, (list, tuple)):
return render_list(value, labels)
elif isinstance(value, dict):
return render_dict(value, labels)
else:
return value
def render_list(lst, labels):
items = [
'<li>%s</li>' % render_value(value, labels)
for value in lst
]
return '\n'.join(['\n<ul>'] + items + ['</ul>\n'])
def render_dict(dct, labels):
items = []
for key, value in dct.items():
if not value: continue
key = labels.get(key, key)
value = render_value(value, labels)
items.append('<li><b>%s</b>: %s</li>' % (key, value))
return '\n'.join(['\n<ul>'] + items + ['</ul>\n'])

imagine we have this :{name: "a", children:[{name: "b", children: [] },{..},{..}]
def ConvertDictToUlLi():
jsonResult = GetSomeRecursiveDict()
def CreateHtml(DictItem, output):
output = "<li>"+DictItem["name"] if jsonResult.has_key("name") else " "
if len(DictItem["children"]) > 0:
output = output + "<ul>"
for item in DictItem["children"]:
output = output + " "+CreateHtml(item, output)+" "
output = output + "</ul>"
return output+"</li>"
result = "<ul class='tree'>"+CreateHtml(jsonResult, "")+"</ul>"
return result

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Convert XML file to dict in Python [duplicate] - python

xmltodict (full disclosure: I wrote it) does exactly that: xmltodict.parse(""" <?xml version="1.0" ?> <person> <name>john</name> <age>20</age> </person>""") # {u'person': {u'age': u'20', u'name': u'john'}}

I wrote a simple recursive function to do the job: from xml.etree import ElementTree root = ElementTree.XML(xml_to_convert) def xml_to_dict_recursive(root): if len(root.getchildren()) == 0: return {root.tag:root.text} else: return {root.tag:list(map(xml_to_dict_recursive, root.getchildren()))}

def xml_to_dict(node): u''' #param node:lxml_node #return: dict ''' return {'tag': node.tag, 'text': node.text, 'attrib': node.attrib, 'children': {child.tag: xml_to_dict(child) for child in node}}

Updated method posted by firelion.cis (since getchildren is deprecated): from xml.etree import ElementTree root = ElementTree.XML(xml_to_convert) def xml_to_dict_recursive(root): if len(list(root)) == 0: return {root.tag:root.text} else: return {root.tag:list(map(xml_to_dict_recursive, list(root)))}

I have a recursive method to get a dictionary from a lxml element def recursive_dict(element): return (element.tag.split('}')[1], dict(map(recursive_dict, element.getchildren()), **element.attrib))

Related

Output CSV is in binary with python 3.10

Convert a dot notation string to object to access lxml objects python

Extracting nested XML elements of different sizes into Pandas

Extract attributes and certain tag values from xml using python script

python: serialize a dictionary into a simple html output

Categories

Resources