I am trying to create a CSV file from a OSM file. However, every time I run my code I'm getting a "B" in the output, like this,
b'id',b'lat',b'lon',b'user',b'uid',b'version',b'changeset',b'timestamp'
I can NOT figure out what I'm doing wrong.
Code is below.
def get_element(osm_file, tags=('node', 'way', 'relation')):
"""Yield element if it is the right type of tag"""
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
def validate_element(element, validator, schema=SCHEMA):
"""Raise ValidationError if element does not match schema"""
if validator.validate(element, schema) is not True:
field, errors = next(validator.errors.iteritems())
message_string = "\nElement of type '{0}' has the following errors:\n{1}"
error_string = pprint.pformat(errors)
raise Exception(message_string.format(field, error_string))
class UnicodeDictWriter(csv.DictWriter, object):
"""Extend csv.DictWriter to handle Unicode input"""
def writerow(self, row):
super(UnicodeDictWriter, self).writerow({
k: (v.encode('utf-8') if isinstance(v, str) else v) for k, v in row.items()
})
def writerows(self, rows):
for row in rows:
self.writerow(row)
# ================================================== #
# Main Function #
# ================================================== #
def process_map(file_in, validate):
"""Iteratively process each XML element and write to csv(s)"""
with codecs.open(NODES_PATH, 'w') as nodes_file, \
codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
codecs.open(WAYS_PATH, 'w') as ways_file, \
codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:
nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
nodes_writer.writeheader()
node_tags_writer.writeheader()
ways_writer.writeheader()
way_nodes_writer.writeheader()
way_tags_writer.writeheader()
validator = cerberus.Validator()
for element in get_element(file_in, tags=('node', 'way')):
el = shape_element(element)
if el:
if validate is True:
validate_element(el, validator)
if element.tag == 'node':
nodes_writer.writerow(el['node'])
node_tags_writer.writerows(el['node_tags'])
elif element.tag == 'way':
ways_writer.writerow(el['way'])
way_nodes_writer.writerows(el['way_nodes'])
way_tags_writer.writerows(el['way_tags'])
Because of this line in UnicodeDictWriter.writerow():
k: (v.encode('utf-8') if isinstance(v, str) else v) for k, v in row.items()
# ^^^^^^^^^^^^^^^^^ specifically, this
You are encoding the values in the UTF-8 codec, which outputs bytes objects. These then get written to your CSV file.
As Mark Tolonen pointed out, strings in Python 3 are already Unicode, so there's no need to subclass csv.DictWriter for this purpose.
Related
The following code works on python without any issue. It opens the JSON file, and replace all bananas to apples:
import json
replacements = "banana" : "apple"
with open(mycodepath, 'r') as file:
data = file.read()
for old, new in replacements.items():
data = data.replace(old, new)
However, I want to replace "ArmReoriented", "visible": true with "ArmReoriented", "visible": false,
I tried using triple quotes, but it does not work.
replacements = """ArmReoriented", "visible": true""" : """ArmReoriented", "visible": false,"""
How I replace a text containing quotes on JSON using Python?
Using """ is basically a comment or a doc-string and would not work.
Put double quote between a single quote string like below.
replacements = {'"ArmReoriented", "visible": true' : '"ArmReoriented", "visible": false'}
Try switching to single quotes in your replacements dict:
replacements = {'"ArmReoriented", "visible": true': '"ArmReoriented", "visible": false,'}
data = 'asd asd asd "ArmReoriented", "visible": true asd asd'
for old, new in replacements.items():
data = data.replace(old, new)
print(data)
You also may escape double quotes both from key and from value in your replacement dict:
replacements = {"\"ArmReoriented\", \"visible\": true": "\"ArmReoriented\", \"visible\": false,"}
If we know what key the value ArmReoriented is attached to, we can do this the right way, processing your content structured data rather than as a string:
import json, copy
def hideReorientedArm(obj):
if isinstance(obj, dict):
if obj.get("LastEvent") == "ArmReoriented" and obj.get("visible") is True:
obj["visible"] = False
return obj
def walk(obj, updateFn):
if isinstance(obj, list):
obj = [walk(elem, updateFn) for elem in obj]
elif isinstance(obj, dict):
obj = {k: walk(v, updateFn) for k, v in obj.items()}
return updateFn(obj)
with open(mycodepath, 'r') as file:
data = json.load(file)
data = walk(data, hideReorientedArm)
See this running at https://ideone.com/Zr6546
Even if you don't know the name of the specific key having that value, you can still search all of them if that's necessary for some reason, replacing the shorter hideReorientedArm definition above with something more like the following:
def hideReorientedArm(obj):
if isinstance(obj, dict):
if obj.get("visible") is True:
foundArmReoriented = False
for (k,v) in obj.items():
if v == "ArmReoriented":
foundArmReoriented = True
break
if foundArmReoriented:
obj["visible"] = False
return obj
...see this version running at https://ideone.com/z34Mx1
I am trying to convert a Json file that looks like
{
# "item_1":"value_11",
# "item_2":"value_12",
# "item_3":"value_13",
# "item_4":["sub_value_14", "sub_value_15"],
# "item_5":{
# "sub_item_1":"sub_item_value_11",
# "sub_item_2":["sub_item_value_12", "sub_item_value_13"]
# }
# }
TO something that looks like this:
{
# "node_item_1":"value_11",
# "node_item_2":"value_12",
# "node_item_3":"value_13",
# "node_item_4_0":"sub_value_14",
# "node_item_4_1":"sub_value_15",
# "node_item_5_sub_item_1":"sub_item_value_11",
# "node_item_5_sub_item_2_0":"sub_item_value_12",
# "node_item_5_sub_item_2_0":"sub_item_value_13"
# }
I am aware that you can't maintain the order of the Json file when converted to CSV. I am considering to do a workaround by loading the JSON data into OrderedDic objects (which cause them to be added in the order that the input document lists them. However, I am new to working with JSON files, as well as OrderedDic function.
To split items into subgroups i used:
def reduce_item(key, value):
global reduced_item
#Reduction Condition 1
if type(value) is list:
i=0
for sub_item in value:
reduce_item(key+'_'+to_string(i), sub_item)
i=i+1
#Reduction Condition 2
elif type(value) is dict:
sub_keys = value.keys()
for sub_key in sub_keys:
reduce_item(key+'_'+to_string(sub_key), value[sub_key])
#Base Condition
else:
reduced_item[to_string(key)] = to_string(value)
But how do I use the orderedDic along with the above code to show this output:
{
# "node_item_1":"value_11",
# "node_item_2":"value_12",
# "node_item_3":"value_13",
# "node_item_4_0":"sub_value_14",
# "node_item_4_1":"sub_value_15",
# "node_item_5_sub_item_1":"sub_item_value_11",
# "node_item_5_sub_item_2_0":"sub_item_value_12",
# "node_item_5_sub_item_2_0":"sub_item_value_13"
# }
I have the below code as well but it does not split each in subgroups based on the conditions of the subtring code above:
import json
from collections import OrderedDict
with open("/home/file/official.json", 'r') as fp:
metrics_types = json.load(fp, object_pairs_hook=OrderedDict)
print(metrics_types)
That shows:
Any suggestions?
You can use a function that iterates through the given dict or list items and merges the keys from the dict output of the recursive calls:
def flatten(d):
if not isinstance(d, (dict, list)):
return d
out = {}
for k, v in d.items() if isinstance(d, dict) else enumerate(d):
f = flatten(v)
if isinstance(f, dict):
out.update({'%s_%s' % (k, i): s for i, s in f.items()})
else:
out[k] = f
return out
so that given:
d = {
"item_1":"value_11",
"item_2":"value_12",
"item_3":"value_13",
"item_4":["sub_value_14", "sub_value_15"],
"item_5":{
"sub_item_1":"sub_item_value_11",
"sub_item_2":["sub_item_value_12", "sub_item_value_13"]
}
}
flatten(d) returns:
{'item_1': 'value_11',
'item_2': 'value_12',
'item_3': 'value_13',
'item_4_0': 'sub_value_14',
'item_4_1': 'sub_value_15',
'item_5_sub_item_1': 'sub_item_value_11',
'item_5_sub_item_2_0': 'sub_item_value_12',
'item_5_sub_item_2_1': 'sub_item_value_13'}
The above assumes that you're using Python 3.7 or later, where dict keys are guaranteed to be ordered. If you're using earlier versions, you can use OrderedDict in place of a regular dict.
I would like to build a dictionary of abreviations.
I have a text file with a lot of abreviations. The text file looks like this(after import)
with open('abreviations.txt') as ab:
ab_words = ab.read().splitlines()
An extract:
'ACE',
'Access Control Entry',
'ACK',
'Acknowledgement',
'ACORN',
'A Completely Obsessive Really Nutty person',
Now I want to build the dictionnary, where I have every uneven line as a dictionary key and every even line as the dictionary value.
Hence I should be able to write at the end:
ab_dict['ACE']
and get the result:
'Access Control Entry'
Also, How can I make it case-insensitive ?
ab_dict['ace']
should yield the same result
'Access Control Entry'
In fact, it would be perfect, if the output would also be lower case:
'access control entry'
Here is a link to the text file: https://www.dropbox.com/s/91afgnupk686p9y/abreviations.txt?dl=0
Complete solution with custom ABDict class and Python's generator functionality:
class ABDict(dict):
''' Class representing a dictionary of abbreviations'''
def __getitem__(self, key):
v = dict.__getitem__(self, key.upper())
return v.lower() if key.islower() else v
with open('abbreviations.txt') as ab:
ab_dict = ABDict()
while True:
try:
k = next(ab).strip() # `key` line
v = next(ab).strip() # `value` line
ab_dict[k] = v
except StopIteration:
break
Now, testing (with case-relative access):
print(ab_dict['ACE'])
print(ab_dict['ace'])
print('*' * 10)
print(ab_dict['WYTB'])
print(ab_dict['wytb'])
The output(consecutively):
Access Control Entry
access control entry
**********
Wish You The Best
wish you the best
Here's another solution based on the pairwise function from this solution:
from requests.structures import CaseInsensitiveDict
def pairwise(iterable):
"s -> (s0, s1), (s2, s3), (s4, s5), ..."
a = iter(iterable)
return zip(a, a)
with open('abreviations.txt') as reader:
abr_dict = CaseInsensitiveDict()
for abr, full in pairwise(reader):
abr_dict[abr.strip()] = full.strip()
Here is an answer that also allows sentences to be replaced with words from the dictionary:
import re
from requests.structures import CaseInsensitiveDict
def read_file_dict(filename):
"""
Reads file data into CaseInsensitiveDict
"""
# lists for keys and values
keys = []
values = []
# case sensitive dict
data = CaseInsensitiveDict()
# count used for deciding which line we're on
count = 1
with open(filename) as file:
temp = file.read().splitlines()
for line in temp:
# if the line count is even, a value is being read
if count % 2 == 0:
values.append(line)
# otherwise, a key is being read
else:
keys.append(line)
count += 1
# Add to dictionary
# perhaps some error checking here would be good
for key, value in zip(keys, values):
data[key] = value
return data
def replace_word(ab_dict, sentence):
"""
Replaces sentence with words found in dictionary
"""
# not necessarily words, but you get the idea
words = re.findall(r"[\w']+|[.,!?; ]", sentence)
new_words = []
for word in words:
# if word is in dictionary, replace it and add it to resulting list
if word in ab_dict:
new_words.append(ab_dict[word])
# otherwise add it as normally
else:
new_words.append(word)
# return sentence with replaced words
return "".join(x for x in new_words)
def main():
ab_dict = read_file_dict("abreviations.txt")
print(ab_dict)
print(ab_dict['ACE'])
print(ab_dict['Ace'])
print(ab_dict['ace'])
print(replace_word(ab_dict, "The ACE is not easy to understand"))
if __name__ == '__main__':
main()
Which outputs:
{'ACE': 'Access Control Entry', 'ACK': 'Acknowledgement', 'ACORN': 'A Completely Obsessive Really Nutty person'}
Access Control Entry
Access Control Entry
Access Control Entry
The Access Control Entry is not easy to understand
I have a program that reads an XML document from a socket. I have the XML document stored in a string which I would like to convert directly to a Python dictionary, the same way it is done in Django's simplejson library.
Take as an example:
str ="<?xml version="1.0" ?><person><name>john</name><age>20</age></person"
dic_xml = convert_to_dic(str)
Then dic_xml would look like {'person' : { 'name' : 'john', 'age' : 20 } }
xmltodict (full disclosure: I wrote it) does exactly that:
xmltodict.parse("""
<?xml version="1.0" ?>
<person>
<name>john</name>
<age>20</age>
</person>""")
# {u'person': {u'age': u'20', u'name': u'john'}}
This is a great module that someone created. I've used it several times.
http://code.activestate.com/recipes/410469-xml-as-dictionary/
Here is the code from the website just in case the link goes bad.
from xml.etree import cElementTree as ElementTree
class XmlListConfig(list):
def __init__(self, aList):
for element in aList:
if element:
# treat like dict
if len(element) == 1 or element[0].tag != element[1].tag:
self.append(XmlDictConfig(element))
# treat like list
elif element[0].tag == element[1].tag:
self.append(XmlListConfig(element))
elif element.text:
text = element.text.strip()
if text:
self.append(text)
class XmlDictConfig(dict):
'''
Example usage:
>>> tree = ElementTree.parse('your_file.xml')
>>> root = tree.getroot()
>>> xmldict = XmlDictConfig(root)
Or, if you want to use an XML string:
>>> root = ElementTree.XML(xml_string)
>>> xmldict = XmlDictConfig(root)
And then use xmldict for what it is... a dict.
'''
def __init__(self, parent_element):
if parent_element.items():
self.update(dict(parent_element.items()))
for element in parent_element:
if element:
# treat like dict - we assume that if the first two tags
# in a series are different, then they are all different.
if len(element) == 1 or element[0].tag != element[1].tag:
aDict = XmlDictConfig(element)
# treat like list - we assume that if the first two tags
# in a series are the same, then the rest are the same.
else:
# here, we put the list in dictionary; the key is the
# tag name the list elements all share in common, and
# the value is the list itself
aDict = {element[0].tag: XmlListConfig(element)}
# if the tag has attributes, add those to the dict
if element.items():
aDict.update(dict(element.items()))
self.update({element.tag: aDict})
# this assumes that if you've got an attribute in a tag,
# you won't be having any text. This may or may not be a
# good idea -- time will tell. It works for the way we are
# currently doing XML configuration files...
elif element.items():
self.update({element.tag: dict(element.items())})
# finally, if there are no child tags and no attributes, extract
# the text
else:
self.update({element.tag: element.text})
Example usage:
tree = ElementTree.parse('your_file.xml')
root = tree.getroot()
xmldict = XmlDictConfig(root)
//Or, if you want to use an XML string:
root = ElementTree.XML(xml_string)
xmldict = XmlDictConfig(root)
The following XML-to-Python-dict snippet parses entities as well as attributes following this XML-to-JSON "specification". It is the most general solution handling all cases of XML.
from collections import defaultdict
def etree_to_dict(t):
d = {t.tag: {} if t.attrib else None}
children = list(t)
if children:
dd = defaultdict(list)
for dc in map(etree_to_dict, children):
for k, v in dc.items():
dd[k].append(v)
d = {t.tag: {k:v[0] if len(v) == 1 else v for k, v in dd.items()}}
if t.attrib:
d[t.tag].update(('#' + k, v) for k, v in t.attrib.items())
if t.text:
text = t.text.strip()
if children or t.attrib:
if text:
d[t.tag]['#text'] = text
else:
d[t.tag] = text
return d
It is used:
from xml.etree import cElementTree as ET
e = ET.XML('''
<root>
<e />
<e>text</e>
<e name="value" />
<e name="value">text</e>
<e> <a>text</a> <b>text</b> </e>
<e> <a>text</a> <a>text</a> </e>
<e> text <a>text</a> </e>
</root>
''')
from pprint import pprint
pprint(etree_to_dict(e))
The output of this example (as per above-linked "specification") should be:
{'root': {'e': [None,
'text',
{'#name': 'value'},
{'#text': 'text', '#name': 'value'},
{'a': 'text', 'b': 'text'},
{'a': ['text', 'text']},
{'#text': 'text', 'a': 'text'}]}}
Not necessarily pretty, but it is unambiguous, and simpler XML inputs result in simpler JSON. :)
Update
If you want to do the reverse, emit an XML string from a JSON/dict, you can use:
try:
basestring
except NameError: # python3
basestring = str
def dict_to_etree(d):
def _to_etree(d, root):
if not d:
pass
elif isinstance(d, basestring):
root.text = d
elif isinstance(d, dict):
for k,v in d.items():
assert isinstance(k, basestring)
if k.startswith('#'):
assert k == '#text' and isinstance(v, basestring)
root.text = v
elif k.startswith('#'):
assert isinstance(v, basestring)
root.set(k[1:], v)
elif isinstance(v, list):
for e in v:
_to_etree(e, ET.SubElement(root, k))
else:
_to_etree(v, ET.SubElement(root, k))
else:
raise TypeError('invalid type: ' + str(type(d)))
assert isinstance(d, dict) and len(d) == 1
tag, body = next(iter(d.items()))
node = ET.Element(tag)
_to_etree(body, node)
return ET.tostring(node)
pprint(dict_to_etree(d))
This lightweight version, while not configurable, is pretty easy to tailor as needed, and works in old pythons. Also it is rigid - meaning the results are the same regardless of the existence of attributes.
import xml.etree.ElementTree as ET
from copy import copy
def dictify(r,root=True):
if root:
return {r.tag : dictify(r, False)}
d=copy(r.attrib)
if r.text:
d["_text"]=r.text
for x in r.findall("./*"):
if x.tag not in d:
d[x.tag]=[]
d[x.tag].append(dictify(x,False))
return d
So:
root = ET.fromstring("<erik><a x='1'>v</a><a y='2'>w</a></erik>")
dictify(root)
Results in:
{'erik': {'a': [{'x': '1', '_text': 'v'}, {'y': '2', '_text': 'w'}]}}
Disclaimer:
This modified XML parser was inspired by Adam Clark
The original XML parser works for most of simple cases. However, it didn't work for some complicated XML files. I debugged the code line by line and finally fixed some issues. If you find some bugs, please let me know. I am glad to fix it.
class XmlDictConfig(dict):
'''
Note: need to add a root into if no exising
Example usage:
>>> tree = ElementTree.parse('your_file.xml')
>>> root = tree.getroot()
>>> xmldict = XmlDictConfig(root)
Or, if you want to use an XML string:
>>> root = ElementTree.XML(xml_string)
>>> xmldict = XmlDictConfig(root)
And then use xmldict for what it is... a dict.
'''
def __init__(self, parent_element):
if parent_element.items():
self.updateShim( dict(parent_element.items()) )
for element in parent_element:
if len(element):
aDict = XmlDictConfig(element)
# if element.items():
# aDict.updateShim(dict(element.items()))
self.updateShim({element.tag: aDict})
elif element.items(): # items() is specialy for attribtes
elementattrib= element.items()
if element.text:
elementattrib.append((element.tag,element.text )) # add tag:text if there exist
self.updateShim({element.tag: dict(elementattrib)})
else:
self.updateShim({element.tag: element.text})
def updateShim (self, aDict ):
for key in aDict.keys(): # keys() includes tag and attributes
if key in self:
value = self.pop(key)
if type(value) is not list:
listOfDicts = []
listOfDicts.append(value)
listOfDicts.append(aDict[key])
self.update({key: listOfDicts})
else:
value.append(aDict[key])
self.update({key: value})
else:
self.update({key:aDict[key]}) # it was self.update(aDict)
The most recent versions of the PicklingTools libraries (1.3.0 and 1.3.1) support tools for converting from XML to a Python dict.
The download is available here: PicklingTools 1.3.1
There is quite a bit of documentation for the converters here: the documentation describes in detail all of the decisions and issues that will arise when converting between XML and Python dictionaries (there are a number of edge cases: attributes, lists, anonymous lists, anonymous dicts, eval, etc. that most converters don't handle). In general, though,
the converters are easy to use. If an 'example.xml' contains:
<top>
<a>1</a>
<b>2.2</b>
<c>three</c>
</top>
Then to convert it to a dictionary:
>>> from xmlloader import *
>>> example = file('example.xml', 'r') # A document containing XML
>>> xl = StreamXMLLoader(example, 0) # 0 = all defaults on operation
>>> result = xl.expect XML()
>>> print result
{'top': {'a': '1', 'c': 'three', 'b': '2.2'}}
There are tools for converting in both C++ and Python: the C++ and Python do indentical conversion, but the C++ is about 60x faster
You can do this quite easily with lxml. First install it:
[sudo] pip install lxml
Here is a recursive function I wrote that does the heavy lifting for you:
from lxml import objectify as xml_objectify
def xml_to_dict(xml_str):
""" Convert xml to dict, using lxml v3.4.2 xml processing library """
def xml_to_dict_recursion(xml_object):
dict_object = xml_object.__dict__
if not dict_object:
return xml_object
for key, value in dict_object.items():
dict_object[key] = xml_to_dict_recursion(value)
return dict_object
return xml_to_dict_recursion(xml_objectify.fromstring(xml_str))
xml_string = """<?xml version="1.0" encoding="UTF-8"?><Response><NewOrderResp>
<IndustryType>Test</IndustryType><SomeData><SomeNestedData1>1234</SomeNestedData1>
<SomeNestedData2>3455</SomeNestedData2></SomeData></NewOrderResp></Response>"""
print xml_to_dict(xml_string)
The below variant preserves the parent key / element:
def xml_to_dict(xml_str):
""" Convert xml to dict, using lxml v3.4.2 xml processing library, see http://lxml.de/ """
def xml_to_dict_recursion(xml_object):
dict_object = xml_object.__dict__
if not dict_object: # if empty dict returned
return xml_object
for key, value in dict_object.items():
dict_object[key] = xml_to_dict_recursion(value)
return dict_object
xml_obj = objectify.fromstring(xml_str)
return {xml_obj.tag: xml_to_dict_recursion(xml_obj)}
If you want to only return a subtree and convert it to dict, you can use Element.find() to get the subtree and then convert it:
xml_obj.find('.//') # lxml.objectify.ObjectifiedElement instance
See the lxml docs here. I hope this helps!
I wrote a simple recursive function to do the job:
from xml.etree import ElementTree
root = ElementTree.XML(xml_to_convert)
def xml_to_dict_recursive(root):
if len(root.getchildren()) == 0:
return {root.tag:root.text}
else:
return {root.tag:list(map(xml_to_dict_recursive, root.getchildren()))}
def xml_to_dict(node):
u'''
#param node:lxml_node
#return: dict
'''
return {'tag': node.tag, 'text': node.text, 'attrib': node.attrib, 'children': {child.tag: xml_to_dict(child) for child in node}}
The code from http://code.activestate.com/recipes/410469-xml-as-dictionary/ works well, but if there are multiple elements that are the same at a given place in the hierarchy it just overrides them.
I added a shim between that looks to see if the element already exists before self.update(). If so, pops the existing entry and creates a lists out of the existing and the new. Any subsequent duplicates are added to the list.
Not sure if this can be handled more gracefully, but it works:
import xml.etree.ElementTree as ElementTree
class XmlDictConfig(dict):
def __init__(self, parent_element):
if parent_element.items():
self.updateShim(dict(parent_element.items()))
for element in parent_element:
if len(element):
aDict = XmlDictConfig(element)
if element.items():
aDict.updateShim(dict(element.items()))
self.updateShim({element.tag: aDict})
elif element.items():
self.updateShim({element.tag: dict(element.items())})
else:
self.updateShim({element.tag: element.text.strip()})
def updateShim (self, aDict ):
for key in aDict.keys():
if key in self:
value = self.pop(key)
if type(value) is not list:
listOfDicts = []
listOfDicts.append(value)
listOfDicts.append(aDict[key])
self.update({key: listOfDicts})
else:
value.append(aDict[key])
self.update({key: value})
else:
self.update(aDict)
#dibrovsd: Solution will not work if the xml have more than one tag with same name
On your line of thought, I have modified the code a bit and written it for general node instead of root:
from collections import defaultdict
def xml2dict(node):
d, count = defaultdict(list), 1
for i in node:
d[i.tag + "_" + str(count)]['text'] = i.findtext('.')[0]
d[i.tag + "_" + str(count)]['attrib'] = i.attrib # attrib gives the list
d[i.tag + "_" + str(count)]['children'] = xml2dict(i) # it gives dict
return d
From #K3---rnc response (the best for me) I've added a small modifications to get an OrderedDict from an XML text (some times order matters):
def etree_to_ordereddict(t):
d = OrderedDict()
d[t.tag] = OrderedDict() if t.attrib else None
children = list(t)
if children:
dd = OrderedDict()
for dc in map(etree_to_ordereddict, children):
for k, v in dc.iteritems():
if k not in dd:
dd[k] = list()
dd[k].append(v)
d = OrderedDict()
d[t.tag] = OrderedDict()
for k, v in dd.iteritems():
if len(v) == 1:
d[t.tag][k] = v[0]
else:
d[t.tag][k] = v
if t.attrib:
d[t.tag].update(('#' + k, v) for k, v in t.attrib.iteritems())
if t.text:
text = t.text.strip()
if children or t.attrib:
if text:
d[t.tag]['#text'] = text
else:
d[t.tag] = text
return d
Following #K3---rnc example, you can use it:
from xml.etree import cElementTree as ET
e = ET.XML('''
<root>
<e />
<e>text</e>
<e name="value" />
<e name="value">text</e>
<e> <a>text</a> <b>text</b> </e>
<e> <a>text</a> <a>text</a> </e>
<e> text <a>text</a> </e>
</root>
''')
from pprint import pprint
pprint(etree_to_ordereddict(e))
Hope it helps ;)
An alternative (builds a lists for the same tags in hierarchy):
from xml.etree import cElementTree as ElementTree
def xml_to_dict(xml, result):
for child in xml:
if len(child) == 0:
result[child.tag] = child.text
else:
if child.tag in result:
if not isinstance(result[child.tag], list):
result[child.tag] = [result[child.tag]]
result[child.tag].append(xml_to_dict(child, {}))
else:
result[child.tag] = xml_to_dict(child, {})
return result
xmlTree = ElementTree.parse('my_file.xml')
xmlRoot = xmlTree.getroot()
dictRoot = xml_to_dict(xmlRoot, {})
result = {xmlRoot.tag: dictRoot}
Here's a link to an ActiveState solution - and the code in case it disappears again.
==================================================
xmlreader.py:
==================================================
from xml.dom.minidom import parse
class NotTextNodeError:
pass
def getTextFromNode(node):
"""
scans through all children of node and gathers the
text. if node has non-text child-nodes, then
NotTextNodeError is raised.
"""
t = ""
for n in node.childNodes:
if n.nodeType == n.TEXT_NODE:
t += n.nodeValue
else:
raise NotTextNodeError
return t
def nodeToDic(node):
"""
nodeToDic() scans through the children of node and makes a
dictionary from the content.
three cases are differentiated:
- if the node contains no other nodes, it is a text-node
and {nodeName:text} is merged into the dictionary.
- if the node has the attribute "method" set to "true",
then it's children will be appended to a list and this
list is merged to the dictionary in the form: {nodeName:list}.
- else, nodeToDic() will call itself recursively on
the nodes children (merging {nodeName:nodeToDic()} to
the dictionary).
"""
dic = {}
for n in node.childNodes:
if n.nodeType != n.ELEMENT_NODE:
continue
if n.getAttribute("multiple") == "true":
# node with multiple children:
# put them in a list
l = []
for c in n.childNodes:
if c.nodeType != n.ELEMENT_NODE:
continue
l.append(nodeToDic(c))
dic.update({n.nodeName:l})
continue
try:
text = getTextFromNode(n)
except NotTextNodeError:
# 'normal' node
dic.update({n.nodeName:nodeToDic(n)})
continue
# text node
dic.update({n.nodeName:text})
continue
return dic
def readConfig(filename):
dom = parse(filename)
return nodeToDic(dom)
def test():
dic = readConfig("sample.xml")
print dic["Config"]["Name"]
print
for item in dic["Config"]["Items"]:
print "Item's Name:", item["Name"]
print "Item's Value:", item["Value"]
test()
==================================================
sample.xml:
==================================================
<?xml version="1.0" encoding="UTF-8"?>
<Config>
<Name>My Config File</Name>
<Items multiple="true">
<Item>
<Name>First Item</Name>
<Value>Value 1</Value>
</Item>
<Item>
<Name>Second Item</Name>
<Value>Value 2</Value>
</Item>
</Items>
</Config>
==================================================
output:
==================================================
My Config File
Item's Name: First Item
Item's Value: Value 1
Item's Name: Second Item
Item's Value: Value 2
At one point I had to parse and write XML that only consisted of elements without attributes so a 1:1 mapping from XML to dict was possible easily. This is what I came up with in case someone else also doesnt need attributes:
def xmltodict(element):
if not isinstance(element, ElementTree.Element):
raise ValueError("must pass xml.etree.ElementTree.Element object")
def xmltodict_handler(parent_element):
result = dict()
for element in parent_element:
if len(element):
obj = xmltodict_handler(element)
else:
obj = element.text
if result.get(element.tag):
if hasattr(result[element.tag], "append"):
result[element.tag].append(obj)
else:
result[element.tag] = [result[element.tag], obj]
else:
result[element.tag] = obj
return result
return {element.tag: xmltodict_handler(element)}
def dicttoxml(element):
if not isinstance(element, dict):
raise ValueError("must pass dict type")
if len(element) != 1:
raise ValueError("dict must have exactly one root key")
def dicttoxml_handler(result, key, value):
if isinstance(value, list):
for e in value:
dicttoxml_handler(result, key, e)
elif isinstance(value, basestring):
elem = ElementTree.Element(key)
elem.text = value
result.append(elem)
elif isinstance(value, int) or isinstance(value, float):
elem = ElementTree.Element(key)
elem.text = str(value)
result.append(elem)
elif value is None:
result.append(ElementTree.Element(key))
else:
res = ElementTree.Element(key)
for k, v in value.items():
dicttoxml_handler(res, k, v)
result.append(res)
result = ElementTree.Element(element.keys()[0])
for key, value in element[element.keys()[0]].items():
dicttoxml_handler(result, key, value)
return result
def xmlfiletodict(filename):
return xmltodict(ElementTree.parse(filename).getroot())
def dicttoxmlfile(element, filename):
ElementTree.ElementTree(dicttoxml(element)).write(filename)
def xmlstringtodict(xmlstring):
return xmltodict(ElementTree.fromstring(xmlstring).getroot())
def dicttoxmlstring(element):
return ElementTree.tostring(dicttoxml(element))
I have modified one of the answers to my taste and to work with multiple values with the same tag for example consider the following xml code saved in XML.xml file
<A>
<B>
<BB>inAB</BB>
<C>
<D>
<E>
inABCDE
</E>
<E>value2</E>
<E>value3</E>
</D>
<inCout-ofD>123</inCout-ofD>
</C>
</B>
<B>abc</B>
<F>F</F>
</A>
and in python
import xml.etree.ElementTree as ET
class XMLToDictionary(dict):
def __init__(self, parentElement):
self.parentElement = parentElement
for child in list(parentElement):
child.text = child.text if (child.text != None) else ' '
if len(child) == 0:
self.update(self._addToDict(key= child.tag, value = child.text.strip(), dict = self))
else:
innerChild = XMLToDictionary(parentElement=child)
self.update(self._addToDict(key=innerChild.parentElement.tag, value=innerChild, dict=self))
def getDict(self):
return {self.parentElement.tag: self}
class _addToDict(dict):
def __init__(self, key, value, dict):
if not key in dict:
self.update({key: value})
else:
identical = dict[key] if type(dict[key]) == list else [dict[key]]
self.update({key: identical + [value]})
tree = ET.parse('./XML.xml')
root = tree.getroot()
parseredDict = XMLToDictionary(root).getDict()
print(parseredDict)
the output is
{'A': {'B': [{'BB': 'inAB', 'C': {'D': {'E': ['inABCDE', 'value2', 'value3']}, 'inCout-ofD': '123'}}, 'abc'], 'F': 'F'}}
Updated method posted by firelion.cis (since getchildren is deprecated):
from xml.etree import ElementTree
root = ElementTree.XML(xml_to_convert)
def xml_to_dict_recursive(root):
if len(list(root)) == 0:
return {root.tag:root.text}
else:
return {root.tag:list(map(xml_to_dict_recursive, list(root)))}
import xml.etree.ElementTree as ET
root = ET.parse(xml_filepath).getroot()
def parse_xml(node):
ans = {}
for child in node:
if len(child) == 0:
ans[child.tag] = child.text
elif child.tag not in ans:
ans[child.tag] = parse_xml(child)
elif not isinstance(ans[child.tag], list):
ans[child.tag] = [ans[child.tag]]
ans[child.tag].append(parse_xml(child))
else:
ans[child.tag].append(parse_xml(child))
return ans
it merges same field into list and squeezes fields containing one child.
Super simple code
#Follow this, its easy and nothing required, convert the XML into a string and use the find command to find the word that you are looking for as following
#hope this is easy and simple
def xml_key(key, text1):
tx1 = "<" + key + ">"
tx2 = "</" + key + ">"
tx = text1.find(tx1)
ty = text1.find(tx2)
tx = tx + len(tx1)
tw = text1[tx:ty]
return(tw)
text1 = "<person><name>john</name><age>20</age></person>"
dict1 = {"name": xml_key("name",text1),"age":xml_key("age",text1)}
print(dict1)
output :
{'name': 'john'}
I have a recursive method to get a dictionary from a lxml element
def recursive_dict(element):
return (element.tag.split('}')[1],
dict(map(recursive_dict, element.getchildren()),
**element.attrib))
I have created a simple GET only api for my Django application using tastypie. I need to deliver flat-tabular CSV data, but my database structure is normalized. Per the documentation I have implement a customized Serializer class with a to_csv() method as below.
def to_csv(self, data, options=None):
options = options or {}
data = self.to_simple(data, options)
raw_data = StringIO.StringIO()
writer = csv.writer(raw_data, quotechar="'", quoting=csv.QUOTE_NONNUMERIC)
if "meta" in data.keys():#if multiple objects are returned
objects = data.get("objects")
writer.writerow(objects[0].keys())
for object in objects:
test = object.values()
writer.writerow(test)
else:
writer.writerow(data.values())
CSVContent=raw_data.getvalue()
return CSVContent
This works great, except any resources get rendered by default as JSON (when I include full = True in the ModelResource ForeignKey specification), so I wind up with CSV data containing nested JSON data that looks like this.
foodID,foodName,related_details
1,"apricot","{'type':'fruit', 'cost':'medium'}"
2,"beef","{'type':'animal', 'cost':'high'}"
3,"celery","{'type':'vegetable', 'cost':'low'}"
My desired output is
foodID,foodName,type,cost
1,"apricot","fruit","medium"
2,"beef","animal","high"
3,"celery","vegetable","low"
I have an idea that I will need to apply my serializer recursively, and then to combine the results before writing to CSV, but have so far been unsuccessful.
def to_csv(self, data, options=None):
options = options or {}
data = self.to_simple(data, options)
raw_data = StringIO.StringIO()
first = True
if "meta" in data.keys():#if multiple objects are returned
objects = data.get("objects")
for value in objects:
test = {}
self.flatten(value, test)
if first:
writer = csv.DictWriter(raw_data, test.keys(), quotechar="'", quoting=csv.QUOTE_NONNUMERIC)
writer.writeheader()
writer.writerow(test)
first=False
else:
writer.writerow(test)
else:
test = {}
self.flatten(data, test)
if first:
writer = csv.DictWriter(raw_data, test.keys(), quotechar="'", quoting=csv.QUOTE_NONNUMERIC)
writer.writeheader()
writer.writerow(test)
first=False
else:
writer.writerow(test)
CSVContent=raw_data.getvalue()
return CSVContent
def flatten(self, data, odict = {}):
if isinstance(data, list):
for value in data:
self.flatten(value, odict)
elif isinstance(data, dict):
for (key, value) in data.items():
if not isinstance(value, (dict, list)):
odict[key] = value
else:
self.flatten(value, odict)
You can use the following function to_list on each line you have:
def to_list(line):
idx = -1
for i, l in enumerate(line):
if type(l) is str and '{' in l and '}' in l:
idx = i
break
if idx != -1:
result = line[:idx] + eval(line[idx]).values() + line[idx+1:]
else:
result = line
return result
if __name__ == "__main__":
lst = [[1,"apricot","{'type':'fruit', 'cost':'medium'}"],
["beef","{'type':'animal', 'cost':'high'}", 3],
["meat", "sugar"],
["{'type':'car', 'cost':'nothing'}", "something"]]
for line in lst:
print to_list(line)
So for the following lists:
1, "apricot", "{'type':'fruit', 'cost':'medium'}"
"beef", "{'type':'animal', 'cost':'high'}", 3
"meat", "sugar"
"{'type':'car', 'cost':'nothing'}", "something"
You will get:
1, 'apricot', 'medium', 'fruit'
'beef', 'high', 'animal', 3
'meat', 'sugar'
'nothing', 'car', 'something'
As you can see it is not depending on the number of elements neither on the position of the JSON string.