Identify if yaml key is anchor or pointer - python

I use ruamel.yaml in order to parse YAML files and I'd like to identify if the key is the anchor itself or just a pointer. Given the following:
foo: &some_anchor
bar: 1
baz: *some_anchor
I'd like to understand that foo is the actual anchor and baz is a pointer. From what I can see, there's an anchor property on the node (and also yaml_anchor method), but both baz and foo show that their anchor is some_anchor - meaning that I cannot differentiate.
How can I get this info?

Since PyYaml and Ruamel.yaml load an alias node as a reference of the object loaded from the corresponding anchor node, you can traverse an object tree and check if each node is a reference of a previous visited object or not.
The following is a simple example only checking dictionaries.
from ruamel.yaml import YAML
root = YAML().load('''
foo: &some_anchor
bar: 1
baz: *some_anchor
''')
dict_ids = set()
def visit(parent):
if isinstance(parent, dict):
i = id(parent)
print(parent, ', is_alias:', i in dict_ids)
dict_ids.add(i)
for k, v in parent.items():
visit(v)
elif isinstance(parent, list):
for e in parent:
visit(e)
visit(root)
This will output the following.
ordereddict([('foo', ordereddict([('bar', 1)])), ('baz', ordereddict([('bar', 1)]))]) , is_alias: False
ordereddict([('bar', 1)]) , is_alias: False
ordereddict([('bar', 1)]) , is_alias: True

In your example &some_anchor is the anchor for the single element mapping bar: 1 and
*some_anchor is the alias. Writing the "foo is the actual anchor and baz is pointer`" is
in IMO both incorrect terminology and confusing keys with their (anchored/aliased) values. If you had a YAML document:
- 3
- 5
- 9
- &some_anchor
bar: 1
- 42
- *some_anchor
would you actually say, probably after carefully counting,
that '4 is the anchor and 6 is the pointer(or3and5` depending on
where you start counting)?
If you want to test if a key of a dict has a value that was an anchored node in YAML, or if that
value was an aliased node, you'll have to look at the value, and you'll find that they are the same Python data structure
for keys foo resp. baz.
What determines on dumping, which key's value gets the anchor and which key's (or keys') value(s) are dumped as an alias,
is entirely determined
by which gets dumped first, as the YAML specification stats that an anchor has to come before its use as an alias (an
anchor can come after an alias if it is re-defined).
As #relent95 describes you should recursively walk over the
data structure you loaded (to see which key gets there first) and in both ruamel.yaml and PyYAML look at the id().
But for PyYAML that only works for complex data (dict, list, objects) as it throws away anchoring information and will
not find the same id() on e.g. an anchored integer value.
The alternative to using the id is to look at the actual anchor name that ruamel.yaml stores in attribute/property anchor.
If you know up front that your YAML document is as simple as your example ( anchored/aliased nodes are values for
the root level mapping ) you can do:
import sys
import ruamel.yaml
yaml_str = """\
foo: &some_anchor
bar: 1
baz: *some_anchor
oof: 42
"""
def is_keys_value_anchor(key, data, verbose=0):
anchor_found = set()
for k, v in data.items():
res = None
try:
anchor = v.anchor.value
if anchor is not None:
res = anchor not in anchor_found
anchor_found.add(anchor)
except AttributeError:
pass
if k == key:
break
if verbose > 0:
print(f'key "{key}" {res}')
return res
yaml = ruamel.yaml.YAML()
data = yaml.load(yaml_str)
is_keys_value_anchor('foo', data, verbose=1)
is_keys_value_anchor('baz', data, verbose=1)
is_keys_value_anchor('oof', data, verbose=1)
which gives:
key "foo" True
key "baz" False
key "oof" None
But this in ineffecient for root mappings with lots of keys, and won't find anchors/aliases that were nested deeply
in the document. A more generic approach is to recursively walk the data structure once and create dict with
as key the anchor used, and as value a list of "paths", A path itself being a list of keys/indices with which
which you can traverse the data structure starting at the root. The first path in the list being the anchor, the rest aliases:
import sys
import ruamel.yaml
yaml_str = """\
foo: &some_anchor
- bar: 1
- klm: &anchored_num 42
baz:
xyz:
- *some_anchor
oof: [1, 2, c: 13, magic: [*anchored_num]]
"""
def find_anchor_alias_paths(data, path=None, res=None):
def check_add_anchor(d, path, anchors):
# returns False when an alias is found, to prevent recursing into a node twice.
try:
anchor = d.anchor.value
if anchor is not None:
tmp = anchors.setdefault(anchor, [])
tmp.append(path)
return len(tmp) == 1
except AttributeError:
pass
return True
if path is None:
path = []
if res is None:
res = {}
if isinstance(data, dict):
for k, v in data.items():
next_path = path.copy()
next_path.append(k)
if check_add_anchor(v, next_path, res):
find_anchor_alias_paths(v, next_path, res)
elif isinstance(data, list):
for idx, elem in enumerate(data):
next_path = path.copy()
next_path.append(idx)
if check_add_anchor(elem, next_path, res):
find_anchor_alias_paths(elem, next_path, res)
return res
yaml = ruamel.yaml.YAML()
data = yaml.load(yaml_str)
anchor_alias_paths = find_anchor_alias_paths(data)
for anchor, paths in anchor_alias_paths.items():
print(f'anchor: "{anchor}", anchor_path: {paths[0]}, alias_path(s): {paths[1:]}')
print('value for last anchor/alias found', data.mlget(paths[-1], list_ok=True))
which gives:
anchor: "some_anchor", anchor_path: ['foo'], alias_path(s): [['baz', 'xyz', 0]]
anchor: "anchored_num", anchor_path: ['foo', 1, 'klm'], alias_path(s): [['oof', 3, 'magic', 0]]
value for last anchor/alias found 42
You can then test your the paths you are interested in against the values returned by find_anchor_alias_paths,
or the key against the final elements of such paths.

Related

Keeping comments in ruamel.yaml

I'm trying to manipulate some YAML files using ruamel.yaml, notably removing specific keys. This seems to work, but in the process all comment lines and empty lines that follow the key, up to the next key, are also removed. Minimal example:
from ruamel.yaml import YAML
import sys
yaml_str = """\
# Our app configuration
# foo is our main variable
foo: bar
# foz is also important
foz: quz
# And finally we have our optional configs.
# These are not really mandatory, they can be
# considere as "would be nice".
opt1: foqz
"""
yaml = YAML(typ='rt')
data = yaml.load(yaml_str)
data.pop('foz', None)
yaml.dump(data, sys.stdout)
which gives:
# Our app configuration
# foo is our main variable
foo: bar
# foz is also important
opt1: foqz
Is there maybe a way to avoid this and only remove the key itself, and any inline comment?
The ruamel.yaml documentation states
This preservation is normally not broken unless you severely alter
the structure of a component (delete a key in a dict, remove list entries).
so this behavior should not have come as a surprise.
What you can do is
extend the comment on the key before the one you delete with the comment on the key
you are going to delete (you can actually do it afterwards, as the comment is
still there, there is just no longer a key while dumping to associate it with).
import sys
import ruamel.yaml
yaml_str = """\
# Our app configuration
# foo is our main variable
foo: bar
# foz is also important
foz: quz
# And finally we have our optional configs.
# These are not really mandatory, they can be
# considere as "would be nice".
opt1: foqz
"""
undefined = object()
def my_pop(self, key, default=undefined):
if key not in self:
if default is undefined:
raise KeyError(key)
return default
keys = list(self.keys())
idx = keys.index(key)
if key in self.ca.items:
if idx == 0:
raise NotImplementedError('cannot handle moving comment when popping the first key', key)
prev = keys[idx-1]
# print('prev', prev, self.ca)
comment = self.ca.items.pop(key)[2]
if prev in self.ca.items:
self.ca.items[prev][2].value += comment.value
else:
self.ca.items[prev] = self.ca.items.pop(key)
res = self.__getitem__(key)
self.__delitem__(key)
return res
ruamel.yaml.comments.CommentedMap.pop = my_pop
yaml = ruamel.yaml.YAML()
data = yaml.load(yaml_str)
data.pop('foz', None)
yaml.dump(data, sys.stdout)
which gives:
# Our app configuration
# foo is our main variable
foo: bar
# foz is also important
# And finally we have our optional configs.
# These are not really mandatory, they can be
# considere as "would be nice".
opt1: foqz
If you need to be able to pop the first key in a commented map, then you need to inspect self.ca
and handle its comment attribute, which is somewhat more complicated.
As always when using these kind of internals, you should pin the version of ruamel.yaml that you are using.
This internals will change.

is this a lambda in python? [duplicate]

This question already has answers here:
What are type hints in Python 3.5?
(5 answers)
What does -> mean in Python function definitions?
(11 answers)
Closed 2 years ago.
i am using python 3.7 and i have just started my own opensource project. Some time ago a very skilled software developer decided to help, then he didn't have enough time to continue. So i am taking his work back to develop new features for the project. Now he has designed a script to manage the reading of text from pdf and doc files. He has developed it very well but there is something i don't understand:
#classmethod
def extract_document_data(cls, file_path : str) -> DocumentData:
"""
Entry point of the module, it extracts the data from the document
whose path is passed as input.
The extraction strategy is automatically chosen based on the MIME type
of the file.
#type file_path: str
#param file_path: The path of the document to be parsed.
#rtype: DocumentData
#returns: An object containing the data of the parsed document.
"""
mime = magic.Magic(mime=True)
mime_type = mime.from_file(file_path)
document_type = DocumentType.get_instance(mime_type)
strategy = cls.strategies[document_type]
return strategy.extract_document_data(file_path)
this: -> DocumentData is very obscure for me, as if it was a lamdba it shouls be included in the methods arguments as a callback doesn't it? which meaning does it have in this position?
I can paste even the whole classe if you need a more verbose insight:
from enum import Enum
import json
import magic
import docx
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer, LTTextContainer
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
class DocumentType(Enum):
"""
Defines the handled document types.
Each value is associated to a MIME type.
"""
def __init__(self, mime_type):
self.mime_type = mime_type
#classmethod
def get_instance(cls, mime_type : str):
values = [e for e in cls]
for value in values:
if value.mime_type == mime_type:
return value
raise MimeNotValidError(mime_type)
PDF = 'application/pdf'
DOCX = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
class MimeNotValidError(Exception):
"""
Exception to be raised when a not valid MIME type is processed.
"""
pass
class DocumentData:
"""
Wrapper for the extracted document data (TOC and contents).
"""
def __init__(self, toc : list = [], pages : list = [], document_text : str = None):
self.toc = toc
self.pages = pages
if document_text is not None:
self.document_text = document_text
else:
self.document_text = ' '.join([page.replace('\n', ' ') for page in pages])
def toc_as_json(self) -> str:
return json.dumps(self.toc)
class ExtractionStrategy:
"""
Base class for the extraction strategies.
"""
#staticmethod
def extract_document_data(file_path : str) -> DocumentData:
pass
class DOCXExtractionStrategy(ExtractionStrategy):
"""
It implements the TOC and contents extraction from a DOCX document.
"""
#staticmethod
def extract_document_data(file_path : str) -> DocumentData:
document = docx.Document(file_path)
body_elements = document._body._body
# Selecting only the <w:t> elements from DOCX XML,
# as they're the only to contain some text.
text_elems = body_elements.xpath('.//w:t')
return DocumentData(document_text = ' '.join([elem.text for elem in text_elems]))
class PDFExtractionStrategy(ExtractionStrategy):
"""
It implements the TOC and contents extraction from a PDF document.
"""
#staticmethod
def parse_toc(doc : PDFDocument) -> list:
raw_toc = []
try:
outlines = doc.get_outlines()
for (level, title, dest, a, se) in outlines:
raw_toc.append((level, title))
except PDFNoOutlines:
pass
return PDFExtractionStrategy.build_toc_tree(raw_toc)
#staticmethod
def build_toc_tree(items : list) -> list:
"""
Builds the TOC tree from a list of TOC items.
#type items: list
#param items: The TOC items.
Each item must have the following format: (<item depth>, <item description>).
E.g: [(1, 'Contents'), (2, 'Chapter 1'), (2, 'Chapter 2')]
#rtype: list
#returns: The TOC tree. The tree hasn't a root element, therefore it
actually is a list.
"""
toc = []
if items is None or len(items) == 0:
return toc
current_toc_level = toc
# Using an explicit stack containing the lists corresponding to
# the various levels of the TOC, to simulate the recursive building
# of the TOC tree in a more efficient way
toc_levels_stack = []
toc_levels_stack.append(current_toc_level)
# Each TOC item can be inserted into the current TOC level as
# string (just the item description) or as dict, where the key is
# the item description and the value is a list containing the
# children TOC items.
# To correctly determine how to insert the current item into
# the current level, a kind of look-ahead is needed, that is
# the depth of the next item has to be considered.
# Initializing the variables related to the previous item.
prev_item_depth, prev_item_desc = items[0]
# Adding a fake final item in order to handle all the TOC items
# inside the cycle.
items.append((-1, ''))
for i in range(1, len(items)):
# In fact each iteration handles the item of the previous
# one, using the current item to determine how to insert
# the previous item into the current TOC level,
# as explained before.
curr_item = items[i]
curr_item_depth = curr_item[0]
if curr_item_depth == prev_item_depth:
# The depth of the current item is the same
# as the previous one.
# Inserting the previous item into the current TOC level
# as string.
current_toc_level.append(prev_item_desc)
elif curr_item_depth == prev_item_depth + 1:
# The depth of the current item is increased by 1 compared to
# the previous one.
# Inserting the previous item into the current TOC level
# as dict.
prev_item_dict = { prev_item_desc : [] }
current_toc_level.append(prev_item_dict)
# Updating the current TOC level with the newly created one
# which contains the children of the previous item.
current_toc_level = prev_item_dict[prev_item_desc]
toc_levels_stack.append(current_toc_level)
elif curr_item_depth < prev_item_depth:
# The depth of the current item is lesser than
# the previous one.
# Inserting the previous item into the current TOC level
# as string.
current_toc_level.append(prev_item_desc)
if i < len(items)-1:
# Executing these steps for all the items except the last one
depth_diff = prev_item_depth - curr_item_depth
# Removing from the stack as many TOC levels as the difference
# between the depth of the previous item and the depth of the
# current one.
for i in range(0, depth_diff):
toc_levels_stack.pop()
# Updating the current TOC level with the one contained in
# the head of the stack.
current_toc_level = toc_levels_stack[-1]
# Updating the previous item with the current one
prev_item_depth, prev_item_desc = curr_item
return toc
#staticmethod
def from_bytestring(s) -> str:
"""
If the input string is a byte-string, converts it to a string using
UTF-8 as encoding.
#param s: A string or a byte-string.
#rtype: str
#returns: The potentially converted string.
"""
if s:
if isinstance(s, str):
return s
else:
return s.encode('utf-8')
#staticmethod
def parse_layout_nodes(container : LTContainer) -> str:
"""
Recursively extracts the text from all the nodes contained in the
input PDF layout tree/sub-tree.
#type container: LTContainer
#param container: The PDF layout tree/sub-tree from which to extract the text.
#rtype: str
#returns: A string containing the extracted text.
"""
text_content = []
# The iterator returns the children nodes.
for node in container:
if isinstance(node, LTTextContainer):
# Only nodes of type LTTextContainer contain text.
text_content.append(PDFExtractionStrategy.from_bytestring(node.get_text()))
elif isinstance(node, LTContainer):
# Recursively calling the method on the current node, which is a container itself.
text_content.append(PDFExtractionStrategy.parse_layout_nodes(node))
else:
# Ignoring all the other node types.
pass
# Joining all the extracted text chunks with a new line character.
return "\n".join(text_content)
#staticmethod
def parse_pages(doc : PDFDocument) -> list:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
text_content = []
for i, page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
layout = device.get_result()
# Extracts the text from all the nodes of the PDF layout tree of each page
text_content.append(PDFExtractionStrategy.parse_layout_nodes(layout))
return text_content
#staticmethod
def parse_pdf(file_path : str) -> (list, list):
toc = []
pages = []
try:
fp = open(file_path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
parser.set_document(doc)
if doc.is_extractable:
toc = PDFExtractionStrategy.parse_toc(doc)
pages = PDFExtractionStrategy.parse_pages(doc)
fp.close()
except IOError:
pass
return (toc, pages)
#staticmethod
def extract_document_data(file_path : str) -> DocumentData:
toc, pages = PDFExtractionStrategy.parse_pdf(file_path)
return DocumentData(toc, pages = pages)
class DocumentDataExtractor:
"""
Main class of the module.
It's responsible for actually executing the text extraction.
The output is constituted by the following items:
-table of contents (TOC);
-pages contents.
"""
# Dictionary containing the extraction strategies for the different
# document types, indexed by the corresponding DocumentType enum values.
strategies = {
DocumentType.DOCX : DOCXExtractionStrategy(),
DocumentType.PDF : PDFExtractionStrategy()
}
#classmethod
def extract_document_data(cls, file_path : str) -> DocumentData:
"""
Entry point of the module, it extracts the data from the document
whose path is passed as input.
The extraction strategy is automatically chosen based on the MIME type
of the file.
#type file_path: str
#param file_path: The path of the document to be parsed.
#rtype: DocumentData
#returns: An object containing the data of the parsed document.
"""
mime = magic.Magic(mime=True)
mime_type = mime.from_file(file_path)
document_type = DocumentType.get_instance(mime_type)
strategy = cls.strategies[document_type]
return strategy.extract_document_data(file_path)

How to Parse YAML Using PyYAML if there are '!' within the YAML

I have a YAML file that I'd like to parse the description variable only; however, I know that the exclamation points in my CloudFormation template (YAML file) are giving PyYAML trouble.
I am receiving the following error:
yaml.constructor.ConstructorError: could not determine a constructor for the tag '!Equals'
The file has many !Ref and !Equals. How can I ignore these constructors and get a specific variable I'm looking for -- in this case, the description variable.
If you have to deal with a YAML document with multiple different tags, and
are only interested in a subset of them, you should still
handle them all. If the elements you are intersted in are nested
within other tagged constructs you at least need to handle all of the "enclosing" tags
properly.
There is however no need to handle all of the tags individually, you
can write a constructor routine that can handle mappings, sequences
and scalars register that to PyYAML's SafeLoader using:
import yaml
inp = """\
MyEIP:
Type: !Join [ "::", [AWS, EC2, EIP] ]
Properties:
InstanceId: !Ref MyEC2Instance
"""
description = []
def any_constructor(loader, tag_suffix, node):
if isinstance(node, yaml.MappingNode):
return loader.construct_mapping(node)
if isinstance(node, yaml.SequenceNode):
return loader.construct_sequence(node)
return loader.construct_scalar(node)
yaml.add_multi_constructor('', any_constructor, Loader=yaml.SafeLoader)
data = yaml.safe_load(inp)
print(data)
which gives:
{'MyEIP': {'Type': ['::', ['AWS', 'EC2', 'EIP']], 'Properties': {'InstanceId': 'MyEC2Instance'}}}
(inp can also be a file opened for reading).
As you see above will also continue to work if an unexpected !Join tag shows up in your code,
as well as any other tag like !Equal. The tags are just dropped.
Since there are no variables in YAML, it is a bit of guesswork what
you mean by "like to parse the description variable only". If that has
an explicit tag (e.g. !Description), you can filter out the values by adding 2-3 lines
to the any_constructor, by matching the tag_suffix parameter.
if tag_suffix == u'!Description':
description.append(loader.construct_scalar(node))
It is however more likely that there is some key in a mapping that is a scalar description,
and that you are interested in the value associated with that key.
if isinstance(node, yaml.MappingNode):
d = loader.construct_mapping(node)
for k in d:
if k == 'description':
description.append(d[k])
return d
If you know the exact position in the data hierarchy, You can of
course also walk the data structure and extract anything you need
based on keys or list positions. Especially in that case you'd be better of
using my ruamel.yaml, was this can load tagged YAML in round-trip mode without
extra effort (assuming the above inp):
from ruamel.yaml import YAML
with YAML() as yaml:
data = yaml.load(inp)
You can define a custom constructors using a custom yaml.SafeLoader
import yaml
doc = '''
Conditions:
CreateNewSecurityGroup: !Equals [!Ref ExistingSecurityGroup, NONE]
'''
class Equals(object):
def __init__(self, data):
self.data = data
def __repr__(self):
return "Equals(%s)" % self.data
class Ref(object):
def __init__(self, data):
self.data = data
def __repr__(self):
return "Ref(%s)" % self.data
def create_equals(loader,node):
value = loader.construct_sequence(node)
return Equals(value)
def create_ref(loader,node):
value = loader.construct_scalar(node)
return Ref(value)
class Loader(yaml.SafeLoader):
pass
yaml.add_constructor(u'!Equals', create_equals, Loader)
yaml.add_constructor(u'!Ref', create_ref, Loader)
a = yaml.load(doc, Loader)
print(a)
Outputs:
{'Conditions': {'CreateNewSecurityGroup': Equals([Ref(ExistingSecurityGroup), 'NONE'])}}

XML generation Python reading from DB

I have generate an xml, that take a param value from the model.
The result xml should be like this <read><test><value string/></test></read>
root = etree.Element('read')
node = etree.Element('test')
value = etree.SubElement(node, "value" )
for dict in Listing.objects.values('string'):
value.text = dict['string']
I get like this :<read><test><value>string</value></test></read>
How should I add the string to the value tag rather than as a TEXT?
Firstly, as #parfait mentioned, the XML result you want (<read><test><value string/></test></read>) is not valid XML.
Using the code you provided, the node test doesn't get added to the root node read.
1. Code to get your current result:
If you want something like this as the result: <read><test><value>string</value></test></read> (which you said you don't), then the code for it would be:
>>> root = etree.Element('read')
>>> node = etree.SubElement(root, 'test') # `node` should be a `SubElement` of root
>>> # or create as `Element` and use `root.append(node)` to make it a `SubElement` later
... value = etree.SubElement(node, 'value')
>>> value.text = 'string' # or populate based on your `Listing.objects`
>>> etree.dump(root)
<read><test><value>string</value></test></read>
2. "string" as a value (text) of "test":
If you want 'string' to be the value of test and not as a node 'value' under 'test', then you should set 'string' as the .text attribute of 'test':
>>> root = etree.Element('read')
>>> node = etree.SubElement(root, 'test')
>>> node.text = 'string'
>>> etree.dump(root)
<read><test>string</test></read>
3. "value" as an attribute of "test" with the value "string":
The one I think you're trying to get:
>>> root = etree.Element('read')
>>> node = etree.SubElement(root, 'test')
>>> node.attrib # the current attributes of the node, nothing, empty dict
{}
>>> node.attrib['value'] = 'string' # this is how you set an attribute
>>> etree.dump(root)
<read><test value="string" /></read>
Btw, in XML good-ness, the 2nd option is nicer than the 3rd; but botha/all are valid XML.

Convert XML file to dict in Python [duplicate]

I have a program that reads an XML document from a socket. I have the XML document stored in a string which I would like to convert directly to a Python dictionary, the same way it is done in Django's simplejson library.
Take as an example:
str ="<?xml version="1.0" ?><person><name>john</name><age>20</age></person"
dic_xml = convert_to_dic(str)
Then dic_xml would look like {'person' : { 'name' : 'john', 'age' : 20 } }
xmltodict (full disclosure: I wrote it) does exactly that:
xmltodict.parse("""
<?xml version="1.0" ?>
<person>
<name>john</name>
<age>20</age>
</person>""")
# {u'person': {u'age': u'20', u'name': u'john'}}
This is a great module that someone created. I've used it several times.
http://code.activestate.com/recipes/410469-xml-as-dictionary/
Here is the code from the website just in case the link goes bad.
from xml.etree import cElementTree as ElementTree
class XmlListConfig(list):
def __init__(self, aList):
for element in aList:
if element:
# treat like dict
if len(element) == 1 or element[0].tag != element[1].tag:
self.append(XmlDictConfig(element))
# treat like list
elif element[0].tag == element[1].tag:
self.append(XmlListConfig(element))
elif element.text:
text = element.text.strip()
if text:
self.append(text)
class XmlDictConfig(dict):
'''
Example usage:
>>> tree = ElementTree.parse('your_file.xml')
>>> root = tree.getroot()
>>> xmldict = XmlDictConfig(root)
Or, if you want to use an XML string:
>>> root = ElementTree.XML(xml_string)
>>> xmldict = XmlDictConfig(root)
And then use xmldict for what it is... a dict.
'''
def __init__(self, parent_element):
if parent_element.items():
self.update(dict(parent_element.items()))
for element in parent_element:
if element:
# treat like dict - we assume that if the first two tags
# in a series are different, then they are all different.
if len(element) == 1 or element[0].tag != element[1].tag:
aDict = XmlDictConfig(element)
# treat like list - we assume that if the first two tags
# in a series are the same, then the rest are the same.
else:
# here, we put the list in dictionary; the key is the
# tag name the list elements all share in common, and
# the value is the list itself
aDict = {element[0].tag: XmlListConfig(element)}
# if the tag has attributes, add those to the dict
if element.items():
aDict.update(dict(element.items()))
self.update({element.tag: aDict})
# this assumes that if you've got an attribute in a tag,
# you won't be having any text. This may or may not be a
# good idea -- time will tell. It works for the way we are
# currently doing XML configuration files...
elif element.items():
self.update({element.tag: dict(element.items())})
# finally, if there are no child tags and no attributes, extract
# the text
else:
self.update({element.tag: element.text})
Example usage:
tree = ElementTree.parse('your_file.xml')
root = tree.getroot()
xmldict = XmlDictConfig(root)
//Or, if you want to use an XML string:
root = ElementTree.XML(xml_string)
xmldict = XmlDictConfig(root)
The following XML-to-Python-dict snippet parses entities as well as attributes following this XML-to-JSON "specification". It is the most general solution handling all cases of XML.
from collections import defaultdict
def etree_to_dict(t):
d = {t.tag: {} if t.attrib else None}
children = list(t)
if children:
dd = defaultdict(list)
for dc in map(etree_to_dict, children):
for k, v in dc.items():
dd[k].append(v)
d = {t.tag: {k:v[0] if len(v) == 1 else v for k, v in dd.items()}}
if t.attrib:
d[t.tag].update(('#' + k, v) for k, v in t.attrib.items())
if t.text:
text = t.text.strip()
if children or t.attrib:
if text:
d[t.tag]['#text'] = text
else:
d[t.tag] = text
return d
It is used:
from xml.etree import cElementTree as ET
e = ET.XML('''
<root>
<e />
<e>text</e>
<e name="value" />
<e name="value">text</e>
<e> <a>text</a> <b>text</b> </e>
<e> <a>text</a> <a>text</a> </e>
<e> text <a>text</a> </e>
</root>
''')
from pprint import pprint
pprint(etree_to_dict(e))
The output of this example (as per above-linked "specification") should be:
{'root': {'e': [None,
'text',
{'#name': 'value'},
{'#text': 'text', '#name': 'value'},
{'a': 'text', 'b': 'text'},
{'a': ['text', 'text']},
{'#text': 'text', 'a': 'text'}]}}
Not necessarily pretty, but it is unambiguous, and simpler XML inputs result in simpler JSON. :)
Update
If you want to do the reverse, emit an XML string from a JSON/dict, you can use:
try:
basestring
except NameError: # python3
basestring = str
def dict_to_etree(d):
def _to_etree(d, root):
if not d:
pass
elif isinstance(d, basestring):
root.text = d
elif isinstance(d, dict):
for k,v in d.items():
assert isinstance(k, basestring)
if k.startswith('#'):
assert k == '#text' and isinstance(v, basestring)
root.text = v
elif k.startswith('#'):
assert isinstance(v, basestring)
root.set(k[1:], v)
elif isinstance(v, list):
for e in v:
_to_etree(e, ET.SubElement(root, k))
else:
_to_etree(v, ET.SubElement(root, k))
else:
raise TypeError('invalid type: ' + str(type(d)))
assert isinstance(d, dict) and len(d) == 1
tag, body = next(iter(d.items()))
node = ET.Element(tag)
_to_etree(body, node)
return ET.tostring(node)
pprint(dict_to_etree(d))
This lightweight version, while not configurable, is pretty easy to tailor as needed, and works in old pythons. Also it is rigid - meaning the results are the same regardless of the existence of attributes.
import xml.etree.ElementTree as ET
from copy import copy
def dictify(r,root=True):
if root:
return {r.tag : dictify(r, False)}
d=copy(r.attrib)
if r.text:
d["_text"]=r.text
for x in r.findall("./*"):
if x.tag not in d:
d[x.tag]=[]
d[x.tag].append(dictify(x,False))
return d
So:
root = ET.fromstring("<erik><a x='1'>v</a><a y='2'>w</a></erik>")
dictify(root)
Results in:
{'erik': {'a': [{'x': '1', '_text': 'v'}, {'y': '2', '_text': 'w'}]}}
Disclaimer:
This modified XML parser was inspired by Adam Clark
The original XML parser works for most of simple cases. However, it didn't work for some complicated XML files. I debugged the code line by line and finally fixed some issues. If you find some bugs, please let me know. I am glad to fix it.
class XmlDictConfig(dict):
'''
Note: need to add a root into if no exising
Example usage:
>>> tree = ElementTree.parse('your_file.xml')
>>> root = tree.getroot()
>>> xmldict = XmlDictConfig(root)
Or, if you want to use an XML string:
>>> root = ElementTree.XML(xml_string)
>>> xmldict = XmlDictConfig(root)
And then use xmldict for what it is... a dict.
'''
def __init__(self, parent_element):
if parent_element.items():
self.updateShim( dict(parent_element.items()) )
for element in parent_element:
if len(element):
aDict = XmlDictConfig(element)
# if element.items():
# aDict.updateShim(dict(element.items()))
self.updateShim({element.tag: aDict})
elif element.items(): # items() is specialy for attribtes
elementattrib= element.items()
if element.text:
elementattrib.append((element.tag,element.text )) # add tag:text if there exist
self.updateShim({element.tag: dict(elementattrib)})
else:
self.updateShim({element.tag: element.text})
def updateShim (self, aDict ):
for key in aDict.keys(): # keys() includes tag and attributes
if key in self:
value = self.pop(key)
if type(value) is not list:
listOfDicts = []
listOfDicts.append(value)
listOfDicts.append(aDict[key])
self.update({key: listOfDicts})
else:
value.append(aDict[key])
self.update({key: value})
else:
self.update({key:aDict[key]}) # it was self.update(aDict)
The most recent versions of the PicklingTools libraries (1.3.0 and 1.3.1) support tools for converting from XML to a Python dict.
The download is available here: PicklingTools 1.3.1
There is quite a bit of documentation for the converters here: the documentation describes in detail all of the decisions and issues that will arise when converting between XML and Python dictionaries (there are a number of edge cases: attributes, lists, anonymous lists, anonymous dicts, eval, etc. that most converters don't handle). In general, though,
the converters are easy to use. If an 'example.xml' contains:
<top>
<a>1</a>
<b>2.2</b>
<c>three</c>
</top>
Then to convert it to a dictionary:
>>> from xmlloader import *
>>> example = file('example.xml', 'r') # A document containing XML
>>> xl = StreamXMLLoader(example, 0) # 0 = all defaults on operation
>>> result = xl.expect XML()
>>> print result
{'top': {'a': '1', 'c': 'three', 'b': '2.2'}}
There are tools for converting in both C++ and Python: the C++ and Python do indentical conversion, but the C++ is about 60x faster
You can do this quite easily with lxml. First install it:
[sudo] pip install lxml
Here is a recursive function I wrote that does the heavy lifting for you:
from lxml import objectify as xml_objectify
def xml_to_dict(xml_str):
""" Convert xml to dict, using lxml v3.4.2 xml processing library """
def xml_to_dict_recursion(xml_object):
dict_object = xml_object.__dict__
if not dict_object:
return xml_object
for key, value in dict_object.items():
dict_object[key] = xml_to_dict_recursion(value)
return dict_object
return xml_to_dict_recursion(xml_objectify.fromstring(xml_str))
xml_string = """<?xml version="1.0" encoding="UTF-8"?><Response><NewOrderResp>
<IndustryType>Test</IndustryType><SomeData><SomeNestedData1>1234</SomeNestedData1>
<SomeNestedData2>3455</SomeNestedData2></SomeData></NewOrderResp></Response>"""
print xml_to_dict(xml_string)
The below variant preserves the parent key / element:
def xml_to_dict(xml_str):
""" Convert xml to dict, using lxml v3.4.2 xml processing library, see http://lxml.de/ """
def xml_to_dict_recursion(xml_object):
dict_object = xml_object.__dict__
if not dict_object: # if empty dict returned
return xml_object
for key, value in dict_object.items():
dict_object[key] = xml_to_dict_recursion(value)
return dict_object
xml_obj = objectify.fromstring(xml_str)
return {xml_obj.tag: xml_to_dict_recursion(xml_obj)}
If you want to only return a subtree and convert it to dict, you can use Element.find() to get the subtree and then convert it:
xml_obj.find('.//') # lxml.objectify.ObjectifiedElement instance
See the lxml docs here. I hope this helps!
I wrote a simple recursive function to do the job:
from xml.etree import ElementTree
root = ElementTree.XML(xml_to_convert)
def xml_to_dict_recursive(root):
if len(root.getchildren()) == 0:
return {root.tag:root.text}
else:
return {root.tag:list(map(xml_to_dict_recursive, root.getchildren()))}
def xml_to_dict(node):
u'''
#param node:lxml_node
#return: dict
'''
return {'tag': node.tag, 'text': node.text, 'attrib': node.attrib, 'children': {child.tag: xml_to_dict(child) for child in node}}
The code from http://code.activestate.com/recipes/410469-xml-as-dictionary/ works well, but if there are multiple elements that are the same at a given place in the hierarchy it just overrides them.
I added a shim between that looks to see if the element already exists before self.update(). If so, pops the existing entry and creates a lists out of the existing and the new. Any subsequent duplicates are added to the list.
Not sure if this can be handled more gracefully, but it works:
import xml.etree.ElementTree as ElementTree
class XmlDictConfig(dict):
def __init__(self, parent_element):
if parent_element.items():
self.updateShim(dict(parent_element.items()))
for element in parent_element:
if len(element):
aDict = XmlDictConfig(element)
if element.items():
aDict.updateShim(dict(element.items()))
self.updateShim({element.tag: aDict})
elif element.items():
self.updateShim({element.tag: dict(element.items())})
else:
self.updateShim({element.tag: element.text.strip()})
def updateShim (self, aDict ):
for key in aDict.keys():
if key in self:
value = self.pop(key)
if type(value) is not list:
listOfDicts = []
listOfDicts.append(value)
listOfDicts.append(aDict[key])
self.update({key: listOfDicts})
else:
value.append(aDict[key])
self.update({key: value})
else:
self.update(aDict)
#dibrovsd: Solution will not work if the xml have more than one tag with same name
On your line of thought, I have modified the code a bit and written it for general node instead of root:
from collections import defaultdict
def xml2dict(node):
d, count = defaultdict(list), 1
for i in node:
d[i.tag + "_" + str(count)]['text'] = i.findtext('.')[0]
d[i.tag + "_" + str(count)]['attrib'] = i.attrib # attrib gives the list
d[i.tag + "_" + str(count)]['children'] = xml2dict(i) # it gives dict
return d
From #K3---rnc response (the best for me) I've added a small modifications to get an OrderedDict from an XML text (some times order matters):
def etree_to_ordereddict(t):
d = OrderedDict()
d[t.tag] = OrderedDict() if t.attrib else None
children = list(t)
if children:
dd = OrderedDict()
for dc in map(etree_to_ordereddict, children):
for k, v in dc.iteritems():
if k not in dd:
dd[k] = list()
dd[k].append(v)
d = OrderedDict()
d[t.tag] = OrderedDict()
for k, v in dd.iteritems():
if len(v) == 1:
d[t.tag][k] = v[0]
else:
d[t.tag][k] = v
if t.attrib:
d[t.tag].update(('#' + k, v) for k, v in t.attrib.iteritems())
if t.text:
text = t.text.strip()
if children or t.attrib:
if text:
d[t.tag]['#text'] = text
else:
d[t.tag] = text
return d
Following #K3---rnc example, you can use it:
from xml.etree import cElementTree as ET
e = ET.XML('''
<root>
<e />
<e>text</e>
<e name="value" />
<e name="value">text</e>
<e> <a>text</a> <b>text</b> </e>
<e> <a>text</a> <a>text</a> </e>
<e> text <a>text</a> </e>
</root>
''')
from pprint import pprint
pprint(etree_to_ordereddict(e))
Hope it helps ;)
An alternative (builds a lists for the same tags in hierarchy):
from xml.etree import cElementTree as ElementTree
def xml_to_dict(xml, result):
for child in xml:
if len(child) == 0:
result[child.tag] = child.text
else:
if child.tag in result:
if not isinstance(result[child.tag], list):
result[child.tag] = [result[child.tag]]
result[child.tag].append(xml_to_dict(child, {}))
else:
result[child.tag] = xml_to_dict(child, {})
return result
xmlTree = ElementTree.parse('my_file.xml')
xmlRoot = xmlTree.getroot()
dictRoot = xml_to_dict(xmlRoot, {})
result = {xmlRoot.tag: dictRoot}
Here's a link to an ActiveState solution - and the code in case it disappears again.
==================================================
xmlreader.py:
==================================================
from xml.dom.minidom import parse
class NotTextNodeError:
pass
def getTextFromNode(node):
"""
scans through all children of node and gathers the
text. if node has non-text child-nodes, then
NotTextNodeError is raised.
"""
t = ""
for n in node.childNodes:
if n.nodeType == n.TEXT_NODE:
t += n.nodeValue
else:
raise NotTextNodeError
return t
def nodeToDic(node):
"""
nodeToDic() scans through the children of node and makes a
dictionary from the content.
three cases are differentiated:
- if the node contains no other nodes, it is a text-node
and {nodeName:text} is merged into the dictionary.
- if the node has the attribute "method" set to "true",
then it's children will be appended to a list and this
list is merged to the dictionary in the form: {nodeName:list}.
- else, nodeToDic() will call itself recursively on
the nodes children (merging {nodeName:nodeToDic()} to
the dictionary).
"""
dic = {}
for n in node.childNodes:
if n.nodeType != n.ELEMENT_NODE:
continue
if n.getAttribute("multiple") == "true":
# node with multiple children:
# put them in a list
l = []
for c in n.childNodes:
if c.nodeType != n.ELEMENT_NODE:
continue
l.append(nodeToDic(c))
dic.update({n.nodeName:l})
continue
try:
text = getTextFromNode(n)
except NotTextNodeError:
# 'normal' node
dic.update({n.nodeName:nodeToDic(n)})
continue
# text node
dic.update({n.nodeName:text})
continue
return dic
def readConfig(filename):
dom = parse(filename)
return nodeToDic(dom)
def test():
dic = readConfig("sample.xml")
print dic["Config"]["Name"]
print
for item in dic["Config"]["Items"]:
print "Item's Name:", item["Name"]
print "Item's Value:", item["Value"]
test()
==================================================
sample.xml:
==================================================
<?xml version="1.0" encoding="UTF-8"?>
<Config>
<Name>My Config File</Name>
<Items multiple="true">
<Item>
<Name>First Item</Name>
<Value>Value 1</Value>
</Item>
<Item>
<Name>Second Item</Name>
<Value>Value 2</Value>
</Item>
</Items>
</Config>
==================================================
output:
==================================================
My Config File
Item's Name: First Item
Item's Value: Value 1
Item's Name: Second Item
Item's Value: Value 2
At one point I had to parse and write XML that only consisted of elements without attributes so a 1:1 mapping from XML to dict was possible easily. This is what I came up with in case someone else also doesnt need attributes:
def xmltodict(element):
if not isinstance(element, ElementTree.Element):
raise ValueError("must pass xml.etree.ElementTree.Element object")
def xmltodict_handler(parent_element):
result = dict()
for element in parent_element:
if len(element):
obj = xmltodict_handler(element)
else:
obj = element.text
if result.get(element.tag):
if hasattr(result[element.tag], "append"):
result[element.tag].append(obj)
else:
result[element.tag] = [result[element.tag], obj]
else:
result[element.tag] = obj
return result
return {element.tag: xmltodict_handler(element)}
def dicttoxml(element):
if not isinstance(element, dict):
raise ValueError("must pass dict type")
if len(element) != 1:
raise ValueError("dict must have exactly one root key")
def dicttoxml_handler(result, key, value):
if isinstance(value, list):
for e in value:
dicttoxml_handler(result, key, e)
elif isinstance(value, basestring):
elem = ElementTree.Element(key)
elem.text = value
result.append(elem)
elif isinstance(value, int) or isinstance(value, float):
elem = ElementTree.Element(key)
elem.text = str(value)
result.append(elem)
elif value is None:
result.append(ElementTree.Element(key))
else:
res = ElementTree.Element(key)
for k, v in value.items():
dicttoxml_handler(res, k, v)
result.append(res)
result = ElementTree.Element(element.keys()[0])
for key, value in element[element.keys()[0]].items():
dicttoxml_handler(result, key, value)
return result
def xmlfiletodict(filename):
return xmltodict(ElementTree.parse(filename).getroot())
def dicttoxmlfile(element, filename):
ElementTree.ElementTree(dicttoxml(element)).write(filename)
def xmlstringtodict(xmlstring):
return xmltodict(ElementTree.fromstring(xmlstring).getroot())
def dicttoxmlstring(element):
return ElementTree.tostring(dicttoxml(element))
I have modified one of the answers to my taste and to work with multiple values with the same tag for example consider the following xml code saved in XML.xml file
<A>
<B>
<BB>inAB</BB>
<C>
<D>
<E>
inABCDE
</E>
<E>value2</E>
<E>value3</E>
</D>
<inCout-ofD>123</inCout-ofD>
</C>
</B>
<B>abc</B>
<F>F</F>
</A>
and in python
import xml.etree.ElementTree as ET
class XMLToDictionary(dict):
def __init__(self, parentElement):
self.parentElement = parentElement
for child in list(parentElement):
child.text = child.text if (child.text != None) else ' '
if len(child) == 0:
self.update(self._addToDict(key= child.tag, value = child.text.strip(), dict = self))
else:
innerChild = XMLToDictionary(parentElement=child)
self.update(self._addToDict(key=innerChild.parentElement.tag, value=innerChild, dict=self))
def getDict(self):
return {self.parentElement.tag: self}
class _addToDict(dict):
def __init__(self, key, value, dict):
if not key in dict:
self.update({key: value})
else:
identical = dict[key] if type(dict[key]) == list else [dict[key]]
self.update({key: identical + [value]})
tree = ET.parse('./XML.xml')
root = tree.getroot()
parseredDict = XMLToDictionary(root).getDict()
print(parseredDict)
the output is
{'A': {'B': [{'BB': 'inAB', 'C': {'D': {'E': ['inABCDE', 'value2', 'value3']}, 'inCout-ofD': '123'}}, 'abc'], 'F': 'F'}}
Updated method posted by firelion.cis (since getchildren is deprecated):
from xml.etree import ElementTree
root = ElementTree.XML(xml_to_convert)
def xml_to_dict_recursive(root):
if len(list(root)) == 0:
return {root.tag:root.text}
else:
return {root.tag:list(map(xml_to_dict_recursive, list(root)))}
import xml.etree.ElementTree as ET
root = ET.parse(xml_filepath).getroot()
def parse_xml(node):
ans = {}
for child in node:
if len(child) == 0:
ans[child.tag] = child.text
elif child.tag not in ans:
ans[child.tag] = parse_xml(child)
elif not isinstance(ans[child.tag], list):
ans[child.tag] = [ans[child.tag]]
ans[child.tag].append(parse_xml(child))
else:
ans[child.tag].append(parse_xml(child))
return ans
it merges same field into list and squeezes fields containing one child.
Super simple code
#Follow this, its easy and nothing required, convert the XML into a string and use the find command to find the word that you are looking for as following
#hope this is easy and simple
def xml_key(key, text1):
tx1 = "<" + key + ">"
tx2 = "</" + key + ">"
tx = text1.find(tx1)
ty = text1.find(tx2)
tx = tx + len(tx1)
tw = text1[tx:ty]
return(tw)
text1 = "<person><name>john</name><age>20</age></person>"
dict1 = {"name": xml_key("name",text1),"age":xml_key("age",text1)}
print(dict1)
output :
{'name': 'john'}
I have a recursive method to get a dictionary from a lxml element
def recursive_dict(element):
return (element.tag.split('}')[1],
dict(map(recursive_dict, element.getchildren()),
**element.attrib))

Categories