Is there a way to skip nodes/elements with iterparse lxml? - python

Is there a way using lxml iterparse to skip an element without checking the tag? Take this xml for example:
<root>
<sample>
<tag1>text1</tag1>
<tag2>text2</tag2>
<tag3>text3</tag3>
<tag4>text4</tag4>
</sample>
<sample>
<tag1>text1</tag1>
<tag2>text2</tag2>
<tag3>text3</tag3>
<tag4>text4</tag4>
</sample>
</root>
If I care about tag1 and tag4, checking tag2 and tag3 will eat up some time. If the file isn't big, it doesn't really matter but if I have a million <sample> nodes, I could reduce search time some if I don't have to check tag2 nd tag3. They're always there and I never need them.
using iterparse in lxml
import lxml
xmlfile = 'myfile.xml'
context = etree.iterparse(xmlfile, events('end',), tag='sample')
for event, elem in context:
for child in elem:
if child.tag == 'tag1'
my_list.append(child.text)
#HERE I'd like to advance the loop twice without checking tag2 and tag3 at all
#something like:
#next(child)
#next(child)
elif child.tag == 'tag4'
my_list.append(child.text)

If you use the tag arg in iterchildren like you do in iterparse, you can "skip" elements other than tag1 and tag4.
Example...
from lxml import etree
xmlfile = "myfile.xml"
my_list = []
for event, elem in etree.iterparse(xmlfile, tag="sample"):
for child in elem.iterchildren(tag=["tag1", "tag4"]):
if child.tag == "tag1":
my_list.append(child.text)
elif child.tag == "tag4":
my_list.append(child.text)
print(my_list)
Printed output...
['text1', 'text4', 'text1', 'text4']

Related

Add an element before text with lxml

I have some XML where I want to insert a new element before the text.
I tried:
from lxml import etree
xml = "<root><foo>some text</foo></root>"
root = etree.fromstring(xml)
root.find("foo")
foo.insert(0, etree.Element("bar"))
etree.tostring(foo)
and the result was
<foo>some text<bar/></foo>
when I was hoping for
<foo><bar/>some text</foo>
Bearing in mind that the foo element may actually be quite complicated.
The best I could come with was
def insert_before(elem, child):
elem.insert(0, child)
child.tail, elem.text = elem.text, None
But is there a function or argument in the API that I missed?

Python lxml: how to fetch XML tag names with xpath selector?

I'm trying to parse the following XML using Python and lxml:
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="/bind9.xsl"?>
<isc version="1.0">
<bind>
<statistics version="2.2">
<memory>
<summary>
<TotalUse>1232952256
</TotalUse>
<InUse>835252452
</InUse>
<BlockSize>598212608
</BlockSize>
<ContextSize>52670016
</ContextSize>
<Lost>0
</Lost>
</summary>
</memory>
</statistics>
</bind>
</isc>
The goal is to extract the tag name and text of every element under bind/statistics/memory/summary in order to produce the following mapping:
TotalUse: 1232952256
InUse: 835252452
BlockSize: 598212608
ContextSize: 52670016
Lost: 0
I've managed to extract the element values, but I can't figure out the xpath expression to get the element tag names.
A sample script:
from lxml import etree as et
def main():
xmlfile = "bind982.xml"
location = "bind/statistics/memory/summary/*"
label_selector = "??????" ## what to put here...?
value_selector = "text()"
with open(xmlfile, "r") as data:
xmldata = et.parse(data)
etree = xmldata.getroot()
statlist = etree.xpath(location)
for stat in statlist:
label = stat.xpath(label_selector)[0]
value = stat.xpath(value_selector)[0]
print "{0}: {1}".format(label, value)
if __name__ == '__main__':
main()
I know I could use value = stat.tag instead of stat.xpath(), but the script must be sufficiently generic to also process other pieces of XML where the label selector is different.
What xpath selector would return an element's tag name?
Simply use XPath's name(), and remove the zero index since this returns a string and not list.
from lxml import etree as et
def main():
xmlfile = "ExtractXPathTagName.xml"
location = "bind/statistics/memory/summary/*"
label_selector = "name()" ## what to put here...?
value_selector = "text()"
with open(xmlfile, "r") as data:
xmldata = et.parse(data)
etree = xmldata.getroot()
statlist = etree.xpath(location)
for stat in statlist:
label = stat.xpath(label_selector)
value = stat.xpath(value_selector)[0]
print("{0}: {1}".format(label, value).strip())
if __name__ == '__main__':
main()
Output
TotalUse: 1232952256
InUse: 835252452
BlockSize: 598212608
ContextSize: 52670016
Lost: 0
I think you don't need XPath for the two values, the element nodes have properties tag and text so use for instance a list comprehension:
[(element.tag, element.text) for element in etree.xpath(location)]
Or if you really want to use XPath
result = [(element.xpath('name()'), element.xpath('string()')) for element in etree.xpath(location)]
You could of course also construct a list of dictionaries:
result = [{ element.tag : element.text } for element in root.xpath(location)]
or
result = [{ element.xpath('name()') : element.xpath('string()') } for element in etree.xpath(location)]

Python LXML iterparse function: memory not getting freed while parsing a huge XML

I am parsing big XMLs (~500MB) with the help of LXML library in Python. I have used BeautifulSoup with lxml-xml parser for small files. But when I came across huge XMLs, it was inefficient as it reads the whole file once, and then parses it.
I need to parse a XML to get root to leaf paths (except the outermost tag).
eg.
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE A>
<A>
<B>
<C>
abc
</C>
<D>
abd
</D>
</B>
</A>
Above XML should give keys and values as output (root to leaf paths).
A.B.C = abc
A.B.D = abd
Here's the code that I've written to parse it:
(ignore1 and ignore2 are the tags that need to be ignored, and tu.clean_text() is the function which will remove unnecessary characters)
def fast_parser(filename, keys, values, ignore1, ignore2):
context = etree.iterparse(filename, events=('start', 'end',))
path = list()
i = 0
lastevent = ""
for event, elem in context:
i += 1
tag = elem.tag if "}" not in elem.tag else elem.tag.split('}', 1)[1]
if tag == ignore1 or tag == ignore2:
pass
elif event == "start":
path.append(tag)
elif event == "end":
if lastevent == "start":
keys.append(".".join(path))
values.append(tu.clean_text(elem.text))
# free memory
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
if len(path) > 0:
path.pop()
lastevent = event
del context
return keys, values
I have already referred the following article for parsing a large file ibm.com/developerworks/xml/library/x-hiperfparse/#listing4
Here's the screenshot of top command. Memory usage goes beyond 2 GB for a ~500 MB XML file. I suspect that memory is not getting freed.
I have already gone through few StackOverflow questions. But it didn't help. Please advice.
I took the code from https://stackoverflow.com/a/7171543/131187, chopped out comments and print statements, and added a suitable func to get this. I wouldn't like to guess how much time it would take to process a 500 Mb file!
Even in writing func I have done nothing original, having adopted the original authors' use of the xpath expression, 'ancestor-or-self::*', to provide the absolute path that you want.
However, since this code conforms more closely to the original scripts it might not leak memory.
import lxml.etree as ET
input_xml = 'temp.xml'
for line in open(input_xml).readlines():
print (line[:-1])
def mod_fast_iter(context, func, *args, **kwargs):
for event, elem in context:
func(elem, *args, **kwargs)
elem.clear()
for ancestor in elem.xpath('ancestor-or-self::*'):
while ancestor.getprevious() is not None:
del ancestor.getparent()[0]
del context
def func(elem):
content = '' if not elem.text else elem.text.strip()
if content:
ancestors = elem.xpath('ancestor-or-self::*')
print ('%s=%s' % ('.'.join([_.tag for _ in ancestors]), content))
print ('\nResult:\n')
context = ET.iterparse(open(input_xml , 'rb'), events=('end', ))
mod_fast_iter(context, func)
Output:
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE A>
<A>
<B>
<C>
abc
</C>
<D>
abd
</D>
</B>
</A
Result:
A.B.C=abc
A.B.D=abd

Getting subelements using lxml and iterparse

I am trying to write a parsing algorithm to efficiently pull data from an xml document. I am currently rolling through the document based on elements and children, but would like to use iterparse instead. One issue is that I have a list of elements that when found, I want to pull the child data from them, but it seems like using iterparse my options are to filter based on either one element name, or get every single element.
Example xml:
<?xml version="1.0" encoding="UTF-8"?>
<data_object xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<source id="0">
<name>Office Issues</name>
<datetime>2012-01-13T16:09:15</datetime>
<data_id>7</data_id>
</source>
<event id="125">
<date>2012-11-06</date>
<state_id>7</state_id>
</event>
<state id="7">
<name>Washington</name>
</state>
<locality id="2">
<name>Olympia</name>
<state_id>7</state_id>
<type>City</type>
</locality>
<locality id="3">
<name>Town</name>
<state_id>7</state_id>
<type>Town</type>
</locality>
</data_object>
Code example:
from lxml import etree
fname = "test.xml"
ELEMENT_LIST = ["source", "event", "state", "locality"]
with open(fname) as xml_doc:
context = etree.iterparse(xml_doc, events=("start", "end"))
context = iter(context)
event, root = context.next()
base = False
b_name = ""
for event, elem in context:
if event == "start" and elem.tag in ELEMENT_LIST:
base = True
bname = elem.tag
children = elem.getchildren()
child_list = []
for child in children:
child_list.append(child.tag)
print bname + ":" + str(child_list)
elif event == "end" and elem.tag in ELEMENT_LIST:
base = False
root.clear()
With iterparse you cannot limit parsing to some types of tags, you may do this only with one tag (by passing argument tag). However it is easy to do manually what you would like to achieve. In the following snippet:
from lxml import etree
fname = "test.xml"
ELEMENT_LIST = ["source", "event", "state", "locality"]
with open(fname) as xml_doc:
context = etree.iterparse(xml_doc, events=("start", "end"))
for event, elem in context:
if event == "start" and elem.tag in ELEMENT_LIST:
print "this elem is interesting, do some processing: %s: [%s]" % (elem.tag, ", ".join(child.tag for child in elem))
elem.clear()
you limit your search to interesting tags only. Important part of iterparse is the elem.clear() which clears memory when item is obsolete. That is why it is memory efficient, see http://lxml.de/parsing.html#modifying-the-tree
I would use XPath instead. It's much more elegant than walking the document on your own and certainly more efficient I assume.
Use tag='{http://www.sitemaps.org/schemas/sitemap/0.9}url'
Similar question with right answer https://stackoverflow.com/a/7019273/1346222
#!/usr/bin/python
# coding: utf-8
""" Parsing xml file. Basic example """
from StringIO import StringIO
from lxml import etree
import urllib2
sitemap = urllib2.urlopen(
'http://google.com/sitemap.xml',
timeout=10
).read()
NS = {
'x': 'http://www.sitemaps.org/schemas/sitemap/0.9',
'x2': 'http://www.google.com/schemas/sitemap-mobile/1.0'
}
res = []
urls = etree.iterparse(StringIO(sitemap), tag='{http://www.sitemaps.org/schemas/sitemap/0.9}url')
for event, url in urls:
t = []
t = url.xpath('.//x:loc/text() | .//x:priority/text()', namespaces=NS)
t.append(url.xpath('boolean(.//x2:mobile)', namespaces=NS))
res.append(t)

Getting unique value when the same tag is in children's tree in XML with Python

I have getElementText as follows which works pretty well with [0] as the XML that I'm working on doesn't have the duplicate tag.
from xml.dom import minidom
def getElementText(element, tagName):
return str(element.getElementsByTagName(tagName)[0].firstChild.data)
doc = minidom.parse("/Users/smcho/Desktop/hello.xml")
outputTree = doc.getElementsByTagName("Output")[0]
print getElementText(outputTree, "Number")
However, when I parse the following XML, I can't get the value <Number>0</Number> but <ConnectedTerminal><Number>1</Number></ConnectedTerminal> with getElementText(outputTree, "Number"), because the getElementText function returns the first of the two elements with the tag "Number".
<Output>
<ConnectedTerminal>
<Node>5</Node>
<Number>1</Number>
</ConnectedTerminal>
<Type>int8</Type>
<Number>0</Number>
</Output>
Any solution to this problem? Is there any way to get only <Number>0</Number> or <ConnectedTerminal><Number>1</Number></ConnectedTerminal>.
If lxml is an option (it's much nicer than minidomyou) can do:
from lxml import etree
doc = etree.fromstring(xml)
node = doc.find('Number')
print node.text # 0
node = doc.xpath('//ConnectedTerminal/Number')[0]
print node.text # 1
Also see the xpath tutorial.
There's not a direct DOM method to do this, no. But it's fairly easy to write one:
def getChildElementsByTagName(element, tag):
children= []
for child in element.childNodes:
if child.nodeType==child.ELEMENT_NODE and tag in (child.tagName, '*'):
children.push(child)
return children
Plus here's a safer text-getting function, so you don't have to worry about multiple nodes, missing nodes due to blank strings, or CDATA sections.
def getTextContent(element):
texts= []
for child in element.childNodes:
if child.nodeType==child.ELEMENT_NODE:
texts.append(getTextContent(child))
elif child.nodeType==child.TEXT_NODE:
texts.append(child.data)
return u''.join(texts)
then just:
>>> getTextContent(getChildElementsByTagName(doc, u'Number')[0])
u'0'
>>> getTextContent(getChildElementsByTagName(doc, u'Output')[0].getElementsByTagName(u'Number')[0])
u'1'

Categories