Parsing XML file etree module - python

I'm reading XML file using Etree module. Im using following code to print the value of <page> and <title> tags. My code working fine. But I want little change. If the <page id='...'> attribute id is exists then print the value of tag. Is it possible? thanks
import xml.etree.cElementTree as etree
from pprint import pprint
tree = etree.parse('find_title.xml')
for value in tree.getiterator(tag='title'):
print value.text
for value in tree.getiterator(tag='page'):
pprint(value.attrib)
Here is my xml File.
<mediawiki>
<siteinfo>
<sitename>Wiki</sitename>
<namespaces>
<namespace key="-2" case="first-letter">Media</namespace>
</namespaces>
</siteinfo>
<page id="31239628" orglength="6822" newlength="4524" stub="0" categories="0" outlinks="1" urls="10">
<title>Title</title>
<categories></categories>
<links>15099779</links>
<urls>
</urls>
<text>
Books
</text>
</page>
</mediawiki>

for el in tree.getiterator(tag='page'):
page_id = el.get('id', None) # returns second arg if id not exists
if page_id:
print page_id, el.find('title').text
else:
pprint(el.attrib)
Edit: Updated for commment: "Thanks can i print page_id and title at same time? Means 31239628 - Title"

The element.get() method is used to retrieve option attribute values in a tag:
>>> page_id = tree.find('page').get('id')
>>> if page_id:
print page_id
31239628

Related

How to loop over tags in XML file using python

My XML file structure is:
<tp:Package xml:lang='en-US' xmlns:tp='http://myorg.org/2016/mypackage'>
<tp:identifier>http://www.myweb.com/</tp:identifier>
<tp:name>MyName</tp:name>
<tp:description xml:lang='en-US'>My Description</tp:description>
<tp:version>2020-01-01</tp:version>
<tp:license href='http://www.myweb.com/terms/TermsConditions.html' name='Terms and Conditions' />
<tp:publisher>MyPublisher</tp:publisher>
<tp:publisherURL>http://www.mypublisherurl.com/</tp:publisherURL>
<tp:publisherCountry>US</tp:publisherCountry>
<tp:publicationDate>2020-01-01</tp:publicationDate>
<tp:entryPoints>
<tp:entryPoint>
<tp:name>Form A</tp:name>
<tp:description>This is Form A.</tp:description>
<tp:version>v313</tp:version>
<tp:entryPointDocument href='http://www.myweb.com/myfile.xsd' />
<tp:formType>1</tp:formType>
</tp:entryPoint>
<tp:entryPoint>
<tp:name>Form B</tp:name>
<tp:description>This is Form B.</tp:description>
<tp:version>v313</tp:version>
<tp:entryPointDocument href='http://www.myweb.com/myfile.xsd' />
<tp:formType>2</tp:formType>
</tp:entryPoint>
</tp:entryPoints>
</tp:Package>
How Do I read this file using etree and loop over each tag and print the values of element tp:name, tp:description, tp:version, tp:entryPointDocument, tp:formType
Following is my partial python code:
from lxml import etree
tree = etree.parse(xmlfilepath)
root = tree.getroot()
for elt in root.xpath("//tp:entryPoints", namespaces={'tp': 'http://myorg.org/2016/mypackage'}):
print(elt)
Try this:
package = """your xml above"""
from lxml import etree
tree = etree.fromstring(package)
for elt in tree.xpath("//tp:entryPoints//*", namespaces={'tp': 'http://myorg.org/2016/mypackage'}):
print(elt.text)
Output:
Form A
This is Form A.
v313
None
1
Form B
This is Form B.
v313
None
2

How can I parse a Wikipedia XML dump with Python?

I have:
import xml.etree.ElementTree as ET
def strip_tag_name(t):
t = elem.tag
idx = k = t.rfind("}")
if idx != -1:
t = t[idx + 1:]
return t
events = ("start", "end")
title = None
for event, elem in ET.iterparse('data/enwiki-20190620-pages-articles-multistream.xml', events=events):
tname = strip_tag_name(elem.tag)
if event == 'end':
if tname == 'title':
title = elem.text
elif tname == 'page':
print(title, elem.text)
This seems to give the title just fine, but the page text always seems blank. What am I missing?
I haven't been able to open the file (it's huge), but I think this is an accurate snippet:
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">
<siteinfo>
<sitename>Wikipedia</sitename>
<dbname>enwiki</dbname>
<base>https://en.wikipedia.org/wiki/Main_Page</base>
<generator>MediaWiki 1.29.0-wmf.12</generator>
<case>first-letter</case>
<namespaces>
...
</namespaces>
</siteinfo>
<page>
<title>AccessibleComputing</title>
<ns>0</ns>
<id>10</id>
<redirect title="Computer accessibility" />
<revision>
<id>631144794</id>
<parentid>381202555</parentid>
<timestamp>2014-10-26T04:50:23Z</timestamp>
<contributor>
<username>Paine Ellsworth</username>
<id>9092818</id>
</contributor>
<comment>add [[WP:RCAT|rcat]]s</comment>
<model>wikitext</model>
<format>text/x-wiki</format>
<text xml:space="preserve">#REDIRECT [[Computer accessibility]]
\{\{Redr|move|from CamelCase|up\}\}</text>
<sha1>4ro7vvppa5kmm0o1egfjztzcwd0vabw</sha1>
</revision>
</page>
<page>
<title>Anarchism</title>
<ns>0</ns>
<id>12</id>
<revision>
<id>766348469</id>
<parentid>766047928</parentid>
<timestamp>2017-02-19T18:08:07Z</timestamp>
<contributor>
<username>GreenC bot</username>
<id>27823944</id>
</contributor>
<minor />
<comment>Reformat 1 archive link. [[User:Green Cardamom/WaybackMedic_2.1|Wayback Medic 2.1]]</comment>
<model>wikitext</model>
<format>text/x-wiki</format>
<text xml:space="preserve">
...
</text>
</revision>
</page>
</mediawiki>
The best approach is to use a the MWXML python package which is part of the Mediawiki Utilities (installable with pip3 install mwxml). MWXML is designed to solve this specific problem and is widely used. The software was created by research staff at the Wikimedia Foundation and is maintained by a set of researchers inside and outside of the foundation.
Here's a code example adapted from an example notebook distributed with the library that prints out page IDs, revision IDs, timestamp, and the length of the text:
import mwxml
import glob
paths = glob.glob('/public/dumps/public/nlwiki/20151202/nlwiki-20151202-pages-meta-history*.xml*.bz2')
def process_dump(dump, path):
for page in dump:
for revision in page:
yield page.id, revision.id, revision.timestamp, len(revision.text)
for page_id, rev_id, rev_timestamp, rev_textlength in mwxml.map(process_dump, paths):
print("\t".join(str(v) for v in [page_id, rev_id, rev_timestamp, rev_textlength]))
The full example from which this is adapted reports the number of added and removed image links within each revision. It is fully documented but includes only 25 lines of code.
The text refers to the text between the element tags (i.e. <tag>text</tag>) and not to all the child elements. Thus, in case of the title element one has:
<title>AccessibleComputing</title>
and the text between the tags is AccessibleComputing.
In the case of the page element, the only text defined is '\n ' and there are other child elements (see below), including the title element:
<page>
<title>Anarchism</title>
<ns>0</ns>
<id>12</id>
...
</page>
See more details in w3schools page
If you want to parse the file, I would recomend to use either findall method:
from lxml import etree
from lxml.etree import tostring
tree = etree.parse('data/enwiki-20190620-pages-articles-multistream.xml')
root = tree.getroot()
# iterate through all the titles
for title in root.findall(".//title", namespaces=root.nsmap):
print(tostring(title))
print(title.text)
which generates this output:
b'<title xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">AccessibleComputing</title>\n '
AccessibleComputing
b'<title xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">Anarchism</title>\n '
Anarchism
or the xpath method:
nsmap = root.nsmap
nsmap['x'] = root.nsmap[None]
nsmap.pop(None)
# iterate through all the pages
for page in root.findall(".//x:page", namespaces=nsmap):
print(page)
print(repr(page.text)) # which prints '\n '
print('number of children: %i' % len(page.getchildren()))
and the output is:
<Element {http://www.mediawiki.org/xml/export-0.10/}page at 0x7ff75cc610c8>
'\n '
number of children: 5
<Element {http://www.mediawiki.org/xml/export-0.10/}page at 0x7ff75cc71bc8>
'\n '
number of children: 5
Please see lxml tutorial for more details.
You are trying to get the content of the text property of the <page> element, but that is just whitespace.
To get the text of the <text> element, just change
elif tname == 'page':
to
elif tname == 'text':
For XML parsing I use package untangle from PYPI, which presents a complete document view. Then you have:
import untangle
doc = untangle.parse('data/enwiki-20190620-pages-articles-multistream.xml')
for page in doc.mediawiki.page:
print(page.title.cdata)
for text in page.revision.text:
print(text.cdata)
To get the Wikipedia article, you need to access the content of the text property of the <text> element, and not the <page> element.
Here is the corrected version of your code:
import xml.etree.ElementTree as ET
def strip_tag_name(t):
t = elem.tag
idx = k = t.rfind("}")
if idx != -1:
t = t[idx + 1:]
return t
events = ("start", "end")
title = None
for event, elem in ET.iterparse('data/enwiki-20190620-pages-articles-multistream.xml', events=events):
tname = strip_tag_name(elem.tag)
if event == 'end':
if tname == 'title':
title = elem.text
elif tname == 'text':
print(title, elem.text)
elem.clear()
Since the Wikipedia dump is quite large, don't forget the elem.clear() at the end of the for loop.
As mentioned in mzjn answers the content of the text property of the <page> element is just whitespace.

extract tag inside text element of an xml tag

Suppose I have an XML document of the following form
<root>
<foos>
<foo>the quick <bar>brown </bar>fox</foo>
</foos>
<!-- Lots more <foo></foo> -->
</root>
How do I extract the full text string the quick fox as well as the string brown?
import xml.etree.ElementTree as ET
doc = ET.parse(xmldocument).getroot()
foos = doc.find('foos')
for foo in foos:
print foo.text # This will print 'the quick '
Not sure how to solve this problem.
You can also try something like this, which iterates in all nested tags automatically:
foos = doc.find('foos')
for foo in foos:
for text in foo.itertext():
print text.strip(),
print
from scrapy.selector import XmlXPathSelector
xml = \
"""
<root>
<foos>
<foo>the quick <bar>brown </bar>fox</foo>
</foos>
</root>
"""
hxs =XmlXPathSelector(text=xml)
foos = hxs.select('//foos')
for one in foos:
text = one.select('./foo//text()').extract()
text = ''.join(text)
print text

Print XML tag attributes using element tree

I'm using following statements to print the value of Title tag. Its working fine. But I'm also want to print <page id='...' ....... Is it possible? thanks
<mediawiki>
<siteinfo>
<sitename>Wiki</sitename>
<namespaces>
<namespace key="-2" case="first-letter">Media</namespace>
</namespaces>
</siteinfo>
<page id="31239628" orglength="6822" newlength="4524" stub="0" categories="0" outlinks="1" urls="10">
<title>Title</title>
<categories></categories>
<links>15099779</links>
<urls>
</urls>
<text>
Books
</text>
</page>
</mediawiki>
Here is my working code. Which print the title tag values.
import xml.etree.cElementTree as etree
tree = etree.parse('find_title.xml')
for value in tree.getiterator(tag='title'):
print value.text
You can try the following:
import xml.etree.cElementTree as etree
from pprint import pprint
tree = etree.parse('find_title.xml')
for value in tree.getiterator(tag='title'):
print value.text
for value in tree.getiterator(tag='page'):
pprint(value.attrib)
It should output something like this:
$ python file.py
Title
{'categories': '0',
'id': '31239628',
'newlength': '4524',
'orglength': '6822',
'outlinks': '1',
'stub': '0',
'urls': '10'}

XML writing tools for Python

I'm currently trying ElementTree and it looks fine, it escapes HTML entities and so on and so forth. Am I missing something truly wonderful I haven't heard of?
This is similar to what I'm actually doing:
import xml.etree.ElementTree as ET
root = ET.Element('html')
head = ET.SubElement(root,'head')
script = ET.SubElement(head,'script')
script.set('type','text/javascript')
script.text = "var a = 'I love รก letters'"
body = ET.SubElement(root,'body')
h1 = ET.SubElement(body,'h1')
h1.text = "And I like the fact that 3 > 1"
tree = ET.ElementTree(root)
tree.write('foo.xhtml')
# more foo.xhtml
<html><head><script type="text/javascript">var a = 'I love &aacute;
letters'</script></head><body><h1>And I like the fact that 3 > 1</h1>
</body></html>
Another way is using the E Factory builder from lxml (available in Elementtree too)
>>> from lxml import etree
>>> from lxml.builder import E
>>> def CLASS(*args): # class is a reserved word in Python
... return {"class":' '.join(args)}
>>> html = page = (
... E.html( # create an Element called "html"
... E.head(
... E.title("This is a sample document")
... ),
... E.body(
... E.h1("Hello!", CLASS("title")),
... E.p("This is a paragraph with ", E.b("bold"), " text in it!"),
... E.p("This is another paragraph, with a", "\n ",
... E.a("link", href="http://www.python.org"), "."),
... E.p("Here are some reserved characters: <spam&egg>."),
... etree.XML("<p>And finally an embedded XHTML fragment.</p>"),
... )
... )
... )
>>> print(etree.tostring(page, pretty_print=True))
<html>
<head>
<title>This is a sample document</title>
</head>
<body>
<h1 class="title">Hello!</h1>
<p>This is a paragraph with <b>bold</b> text in it!</p>
<p>This is another paragraph, with a
link.</p>
<p>Here are some reservered characters: <spam&egg>.</p>
<p>And finally an embedded XHTML fragment.</p>
</body>
</html>
There's always SimpleXMLWriter, part of the ElementTree toolkit. The interface is dead simple.
Here's an example:
from elementtree.SimpleXMLWriter import XMLWriter
import sys
w = XMLWriter(sys.stdout)
html = w.start("html")
w.start("head")
w.element("title", "my document")
w.element("meta", name="generator", value="my application 1.0")
w.end()
w.start("body")
w.element("h1", "this is a heading")
w.element("p", "this is a paragraph")
w.start("p")
w.data("this is ")
w.element("b", "bold")
w.data(" and ")
w.element("i", "italic")
w.data(".")
w.end("p")
w.close(html)
I assume that you're actually creating an XML DOM tree, because you want to validate that what goes into this file is valid XML, since otherwise you'd just write a static string to a file. If validating your output is indeed your goal, then I'd suggest
from xml.dom.minidom import parseString
doc = parseString("""<html>
<head>
<script type="text/javascript">
var a = 'I love &aacute; letters'
</script>
</head>
<body>
<h1>And I like the fact that 3 > 1</h1>
</body>
</html>""")
with open("foo.xhtml", "w") as f:
f.write( doc.toxml() )
This lets you just write the XML you want to output, validate that it's correct (since parseString will raise an exception if it's invalid) and have your code look much nicer.
Presumably you're not just writing the same static XML every time and want some substitution. In this case I'd have lines like
var a = '%(message)s'
and then use the % operator to do the substitution, like
</html>""" % {"message": "I love &aacute; letters"})
https://github.com/galvez/xmlwitch:
import xmlwitch
xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
with xml.feed(xmlns='http://www.w3.org/2005/Atom'):
xml.title('Example Feed')
xml.updated('2003-12-13T18:30:02Z')
with xml.author:
xml.name('John Doe')
xml.id('urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6')
with xml.entry:
xml.title('Atom-Powered Robots Run Amok')
xml.id('urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a')
xml.updated('2003-12-13T18:30:02Z')
xml.summary('Some text.')
print(xml)
don't you actually want something like:
html(head(script(type='text/javascript', content='var a = ...')),
body(h1('And I like the fact that 3 < 1'), p('just some paragraph'))
I think I saw something like that somewhere. This would be wonderful.
EDIT: Actually, I went and wrote a library today to do just that: magictree
You can use it like this:
from magictree import html, head, script, body, h1, p
root = html(
head(
script('''var a = 'I love &aacute; letters''',
type='text/javascript')),
body(
h1('And I like the fact that 3 > 1')))
# root is a plain Element object, like those created with ET.Element...
# so you can write it out using ElementTree :)
tree = ET.ElementTree(root)
tree.write('foo.xhtml')
The magic in magictree lies in how the importing works: The Element factories are created when needed. Have a look at the source, it is based on an answer to another StackOverflow question.
I ended up using saxutils.escape(str) to generate valid XML strings and then validating it with Eli's approach to be sure I didn't miss any tag
from xml.sax import saxutils
from xml.dom.minidom import parseString
from xml.parsers.expat import ExpatError
xml = '''<?xml version="1.0" encoding="%s"?>\n
<contents title="%s" crawl_date="%s" in_text_date="%s"
url="%s">\n<main_post>%s</main_post>\n</contents>''' %
(self.encoding, saxutils.escape(title), saxutils.escape(time),
saxutils.escape(date), saxutils.escape(url), saxutils.escape(contents))
try:
minidoc = parseString(xml)
catch ExpatError:
print "Invalid xml"
For anyone encountering this now, there's actually a way to do this hidden away in Python's standard library in xml.sax.utils.XMLGenerator. Here's an example of it in action:
>>> from xml.sax.saxutils import XMLGenerator
>>> import StringIO
>>> w = XMLGenerator(out, 'utf-8')
>>> w.startDocument()
>>> w.startElement("test", {'bar': 'baz'})
>>> w.characters("Foo")
>>> w.endElement("test")
>>> w.endDocument()
>>> print out.getvalue()
<?xml version="1.0" encoding="utf-8"?>
<test bar="baz">Foo</test>
Try http://uche.ogbuji.net/tech/4suite/amara. It is quite complete and has a straight forward set of access tools. Normal Unicode support, etc.
#
#Output the XML entry
#
def genFileOLD(out,label,term,idval):
filename=entryTime() + ".html"
writer=MarkupWriter(out, indent=u"yes")
writer.startDocument()
#Test element and attribute writing
ans=namespace=u'http://www.w3.org/2005/Atom'
xns=namespace=u'http://www.w3.org/1999/xhtml'
writer.startElement(u'entry',
ans,
extraNss={u'x':u'http://www.w3.org/1999/xhtml' ,
u'dc':u'http://purl.org/dc/elements/1.1'})
#u'a':u'http://www.w3.org/2005/Atom',
#writer.attribute(u'xml:lang',unicode("en-UK"))
writer.simpleElement(u'title',ans,content=unicode(label))
#writer.simpleElement(u'a:subtitle',ans,content=u' ')
id=unicode("http://www.dpawson.co.uk/nodesets/"+afn.split(".")[0])
writer.simpleElement(u'id',ans,content=id)
writer.simpleElement(u'updated',ans,content=unicode(dtime()))
writer.startElement(u'author',ans)
writer.simpleElement(u'name',ans,content=u'Dave ')
writer.simpleElement(u'uri',ans,
content=u'http://www.dpawson.co.uk/nodesets/'+afn+".xml")
writer.endElement(u'author')
writer.startElement(u'category', ans)
if (prompt):
label=unicode(raw_input("Enter label "))
writer.attribute(u'label',unicode(label))
if (prompt):
term = unicode(raw_input("Enter term to use "))
writer.attribute(u'term', unicode(term))
writer.endElement(u'category')
writer.simpleElement(u'rights',ans,content=u'\u00A9 Dave 2005-2008')
writer.startElement(u'link',ans)
writer.attribute(u'href',
unicode("http://www.dpawson.co.uk/nodesets/entries/"+afn+".html"))
writer.attribute(u'rel',unicode("alternate"))
writer.endElement(u'link')
writer.startElement(u'published', ans)
dt=dtime()
dtu=unicode(dt)
writer.text(dtu)
writer.endElement(u'published')
writer.simpleElement(u'summary',ans,content=unicode(label))
writer.startElement(u'content',ans)
writer.attribute(u'type',unicode("xhtml"))
writer.startElement(u'div',xns)
writer.simpleElement(u'h3',xns,content=unicode(label))
writer.endElement(u'div')
writer.endElement(u'content')
writer.endElement(u'entry')

Categories