Read XML on Python - python

I'm using minidom for read XML document. I need find the "Member" tag name and if the attributes start with "W" or "F" I need read the text between "MultilanguajeTex" tag name.
It's possible?
I try this:
mydoc = minidom.parse('DB.xml')
items = mydoc.getElementsByTagName('Member')
for elem in items:
valor = elem.attributes['Name'].value
if (valor[0] == "F" or valor[0] == "W") and len(valor)<4:
print(len(valor))
print(valor)
texto = elem.SelectNodes('MultiLanguageText')
print(texto.localName)
XML text:
<Member Name="F00" Datatype="Bool">
<Comment>
<MultiLanguageText Lang="en-GB">
BottomSidePanels_Feeding_Zone_41 Fa 00. Material Jam. Material too much time going out Fw
</MultiLanguageText>
</Comment>
Thanks!

Using minidom it could be done as follows:
from xml.dom.minidom import parseString
xml = '''<root>
<Member Name="F00" Datatype="Bool">
<Comment>
<MultiLanguageText Lang="en-GB">
BottomSidePanels_Feeding_Zone_41 Fa 00. Material Jam. Material too much time going out Fw
</MultiLanguageText>
</Comment>
</Member>
</root>'''
root = parseString(xml)
items = root.getElementsByTagName('Member')
for elem in items:
valor = elem.attributes['Name'].value
if (valor[0] == "F" or valor[0] == "W") and len(valor) < 4:
texts = elem.getElementsByTagName('MultiLanguageText')
for text in texts:
print(text.firstChild.nodeValue)
A bit simpler it could be achieved using lxml as it allows usage of XPath expressions:
import lxml.etree as etree
root = etree.fromstring(xml)
text_elements = root.xpath('.//Member[starts-with(#Name, "F") or starts-with(#Name, "W")]//MultiLanguageText')
for text_element in text_elements:
print(text_element.text.strip())

Related

XML tag not found

After extracting XML from CDATA I can't find tags in the extracted XML. If I convert to string and then back to an ElementTree I can find the tags I'm looking for (un-comment the lines marked "UNCOMMENT ME"). Looking for a better / more correct way.
import xml.etree.ElementTree as ElementTree
XML = '''<?xml version="1.0" encoding="UTF-8"?>
<Catalog>
<Data><![CDATA[
<Book>
<Author>George Orwell</Author>
<Title>1984</Title>
</Book>
]]></Data>
</Catalog>
'''
def get_cdata_xml(xml_str: str) -> ElementTree:
xml_root = ElementTree.fromstring(xml_str)
cdata_xml = xml_root.find('.//Data')
return cdata_xml
if __name__ == '__main__':
cdata_xml = get_cdata_xml(XML)
#xml_str = cdata_xml.text # UNCOMMENT ME
#cdata_xml = ElementTree.fromstring(xml_str) #UNCOMMENT ME
# type(cdata_xml) = xml.etree.ElementTree.Element
author = cdata_xml.find('.//Author')
print(author.text)
A CDATA block is just a string; it's not XML content. You would need to parse the cdata content with another call to `ElementTree.fromstring:
cdata_xml = get_cdata_xml(XML)
book = ElementTree.fromstring(cdata_xml.text)
author = book.find(".//Author")
print(author.text)

Python lxml: how to fetch XML tag names with xpath selector?

I'm trying to parse the following XML using Python and lxml:
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="/bind9.xsl"?>
<isc version="1.0">
<bind>
<statistics version="2.2">
<memory>
<summary>
<TotalUse>1232952256
</TotalUse>
<InUse>835252452
</InUse>
<BlockSize>598212608
</BlockSize>
<ContextSize>52670016
</ContextSize>
<Lost>0
</Lost>
</summary>
</memory>
</statistics>
</bind>
</isc>
The goal is to extract the tag name and text of every element under bind/statistics/memory/summary in order to produce the following mapping:
TotalUse: 1232952256
InUse: 835252452
BlockSize: 598212608
ContextSize: 52670016
Lost: 0
I've managed to extract the element values, but I can't figure out the xpath expression to get the element tag names.
A sample script:
from lxml import etree as et
def main():
xmlfile = "bind982.xml"
location = "bind/statistics/memory/summary/*"
label_selector = "??????" ## what to put here...?
value_selector = "text()"
with open(xmlfile, "r") as data:
xmldata = et.parse(data)
etree = xmldata.getroot()
statlist = etree.xpath(location)
for stat in statlist:
label = stat.xpath(label_selector)[0]
value = stat.xpath(value_selector)[0]
print "{0}: {1}".format(label, value)
if __name__ == '__main__':
main()
I know I could use value = stat.tag instead of stat.xpath(), but the script must be sufficiently generic to also process other pieces of XML where the label selector is different.
What xpath selector would return an element's tag name?
Simply use XPath's name(), and remove the zero index since this returns a string and not list.
from lxml import etree as et
def main():
xmlfile = "ExtractXPathTagName.xml"
location = "bind/statistics/memory/summary/*"
label_selector = "name()" ## what to put here...?
value_selector = "text()"
with open(xmlfile, "r") as data:
xmldata = et.parse(data)
etree = xmldata.getroot()
statlist = etree.xpath(location)
for stat in statlist:
label = stat.xpath(label_selector)
value = stat.xpath(value_selector)[0]
print("{0}: {1}".format(label, value).strip())
if __name__ == '__main__':
main()
Output
TotalUse: 1232952256
InUse: 835252452
BlockSize: 598212608
ContextSize: 52670016
Lost: 0
I think you don't need XPath for the two values, the element nodes have properties tag and text so use for instance a list comprehension:
[(element.tag, element.text) for element in etree.xpath(location)]
Or if you really want to use XPath
result = [(element.xpath('name()'), element.xpath('string()')) for element in etree.xpath(location)]
You could of course also construct a list of dictionaries:
result = [{ element.tag : element.text } for element in root.xpath(location)]
or
result = [{ element.xpath('name()') : element.xpath('string()') } for element in etree.xpath(location)]

Finding element in xml with python

I am trying to parse XML before converting it's content into lists and then into CSV. Unfortunately, I think my search terms for finding the initial element are failing, causing subsequent searches further down the hierarchy. I am new to XML, so I've tried variations on namespace dictionaries and including the namespace references... The simplified XML is given below:
<?xml version="1.0" encoding="utf-8"?>
<StationList xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:add="http://www.govtalk.gov.uk/people/AddressAndPersonalDetails"
xmlns:com="http://nationalrail.co.uk/xml/common" xsi:schemaLocation="http://internal.nationalrail.co.uk/xml/XsdSchemas/External/Version4.0/nre-station-v4-0.xsd"
xmlns="http://nationalrail.co.uk/xml/station">
<Station xsi:schemaLocation="http://internal.nationalrail.co.uk/xml/XsdSchemas/External/Version4.0/nre-station-v4-0.xsd">
<ChangeHistory>
<com:ChangedBy>spascos</com:ChangedBy>
<com:LastChangedDate>2018-11-07T00:00:00.000Z</com:LastChangedDate>
</ChangeHistory>
<Name>Aber</Name>
</Station>​
The Code I am using to try to extract the com/...xml/station / ChangedBy element is below
tree = ET.parse(rootfilepath + "NRE_Station_Dataset_2019_raw.xml")
root = tree.getroot()
#get at the tags and their data
#for elem in tree.iter():
# print(f"this the tag {elem.tag} and this is the data: {elem.text}")
#open file for writing
station_data = open(rootfilepath + 'station_data.csv','w')
csvwriter = csv.writer(station_data)
station_head = []
count = 0
#inspiration for this code: http://blog.appliedinformaticsinc.com/how-to- parse-and-convert-xml-to-csv-using-python/
#this is where it goes wrong; some combination of the namespace and the tag can't find anything in line 27, 'StationList'
for member in root.findall('{http://nationalrail.co.uk/xml/station}Station'):
station = []
if count == 0:
changedby = member.find('{http://nationalrail.co.uk/xml/common}ChangedBy').tag
station_head.append(changedby)
name = member.find('{http://nationalrail.co.uk/xml/station}Name').tag
station_head.append(name)
count = count+1
changedby = member.find('{http://nationalrail.co.uk/xml/common}ChangedBy').text
station.append(changedby)
name = member.find('{http://nationalrail.co.uk/xml/station}Name').text
station.append(name)
csvwriter.writerow(station)
I have tried:
using dictionaries of namespaces but that results in nothing being found at all
using hard coded namespaces but that results in "Attribute Error: 'NoneType' object has no attribute 'tag'
Thanks in advance for all and any assistance.
First of all your XML is invalid (</StationList> is absent at the end of a file).
Assuming you have valid XML file:
<?xml version="1.0" encoding="utf-8"?>
<StationList xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:add="http://www.govtalk.gov.uk/people/AddressAndPersonalDetails"
xmlns:com="http://nationalrail.co.uk/xml/common" xsi:schemaLocation="http://internal.nationalrail.co.uk/xml/XsdSchemas/External/Version4.0/nre-station-v4-0.xsd"
xmlns="http://nationalrail.co.uk/xml/station">
<Station xsi:schemaLocation="http://internal.nationalrail.co.uk/xml/XsdSchemas/External/Version4.0/nre-station-v4-0.xsd">
<ChangeHistory>
<com:ChangedBy>spascos</com:ChangedBy>
<com:LastChangedDate>2018-11-07T00:00:00.000Z</com:LastChangedDate>
</ChangeHistory>
<Name>Aber</Name>
</Station>​
</StationList>
Then you can convert your XML to JSON and simply address to the required value:
import xmltodict
with open('file.xml', 'r') as f:
data = xmltodict.parse(f.read())
changed_by = data['StationList']['Station']['ChangeHistory']['com:ChangedBy']
Output:
spascos
Try lxml:
#!/usr/bin/env python3
from lxml import etree
ns = {"com": "http://nationalrail.co.uk/xml/common"}
with open("so.xml") as f:
tree = etree.parse(f)
for t in tree.xpath("//com:ChangedBy/text()", namespaces=ns):
print(t)
Output:
spascos
You can use Beautifulsoup which is an html and xml parser
from bs4 import BeautifulSoup
fd = open(rootfilepath + "NRE_Station_Dataset_2019_raw.xml")
soup = BeautifulSoup(fd,'lxml-xml')
for i in soup.findAll('ChangeHistory'):
print(i.ChangedBy.text)

prettify adding extra lines in xml

I'm using Prettify to make my XML file readable. I am adding some new info in to an excising XML file but when i save it to a file i get extra lines in between the lines. is there a way of removing these line? Below is the code i'm using
import xml.etree.ElementTree as xml
import xml.dom.minidom as minidom
from lxml import etree
def prettify(elem):
rough_string = xml.tostring(elem, 'utf-8')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent="\t")
cid = "[123,123,123,123,123]"
doc = xml.parse('test.xml')
root = doc.getroot()
root.getchildren().index(root.find('card'))
e = xml.Element('card')
e.set('id', cid)
n = xml.SubElement(e, "name")
n.text = "FOLDER"
r = xml.SubElement(e, "red")
r.text = "FILE.AVI"
g = xml.SubElement(e, "green")
g.text = "FILE.AVI"
b = xml.SubElement(e, "blue")
b.text = "FILE.AVI"
root.insert(0, e)
doc2 = prettify(root)
with open("testnew.xml", "w") as f:
f.write(doc2)
Below is what i get in the file
<data>
<card id="[123,123,123,123,123]">
<name>FOLDER</name>
<red>FILE.AVI</red>
<green>FILE.AVI</green>
<blue>FILE.AVI</blue>
</card>
<card id="[000,000,000,000,000]">
<name>Colours</name>
<red>/media/usb/cow.avi</red>
<green>/media/usb/pig.avi</green>
<blue>/media/usb/cat.avi</blue>
</card>
</data>
input file "test.xml" looks like
<data>
<card id="[000,000,000,000,000]">
<name>Colours</name>
<red>/media/usb/cow.avi</red>
<green>/media/usb/pig.avi</green>
<blue>/media/usb/cat.avi</blue>
</card>
</data>
The new content added is being printed fine. Removing any "prettification" of the existing text solves the issue
Add
for elem in root.iter('*'):
if elem == e:
print "Added XML node does not need to be stripped"
continue
if elem.text is not None:
elem.text = elem.text.strip()
if elem.tail is not None:
elem.tail = elem.tail.strip()
before calling
doc2 = prettify(root)
Related answer: Python how to strip white-spaces from xml text nodes

Getting subelements using lxml and iterparse

I am trying to write a parsing algorithm to efficiently pull data from an xml document. I am currently rolling through the document based on elements and children, but would like to use iterparse instead. One issue is that I have a list of elements that when found, I want to pull the child data from them, but it seems like using iterparse my options are to filter based on either one element name, or get every single element.
Example xml:
<?xml version="1.0" encoding="UTF-8"?>
<data_object xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<source id="0">
<name>Office Issues</name>
<datetime>2012-01-13T16:09:15</datetime>
<data_id>7</data_id>
</source>
<event id="125">
<date>2012-11-06</date>
<state_id>7</state_id>
</event>
<state id="7">
<name>Washington</name>
</state>
<locality id="2">
<name>Olympia</name>
<state_id>7</state_id>
<type>City</type>
</locality>
<locality id="3">
<name>Town</name>
<state_id>7</state_id>
<type>Town</type>
</locality>
</data_object>
Code example:
from lxml import etree
fname = "test.xml"
ELEMENT_LIST = ["source", "event", "state", "locality"]
with open(fname) as xml_doc:
context = etree.iterparse(xml_doc, events=("start", "end"))
context = iter(context)
event, root = context.next()
base = False
b_name = ""
for event, elem in context:
if event == "start" and elem.tag in ELEMENT_LIST:
base = True
bname = elem.tag
children = elem.getchildren()
child_list = []
for child in children:
child_list.append(child.tag)
print bname + ":" + str(child_list)
elif event == "end" and elem.tag in ELEMENT_LIST:
base = False
root.clear()
With iterparse you cannot limit parsing to some types of tags, you may do this only with one tag (by passing argument tag). However it is easy to do manually what you would like to achieve. In the following snippet:
from lxml import etree
fname = "test.xml"
ELEMENT_LIST = ["source", "event", "state", "locality"]
with open(fname) as xml_doc:
context = etree.iterparse(xml_doc, events=("start", "end"))
for event, elem in context:
if event == "start" and elem.tag in ELEMENT_LIST:
print "this elem is interesting, do some processing: %s: [%s]" % (elem.tag, ", ".join(child.tag for child in elem))
elem.clear()
you limit your search to interesting tags only. Important part of iterparse is the elem.clear() which clears memory when item is obsolete. That is why it is memory efficient, see http://lxml.de/parsing.html#modifying-the-tree
I would use XPath instead. It's much more elegant than walking the document on your own and certainly more efficient I assume.
Use tag='{http://www.sitemaps.org/schemas/sitemap/0.9}url'
Similar question with right answer https://stackoverflow.com/a/7019273/1346222
#!/usr/bin/python
# coding: utf-8
""" Parsing xml file. Basic example """
from StringIO import StringIO
from lxml import etree
import urllib2
sitemap = urllib2.urlopen(
'http://google.com/sitemap.xml',
timeout=10
).read()
NS = {
'x': 'http://www.sitemaps.org/schemas/sitemap/0.9',
'x2': 'http://www.google.com/schemas/sitemap-mobile/1.0'
}
res = []
urls = etree.iterparse(StringIO(sitemap), tag='{http://www.sitemaps.org/schemas/sitemap/0.9}url')
for event, url in urls:
t = []
t = url.xpath('.//x:loc/text() | .//x:priority/text()', namespaces=NS)
t.append(url.xpath('boolean(.//x2:mobile)', namespaces=NS))
res.append(t)

Categories