element attributes missing when parsing XML with iterparse/lxml/python 2

element attributes missing when parsing XML with iterparse/lxml/python 2 - python

Here's my use case:
I have a potentially large XML file, and I want to output the frequency of all the unique structural variations of a given element type. Element attributes should be included as part of the uniqueness test. The output should sort the variations by frequency.
Here's a trivial input example, with 4 entries for automobile:
<automobile>
<mileage>20192</mileage>
<year>2005</year>
<user_defined name="color">red</user_defined>
</automobile>
<automobile>
<mileage>1098</mileage>
<year>2018</year>
<user_defined name="color">blue</user_defined>
</automobile>
<automobile>
<mileage>17964</mileage>
<year>2012</year>
<user_defined name="title_status">salvage</user_defined>
</automobile>
<automobile>
<mileage>198026</mileage>
<year>1990</year>
</automobile>
The output I expect would look like this:
<automobile automobile_frequency="2">
<mileage />
<year />
<user_defined name="color" />
</automobile>
<automobile automobile_frequency="1">
<mileage />
<year />
<user_defined name="title_status" />
</automobile>
<automobile automobile_frequency="1">
<mileage />
<year />
</automobile>
I've implemented the code using iterparse, but when it's processing the elements, the attributes do not exist in the element. The code logic appears to be correct, but attributes simply don't exist; they are not written in the output, and they are not present for the uniqueness test. Per the above input example, this is what I get on output:
<root>
<automobile automobile_frequency="3">
<mileage/>
<year/>
<user_defined/>
</automobile>
<automobile automobile_frequency="1">
<mileage/>
<year/>
</automobile>
</root>
The usage is:
xplore.py input.xml node_to_explore
In the above example, I used:
xplore.py trivial.xml automobile
Here's the source:
from lxml import etree
import sys
import re
from datetime import datetime
# global node signature map
structure_map = {}
# global code frequency map
frequency_map = {}
# output tree
tmp_root = etree.Element("tmp_root")
def process_element(el):
global target
if el.tag != target:
return
# get the structure of the element
structure = get_structure(el)
global structure_map
structure_key = etree.tostring(structure, pretty_print=True)
if structure_key not in structure_map.keys():
# add signature to structure map
structure_map[structure_key] = structure
# add node to output
global tmp_root
tmp_root.append(structure)
# add signature to frequency map
frequency_map[structure_key] = 1
else:
# increment frequency map
frequency_map[structure_key] += 1
# returns a unique string representing the structure of the node
# including attributes
def get_structure(el):
# create new element for the return value
ret = etree.Element(el.tag)
# get attributes
attribute_keys = el.attrib.keys()
for attribute_key in attribute_keys:
ret.set(attribute_key, el.get(attribute_key))
# check for children
children = list(el)
for child in children:
ret.append(get_structure(child))
return ret
if len(sys.argv) < 3:
print "Must specify an XML file for processing, as well as an element type!"
exit(0)
# Get XML file
xml = sys.argv[1]
# Get output file name
output_file = xml[0:xml.rindex(".")]+".txt"
# get target element type to evaluate
target = sys.argv[2]
# mark start
startTime = datetime.now()
# Parse XML
print '==========================='
print 'Parsing XML'
print '==========================='
context = etree.iterparse(xml, events=('end',))
for event, element in context:
process_element(element)
element.clear()
# create tree sorted by frequency
ranked = sorted(frequency_map.items(), key=lambda x: x[1], reverse=True)
root = etree.Element("root")
for item in ranked:
structure = structure_map[item[0]]
structure.set(target+"_frequency", str(item[1]))
root.append(structure)
# pretty print root
out = open(output_file, 'w')
out.write(etree.tostring(root, pretty_print=True))
# output run time
time = datetime.now() - startTime
reg3 = re.compile("\\d+:\\d(\\d:\\d+\\.\\d{4})")
time = re.search(reg3, unicode(time))
time = "Runtime: %ss" % (time.group(1).encode("utf-8"))
print(time)
In the debugger, I can clearly see that the attributes are missing from elements in the calls to get_structure. Can anyone tell me why this is the case?

The data:
<root>
<automobile>
<mileage>20192</mileage>
<year>2005</year>
<user_defined name="color">red</user_defined>
</automobile>
<automobile>
<mileage>1098</mileage>
<year>2018</year>
<user_defined name="color">blue</user_defined>
</automobile>
<automobile>
<mileage>17964</mileage>
<year>2012</year>
<user_defined name="title_status">salvage</user_defined>
</automobile>
<automobile>
<mileage>198026</mileage>
<year>1990</year>
</automobile>
</root>
The code:
from lxml import etree
import sys
import re
from datetime import datetime
# global node signature map
structure_map = {}
# global code frequency map
frequency_map = {}
# output tree
tmp_root = etree.Element("tmp_root")
def process_element(el):
# get the structure of the element
structure = get_structure(el)
global structure_map
structure_key = etree.tostring(structure, pretty_print=True)
if structure_key not in structure_map.keys():
# add signature to structure map
structure_map[structure_key] = structure
# add node to output
global tmp_root
tmp_root.append(structure)
# add signature to frequency map
frequency_map[structure_key] = 1
else:
# increment frequency map
frequency_map[structure_key] += 1
# returns a unique string representing the structure of the node
# including attributes
def get_structure(el):
# create new element for the return value
ret = etree.Element(el.tag)
# get attributes
attribute_keys = el.attrib.keys()
for attribute_key in attribute_keys:
ret.set(attribute_key, el.get(attribute_key))
# check for children
children = list(el)
for child in children:
ret.append(get_structure(child))
return ret
if len(sys.argv) < 3:
print "Must specify an XML file for processing, as well as an element type!"
exit(0)
# Get XML file
xml = sys.argv[1]
# Get output file name
output_file = xml[0:xml.rindex(".")]+".txt"
# get target element type to evaluate
target = sys.argv[2]
# mark start
startTime = datetime.now()
# Parse XML
print '==========================='
print 'Parsing XML'
print '==========================='
context = etree.iterparse(xml, events=('end',))
element_to_clear = []
for event, element in context:
element_to_clear.append(element)
global target
if element.tag == target:
process_element(element)
for ele in element_to_clear:
ele.clear()
element_to_clear = []
# create tree sorted by frequency
ranked = sorted(frequency_map.items(), key=lambda x: x[1], reverse=True)
root = etree.Element("root")
for item in ranked:
structure = structure_map[item[0]]
structure.set(target+"_frequency", str(item[1]))
root.append(structure)
# pretty print root
out = open(output_file, 'w')
out.write(etree.tostring(root, pretty_print=True))
# output run time
time = datetime.now() - startTime
reg3 = re.compile("\\d+:\\d(\\d:\\d+\\.\\d{4})")
time = re.search(reg3, unicode(time))
time = "Runtime: %ss" % (time.group(1).encode("utf-8"))
print(time)
The command: xplore.py trivial.xml automobile

Related

Python ElementTree adding a child

I have an xml file which looks like this:
<keyboard>
</keyboard>
I want to make it look like this:
<keyboard>
<keybind key="W-c-a"><action name="Execute"><command>sudo shutdown now</command></action></keybind>
</keyboard>
I have a function to add this which has parameters that will change the key and the command. Is this possible to do? If yes, how can I do this?
(The function):
def add_keybinding(self, keys, whatToExec):
keybinding = "<keybind key=\"%s\"><action name=\"Execute\"><command>%s</command><action></keybind>" % (keys, whatToExec)
f = open("/etc/xdg/openbox/rc.xml", "a")
try:
# I want to append the keybinding variable to the <keyboard>
except IOError as e:
print(e)

From the doc, you can try the following:
def add_keybinding(keys, whatToExec, filename):
keybind = ET.Element('keybind')
keybind.set("key", keys)
action = ET.SubElement(keybind, 'action')
action.set("name", "Execute")
command = ET.SubElement(action, 'command')
command.text = whatToExec
tree = ET.parse(filename)
tree.getroot().append(keybind)
tree.write(filename)
Explanation:
Create the keybind tag using xml.etree.ElementTree.Element : keybind = ET.Element('keybind')
Set a property using set: keybind.set("key", keys)
Create the action tag as a sub element of keybind using
xml.etree.ElementTree.SubElement: action = ET.SubElement(keybind, 'action')
Set the property as at step 2: action.set("name", "Execute")
Create command tag: action.set("name", "Execute")
Set command tag content using .text: command.text = whatToExec
Read file using xml.etree.ElementTree.parse: tree = ET.parse(filename)
Append keybind tag to the doc root element using append*
Export new xml to file using write
Full example:
import xml.etree.ElementTree as ET
from xml.dom import minidom
def add_keybinding(keys, whatToExec, filename):
keybind = ET.Element('keybind')
keybind.set("key", keys)
action = ET.SubElement(keybind, 'action')
action.set("name", "Execute")
command = ET.SubElement(action, 'command')
command.text = whatToExec
tree = ET.parse(filename)
tree.getroot().append(keybind)
tree.write(filename)
return tree
def prettify(elem):
rough_string = ET.tostring(elem, 'utf-8')
return minidom.parseString(rough_string).toprettyxml(indent=" ")
filename = "test.xml"
for i in range(3):
tree = add_keybinding(str(i), "whatToExec " + str(i), filename)
print(prettify(tree.getroot()))
Output:
<?xml version="1.0" ?>
<keyboard>
<keybind key="0">
<action name="Execute">
<command>whatToExec 0</command>
</action>
</keybind>
<keybind key="1">
<action name="Execute">
<command>whatToExec 1</command>
</action>
</keybind>
<keybind key="2">
<action name="Execute">
<command>whatToExec 2</command>
</action>
</keybind>
</keyboard>

How convert xml to csv file using python (in row)?

I want to encode this xml document in cvs. I tried but it does not work I do not know what I'm doing wrong.I'm new on this.
There is the xml that i want to convert
<?xml version="1.0" encoding="UTF-8"?>
<Shot
Shotcode = "30AA"
ShotDate = "4/2/2000">
<Images>
<Image
ImageNumber="103"
RawFileName="18_Shot_30AA.jpg" />
<Image
ImageNumber="104"
RawFileName="17_Shot_30AA.jpg" />
<Image
ImageNumber="105"
RawFileName="14_Shot_30AA" />
</Images>
<Metrics>
<Metric
Name = "30AA"
TypeId = "163"
Value = "0" />
<Metric
Name = "Area"
TypeId = "10"
Value = "63" />
</Metrics>
</Shot>
I code this in that form, in order to complete some example and is not the complete program but show what i'm doing.
import xml.etree.ElementTree as ET
import csv
tree = ET.parse("30AA.xml")
root = tree.getroot()
30AA = open('30AA.csv', 'w+')
csvwriter = csv.writer(30AA)
head = []
count = 0 #loops
for member in root.findall('Shot'):
Shot = []
if count == 0:
ShotCode = member.find('ShotCode').tag
head.append(ShotCode)
ShotDate = member.find('ShotDate').tag
head.append(ShotDate)
csvwriter.writerow(head)
count = count + 1
ShotCode = member.find('ShotCode').txt
Shot.append(ShotCode)
ShotDate = member.find('ShotDate').txt
Shot.append(ShotDate)
30AA.close()
the result that i expect is
Shotcode 30AA
ShotDate 4/2/2000
Imagen 103
Imagen 104
Imagen 105
Name TypeId Value
30AA 163 0
area 10 63

Okay I think I see whats going wrong, the major problem is mostly in reading the xml It just looks like its a csv thing.
The root of your xml is a Shot tag, so you can't use root.findall('Shot') to get all the tags since root is already and it doesn't have any Shot's inside it.
So that why your not getting anything in your output.
Also when you want to get the attributes of a tag you use .attrib['name_of_attribute'] so for example instead of member.find('ShotCode').tag should be member.attrib['ShotCode']
That changes the rest of the script quite a bit but you then need to do something like this:
root = tree.getroot()
_30AA = open('30AA.csv', 'w+')
csvwriter = csv.writer(_30AA)
head = []
ShotCode = root.attrib['Shotcode']
csvwriter.writerow(['ShotCode', ShotCode])
head.append(ShotCode)
ShotDate = root.attrib['ShotDate']
csvwriter.writerow(['ShotDate', ShotDate])
# member is going to be the <Images> and <Metrics>
for member in root.getchildren():
submembers = member.getchildren()
# Write the names of the attributes as headings
keys = submembers[0].attrib.keys()
csvwriter.writerow(keys)
for submember in submembers:
row_data = [submember.attrib[k] for k in keys]
csvwriter.writerow(row_data )
_30AA.close()
Will give you what you want

Python ElementTree: replacing elements in a loop

I'm trying to create a script that loops creating an xml file, with incrementing values for two elements. (an IP address using netaddr, and the tag/member element that increments, tag01 - tag10)
from netaddr import IPNetwork
import xml.dom.minidom
import lxml.etree as etree
import xml.etree.cElementTree as ET
ip = IPNetwork('10.10.10.0/24')
count = 1
tag = range(1,10)
uid = ET.Element("message")
type = ET.SubElement(uid, "type").text = "update"
payload = ET.SubElement(uid, "payload")
register = ET.SubElement(payload, "register")
entry = ET.SubElement(register, "entry", ip="11.11.11.11")
tag = ET.SubElement(entry, "tag")
ET.SubElement(tag, "member").text = "tag1"
tree = ET.ElementTree(uid)
while count <= 10:
elemtag = tree.findall(".//member")
for elemt in elemtag:
elemt.text = 'tag{}'.format(tag)
elemip = tree.findall(".//entry")
for elemi in elemip:
elemi.text = 'ip="{}"'.format(ip)
count += 1
ET.dump(uid)
print(count)
#tree.write("tmp.xml")
#x = etree.parse("tmp.xml")
#print etree.tostring(x, pretty_print=True)
#etree.parse("tmp.xml").write("pretty.xml", encoding="utf-8", pretty_print=True)
#os.system('tool.py -f pretty.xml')
I figured out how to create the xml I needed using ElementTree, and if I comment out my loop and write the the resulting xml, it's correct, looks good, and works with the tool that consumes it (yay!).
<message>
<type>update</type>
<payload>
<register>
<entry ip="11.11.11.11">
<tag>
<member>tag1</member>
</tag>
</entry>
</register>
</payload>
</message>
However, when I add my loop to replace the values for the two elements, I just cant seem to get it right, and I'm clobbering the tags/elements in the tree.
<message>
<type>update</type>
<payload>
<register>
<entry ip="11.11.11.11">ip="10.10.10.0/24"<tag><member>tag<Element 'tag' at
0x7f7b29d66c90></member></tag></entry>
</register>
</payload>
</message>
I keep trying different things to replace the elements, but they just end up as different permutations of wrong, and I just can't seem to get it right. Hoping someone can help me figure out what I'm missing. Thanks in advance!

< Entry > it's an element and "ip" it's an attribute of that element.
We can change the attribute for the element in the following way:
tree.find('.//entry').attrib['ip'] = "22.22.22.22"
"tag1" it's .text inside of the < member > element but we reassigned the tag variable at this line:
tag = ET.SubElement(entry, "tag")
According that we assigned own tag object to the tag element's text instead of tag number.
I updated your code. Now this solution generates new XML < message > at each "count" iteration. I hope it will help.
from netaddr import IPNetwork
import xml.etree.cElementTree as ET
ip = IPNetwork('10.10.10.0/24')
tag_lst = list(range(1, 11))
count = 1
uid = ET.Element("message")
type = ET.SubElement(uid, "type").text = "update"
payload = ET.SubElement(uid, "payload")
register = ET.SubElement(payload, "register")
entry = ET.SubElement(register, "entry", ip="11.11.11.11")
tag = ET.SubElement(entry, "tag")
ET.SubElement(tag, "member").text = "tag1"
tree = ET.ElementTree(uid)
while count <= 10:
tree.find('.//member').text = "tag"+str(count)
tree.find('.//entry').attrib['ip'] = format(ip[count])
count += 1
ET.dump(uid)
print(count)
XML output is:
<message><type>update</type><payload><register><entry ip="10.10.10.1"><tag><member>tag1</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.2"><tag><member>tag2</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.3"><tag><member>tag3</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.4"><tag><member>tag4</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.5"><tag><member>tag5</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.6"><tag><member>tag6</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.7"><tag><member>tag7</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.8"><tag><member>tag8</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.9"><tag><member>tag9</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.10"><tag><member>tag10</member></tag></entry></register></payload></message>

Opening file by file in a folder

I m new at programing with python but currently i received a task to write a script that writes me down all ID's were the type=0 or type=1 occurs. Its an XML File that looks like this example:
<root>
<bla1 type="0" id = "1001" pvalue:="djdjd"/>
<bla2 type="0" id = "1002" pvalue:="djdjd" />
<bla3 type="0" id = "1003" pvalue:="djdjd"/>
<bla4 type="0" id = "1004" pvalue:="djdjd"/>
<bla5 type="0" id = "1005" pvalue:="djdjd"/>
<bla6 type="1" id = "1006" pvalue:="djdjd"/>
<bla7 type="0" id = "1007" pvalue:="djdjd"/>
<bla8 type="0" id = "1008" pvalue:="djdjd"/>
<bla9 type="1" id = "1009" pvalue:="djdjd"/>
<bla10 type="0" id = "1010" pvalue:="djdjd"/>
<bla11 type="0" id = "1011" pvalue:="djdjd"/>
<bla12 type="0" id = "1009" pvalue:="djdjd"/>
<root>
So the first thing the code does is to replace basically ':=' with '=' cause that makes my xml upload causing errors. Anyway then it writes down the ID's were the type is 0 and the ID's where the the type is 1. This works perfectly for one xml file. Unfortunately i have more then just one file and i need sth like a loop that always opens the next xml file (different names) in the folder and adds always the new ID's to the ID's found in the last xml. So basically it adds always the new found id's from the new xml file.
import xml.etree.cElementTree as ET # required import
XmlFile = 'ID3.xml' # insert here the name of the XML-file, which needs to be inside the same folder as the .py file
my_file = open('%s' % XmlFile, "r+") # open the XML-file
Xml2String = my_file.readlines() # convert the file into a list strings
XmlFile_new = [] # new list, which is filled with the modified strings
L = len(Xml2String) # length of the string-list
for i in range(1, L): # Increment starts at 0, therefore, the first line is ignored
if ':=' in Xml2String[i]:
XmlFile_new.append(Xml2String[i].replace(':=', '=')) # get rid of colon
else:
XmlFile_new.append(Xml2String[i])
tree = ET.ElementTree(XmlFile_new)
root = tree.getroot()
id_0 = [] # list for id="0"
id_1 = [] # list for id="1"
id_one2zero = [] # list for ids, that occur twice
for i in range(len(root)):
if 'type="0"' in root[i]: # check for type
a = root[i].index("id") + 5 # search index of id
b = a+6
id_0.append((root[i][a:b])) # the id is set via index slicing
elif 'type="1"' in root[i]: # check for type
a = root[i].index("id") + 5
b = a+6
id_1.append((root[i][a:b]))
else:
print("Unknown type occurred") # If there's a line without type="0" or type="1", this message gets printed
# (Remember: first line of the xml-file is ignored)
for i in range(len(id_0)): # check for ids, that occur twice
for j in range(len(id_1)):
if id_0[i] == id_1[j]:
id_one2zero.append(id_0[i])
print(id_0)
print(id_1)
f = open('write.xml','w')
print >>f, 'whatever'
print('<end>')

An easy way to solve this is using the os.walk() function. With it you can open all files in one directory or even recursively.
Here is an example how to use it:
for root, dirs, files in os.walk("your/path"):
for file in files:
# process your file
If you also have other files than xml-files in your directory you can make sure you only process xml-files with the file.endswith(".xml").

get value from variable which contains xml

I have the next jenkins API script:
import jenkins
import json
import re
server = jenkins.Jenkins('https://jenkins_url', username, password)
nodes = json.dumps(server.get_nodes())
nodes = re.sub('\"offline\"|[:{} \[\]]|true,|false,|\"name\"|\"','',nodes).split(',')
for label in nodes:
if label != 'master':
print label
node_config = server.get_node_config(label)
print node_config
node_config contains for example the next xml text:
<?xml version="1.0" encoding="UTF-8"?>
<slave>
<name>test.server</name>
<description></description>
<remoteFS>/var/lib/jenkins</remoteFS>
<numExecutors>1</numExecutors>
<mode>EXCLUSIVE</mode>
<retentionStrategy class="hudson.slaves.RetentionStrategy$Always"/>
<launcher class="hudson.plugins.sshslaves.SSHLauncher" plugin="ssh-slaves#1.10">
<host>test.server</host>
<port>7777</port>
<credentialsId>d0970a8f-d124</credentialsId>
<maxNumRetries>0</maxNumRetries>
<retryWaitTime>0</retryWaitTime>
</launcher>
<label>BuildServer</label>
<nodeProperties/>
<userId>test</userId>
</slave>
I want to get value of each of tag to obtain on output eg test.server etc.
Could you please help me with it?

xml_text = """<?xml version="1.0" encoding="UTF-8"?>
<slave>
<name>test.server</name>
<description></description>
<remoteFS>/var/lib/jenkins</remoteFS>
<numExecutors>1</numExecutors>
<mode>EXCLUSIVE</mode>
<retentionStrategy class="hudson.slaves.RetentionStrategy$Always"/>
<launcher class="hudson.plugins.sshslaves.SSHLauncher" plugin="ssh-slaves#1.10">
<host>test.server</host>
<port>7777</port>
<credentialsId>d0970a8f-d124</credentialsId>
<maxNumRetries>0</maxNumRetries>
<retryWaitTime>0</retryWaitTime>
</launcher>
<label>BuildServer</label>
<nodeProperties/>
<userId>test</userId>
</slave>
"""
import xml.etree.ElementTree
root = xml.etree.ElementTree.fromstring(xml_text)
# show only a particular tag
for name in root.findall('name'):
print(name.text)
# show all children at first level
for child in root:
print('%s: %s' % (child.tag, child.text))
# build a dict (will only get last of any duplicate tags, and no children)
slave = {child.tag: child.text for child in root}
# build a dict (will only get last of any duplicate tags)
def xml_todict(xml_node):
dict_ = {}
for child in xml_node:
dict_[child.tag] = xml_todict(child)
dict_.update(xml_node.attrib)
if not dict_:
return xml_node.text
if xml_node.text and 'text' not in dict_:
dict_['text'] = xml_node.text
return dict_
slave = xml_todict(root)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

element attributes missing when parsing XML with iterparse/lxml/python 2 - python

Related

Python ElementTree adding a child

How convert xml to csv file using python (in row)?

Python ElementTree: replacing elements in a loop

Opening file by file in a folder

get value from variable which contains xml

Categories

Resources