How convert xml to csv file using python (in row)? - python

I want to encode this xml document in cvs. I tried but it does not work I do not know what I'm doing wrong.I'm new on this.
There is the xml that i want to convert
<?xml version="1.0" encoding="UTF-8"?>
<Shot
Shotcode = "30AA"
ShotDate = "4/2/2000">
<Images>
<Image
ImageNumber="103"
RawFileName="18_Shot_30AA.jpg" />
<Image
ImageNumber="104"
RawFileName="17_Shot_30AA.jpg" />
<Image
ImageNumber="105"
RawFileName="14_Shot_30AA" />
</Images>
<Metrics>
<Metric
Name = "30AA"
TypeId = "163"
Value = "0" />
<Metric
Name = "Area"
TypeId = "10"
Value = "63" />
</Metrics>
</Shot>
I code this in that form, in order to complete some example and is not the complete program but show what i'm doing.
import xml.etree.ElementTree as ET
import csv
tree = ET.parse("30AA.xml")
root = tree.getroot()
30AA = open('30AA.csv', 'w+')
csvwriter = csv.writer(30AA)
head = []
count = 0 #loops
for member in root.findall('Shot'):
Shot = []
if count == 0:
ShotCode = member.find('ShotCode').tag
head.append(ShotCode)
ShotDate = member.find('ShotDate').tag
head.append(ShotDate)
csvwriter.writerow(head)
count = count + 1
ShotCode = member.find('ShotCode').txt
Shot.append(ShotCode)
ShotDate = member.find('ShotDate').txt
Shot.append(ShotDate)
30AA.close()
the result that i expect is
Shotcode 30AA
ShotDate 4/2/2000
Imagen 103
Imagen 104
Imagen 105
Name TypeId Value
30AA 163 0
area 10 63

Okay I think I see whats going wrong, the major problem is mostly in reading the xml It just looks like its a csv thing.
The root of your xml is a Shot tag, so you can't use root.findall('Shot') to get all the tags since root is already and it doesn't have any Shot's inside it.
So that why your not getting anything in your output.
Also when you want to get the attributes of a tag you use .attrib['name_of_attribute'] so for example instead of member.find('ShotCode').tag should be member.attrib['ShotCode']
That changes the rest of the script quite a bit but you then need to do something like this:
root = tree.getroot()
_30AA = open('30AA.csv', 'w+')
csvwriter = csv.writer(_30AA)
head = []
ShotCode = root.attrib['Shotcode']
csvwriter.writerow(['ShotCode', ShotCode])
head.append(ShotCode)
ShotDate = root.attrib['ShotDate']
csvwriter.writerow(['ShotDate', ShotDate])
# member is going to be the <Images> and <Metrics>
for member in root.getchildren():
submembers = member.getchildren()
# Write the names of the attributes as headings
keys = submembers[0].attrib.keys()
csvwriter.writerow(keys)
for submember in submembers:
row_data = [submember.attrib[k] for k in keys]
csvwriter.writerow(row_data )
_30AA.close()
Will give you what you want

Related

How to add Subelements and its in xml

I have a xml file which has subelements:-
.......
.......
<EnabledFeatureListForUsers>
<FeatureEntitlementDetail>
<UserName>xyz#xyz.com</UserName>
<FeatureList>
<FeatureDetail>
<FeatureId>X</FeatureId>
</FeatureDetail>
</FeatureList>
</FeatureEntitlementDetail>
</EnabledFeatureListForUsers>
.....
.....
I want to add a new sub element FeatureEntitlementDetail with all its subelements/children like username, Feature List, Feature Detail, Feature Id. I tried using SubElement function, but it only adds FeatureEntitlementDetail />. The code which I used was :-
import xml.etree.ElementTree as ET
filename = "XYZ.xml"
xmlTree = ET.parse(filename)
root = xmlTree.getroot()
for element in root.iter('EnabledFeatureListForUsers'):
ET.SubElement(element,"FeatureEntitlementDetail")
Any help is appreciated.
See below
import xml.etree.ElementTree as ET
xml = """
<EnabledFeatureListForUsers>
<FeatureEntitlementDetail>
<UserName>xyz#xyz.com</UserName>
<FeatureList>
<FeatureDetail>
<FeatureId>X</FeatureId>
</FeatureDetail>
</FeatureList>
</FeatureEntitlementDetail>
</EnabledFeatureListForUsers>
"""
root = ET.fromstring(xml)
fed = ET.SubElement(root,'FeatureEntitlementDetail')
un = ET.SubElement(fed,'UserName')
un.text = 'abc.zz.net'
fl = ET.SubElement(fed,'FeatureList')
df = ET.SubElement(fl,'FeatureDetail')
fi = ET.SubElement(df,'FeatureId')
fi.text = 'Z'
ET.dump(root)
output
<?xml version="1.0" encoding="UTF-8"?>
<EnabledFeatureListForUsers>
<FeatureEntitlementDetail>
<UserName>xyz#xyz.com</UserName>
<FeatureList>
<FeatureDetail>
<FeatureId>X</FeatureId>
</FeatureDetail>
</FeatureList>
</FeatureEntitlementDetail>
<FeatureEntitlementDetail>
<UserName>abc.zz.net</UserName>
<FeatureList>
<FeatureDetail>
<FeatureId>Z</FeatureId>
</FeatureDetail>
</FeatureList>
</FeatureEntitlementDetail>
</EnabledFeatureListForUsers>

How to add new nodes into XML tree, reading from a list in Python?

I am trying to read from a list and add the values as new nodes into an XML in Python
list = ['163','164','165']
and after appending list values into the node trackingnumbers,
The xml should looks like this :
<?xml version="1.0" encoding="UTF-8"?>
<trackingrequest>
<user>TAIL</user>
<password>20</password>
<trackingnumbers>
<trackingnumber>163</trackingnumber>
<trackingnumber>164</trackingnumber>
<trackingnumber>165</trackingnumber>
</trackingnumbers>
</trackingrequest>
I have got it this far but i am stuck at creating dynamic variables inside a loop, which creates new nodes inside trackingnumbers
def GenerateXML():
root = ET.Element("trackingrequest")
m1 = ET.Element("user")
root.append(m1)
m1.text = 'TAIL'
m2 = ET.Element("password")
root.append(m2)
m2.text = '20'
m3 = ET.Element("trackingnumbers")
root.append(m3)
d = {}
for i in range(list):
d["trackingid_{0}".format(i)] = ET.SubElement(m3, "trackingnumber")
d['trackingid_1'].text = i*2
tree = ET.ElementTree(root)
The idea is to find trackingnumbers and add the required sub elements
import xml.etree.ElementTree as ET
lst = ['163', '164', '165']
xml = '''<?xml version="1.0" encoding="UTF-8"?>
<trackingrequest>
<user>TAIL</user>
<password>20</password>
<trackingnumbers>
</trackingnumbers>
</trackingrequest>'''
root = ET.fromstring(xml)
tracking_numbers = root.find('.//trackingnumbers')
for num in lst:
tn = ET.SubElement(tracking_numbers, 'trackingnumber')
tn.text = num
ET.dump(root)
output
<?xml version="1.0" encoding="UTF-8"?>
<trackingrequest>
<user>TAIL</user>
<password>20</password>
<trackingnumbers>
<trackingnumber>163</trackingnumber>
<trackingnumber>164</trackingnumber>
<trackingnumber>165</trackingnumber>
</trackingnumbers>
</trackingrequest>

How to get the content of specific grandchild from xml file through python

Hi I am very new to python programming. I have an xml file of structure:
<?xml version="1.0" encoding="UTF-8"?>
-<LidcReadMessage xsi:schemaLocation="http://www.nih.gov http://troll.rad.med.umich.edu/lidc/LidcReadMessage.xsd"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://www.nih.gov" uid="1.3.6.1.4.1.14519.5.2.1.6279.6001.1307390687803.0">
-<ResponseHeader>
<Version>1.8.1</Version>
<MessageId>-421198203</MessageId>
<DateRequest>2007-11-01</DateRequest>
<TimeRequest>12:30:44</TimeRequest>
<RequestingSite>removed</RequestingSite>
<ServicingSite>removed</ServicingSite>
<TaskDescription>Second unblinded read</TaskDescription>
<CtImageFile>removed</CtImageFile>
<SeriesInstanceUid>1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636438705059720603192</SeriesInstanceUid>
<DateService>2008-08-18</DateService>
<TimeService>02:05:51</TimeService>
<ResponseDescription>1 - Reading complete</ResponseDescription>
<StudyInstanceUID>1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288633453246975630178</StudyInstanceUID>
</ResponseHeader>
-<readingSession>
<annotationVersion>3.12</annotationVersion>
<servicingRadiologistID>540461523</servicingRadiologistID>
-<unblindedReadNodule>
<noduleID>Nodule 001</noduleID>
-<characteristics>
<subtlety>5</subtlety>
<internalStructure>1</internalStructure>
<calcification>6</calcification>
<sphericity>3</sphericity>
<margin>3</margin>
<lobulation>3</lobulation>
<spiculation>4</spiculation>
<texture>5</texture>
<malignancy>5</malignancy>
</characteristics>
-<roi>
<imageZposition>-125.000000 </imageZposition>
<imageSOP_UID>1.3.6.1.4.1.14519.5.2.1.6279.6001.110383487652933113465768208719</imageSOP_UID>
......
There are four which contains multiple . Each contains an . I need to extract the information in from all of these headers.
Right now I am doing this:
import xml.etree.ElementTree as ET
tree = ET.parse('069.xml')
root = tree.getroot()
#lst = []
for readingsession in root.iter('readingSession'):
for roi in readingsession.findall('roi'):
id = roi.findtext('imageSOP_UID')
print(id)
but it ouputs like this:
Process finished with exit code 0.
If anyone can help.
The real problem as been wit the namespace. I tried with and without it, but it didn't work with this code.
ds = pydicom.dcmread("000071.dcm")
uid = ds.SOPInstanceUID
tree = ET.parse("069.xml")
root = tree.getroot()
for child in root:
print(child.tag)
if child.tag == '{http://www.nih.gov}readingSession':
read = child.find('{http://www.nih.gov}unblindedReadNodule')
if read != None:
nodule_id = read.find('{http://www.nih.gov}noduleID').text
xml_uid = read.find('{http://www.nih.gov}roi').find('{http://www.nih.gov}imageSOP_UID').text
if xml_uid == uid:
print(xml_uid, "=", uid)
roi= read.find('{http://www.nih.gov}roi')
print(roi)
This work completely fine to get a uid from dicom image of LIDC/IDRI dataset and then extract the same uid from xml file for it region of interest.

Python ElementTree: replacing elements in a loop

I'm trying to create a script that loops creating an xml file, with incrementing values for two elements. (an IP address using netaddr, and the tag/member element that increments, tag01 - tag10)
from netaddr import IPNetwork
import xml.dom.minidom
import lxml.etree as etree
import xml.etree.cElementTree as ET
ip = IPNetwork('10.10.10.0/24')
count = 1
tag = range(1,10)
uid = ET.Element("message")
type = ET.SubElement(uid, "type").text = "update"
payload = ET.SubElement(uid, "payload")
register = ET.SubElement(payload, "register")
entry = ET.SubElement(register, "entry", ip="11.11.11.11")
tag = ET.SubElement(entry, "tag")
ET.SubElement(tag, "member").text = "tag1"
tree = ET.ElementTree(uid)
while count <= 10:
elemtag = tree.findall(".//member")
for elemt in elemtag:
elemt.text = 'tag{}'.format(tag)
elemip = tree.findall(".//entry")
for elemi in elemip:
elemi.text = 'ip="{}"'.format(ip)
count += 1
ET.dump(uid)
print(count)
#tree.write("tmp.xml")
#x = etree.parse("tmp.xml")
#print etree.tostring(x, pretty_print=True)
#etree.parse("tmp.xml").write("pretty.xml", encoding="utf-8", pretty_print=True)
#os.system('tool.py -f pretty.xml')
I figured out how to create the xml I needed using ElementTree, and if I comment out my loop and write the the resulting xml, it's correct, looks good, and works with the tool that consumes it (yay!).
<message>
<type>update</type>
<payload>
<register>
<entry ip="11.11.11.11">
<tag>
<member>tag1</member>
</tag>
</entry>
</register>
</payload>
</message>
However, when I add my loop to replace the values for the two elements, I just cant seem to get it right, and I'm clobbering the tags/elements in the tree.
<message>
<type>update</type>
<payload>
<register>
<entry ip="11.11.11.11">ip="10.10.10.0/24"<tag><member>tag<Element 'tag' at
0x7f7b29d66c90></member></tag></entry>
</register>
</payload>
</message>
I keep trying different things to replace the elements, but they just end up as different permutations of wrong, and I just can't seem to get it right. Hoping someone can help me figure out what I'm missing. Thanks in advance!
< Entry > it's an element and "ip" it's an attribute of that element.
We can change the attribute for the element in the following way:
tree.find('.//entry').attrib['ip'] = "22.22.22.22"
"tag1" it's .text inside of the < member > element but we reassigned the tag variable at this line:
tag = ET.SubElement(entry, "tag")
According that we assigned own tag object to the tag element's text instead of tag number.
I updated your code. Now this solution generates new XML < message > at each "count" iteration. I hope it will help.
from netaddr import IPNetwork
import xml.etree.cElementTree as ET
ip = IPNetwork('10.10.10.0/24')
tag_lst = list(range(1, 11))
count = 1
uid = ET.Element("message")
type = ET.SubElement(uid, "type").text = "update"
payload = ET.SubElement(uid, "payload")
register = ET.SubElement(payload, "register")
entry = ET.SubElement(register, "entry", ip="11.11.11.11")
tag = ET.SubElement(entry, "tag")
ET.SubElement(tag, "member").text = "tag1"
tree = ET.ElementTree(uid)
while count <= 10:
tree.find('.//member').text = "tag"+str(count)
tree.find('.//entry').attrib['ip'] = format(ip[count])
count += 1
ET.dump(uid)
print(count)
XML output is:
<message><type>update</type><payload><register><entry ip="10.10.10.1"><tag><member>tag1</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.2"><tag><member>tag2</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.3"><tag><member>tag3</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.4"><tag><member>tag4</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.5"><tag><member>tag5</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.6"><tag><member>tag6</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.7"><tag><member>tag7</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.8"><tag><member>tag8</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.9"><tag><member>tag9</member></tag></entry></register></payload></message>
<message><type>update</type><payload><register><entry ip="10.10.10.10"><tag><member>tag10</member></tag></entry></register></payload></message>

element attributes missing when parsing XML with iterparse/lxml/python 2

Here's my use case:
I have a potentially large XML file, and I want to output the frequency of all the unique structural variations of a given element type. Element attributes should be included as part of the uniqueness test. The output should sort the variations by frequency.
Here's a trivial input example, with 4 entries for automobile:
<automobile>
<mileage>20192</mileage>
<year>2005</year>
<user_defined name="color">red</user_defined>
</automobile>
<automobile>
<mileage>1098</mileage>
<year>2018</year>
<user_defined name="color">blue</user_defined>
</automobile>
<automobile>
<mileage>17964</mileage>
<year>2012</year>
<user_defined name="title_status">salvage</user_defined>
</automobile>
<automobile>
<mileage>198026</mileage>
<year>1990</year>
</automobile>
The output I expect would look like this:
<automobile automobile_frequency="2">
<mileage />
<year />
<user_defined name="color" />
</automobile>
<automobile automobile_frequency="1">
<mileage />
<year />
<user_defined name="title_status" />
</automobile>
<automobile automobile_frequency="1">
<mileage />
<year />
</automobile>
I've implemented the code using iterparse, but when it's processing the elements, the attributes do not exist in the element. The code logic appears to be correct, but attributes simply don't exist; they are not written in the output, and they are not present for the uniqueness test. Per the above input example, this is what I get on output:
<root>
<automobile automobile_frequency="3">
<mileage/>
<year/>
<user_defined/>
</automobile>
<automobile automobile_frequency="1">
<mileage/>
<year/>
</automobile>
</root>
The usage is:
xplore.py input.xml node_to_explore
In the above example, I used:
xplore.py trivial.xml automobile
Here's the source:
from lxml import etree
import sys
import re
from datetime import datetime
# global node signature map
structure_map = {}
# global code frequency map
frequency_map = {}
# output tree
tmp_root = etree.Element("tmp_root")
def process_element(el):
global target
if el.tag != target:
return
# get the structure of the element
structure = get_structure(el)
global structure_map
structure_key = etree.tostring(structure, pretty_print=True)
if structure_key not in structure_map.keys():
# add signature to structure map
structure_map[structure_key] = structure
# add node to output
global tmp_root
tmp_root.append(structure)
# add signature to frequency map
frequency_map[structure_key] = 1
else:
# increment frequency map
frequency_map[structure_key] += 1
# returns a unique string representing the structure of the node
# including attributes
def get_structure(el):
# create new element for the return value
ret = etree.Element(el.tag)
# get attributes
attribute_keys = el.attrib.keys()
for attribute_key in attribute_keys:
ret.set(attribute_key, el.get(attribute_key))
# check for children
children = list(el)
for child in children:
ret.append(get_structure(child))
return ret
if len(sys.argv) < 3:
print "Must specify an XML file for processing, as well as an element type!"
exit(0)
# Get XML file
xml = sys.argv[1]
# Get output file name
output_file = xml[0:xml.rindex(".")]+".txt"
# get target element type to evaluate
target = sys.argv[2]
# mark start
startTime = datetime.now()
# Parse XML
print '==========================='
print 'Parsing XML'
print '==========================='
context = etree.iterparse(xml, events=('end',))
for event, element in context:
process_element(element)
element.clear()
# create tree sorted by frequency
ranked = sorted(frequency_map.items(), key=lambda x: x[1], reverse=True)
root = etree.Element("root")
for item in ranked:
structure = structure_map[item[0]]
structure.set(target+"_frequency", str(item[1]))
root.append(structure)
# pretty print root
out = open(output_file, 'w')
out.write(etree.tostring(root, pretty_print=True))
# output run time
time = datetime.now() - startTime
reg3 = re.compile("\\d+:\\d(\\d:\\d+\\.\\d{4})")
time = re.search(reg3, unicode(time))
time = "Runtime: %ss" % (time.group(1).encode("utf-8"))
print(time)
In the debugger, I can clearly see that the attributes are missing from elements in the calls to get_structure. Can anyone tell me why this is the case?
The data:
<root>
<automobile>
<mileage>20192</mileage>
<year>2005</year>
<user_defined name="color">red</user_defined>
</automobile>
<automobile>
<mileage>1098</mileage>
<year>2018</year>
<user_defined name="color">blue</user_defined>
</automobile>
<automobile>
<mileage>17964</mileage>
<year>2012</year>
<user_defined name="title_status">salvage</user_defined>
</automobile>
<automobile>
<mileage>198026</mileage>
<year>1990</year>
</automobile>
</root>
The code:
from lxml import etree
import sys
import re
from datetime import datetime
# global node signature map
structure_map = {}
# global code frequency map
frequency_map = {}
# output tree
tmp_root = etree.Element("tmp_root")
def process_element(el):
# get the structure of the element
structure = get_structure(el)
global structure_map
structure_key = etree.tostring(structure, pretty_print=True)
if structure_key not in structure_map.keys():
# add signature to structure map
structure_map[structure_key] = structure
# add node to output
global tmp_root
tmp_root.append(structure)
# add signature to frequency map
frequency_map[structure_key] = 1
else:
# increment frequency map
frequency_map[structure_key] += 1
# returns a unique string representing the structure of the node
# including attributes
def get_structure(el):
# create new element for the return value
ret = etree.Element(el.tag)
# get attributes
attribute_keys = el.attrib.keys()
for attribute_key in attribute_keys:
ret.set(attribute_key, el.get(attribute_key))
# check for children
children = list(el)
for child in children:
ret.append(get_structure(child))
return ret
if len(sys.argv) < 3:
print "Must specify an XML file for processing, as well as an element type!"
exit(0)
# Get XML file
xml = sys.argv[1]
# Get output file name
output_file = xml[0:xml.rindex(".")]+".txt"
# get target element type to evaluate
target = sys.argv[2]
# mark start
startTime = datetime.now()
# Parse XML
print '==========================='
print 'Parsing XML'
print '==========================='
context = etree.iterparse(xml, events=('end',))
element_to_clear = []
for event, element in context:
element_to_clear.append(element)
global target
if element.tag == target:
process_element(element)
for ele in element_to_clear:
ele.clear()
element_to_clear = []
# create tree sorted by frequency
ranked = sorted(frequency_map.items(), key=lambda x: x[1], reverse=True)
root = etree.Element("root")
for item in ranked:
structure = structure_map[item[0]]
structure.set(target+"_frequency", str(item[1]))
root.append(structure)
# pretty print root
out = open(output_file, 'w')
out.write(etree.tostring(root, pretty_print=True))
# output run time
time = datetime.now() - startTime
reg3 = re.compile("\\d+:\\d(\\d:\\d+\\.\\d{4})")
time = re.search(reg3, unicode(time))
time = "Runtime: %ss" % (time.group(1).encode("utf-8"))
print(time)
The command: xplore.py trivial.xml automobile

Categories