xml to csv conversion in python - python

Here i need to parse the xml and get the values. I need to get attribute element like 'personid =01' which i couldnt get in this code. And also i need to fetch the grand children node values also. here it is for "SIBLING" and its name tags.BUt i cant hard code it as sibling and fetch the value. And top of all i need to handle multiple attributes and join them to form a unique key which will come as a column in the final table.
import xml.dom
import xml.dom.minidom
doc = xml.dom.minidom.parseString('''
<root>
<person id="01">
<name> abc</name>
<age>32</age>
<address>addr123</address>
<siblings>
<name></name>
</siblings>
</person>
<person id="02">
<name> def</name>
<age>22</age>
<address>addr456</address>
<siblings>
<name></name>
<name></name>
</siblings>
</person>
</root>
''')
innerlist=[]
outerlist=[]
def innerHtml(root):
text = ''
nodes = [ root ]
while not nodes==[]:
node = nodes.pop()
if node.nodeType==xml.dom.Node.TEXT_NODE:
text += node.wholeText
else:
nodes.extend(node.childNodes)
return text
for statusNode in doc.getElementsByTagName('person'):
for childNode in statusNode.childNodes:
if childNode.nodeType==xml.dom.Node.ELEMENT_NODE:
if innerHtml(childNode).strip() != '':
innerlist.append(childNode.nodeName+" "+innerHtml(childNode).strip())
outerlist.append(innerlist)
innerlist=[]
#print(outerlist)
attrlist = []
nodes = doc.getElementsByTagName('person')
for node in nodes:
if 'id' in node.attributes:
#print(node.attributes['id'].value)
attrlist.append(node.attributes['id'].value)
#print(attrlist)
dictionary = dict(zip(attrlist, outerlist))
print(dictionary)

Comment: i have stored it in a dictnorary. {'01': ['name abc', 'age 32', 'address addr123'], '02': ['name def', 'age 22', 'address addr456']}.
You can't write suche a dict to CSV!
ValueError: dict contains fields not in fieldnames: '01'
Do you REALY want to convert to CSV?
Read about CSV File Reading and Writing
Comment: Here i need to get sibiling tag also as another innerlist.
CSV dosn't support such innerlist?
Edit your Question and show expected CSV Output!
Question: xml to csv conversion
Solution with xml.etree.ElementTree.
Note: Don't understand how you want to handle grand children node values.
Write it as List of dict in one Column.
import csv
import xml.etree.ElementTree as ET
root = ET.fromstring(doc)
fieldnames = None
with open('doc.csv', 'w') as fh:
for p in root.findall('person'):
person = {'_id':p.attrib['id']}
for element in p:
if len(element) >= 1:
person[element.tag] = []
for sub_e in element:
person[element.tag].append({sub_e.tag:sub_e.text})
else:
person[element.tag] = element.text
if not fieldnames:
fieldnames = sorted(person)
w = csv.DictWriter(fh, fieldnames=fieldnames)
w.writeheader()
w.writerow(person)
Output:
_id,address,age,name,siblings
01,addr123,32, abc,[{'name': 'sib1'}]
02,addr456,, def,"[{'name': 'sib2'}, {'name': 'sib3'}]"
Tested with Python: 3.4.2

Related

Listing path and data from a xml file to store in a dataframe

Here is a xml file :
<SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/">
<SOAP-ENV:Header />
<SOAP-ENV:Body>
<ADD_LandIndex_001>
<CNTROLAREA>
<BSR>
<status>ADD</status>
<NOUN>LandIndex</NOUN>
<REVISION>001</REVISION>
</BSR>
</CNTROLAREA>
<DATAAREA>
<LandIndex>
<reportId>AMI100031</reportId>
<requestKey>R3278458</requestKey>
<SubmittedBy>EN4871</SubmittedBy>
<submittedOn>2015/01/06 4:20:11 PM</submittedOn>
<LandIndex>
<agreementdetail>
<agreementid>001 4860</agreementid>
<agreementtype>NATURAL GAS</agreementtype>
<currentstatus>
<status>ACTIVE</status>
<statuseffectivedate>1965/02/18</statuseffectivedate>
<termdate>1965/02/18</termdate>
</currentstatus>
<designatedrepresentative></designatedrepresentative>
</agreementdetail>
</LandIndex>
</LandIndex>
</DATAAREA>
</ADD_LandIndex_001>
</SOAP-ENV:Body>
</SOAP-ENV:Envelope>
I want to save in a dataframe : 1) the path and 2) the text of the elements corresponding to the path. To do this dataframe, I am thinking to do a dictionary to store both. So first I would like to get a dictionary like that (where I have the values associated to the corresonding path).
{'/Envelope/Body/ADD_LandIndex_001/CNTROLAREA/BSR/status': 'ADD', /Envelope/Body/ADD_LandIndex_001/CNTROLAREA/BSR/NOUN: 'LandIndex',...}
Like that I just have to use the function df=pd.DataFrame() to create a dataframe that I can export in a excel sheet. I have already a part for the listing of the path, however I can not get text from those paths. I do not get how the lxml library works. I tried the function .text() and text_content() but I have an error.
Here is my code :
from lxml import etree
import xml.etree.ElementTree as et
from bs4 import BeautifulSoup
import pandas as pd
filename = 'file_try.xml'
with open(filename, 'r') as f:
soap = f.read()
root = etree.XML(soap.encode())
tree = etree.ElementTree(root)
mylist_path = []
mylist_data = []
mydico = {}
mylist = []
for target in root.xpath('//text()'):
if len(target.strip())>0:
path = tree.getpath(target.getparent()).replace('SOAP-ENV:','')
mydico[path] = target.text()
mylist_path.append(path)
mylist_data.append(target.text())
mylist.append(mydico)
df=pd.DataFrame(mylist)
df.to_excel("data_xml.xlsx")
print(mylist_path)
print(mylist_data)
Thank you for the help !
Here is an example of traversing XML tree. For this purpose recursive function will be needed. Fortunately lxml provides all functionality for this.
from lxml import etree as et
from collections import defaultdict
import pandas as pd
d = defaultdict(list)
root = et.fromstring(xml)
tree = et.ElementTree(root)
def traverse(el, d):
if len(list(el)) > 0:
for child in el:
traverse(child, d)
else:
if el.text is not None:
d[tree.getelementpath(el)].append(el.text)
traverse(root, d)
df = pd.DataFrame(d)
df.head()
Output:
{
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/CNTROLAREA/BSR/status': ['ADD'],
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/CNTROLAREA/BSR/NOUN': ['LandIndex'],
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/CNTROLAREA/BSR/REVISION': ['001'],
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/DATAAREA/LandIndex/reportId': ['AMI100031'],
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/DATAAREA/LandIndex/requestKey': ['R3278458'],
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/DATAAREA/LandIndex/SubmittedBy': ['EN4871'],
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/DATAAREA/LandIndex/submittedOn': ['2015/01/06 4:20:11 PM'],
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/DATAAREA/LandIndex/LandIndex/agreementdetail/agreementid': ['001 4860'],
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/DATAAREA/LandIndex/LandIndex/agreementdetail/agreementtype': ['NATURAL GAS'],
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/DATAAREA/LandIndex/LandIndex/agreementdetail/currentstatus/status': ['ACTIVE'],
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/DATAAREA/LandIndex/LandIndex/agreementdetail/currentstatus/statuseffectivedate': ['1965/02/18'],
'{http://schemas.xmlsoap.org/soap/envelope/}Body/ADD_LandIndex_001/DATAAREA/LandIndex/LandIndex/agreementdetail/currentstatus/termdate': ['1965/02/18']
}
Please note, the dictionary d contains lists as values. That's because elements can be repeated in XML and otherwise last value will override previous one. If that's not the case for your particular XML, use regular dict instead of defaultdict d = {} and use assignment instead of appending d[tree.getelementpath(el)] = el.text.
The same when reading from file:
d = defaultdict(list)
with open('output.xml', 'rb') as file:
root = et.parse(file).getroot()
tree = et.ElementTree(root)
def traverse(el, d):
if len(list(el)) > 0:
for child in el:
traverse(child, d)
else:
if el.text is not None:
d[tree.getelementpath(el)].append(el.text)
traverse(root, d)
df = pd.DataFrame(d)
print(d)

Extracting nested XML elements of different sizes into Pandas

Lets assume we have an arbitrary XML document like below
<?xml version="1.0" encoding="UTF-8"?>
<programs xmlns="http://something.org/schema/s/program">
<program xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://something.org/schema/s/program http://something.org/schema/s/program.xsd">
<orgUnitId>Organization 1</orgUnitId>
<requiredLevel>academic bachelor</requiredLevel>
<requiredLevel>academic master</requiredLevel>
<programDescriptionText xml:lang="nl">Here is some text; blablabla</programDescriptionText>
<searchword xml:lang="nl">Scrum master</searchword>
</program>
<program xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://something.org/schema/s/program http://something.org/schema/s/program.xsd">
<requiredLevel>bachelor</requiredLevel>
<requiredLevel>academic master</requiredLevel>
<requiredLevel>academic bachelor</requiredLevel>
<orgUnitId>Organization 2</orgUnitId>
<programDescriptionText xml:lang="nl">Text from another organization about some stuff.</programDescriptionText>
<searchword xml:lang="nl">Excutives</searchword>
</program>
<program xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<orgUnitId>Organization 3</orgUnitId>
<programDescriptionText xml:lang="nl">Also another huge text description from another organization.</programDescriptionText>
<searchword xml:lang="nl">Negotiating</searchword>
<searchword xml:lang="nl">Effective leadership</searchword>
<searchword xml:lang="nl">negotiating techniques</searchword>
<searchword xml:lang="nl">leadership</searchword>
<searchword xml:lang="nl">strategic planning</searchword>
</program>
</programs>
Currently I'm looping over the elements I need by using their absolute paths, since I'm not able to use any of the get or find methods in ElementTree. As such, my code looks like below:
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np
import itertools
tree = ET.parse('data.xml')
root = tree.getroot()
root.tag
dfcols=['organization','description','level','keyword']
organization=[]
description=[]
level=[]
keyword=[]
for node in root:
for child in
node.findall('.//{http://something.org/schema/s/program}orgUnitId'):
organization.append(child.text)
for child in node.findall('.//{http://something.org/schema/s/program}programDescriptionText'):
description.append(child.text)
for child in node.findall('.//{http://something.org/schema/s/program}requiredLevel'):
level.append(child.text)
for child in node.findall('.//{http://something.org/schema/s/program}searchword'):
keyword.append(child.text)
The goal, of course, is to create one dataframe. However, since each node in the XML file contains one or multiple elements, such as requiredLevel or searchword I'm currently losing data when I'm casting it to a dataframe by either:
df=pd.DataFrame(list(itertools.zip_longest(organization,
description,level,searchword,
fillvalue=np.nan)),columns=dfcols)
or using pd.Series as given here or another solution which I don't seem to get it fit from here
My best bet is not to use Lists at all, since they don't seem to index the data correctly. That is, I lose data from the 2nd to Xth child node. But right now I'm stuck, and don't see any other options.
What my end result should look like is this:
organization description level keyword
Organization 1 .... academic bachelor, Scrum master
academic master
Organization 2 .... bachelor, Executives
academic master,
academic bachelor
Organization 3 .... Negotiating,
Effective leadership,
negotiating techniques,
....
Consider building a list of dictionaries with comma-collapsed text values. Then pass list into the pandas.DataFrame constructor:
dicts = []
for node in root:
orgs = ", ".join([org.text for org in node.findall('.//{http://something.org/schema/s/program}orgUnitId')])
desc = ", ".join([desc.text for desc in node.findall('.//{http://something.org/schema/s/program}programDescriptionText')])
lvls = ", ".join([lvl.text for lvl in node.findall('.//{http://something.org/schema/s/program}requiredLevel')])
wrds = ", ".join([wrd.text for wrd in node.findall('.//{http://something.org/schema/s/program}searchword')])
dicts.append({'organization': orgs, 'description': desc, 'level': lvls, 'keyword': wrds})
final_df = pd.DataFrame(dicts, columns=['organization','description','level','keyword'])
Output
print(final_df)
# organization description level keyword
# 0 Organization 1 Here is some text; blablabla academic bachelor, academic master Scrum master
# 1 Organization 2 Text from another organization about some stuff. bachelor, academic master, academic bachelor Excutives
# 2 Organization 3 Also another huge text description from anothe... Negotiating, Effective leadership, negotiating...
A lightweight xml_to_dict converter can be found here. It can be improved by this to handle namespaces.
def xml_to_dict(xml='', remove_namespace=True):
"""Converts an XML string into a dict
Args:
xml: The XML as string
remove_namespace: True (default) if namespaces are to be removed
Returns:
The XML string as dict
Examples:
>>> xml_to_dict('<text><para>hello world</para></text>')
{'text': {'para': 'hello world'}}
"""
def _xml_remove_namespace(buf):
# Reference: https://stackoverflow.com/a/25920989/1498199
it = ElementTree.iterparse(buf)
for _, el in it:
if '}' in el.tag:
el.tag = el.tag.split('}', 1)[1]
return it.root
def _xml_to_dict(t):
# Reference: https://stackoverflow.com/a/10077069/1498199
from collections import defaultdict
d = {t.tag: {} if t.attrib else None}
children = list(t)
if children:
dd = defaultdict(list)
for dc in map(_xml_to_dict, children):
for k, v in dc.items():
dd[k].append(v)
d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
if t.attrib:
d[t.tag].update(('#' + k, v) for k, v in t.attrib.items())
if t.text:
text = t.text.strip()
if children or t.attrib:
if text:
d[t.tag]['#text'] = text
else:
d[t.tag] = text
return d
buffer = io.StringIO(xml.strip())
if remove_namespace:
root = _xml_remove_namespace(buffer)
else:
root = ElementTree.parse(buffer).getroot()
return _xml_to_dict(root)
So let s be the string which holds your xml. We can convert it to a dict via
d = xml_to_dict(s, remove_namespace=True)
Now the solution is straight forward:
rows = []
for program in d['programs']['program']:
cols = []
cols.append(program['orgUnitId'])
cols.append(program['programDescriptionText']['#text'])
try:
cols.append(','.join(program['requiredLevel']))
except KeyError:
cols.append('')
try:
searchwords = program['searchword']['#text']
except TypeError:
searchwords = []
for searchword in program['searchword']:
searchwords.append(searchword['#text'])
searchwords = ','.join(searchwords)
cols.append(searchwords)
rows.append(cols)
df = pd.DataFrame(rows, columns=['organization', 'description', 'level', 'keyword'])

How can I parse a XML file to a dictionary in Python?

I 'am trying to parse a XML file using the Python library minidom (even tried xml.etree.ElementTree API).
My XML (resource.xml)
<?xml version='1.0'?>
<quota_result xmlns="https://some_url">
</quota_rule>
<quota_rule name='max_mem_per_user/5'>
<users>user1</users>
<limit resource='mem' limit='1550' value='921'/>
</quota_rule>
<quota_rule name='max_mem_per_user/6'>
<users>user2 /users>
<limit resource='mem' limit='2150' value='3'/>
</quota_rule>
</quota_result>
I would like to parse this file and store inside a dictionnary the information in the following form and be able to access it:
dict={user1=[resource,limit,value],user2=[resource,limit,value]}
So far I have only been able to do things like:
docXML = minidom.parse("resource.xml")
for node in docXML.getElementsByTagName('limit'):
print node.getAttribute('value')
You can use getElementsByTagName and getAttribute to trace the result:
dict_users = dict()
docXML = parse('mydata.xml')
users= docXML.getElementsByTagName("quota_rule")
for node in users:
user = 'None'
tag_user = node.getElementsByTagName("users") #check the length of the tag_user to see if tag <users> is exist or not
if len(tag_user) ==0:
print "tag <users> is not exist"
else:
user = tag_user[0]
resource = node.getElementsByTagName("limit")[0].getAttribute("resource")
limit = node.getElementsByTagName("limit")[0].getAttribute("limit")
value = node.getElementsByTagName("limit")[0].getAttribute("value")
dict_users[user.firstChild.data]=[resource, limit, value]
if user == 'None':
dict_users['None']=[resource, limit, value]
else:
dict_users[user.firstChild.data]=[resource, limit, value]
print(dict_users) # remove the <users>user1</users> in xml
Output:
tag <users> is not exist
{'None': [u'mem', u'1550', u'921'], u'user2': [u'mem', u'2150', u'3']}

How to associate values of tags with label of the tag the using ElementTree in a Pythonic way

I have some xml files I am trying to process.
Here is a derived sample from one of the files
fileAsString = """
<?xml version="1.0" encoding="utf-8"?>
<eventDocument>
<schemaVersion>X2</schemaVersion>
<eventTable>
<eventTransaction>
<eventTitle>
<value>Some Event</value>
</eventTitle>
<eventDate>
<value>2003-12-31</value>
</eventDate>
<eventCoding>
<eventType>47</eventType>
<eventCode>A</eventCode>
<footnoteId id="F1"/>
<footnoteId id="F2"/>
</eventCoding>
<eventCycled>
<value></value>
</eventCycled>
<eventAmounts>
<eventVoltage>
<value>40000</value>
</eventVoltage>
</eventAmounts>
</eventTransaction>
</eventTable>
</eventDocument>"""
Note, there can be many eventTables in each document and events can have more details then just the ones I have isolated.
My goal is to create a dictionary in the following form
{'eventTitle':'Some Event, 'eventDate':'2003-12-31','eventType':'47',\
'eventCode':'A', 'eventCoding_FTNT_1':'F1','eventCoding_FTNT_2':'F2',\
'eventCycled': , 'eventVoltage':'40000'}
I am actually reading these in from files but assuming I have a string my code to get the text for the elements right below the eventTransaction element where the text is inside a value tag is as follows
import xml.etree.cElementTree as ET
myXML = ET.fromstring(fileAsString)
eventTransactions = [ e for e in myXML.iter() if e.tag == 'eventTransaction']
testTransaction = eventTransactions[0]
my_dict = {}
for child_of in testTransaction:
grand_children_tags = [e.tag for e in child_of]
if grand_children_tags == ['value']:
my_dict[child_of.tag] = [e.text for e in child_of][0]
>>> my_dict
{'eventTitle': 'Some Event', 'eventCycled': None, 'eventDate': '2003-12-31'}
This seems wrong because I am not really taking advantage of xml instead I am using brute force but I have not seemed to find an example.
Is there a clearer and more pythonic way to create the output I am looking for?
Use XPath to pull out the elements you're interested in.
The following code creates a list of lists of dicts (i.e. tables/transactions/info):
tables = []
myXML = ET.fromstring(fileAsString)
for table in myXML.findall('./eventTable'):
transactions = []
tables.append(transactions)
for transaction in table.findall('./eventTransaction'):
info = {}
for element in table.findall('.//*[value]'):
info[element.tag] = element.find('./value').text or ''
coding = transaction.find('./eventCoding')
if coding is not None:
for tag in 'eventType', 'eventCode':
element = coding.find('./%s' % tag)
if element is not None:
info[tag] = element.text or ''
for index, element in enumerate(coding.findall('./footnoteId')):
info['eventCoding_FTNT_%d' % index] = element.get('id', '')
if info:
transactions.append(info)
Output:
[[{'eventCode': 'A',
'eventCoding_FTNT_0': 'F1',
'eventCoding_FTNT_1': 'F2',
'eventCycled': '',
'eventDate': '2003-12-31',
'eventTitle': 'Some Event',
'eventType': '47',
'eventVoltage': '40000'}]]

Extract attributes and certain tag values from xml using python script

I want to parse an XML content and return a dictionary which contains only the name attribute and its values as dictionary. For example:
<ecmaarray>
<number name="xyz1">123.456</number>
<ecmaarray name="xyz2">
<string name="str1">aaa</string>
<number name="num1">55</number>
</ecmaarray>
<strictarray name="xyz3">
<string>aaa</string>
<number>55</number>
</strictarray>
</ecmaarray>
The output has to be in a dictionary something like this..
Dict:{ 'xyz1': 123.456,
'xyz2': {'str1':'aaa', 'num1': '55'},
'xyz3': ['aaa','55']
}
Can any one suggest a recursive solution for this ?
Assuming situation like this:
<strictarray name="xyz4">
<string>aaa</string>
<number name="num1">55</number>
</strictarray>
is not possible, here's a sample code using lxml:
from lxml import etree
tree = etree.parse('test.xml')
result = {}
for element in tree.xpath('/ecmaarray/*'):
name = element.attrib["name"]
text = element.text
childs = element.getchildren()
if not childs:
result[name] = text
else:
child_dict = {}
child_list = []
for child in childs:
child_name = child.attrib.get('name')
child_text = child.text
if child_name:
child_dict[child_name] = child_text
else:
child_list.append(child_text)
if child_dict:
result[name] = child_dict
else:
result[name] = child_list
print result
prints:
{'xyz3': ['aaa', '55'],
'xyz2': {'str1': 'aaa', 'num1': '55'},
'xyz1': '123.456'}
You may want to improve the code - it's just a hint on where to go.
Hope that helps.

Categories