How can I export the XML file structure into pandas - python

The structure of the code is as shown below:
This is an xml file
<ROOT>
<data>
<record>
<field name="Country or Area">Afghanistan</field>
<field name="Year">2020</field>
<field name="Item">Gross Domestic Product (GDP)</field>
<field name="Value">508.453721937094</field>
</record>
<record>
<field name="Country or Area">Afghanistan</field>
<field name="Year">2019</field>
<field name="Item">Gross Domestic Product (GDP)</field>
<field name="Value">496.940552822825</field>
</record>
</data>
</ROOT>
I have tried, i've tried other methods but no luck
from lxml import objectify
xml = objectify.parse('GDP_pc.xml')
root = xml.getroot()
data=[]
for i in range(len(root.getchildren())):
data.append([child.text for child in root.getchildren()[i].getchildren()])
df = pd.DataFrame(data)
df.columns = ['Country or Area', 'Year', 'Item', 'Value',]

Have you tried the pandas method pd.read_xml()?
It reads and transform a xml file into a dataframe.
Just to the following:
df = pd.read_xml('GDP_pc.xml')
You can read more about it on the official documentation

See below
import xml.etree.ElementTree as ET
import pandas as pd
xml = '''<ROOT>
<data>
<record>
<field name="Country or Area">Afghanistan</field>
<field name="Year">2020</field>
<field name="Item">Gross Domestic Product (GDP)</field>
<field name="Value">508.453721937094</field>
</record>
<record>
<field name="Country or Area">Afghanistan</field>
<field name="Year">2019</field>
<field name="Item">Gross Domestic Product (GDP)</field>
<field name="Value">496.940552822825</field>
</record>
</data>
</ROOT>'''
data = []
root = ET.fromstring(xml)
for rec in root.findall('.//record'):
data.append({field.attrib['name']: field.text for field in rec.findall('field')})
df = pd.DataFrame(data)
print(df)
output
Country or Area Year Item Value
0 Afghanistan 2020 Gross Domestic Product (GDP) 508.453721937094
1 Afghanistan 2019 Gross Domestic Product (GDP) 496.940552822825

Related

how to parse xml output of mysql in python

I know there are several xml parsers for python, but I dont know which one would be good to parse outputs of mysql xml, I havent been successfully yet. The output looks like:
<resultset statement="select * from table where id > 5" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<row>
<field name="name">first</field>
<field name="login">2021-08-16 13:44:35</field>
</row>
<row>
<field name="name">second</field>
<field name="login">2021-08-18 13:44:35</field>
</row>
</resultset>
because the structure is quite simple here, I come about to write my own parser, but I would guess there should be already something to cover this case?!
Output should be a list of dicts with columns as keys and the value as the content of the row/column
see below
import xml.etree.ElementTree as ET
xml = '''<resultset statement="select * from table where id > 5" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<row>
<field name="name">first</field>
<field name="login">2021-08-16 13:44:35</field>
</row>
<row>
<field name="name">second</field>
<field name="login">2021-08-18 13:44:35</field>
</row>
</resultset>'''
data = []
root = ET.fromstring(xml)
for row in root.findall('.//row'):
fields = []
for field in row.findall('field'):
fields.append((field.attrib['name'], field.text))
data.append(fields)
print(data)
output
[[('name', 'first'), ('login', '2021-08-16 13:44:35')], [('name', 'second'), ('login', '2021-08-18 13:44:35')]]

Remove node from XML node based an attribute value in Python

I am trying to remove a node based on it's attribute value
from xml.etree import ElementTree as ET
groups = ET.fromstring("""<groups>
<group>
<group_components>
<id item="1">14742</id>
<id item="1">121727</id>
<id item="0">541971</id>
</group_components>
</group>
<group>
<group_components>
<id item="1">10186</id>
<id item="1">10553</id>
<id item="1">10644</id>
<id item="0">434639</id>
</group_components>
</group>
</groups>
""")
fnodes = groups.findall('group')
for first in fnodes:
bnode = first.find("group_components")
for child in bnode:
items = child.attrib.get('item')
if items == "1":
bnode.remove(child)
xmlstr = ET.tostring(groups, encoding="utf-8", method="xml")
print(xmlstr.decode("utf-8"))
The above code is only removing single node. If the attribute item =1 that id node should be removed
to_remove = ['<id item="1">']
with open('xmlfile.xml') as xmlfile, open('newfile.xml', 'w') as newfile:
for line in xmlfile:
if not any(remo in line for remo in to_remove):
newfile.write(line)
You can put your xml file and get the new xml file with <id item="1"> removed. No need of element tree here I guess.
See below
from xml.etree import ElementTree as ET
xml = """<groups>
<group>
<group_components>
<id item="1">14742</id>
<id item="1">121727</id>
<id item="0">541971</id>
</group_components>
</group>
<group>
<group_components>
<id item="1">10186</id>
<id item="1">10553</id>
<id item="1">10644</id>
<id item="0">434639</id>
</group_components>
</group>
</groups>
"""
root = ET.fromstring(xml)
for grp_comp in root.findall('.//group_components'):
for _id in list(grp_comp):
if _id.attrib['item'] == "1":
grp_comp.remove(_id)
ET.dump(root)
output
<groups>
<group>
<group_components>
<id item="0">541971</id>
</group_components>
</group>
<group>
<group_components>
<id item="0">434639</id>
</group_components>
</group>
</groups>

How to fill XFA form with iText: Modifying not the datasets, but an other data group in the XDP

I have an XFA form, which I want to fill automatically with either Python or C# or both.
It would be easy if the form data was in datasets like in usual XFA pdfs, but it is not. Here is an example of some data in the datasets:
<?xml version="1.0" encoding="UTF-8"?>
<topmostSubform
><Effacer
/><Rangée3 xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/" xfa:dataNode="dataGroup"
/><table2
><Rangée1
><colG
><positioner xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/" xfa:dataNode="dataGroup"
/></colG
><colD xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/" xfa:dataNode="dataGroup"
/></Rangée1
></table2
><positioner1 xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/" xfa:dataNode="dataGroup"
/><table2
><Rangée1
><colG
><positioner xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/" xfa:dataNode="dataGroup"
/></colG
It points to an other datagroup which is in position 15 in the file XDP.
Using python, I could easily access the XDP, here is its configuration:
[xdp:xdp 115 0 R config 2 0 R template 3 0 R datasets 116 0 R localeSet 5 0 R xmpmeta 6 0 R xfdf 7 0 R form 117 0 R </xdp:xdp> 118 0 R ]
I found all the form data in the "form" stream, here is a part of it:
<form xmlns="http://www.xfa.org/schema/xfa-form/2.8/" checksum="8cn7XfZQ5SK/27lJmAiqCnPmd4M=">
<subform name="topmostSubform">
<field name="pMultilineModified">
<value>
<text>N</text>
</value>
<assist>
<toolTip/>
</assist>
</field>
<instanceManager name="_page1"/>
<subform name="page1">
<instanceManager name="_sf0"/>
<subform name="sf0">
<instanceManager name="_container1"/>
<subform name="container1">
<instanceManager name="_positioner"/>
<subform name="positioner">
<instanceManager name="_sf_numeroEvenement"/>
<subform name="sf_numeroEvenement">
<instanceManager name="_container"/>
<subform name="container">
<instanceManager name="_Figure"/>
<subform name="Figure">
<field name="NumeroEvenement">
<assist>
<toolTip/>
</assist>
<value override="1">
<text>12234445522</text>
</value>
</field>
</subform>
</subform>
</subform>
<field name="txt0_UnitePlaignante">
<assist>
<toolTip/>
</assist>
<value override="1">
<text>200</text>
</value>
</field>
<field name="Effacer">
<assist>
<toolTip/>
</assist>
</field>
</subform>
</subform>
So the real question is:
How can I with iText7 get this "form" stream in position 15 of the XDP PdfObject, modify it to put it back in the XDP and put the XDP back in the pdf?
To replicate what I'm trying to do, one could try getting any other element from the XDP than datasets, modifying it or not, putting it back in the pdf after this. I wasn't able to.
I tried; Pypdf2 PDFNet (python and C#), iText7..
I am desperate, I've been trying for weeks now with no solution. I obviously can't use iText FillXfaForm method, since it modifies the datasets and I want to modify the form.

How do you properly fetch from this nested XML?

I have the following XML:
<?xml version="1.0" encoding="UTF-8"?>
<data>
<columns>
<Leftover index="5">Leftover</Leftover>
<NODE5 index="6"></NODE5>
<NODE6 index="7"></NODE6>
<NODE8 index="9"></NODE8>
<Nomenk__Nr_ index="2">Nomenk.
Nr.</Nomenk__Nr_>
<Year index="8">2020</Year>
<Name index="1">Name</Name>
<Value_code index="3">Value code</Value_code>
</columns>
<records>
<record index="1">
<Leftover>Leftover</Leftover>
<NODE5>Test1</NODE5>
<NODE6>Test2</NODE6>
<NODE8>Test3</NODE8>
<Nomenk__Nr_></Nomenk__Nr_>
<Name></Name>
<Value_code></Value_code>
</record>
... (it repeats itself with different values and the index value increments)
My code is:
import lxml
import lxml.etree as et
xml = open('C:\outputfile.xml', 'rb')
xml_content = xml.read()
tree = et.fromstring(xml_content)
for bad in tree.xpath("//records[#index=\'*\']/NODE5"):
bad.getparent().remove(bad) # here I grab the parent of the element to call the remove directly on it
result = (et.tostring(tree, pretty_print=True, xml_declaration=True))
f = open( 'outputxml.xml', 'w' )
f.write( str(result) )
f.close()
What I need to do is to remove the NODE5, NODE6, NODE8. I tried using a wildcard and then specifying one of the nodes (see line 6) but that seems to not have worked... I'm also getting a syntax error right after the loop on the first character but the code executes.
My problem is also that the encoding by lxml is set to ASCII afterwards when the file is "exported".
UPDATE
I am getting this error on line 8:
return = ...
^
SyntaxError: invalid syntax
I took some code from https://stackoverflow.com/a/7981894/1987598
What I need to do is to remove the NODE5, NODE6, NODE8.
below
import xml.etree.ElementTree as ET
xml = '''<?xml version="1.0" encoding="UTF-8"?>
<data>
<columns>
<Leftover index="5">Leftover</Leftover>
<NODE5 index="6" />
<NODE6 index="7" />
<NODE8 index="9" />
<Nomenk__Nr_ index="2">Nomenk.
Nr.</Nomenk__Nr_>
<Year index="8">2020</Year>
<Name index="1">Name</Name>
<Value_code index="3">Value code</Value_code>
</columns>
<records>
<record index="1">
<Leftover>Leftover</Leftover>
<NODE5>Test1</NODE5>
<NODE6>Test2</NODE6>
<NODE8>Test3</NODE8>
<Nomenk__Nr_ />
<Name />
<Value_code />
</record>
<record index="21">
<Leftover>Leftover</Leftover>
<NODE5>Test11</NODE5>
<NODE6>Test21</NODE6>
<NODE8>Test39</NODE8>
<Nomenk__Nr_ />
<Name />
<Value_code />
</record>
</records>
</data>'''
root = ET.fromstring(xml)
col = root.find('./columns')
for x in ['5','6','8']:
nodes_to_remove = col.findall('./NODE{}'.format(x))
for node in nodes_to_remove:
col.remove(node)
records = root.find('./records')
records_lst = records.findall('./record'.format(x))
for r in records_lst:
for x in ['5','6','8']:
nodes_to_remove = r.findall('./NODE{}'.format(x))
for node in nodes_to_remove:
r.remove(node)
ET.dump(root)
output
<data>
<columns>
<Leftover index="5">Leftover</Leftover>
<Nomenk__Nr_ index="2">Nomenk.
Nr.</Nomenk__Nr_>
<Year index="8">2020</Year>
<Name index="1">Name</Name>
<Value_code index="3">Value code</Value_code>
</columns>
<records>
<record index="1">
<Leftover>Leftover</Leftover>
<Nomenk__Nr_ />
<Name />
<Value_code />
</record>
<record index="2">
<Leftover>Leftover</Leftover>
<Nomenk__Nr_ />
<Name />
<Value_code />
</record>
</records>
</data>

How to read the following XML and get values for "host","status","owner","user-template-01" and "test-id"?

XML = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Entities TotalResults="101" PageSize="100" PageNumber="1">
<Entity Type="run">
<Fields>
<Field Name="host">
<Value>osdc-vw64</Value>
</Field>
<Field Name="status">
<Value>Passed</Value>
</Field>
<Field Name="owner">
<Value>Aspeg</Value>
</Field>
<Field Name="user-template-01">
<Value>1941896</Value>
</Field>
<Field Name="test-id">
<Value>72769</Value>
</Field>
</Fields>
</Entity>
<Entity Type="run">
<Fields>
<Field Name="host">
<Value>osdc-57</Value>
</Field>
<Field Name="status">
<Value>Passed</Value>
</Field>
<Field Name="owner">
<Value>spana</Value>
</Field>
<Field Name="user-template-01">
<Value>1941896</Value>
</Field>
<Field Name="test-id">
<Value>72769</Value>
</Field>
</Fields>
</Entity>
</Entities>"""
I have used :
from xml.etree import ElementTree as ET
root = ET.fromstring(XML)
print root.tag
I do not know how to go ahead now ...
The easiest way would be to use PyQuery (if you understand jQuery selectors):
from pyquery import PyQuery
query = PyQuery(xml);
host = query("[Name='host'] value").text()
test_id = query("[Name='test-id'] value").text()
Since you have multiple elements with Name='host', you should iterate over Entities:
from pyquery import PyQuery
def process_Entity(entity):
pass #do something
query = PyQuery(xml);
query("Entity").each(process_Entity)
import xml.etree.ElementTree as ET
tree = ET.parse('hai.xml')
root = tree.getroot()
for child in root:
print child.tag, child.attrib
for a in child:
print a.tag
for b in a:
print b.attrib , b[0].text
Using lxml.etree:
import lxml.etree as ET
XML = """ your string here """
tree = ET.fromstring(XML) # you may get errors here because of encoding
# if so, re.sub(r"\bencoding="[^"]+?", '', XML) works
info_you_need = {entity: {field.get("Name"): field.find("Value").text for field in entity.findall("Fields/Field")} for entity in tree.findall("Entity")}
N.B. I'm pretty awful with the lxml module, someone may come up with a much better solution than this :) My output was:
{<Element Entity at 0x2af4e18>: {'user-template-01': '1941896', 'owner': 'spana', 'test-id': '72769', 'status': 'Passed', 'host': 'osdc-57'},
<Element Entity at 0x2af4e40>: {'user-template-01': '1941896', 'owner': 'Aspeg', 'test-id': '72769', 'status': 'Passed', 'host': 'osdc-vw64'}}

Categories