Remove node from XML node based an attribute value in Python

Remove node from XML node based an attribute value in Python - python

I am trying to remove a node based on it's attribute value
from xml.etree import ElementTree as ET
groups = ET.fromstring("""<groups>
<group>
<group_components>
<id item="1">14742</id>
<id item="1">121727</id>
<id item="0">541971</id>
</group_components>
</group>
<group>
<group_components>
<id item="1">10186</id>
<id item="1">10553</id>
<id item="1">10644</id>
<id item="0">434639</id>
</group_components>
</group>
</groups>
""")
fnodes = groups.findall('group')
for first in fnodes:
bnode = first.find("group_components")
for child in bnode:
items = child.attrib.get('item')
if items == "1":
bnode.remove(child)
xmlstr = ET.tostring(groups, encoding="utf-8", method="xml")
print(xmlstr.decode("utf-8"))
The above code is only removing single node. If the attribute item =1 that id node should be removed

to_remove = ['<id item="1">']
with open('xmlfile.xml') as xmlfile, open('newfile.xml', 'w') as newfile:
for line in xmlfile:
if not any(remo in line for remo in to_remove):
newfile.write(line)
You can put your xml file and get the new xml file with <id item="1"> removed. No need of element tree here I guess.

See below
from xml.etree import ElementTree as ET
xml = """<groups>
<group>
<group_components>
<id item="1">14742</id>
<id item="1">121727</id>
<id item="0">541971</id>
</group_components>
</group>
<group>
<group_components>
<id item="1">10186</id>
<id item="1">10553</id>
<id item="1">10644</id>
<id item="0">434639</id>
</group_components>
</group>
</groups>
"""
root = ET.fromstring(xml)
for grp_comp in root.findall('.//group_components'):
for _id in list(grp_comp):
if _id.attrib['item'] == "1":
grp_comp.remove(_id)
ET.dump(root)
output
<groups>
<group>
<group_components>
<id item="0">541971</id>
</group_components>
</group>
<group>
<group_components>
<id item="0">434639</id>
</group_components>
</group>
</groups>

Related

How to Extract the Information from XML Soap Response?

We have a requirement to get the data from a SOAP XML Response.
Below is the associated XML file
<?xml version="1.0" encoding="utf-8"?>
<soap:Envelope xmlns:soap="http://www.w3.org/2003/05/soap-envelope" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<soap:Body>
<GetResultResponse xmlns="http://www.relatics.com/">
<GetResultResult>
<Report ReportName="RFC" GeneratedOn="2022-12-22" EnvironmentID="XXXX" EnvironmentName="Systematic Assurance – an XXX Solution" EnvironmentURL="https://XXXX.relaticsonline.com/" WorkspaceID="XXXXX" WorkspaceName="P - ADL Program Management - XXX" TargetDevice="Pc" ReportField="" xmlns="">
<Change_module>
<applied_individual_change_request Change_Request="TestKZIreport" RFC_GUID="XXXXX">
<code RFC_Code="VtW-0101" />
<progress RFC_Progress="agreed" />
<applied_individual_project_organisation Organisation="XXXX" />
<applied__individual_discipline Discipline="Highways" />
<specification Specification="Context of Documents">
<code Specification_Code="1.1.1a" />
</specification>
<applied_individual_workpackage Workpackage="Enabling work">
<code Workpackage_Code="WP-01" />
</applied_individual_workpackage>
<physical_object Physical_Object="Train Station">
<code Physical_Object_Code="TFO-0001" />
</physical_object>
<person approver="XXX" />
<applied_individual_change_consequence_qualification Consequence_Value="10 days">
<applied_conceptual_change_consequence_aspect Consequence_Aspect="Schedule" />
</applied_individual_change_consequence_qualification>
<document Document_Name="WI 300 Design.pdf">
<code Document_Code="DOC-0002" />
</document>
<answer_status BR_Status="no" />
<applied_individual_business_rule Business_Rule="Change Review compliance">
<code BR_Code="BR-006" />
</applied_individual_business_rule>
<applied_individual_change_consequence_qualification Consequence_Value="XXX">
<applied_conceptual_change_consequence_aspect Consequence_Aspect="Finance" />
</applied_individual_change_consequence_qualification>
</applied_individual_change_request>
</Change_module>
</Report>
</GetResultResult>
</GetResultResponse>
</soap:Body>
</soap:Envelope>
i need all the tag values after Change_module.i tried some online help in Stack overflow but it didn't work.
I never worked with XML documents before and here is the sample code i
tried from Stack Overflow.
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
tree = ET.parse("Relatics_XML.xml")
root = tree.getroot()
print(root.tag)
print(root.attrib)
namespaces = {"soap": "http://www.w3.org/2003/05/soap-envelope/",
"xsi": "http://www.w3.org/2001/XMLSchema-instance",
"xsd": "http://www.w3.org/2001/XMLSchema/",
'a': 'http://www.relatics.com/',}
names = tree.findall('./soap:Body''/a:GetResultResponse''/a:GetResultResult', namespaces)
print(names)
for name in names:
print(name.text)
i tried different methods like find and findall and also inside the method i try to pass different values but all its printing is null.
I'm not sure how to get the values out of tags.

Using xml.etree.ElementTree make life easier.
documentation in here
It can parsing tag attribute or innerText.
import xml.etree.ElementTree as ET
xml = """\
<?xml version="1.0" encoding="utf-8"?>
<soap:Envelope xmlns:soap="http://www.w3.org/2003/05/soap-envelope"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<soap:Body>
<GetResultResponse xmlns="http://www.relatics.com/">
<GetResultResult>
<Report ReportName="RFC" GeneratedOn="2022-12-22" EnvironmentID="XXXX" EnvironmentName="Systematic Assurance – an XXX Solution" EnvironmentURL="https://XXXX.relaticsonline.com/" WorkspaceID="XXXXX" WorkspaceName="P - ADL Program Management - XXX" TargetDevice="Pc" ReportField=""
xmlns="">
<Change_module>
<applied_individual_change_request Change_Request="TestKZIreport" RFC_GUID="XXXXX">
<code RFC_Code="VtW-0101" />
<progress RFC_Progress="agreed" />
<applied_individual_project_organisation Organisation="XXXX" />
<applied__individual_discipline Discipline="Highways" />
<specification Specification="Context of Documents">
<code Specification_Code="1.1.1a" />
</specification>
<applied_individual_workpackage Workpackage="Enabling work">
<code Workpackage_Code="WP-01" />
</applied_individual_workpackage>
<physical_object Physical_Object="Train Station">
<code Physical_Object_Code="TFO-0001" />
</physical_object>
<person approver="XXX" />
<applied_individual_change_consequence_qualification Consequence_Value="10 days">
<applied_conceptual_change_consequence_aspect Consequence_Aspect="Schedule" />
</applied_individual_change_consequence_qualification>
<document Document_Name="WI 300 Design.pdf">
<code Document_Code="DOC-0002" />
</document>
<answer_status BR_Status="no" />
<applied_individual_business_rule Business_Rule="Change Review compliance">
<code BR_Code="BR-006" />
</applied_individual_business_rule>
<applied_individual_change_consequence_qualification Consequence_Value="XXX">
<applied_conceptual_change_consequence_aspect Consequence_Aspect="Finance" />
</applied_individual_change_consequence_qualification>
</applied_individual_change_request>
</Change_module>
</Report>
</GetResultResult>
</GetResultResponse>
</soap:Body>
</soap:Envelope>
"""
root = ET.fromstring(xml)
print("RFC_Code: " + str(root.find(".//code[#RFC_Code]").attrib))
print("RFC_Progress: " + str(root.find(".//progress[#RFC_Progress]").attrib))
print("specification: " + str(root.find(".//specification[#Specification]").attrib))
print("Specification_Code: " + str(root.find(".//code[#Specification_Code]").attrib))
print("Workpackage_Code: " + str(root.find(".//code[#Workpackage_Code]").attrib))
print("Document_Code: " + str(root.find(".//code[#Document_Code]").attrib))
Result
$ python get-data.py
RFC_Code: {'RFC_Code': 'VtW-0101'}
RFC_Progress: {'RFC_Progress': 'agreed'}
specification: {'Specification': 'Context of Documents'}
Specification_Code: {'Specification_Code': '1.1.1a'}
Workpackage_Code: {'Workpackage_Code': 'WP-01'}
Document_Code: {'Document_Code': 'DOC-0002'}
If you using xml file open, using this code
with open('data.xml', 'r') as xml_file:
root = ET.parse(xml_file)

modify node and extract data from xml file in python

I am new with python and I am looking for advices on what is the best approach to do the following task:
I have an xml file looking like this
<component xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009 http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009/index.xsd">
<memoryMaps>
<memoryMap>
<name>name</name>
<description>description</description>
<peripheral>
<name>periph</name>
<description>description</description>
<baseAddress>0x0</baseAddress>
<range>0x8</range>
<width>32</width>
<register>
<name>reg1</name>
<displayName>reg1</displayName>
<addressOffset>0x0</addressOffset>
<size>32</size>
<access>read-write</access>
<reset>
<value>0x00000002</value>
<mask>0xFFFFFFFF</mask>
</reset>
<field>
</field>
</register>
</peripheral>
</memoryMap>
</memoryMaps>
</component>
I want to do some modifications to modify the node of "reset" to become 2 separate nodes, one for "resetValue" and another "resetMask" but keeping same data in "value" and "mask" extracted into "resetValue" and "resetMask" as follow:
........
<access>read-write</access>
<resetValue>0x00000002</resetValue>
<resetMask>0xFFFFFFFF</resetMask>
<field>
.............
I managed the part of parsing my xml file with success, now I can't know how to start this first modification.
Thank you to guide me.

code that create 2 sub elements under 'register' and remove the unneeded element 'reset'
import xml.etree.ElementTree as ET
xml = '''<component xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009 http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009/index.xsd">
<memoryMaps>
<memoryMap>
<name>name</name>
<description>description</description>
<peripheral>
<name>periph</name>
<description>description</description>
<baseAddress>0x0</baseAddress>
<range>0x8</range>
<width>32</width>
<register>
<name>reg1</name>
<displayName>reg1</displayName>
<addressOffset>0x0</addressOffset>
<size>32</size>
<access>read-write</access>
<reset>
<value>0x00000002</value>
<mask>0xFFFFFFFF</mask>
</reset>
<field>
</field>
</register>
</peripheral>
</memoryMap>
</memoryMaps>
</component>'''
root = ET.fromstring(xml)
register = root.find('.//register')
value = register.find('.//reset/value').text
mask = register.find('.//reset/mask').text
v = ET.SubElement(register, 'resetValue')
v.text = value
m = ET.SubElement(register, 'resetMask')
m.text = mask
register.remove(register.find('reset'))
ET.dump(root)
output
<?xml version="1.0" encoding="UTF-8"?>
<component xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009 http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009/index.xsd">
<memoryMaps>
<memoryMap>
<name>name</name>
<description>description</description>
<peripheral>
<name>periph</name>
<description>description</description>
<baseAddress>0x0</baseAddress>
<range>0x8</range>
<width>32</width>
<register>
<name>reg1</name>
<displayName>reg1</displayName>
<addressOffset>0x0</addressOffset>
<size>32</size>
<access>read-write</access>
<field />
<resetValue>0x00000002</resetValue>
<resetMask>0xFFFFFFFF</resetMask>
</register>
</peripheral>
</memoryMap>
</memoryMaps>
</component>

How do you properly fetch from this nested XML?

I have the following XML:
<?xml version="1.0" encoding="UTF-8"?>
<data>
<columns>
<Leftover index="5">Leftover</Leftover>
<NODE5 index="6"></NODE5>
<NODE6 index="7"></NODE6>
<NODE8 index="9"></NODE8>
<Nomenk__Nr_ index="2">Nomenk.
Nr.</Nomenk__Nr_>
<Year index="8">2020</Year>
<Name index="1">Name</Name>
<Value_code index="3">Value code</Value_code>
</columns>
<records>
<record index="1">
<Leftover>Leftover</Leftover>
<NODE5>Test1</NODE5>
<NODE6>Test2</NODE6>
<NODE8>Test3</NODE8>
<Nomenk__Nr_></Nomenk__Nr_>
<Name></Name>
<Value_code></Value_code>
</record>
... (it repeats itself with different values and the index value increments)
My code is:
import lxml
import lxml.etree as et
xml = open('C:\outputfile.xml', 'rb')
xml_content = xml.read()
tree = et.fromstring(xml_content)
for bad in tree.xpath("//records[#index=\'*\']/NODE5"):
bad.getparent().remove(bad) # here I grab the parent of the element to call the remove directly on it
result = (et.tostring(tree, pretty_print=True, xml_declaration=True))
f = open( 'outputxml.xml', 'w' )
f.write( str(result) )
f.close()
What I need to do is to remove the NODE5, NODE6, NODE8. I tried using a wildcard and then specifying one of the nodes (see line 6) but that seems to not have worked... I'm also getting a syntax error right after the loop on the first character but the code executes.
My problem is also that the encoding by lxml is set to ASCII afterwards when the file is "exported".
UPDATE
I am getting this error on line 8:
return = ...
^
SyntaxError: invalid syntax
I took some code from https://stackoverflow.com/a/7981894/1987598

What I need to do is to remove the NODE5, NODE6, NODE8.
below
import xml.etree.ElementTree as ET
xml = '''<?xml version="1.0" encoding="UTF-8"?>
<data>
<columns>
<Leftover index="5">Leftover</Leftover>
<NODE5 index="6" />
<NODE6 index="7" />
<NODE8 index="9" />
<Nomenk__Nr_ index="2">Nomenk.
Nr.</Nomenk__Nr_>
<Year index="8">2020</Year>
<Name index="1">Name</Name>
<Value_code index="3">Value code</Value_code>
</columns>
<records>
<record index="1">
<Leftover>Leftover</Leftover>
<NODE5>Test1</NODE5>
<NODE6>Test2</NODE6>
<NODE8>Test3</NODE8>
<Nomenk__Nr_ />
<Name />
<Value_code />
</record>
<record index="21">
<Leftover>Leftover</Leftover>
<NODE5>Test11</NODE5>
<NODE6>Test21</NODE6>
<NODE8>Test39</NODE8>
<Nomenk__Nr_ />
<Name />
<Value_code />
</record>
</records>
</data>'''
root = ET.fromstring(xml)
col = root.find('./columns')
for x in ['5','6','8']:
nodes_to_remove = col.findall('./NODE{}'.format(x))
for node in nodes_to_remove:
col.remove(node)
records = root.find('./records')
records_lst = records.findall('./record'.format(x))
for r in records_lst:
for x in ['5','6','8']:
nodes_to_remove = r.findall('./NODE{}'.format(x))
for node in nodes_to_remove:
r.remove(node)
ET.dump(root)
output
<data>
<columns>
<Leftover index="5">Leftover</Leftover>
<Nomenk__Nr_ index="2">Nomenk.
Nr.</Nomenk__Nr_>
<Year index="8">2020</Year>
<Name index="1">Name</Name>
<Value_code index="3">Value code</Value_code>
</columns>
<records>
<record index="1">
<Leftover>Leftover</Leftover>
<Nomenk__Nr_ />
<Name />
<Value_code />
</record>
<record index="2">
<Leftover>Leftover</Leftover>
<Nomenk__Nr_ />
<Name />
<Value_code />
</record>
</records>
</data>

How can I preserve whitespaces with python 2.7 lxml?

I have a huge xml file (thousands of lines) and I need to change some attribute parameters.
Xml looks like this:
<person id="name" name="pers_name">
<group id="Common">
<emotion id="smile">
<texture texture="smile" x="-131" y="-17" />
<effect name="name1" x="51" y="438" />
<effect name="name2" x="61" y="419" />
<effect name="name3" x="55" y="312" />
</emotion>
</group>
</person>
After I did it and wrote it with tree.write(path, encoding='utf-8', xml_declaration=True) I lost whitespaces before closing tag.
How can I preserve it?
<person id="name" name="pers_name">
<group id="Common">
<emotion id="smile">
<texture texture="smile" x="-131" y="-17"/>
<effect name="name1" x="51" y="438"/>
<effect name="name2" x="61" y="419"/>
<effect name="name3" x="55" y="312"/>
</emotion>
</group>
</person>
Code
from lxml import etree
# Offsets
x_offset = -10
y_offset = -20
tree = etree.parse(path)
XML = tree.getroot()
for effect in XML.iter('effect'):
texture_offset_x = int(effect.get('texture_offset_x')) + x_offset
texture_offset_y = int(effect.get('texture_offset_y')) + y_offset
effect.set('texture_offset_x', str(texture_offset_x))
effect.set('texture_offset_y', str(texture_offset_y))
tree.write(path, encoding='utf-8', xml_declaration=True)

How to read the following XML and get values for "host","status","owner","user-template-01" and "test-id"?

XML = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Entities TotalResults="101" PageSize="100" PageNumber="1">
<Entity Type="run">
<Fields>
<Field Name="host">
<Value>osdc-vw64</Value>
</Field>
<Field Name="status">
<Value>Passed</Value>
</Field>
<Field Name="owner">
<Value>Aspeg</Value>
</Field>
<Field Name="user-template-01">
<Value>1941896</Value>
</Field>
<Field Name="test-id">
<Value>72769</Value>
</Field>
</Fields>
</Entity>
<Entity Type="run">
<Fields>
<Field Name="host">
<Value>osdc-57</Value>
</Field>
<Field Name="status">
<Value>Passed</Value>
</Field>
<Field Name="owner">
<Value>spana</Value>
</Field>
<Field Name="user-template-01">
<Value>1941896</Value>
</Field>
<Field Name="test-id">
<Value>72769</Value>
</Field>
</Fields>
</Entity>
</Entities>"""
I have used :
from xml.etree import ElementTree as ET
root = ET.fromstring(XML)
print root.tag
I do not know how to go ahead now ...

The easiest way would be to use PyQuery (if you understand jQuery selectors):
from pyquery import PyQuery
query = PyQuery(xml);
host = query("[Name='host'] value").text()
test_id = query("[Name='test-id'] value").text()
Since you have multiple elements with Name='host', you should iterate over Entities:
from pyquery import PyQuery
def process_Entity(entity):
pass #do something
query = PyQuery(xml);
query("Entity").each(process_Entity)

import xml.etree.ElementTree as ET
tree = ET.parse('hai.xml')
root = tree.getroot()
for child in root:
print child.tag, child.attrib
for a in child:
print a.tag
for b in a:
print b.attrib , b[0].text

Using lxml.etree:
import lxml.etree as ET
XML = """ your string here """
tree = ET.fromstring(XML) # you may get errors here because of encoding
# if so, re.sub(r"\bencoding="[^"]+?", '', XML) works
info_you_need = {entity: {field.get("Name"): field.find("Value").text for field in entity.findall("Fields/Field")} for entity in tree.findall("Entity")}
N.B. I'm pretty awful with the lxml module, someone may come up with a much better solution than this :) My output was:
{<Element Entity at 0x2af4e18>: {'user-template-01': '1941896', 'owner': 'spana', 'test-id': '72769', 'status': 'Passed', 'host': 'osdc-57'},
<Element Entity at 0x2af4e40>: {'user-template-01': '1941896', 'owner': 'Aspeg', 'test-id': '72769', 'status': 'Passed', 'host': 'osdc-vw64'}}

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Remove node from XML node based an attribute value in Python - python

Related

How to Extract the Information from XML Soap Response?

modify node and extract data from xml file in python

How do you properly fetch from this nested XML?

How can I preserve whitespaces with python 2.7 lxml?

How to read the following XML and get values for "host","status","owner","user-template-01" and "test-id"?

Categories

Resources