creating xml tree from a textfile with Python - python

I need to avoid creating double branches in an xml tree when parsing a text file. Let's say the textfile is as follows (the order of lines is random):
branch1:branch11:message11
branch1:branch12:message12
branch2:branch21:message21
branch2:branch22:message22
So the resulting xml tree should have a root with two branches. Both of those branches have two subbranches. The Python code I use to parse this textfile is as follows:
import string
fh = open ('xmlbasic.txt', 'r')
allLines = fh.readlines()
fh.close()
import xml.etree.ElementTree as ET
root = ET.Element('root')
for line in allLines:
tempv = line.split(':')
branch1 = ET.SubElement(root, tempv[0])
branch2 = ET.SubElement(branch1, tempv[1])
branch2.text = tempv[2]
tree = ET.ElementTree(root)
tree.write('xmlbasictree.xml')
The problem with this code is, that a branch in xml tree is created with each line from the textfile.
Any suggestions how to avoid creating another branch in xml tree if a branch with this name exists already?

with open("xmlbasic.txt") as lines_file:
lines = lines_file.read()
import xml.etree.ElementTree as ET
root = ET.Element('root')
for line in lines:
head, subhead, tail = line.split(":")
head_branch = root.find(head)
if not head_branch:
head_branch = ET.SubElement(root, head)
subhead_branch = head_branch.find(subhead)
if not subhead_branch:
subhead_branch = ET.SubElement(branch1, subhead)
subhead_branch.text = tail
tree = ET.ElementTree(root)
ET.dump(tree)
The logic is simple -- you already stated it in your question! You merely need to check whether a branch already exists in the tree before creating it.
Note that this is likely inefficient, since you are searching up to the entire tree for each line. This is because ElementTree is not designed for uniqueness.
If you require speed (which you may not, especially for smallish trees!), a more efficient way would be to use a defaultdict to store the tree structure before converting it to an ElementTree.
import collections
import xml.etree.ElementTree as ET
with open("xmlbasic.txt") as lines_file:
lines = lines_file.read()
root_dict = collections.defaultdict( dict )
for line in lines:
head, subhead, tail = line.split(":")
root_dict[head][subhead] = tail
root = ET.Element('root')
for head, branch in root_dict.items():
head_element = ET.SubElement(root, head)
for subhead, tail in branch.items():
ET.SubElement(head_element,subhead).text = tail
tree = ET.ElementTree(root)
ET.dump(tree)

something along these lines? You keep the level of the branches to be reused in a dict.
b1map = {}
for line in allLines:
tempv = line.split(':')
branch1 = b1map.get(tempv[0])
if branch1 is None:
branch1 = b1map[tempv[0]] = ET.SubElement(root, tempv[0])
branch2 = ET.SubElement(branch1, tempv[1])
branch2.text = tempv[2]

Related

Group branches in an XML tree with Python on a common field

I have a list of order details in a CSV, and want to join all items from the lines together on one order.
Example date is:
Order|Line|Item|Price
123456789|1|IK123456|199.99
654987321|1|MASGE12385|29.95
654987321|2|KLEAN458792|9.99
654987321|3|LP12489|1959.95
I want everything to be listed in an XML with the root as the Order Number, Child as the Line Number and Sub-Children as Item and Price.
I want the output to look like:
<Order number = "123456789">
<Line number = "1">
<Item>IK123456</Item>
<Price>199.99</Price>
</Line>
</Order>
<Order number = "654987321">
<Line = "1">
<Item>MASGE12385</Item>
<Price>29.95</Price>
</Line>
<Line = "2">
<Item>KLEAN458792</Item>
<Price>9.99</Price>
</Line>
<Line = "3">
<Item>LP12489</Item>
<Price>1959.95</Price>
</Line>
</Order>
Here is my code:
import csv
import xml.etree.ElementTree as ET
file = 'C:/github.txt'
with open (file, 'r') as f:
reader = csv.reader(f, delimiter = '|')
header = next(reader)
order_num = reader[0]
root = ET.Element("Order") #BUILD A ROOT FOR THE XML TREE
root.set('number', order_num) #ADD ATTRIBUTE
for row in reader: #ITERATE THROUGH EACH ROW AND POPULATE DATA IN BRANCHES OF XML TREE
line = ET.SubElement(root, 'line', number= reader[1])
item = ET.SubElement(line, 'item code')
item.text = reader[2]
price = ET.SubElement(line, 'price')
price.text = reader[3]
tree = ET.ElementTree(root)
tree.write('C:/github.xml', encoding = 'utf-8', xml_declaration = True)
(NOTE: I moved something and got an error, but not sure what happened)
During loop, consider keeping a tracker on Number to conditionally decide to create an element and keep related underlying items together. Additionally, consider csv.DictReader to iterate csv rows as a dictionary which takes first row headers as keys. Finally, use the built-in minidom to pretty print output. Below will incorporate all XML items under the single <Orders> root:
import csv
import xml.etree.ElementTree as ET
import xml.dom.minidom as mn
file = 'C:/github.txt'
curr_order = None
with open (file, 'r') as f:
reader = csv.DictReader(f, delimiter = '|')
# BUILD A ROOT FOR THE XML TREE
root = ET.Element("Orders")
# ITERATE THROUGH EACH ROW AS DICTIONARY
for d in reader:
# CONDITIONALLY BUILD ORDER ELEMENT
if curr_order != str(d['Order']):
orderElem = ET.SubElement(root, "Order")
curr_order = str(d['Order'])
# CREATE DESCENDANTS OF ORDER
orderElem.set('number', str(d['Order']))
line = ET.SubElement(orderElem, 'line', number = str(d['Line']))
ET.SubElement(line, 'item_code').text = str(d['Item'])
ET.SubElement(line, 'price').text = str(d['Price'])
# PRETTY PRINT OUTPUT
dom = mn.parseString(ET.tostring(root, encoding = 'utf-8'))
with open('C:/github.xml', 'wb') as f:
f.write(dom.toprettyxml(indent=" ", encoding = 'utf-8'))
Online Demo

trying to extract values from the same attribute name from full XML files but only gets 1st line value

I am new to python and XML.
attaching my code
from xml.etree import ElementTree as ET
import glob
def extract_points(filename):
tree = ET.parse(filename)
root = tree.getroot()
pointss = list()
for pnts in root.findall('.//shapes/points'):
pt1 = float(pnts.find('row').text)
pt2 = float(pnts.find('row1').text)
coors = [pt1,pt2]
pointss.append(coors)
height = int(root.find('.//imageHeight').text)
width = int(root.find('.//imageWidth').text)
return pointss,height,width
txtfiles = []
for file in glob.glob("C:/Users/MSI/Desktop/xmlfiles/*.xml"):
txtfiles.append(file)
for f in txtfiles:
pointss,height,width = extract_points(f)
part of my XML file is:
PYTH
when i run the code only get first row value instead of both rows under the tag point.
i want both the values to be a part of [[1019.2222222222223,304.974358974359 ]] but i only get [1019.2222222222223]. need help plz
Based on your xml here is working code
from xml.etree import ElementTree as ET
def extract_points(filename):
tree = ET.parse(filename)
root = tree.getroot()
points = list()
for points_ele in root.findall('.//points'):
points.append([])
for row in points_ele:
points[-1].append(float(row.text))
height = int(root.find('.//imageHeight').text)
width = int(root.find('.//imageWidth').text)
return points, height, width
print(extract_points('data.xml'))

How to Exit Python Code when XML File is empty (XML to CSV conversion)

if the XML file is empty i.e. no root element, how to handle in python? When there are XML records, below Python code works fine but whenever XML file empty, conversion process files. I am looking for an exit instead of failing the process.
XML:
<?xml version = '1.0' encoding = 'UTF-8'?>
<ns2:exportEmpData xmlns:ns2="http://webservice.example.com/"/>
Python Code:
import xml
import csv
import xml.etree.ElementTree as ET
tree = ET.parse('C:/emp/emplist.xml')
root = tree.getroot()
# open a file for writing
Emp_data = open('C:/emp/emplist.csv', 'wb')
# create the csv writer object
csvwriter = csv.writer(Emp_data)
emp_head = []
count = 0
for member in root.findall('emplist'):
emp_nodes = []
if count == 0:
empId = member.find('empId').tag
emp_head.append(empId)
fullName = member.find('fullName').tag
emp_head.append(fullName)
currentAddress = member.find('currentAddress').tag
emp_head.append(currentAddress)
csvwriter.writerow(emp_head)
count = count + 1
empId = member.find('empId').text
emp_nodes.append(empId)
fullName = member.find('fullName').text
emp_nodes.append(fullName)
currentAddress = member.find('currentAddress').attrib.get('city')
emp_nodes.append(currentAddress)
csvwriter.writerow(emp_nodes)
Emp_data.close()

Create a dataframe from an xml file

i have a real (and maybe pretty stupid) problem to convert a xml-file into a dataframe from pandas. Im new in python and need some help. I trying a code from another thread and modificate it but it not works.
I want to iterate through this file:
<objects>
<object id="123" name="some_string">
<object>
<id>123</id>
<site id="456" name="somename" query="some_query_as_string"/>
<create-date>some_date</create-date>
<update-date>some_date</update-date>
<update-user id="567" name="User:xyz" query="some_query_as_string"/>
<delete-date/>
<delete-user/>
<deleted>false</deleted>
<system-object>false</system-object>
<to-string>some_string_notifications</to-string>
</object>
<workflow>
<workflow-type id="12345" name="WorkflowType_some_workflow" query="some_query_as_string"/>
<validated>true</validated>
<name>somestring</name>
<exported>false</exported>
</workflow>
Here is my code:
import xml.etree.ElementTree as ET
import pandas as pd
path = "C:/Users/User/Desktop/test.xml"
with open(path, 'rb') as fp:
content = fp.read()
parser = ET.XMLParser(encoding="utf-8")
tree = ET.fromstring(content, parser=parser)
def xml2df(tree):
root = ET.XML(tree)
all_records = []
for i, child in enumerate(root):
record ={}
for subchild in child:
record[subchild.tag] = subchild.text
all_records.append(record)
return pd.DataFrame(all_records)
Where is the problem? Please help :O
You are passing the file location string to ET.fromstring(), which is not the actual contents of the file. You need to read the contents of the file first, then pass that to ET.fromstring().
path = "C:/Users/User/Desktop/test.xml"
with open(path, 'rb') as fp:
content = fp.read()
parser = ET.XMLParser(encoding="utf-8")
tree = ET.fromstring(content, parser=parser)

Xml write twice the same thing

i'm looking to solve this problem.
When i try to write into the xml file , it writes twice the same thing.
It's the code:
def writeIntoXml(fileName, tagElement, textElement):
tree = ET.ElementTree(file = fileName)
root = tree.getroot()
newElement = ET.SubElement(root, tagElement)
newElement.text =textElement;
newElement.tail ="\n"
root.append(newElement)
tree.write(fileName, encoding='utf-8')
If i have this xml file, with this tags, if i write a new tag( es "Question-3" Example3 "/Question-3") i get a problem
XmlFile before being written:
<Questions>
<Question-1>Example1</Question-1>
<Question-2>Example2</Question-2>
</Questions>
XmlFile after being written:
<Questions>
<Question-1>Example1</Question-1>
<Question-2>Example2</Question-2>
<Question-3>Example3</Question-3>
<Question-3>Example3</Question-3>
</Questions>
Sorry for grammatical errors
Note that ET.SubElement() appends the element automatically. You are adding the element twice, first in SubElement(), next in append().
You should use either just
newElement = ET.SubElement(root, tagElement)
newElement.text = textElement;
newElement.tail = "\n"
or
newElement = ET.Element(tagElement)
newElement.text = textElement;
newElement.tail = "\n"
root.append(newElement)

Categories