How to parse a .txt file into .xml? - python

This is my txt file:
In File Name: C:\Users\naqushab\desktop\files\File 1.m1
Out File Name: C:\Users\naqushab\desktop\files\Output\File 1.m2
In File Size: Low: 22636 High: 0
Total Process time: 1.859000
Out File Size: Low: 77619 High: 0
In File Name: C:\Users\naqushab\desktop\files\File 2.m1
Out File Name: C:\Users\naqushab\desktop\files\Output\File 2.m2
In File Size: Low: 20673 High: 0
Total Process time: 3.094000
Out File Size: Low: 94485 High: 0
In File Name: C:\Users\naqushab\desktop\files\File 3.m1
Out File Name: C:\Users\naqushab\desktop\files\Output\File 3.m2
In File Size: Low: 66859 High: 0
Total Process time: 3.516000
Out File Size: Low: 217268 High: 0
I am trying to parse this to an XML format like this:
<?xml version='1.0' encoding='utf-8'?>
<root>
<filedata>
<InFileName>File 1.m1</InFileName>
<OutFileName>File 1.m2</OutFileName>
<InFileSize>22636</InFileSize>
<OutFileSize>77619</OutFileSize>
<ProcessTime>1.859000</ProcessTime>
</filedata>
<filedata>
<InFileName>File 2.m1</InFileName>
<OutFileName>File 2.m2</OutFileName>
<InFileSize>20673</InFileSize>
<OutFileSize>94485</OutFileSize>
<ProcessTime>3.094000</ProcessTime>
</filedata>
<filedata>
<InFileName>File 3.m1</InFileName>
<OutFileName>File 3.m2</OutFileName>
<InFileSize>66859</InFileSize>
<OutFileSize>217268</OutFileSize>
<ProcessTime>3.516000</ProcessTime>
</filedata>
</root>
Here is the code (I am using Python 2) in which I am trying to achieve that:
import re
import xml.etree.ElementTree as ET
rex = re.compile(r'''(?P<title>In File Name:
|Out File Name:
|In File Size: Low:
|Total Process time:
|Out File Size: Low:
)
(?P<value>.*)
''', re.VERBOSE)
root = ET.Element('root')
root.text = '\n' # newline before the celldata element
with open('Performance.txt') as f:
celldata = ET.SubElement(root, 'filedata')
celldata.text = '\n' # newline before the collected element
celldata.tail = '\n\n' # empty line after the celldata element
for line in f:
# Empty line starts new celldata element (hack style, uggly)
if line.isspace():
celldata = ET.SubElement(root, 'filedata')
celldata.text = '\n'
celldata.tail = '\n\n'
# If the line contains the wanted data, process it.
m = rex.search(line)
if m:
# Fix some problems with the title as it will be used
# as the tag name.
title = m.group('title')
title = title.replace('&', '')
title = title.replace(' ', '')
e = ET.SubElement(celldata, title.lower())
e.text = m.group('value')
e.tail = '\n'
# Display for debugging
ET.dump(root)
# Include the root element to the tree and write the tree
# to the file.
tree = ET.ElementTree(root)
tree.write('Performance.xml', encoding='utf-8', xml_declaration=True)
But I am getting empty values, is it possible to parse this txt to XML?

A correction with your regex: It should be
m = re.search('(?P<title>(In File Name)|(Out File Name)|(In File Size: *Low)|(Total Process time)|(Out File Size: *Low)):(?P<value>.*)',line)
and not as what you've given. Because in your regex, In File Name|Out File Name means, it will check for In File Nam followed but e or O followed by ut File Name and so on.
Suggestion,
You can do it without using regex.
xml.dom.minidom can be used for prettifying your xml string.
I've added the comments inline for better understanding!
Node.toprettyxml([indent=""[, newl=""[, encoding=""]]])
Return a pretty-printed version of the document. indent specifies the indentation string and defaults to a tabulator; newl specifies the string emitted at the end of each line and defaults to
Edit
import itertools as it
[line[0] for line in it.groupby(lines)]
you can use groupby of itertools package to group consucutive dedup in list lines
So,
import xml.etree.ElementTree as ET
root = ET.Element('root')
with open('file1.txt') as f:
lines = f.read().splitlines()
#add first subelement
celldata = ET.SubElement(root, 'filedata')
import itertools as it
#for every line in input file
#group consecutive dedup to one
for line in it.groupby(lines):
line=line[0]
#if its a break of subelements - that is an empty space
if not line:
#add the next subelement and get it as celldata
celldata = ET.SubElement(root, 'filedata')
else:
#otherwise, split with : to get the tag name
tag = line.split(":")
#format tag name
el=ET.SubElement(celldata,tag[0].replace(" ",""))
tag=' '.join(tag[1:]).strip()
#get file name from file path
if 'File Name' in line:
tag = line.split("\\")[-1].strip()
elif 'File Size' in line:
splist = filter(None,line.split(" "))
tag = splist[splist.index('Low:')+1]
#splist[splist.index('High:')+1]
el.text = tag
#prettify xml
import xml.dom.minidom as minidom
formatedXML = minidom.parseString(
ET.tostring(
root)).toprettyxml(indent=" ",encoding='utf-8').strip()
# Display for debugging
print formatedXML
#write the formatedXML to file.
with open("Performance.xml","w+") as f:
f.write(formatedXML)
Output:
Performance.xml
<?xml version="1.0" encoding="utf-8"?>
<root>
<filedata>
<InFileName>File 1.m1</InFileName>
<OutFileName>File 1.m2</OutFileName>
<InFileSize>22636</InFileSize>
<TotalProcesstime>1.859000</TotalProcesstime>
<OutFileSize>77619</OutFileSize>
</filedata>
<filedata>
<InFileName>File 2.m1</InFileName>
<OutFileName>File 2.m2</OutFileName>
<InFileSize>20673</InFileSize>
<TotalProcesstime>3.094000</TotalProcesstime>
<OutFileSize>94485</OutFileSize>
</filedata>
<filedata>
<InFileName>File 3.m1</InFileName>
<OutFileName>File 3.m2</OutFileName>
<InFileSize>66859</InFileSize>
<TotalProcesstime>3.516000</TotalProcesstime>
<OutFileSize>217268</OutFileSize>
</filedata>
</root>
Hope it helps!

From the docs (emphasis is mine):
re.VERBOSE
This flag allows you to write regular expressions that
look nicer. Whitespace within the pattern is ignored, except when in a
character class or preceded by an unescaped backslash, and, when a
line contains a '#' neither in a character class or preceded by an
unescaped backslash, all characters from the leftmost such '#' through
the end of the line are ignored.
escape spaces in the regex or use \s class

Related

Need help figuring out why my csv output is blank?

I wrote this code to create a .csv report from an .xml file, but when I open the .csv that's generated it's blank. Feel free to rip my code apart, by the way, I'm super new to this and want to learn!
There are multiple "Subjectkeys" in the xml, but only some have an "AuditRecord". I only want to pull ones with an audit record, and then for those, I want to pull their info from "SubjectData", "FormData" and "AuditRecord"
import csv
import xml.etree.cElementTree as ET
tree = ET.parse("response.xml")
root = tree.getroot()
xml_data_to_csv =open("query.csv", 'w')
AuditRecord_head = []
SubjectData_head = []
FormData_head = []
csvwriter=csv.writer(xml_data_to_csv)
count=0
for member in root.findall("AuditRecord"):
AuditRecord = []
Subjectdata = []
FormData = []
if count == 0:
Subject = member.find("SubjectKey").tag
Subjectdata_head.append(Subject)
Form = member.find("p1Name").tag
FormData_head.append(Form)
Action = member.find("Action").tag
AuditRecord_head.append(Action)
csvwriter.writerow(Auditrecord_head)
count = count + 1
Subject = member.find('SubjectKey').text
Subjectdata.append(Subject)
Form = member.find('p1Name').text
FormData.append(Form)
Action = member.find("Action").text
AuditRecord.append(Action)
csvwriter.writerow(Subjectdata)
xml_data_to_csv.close()
I expect the output to be a table with column headings: Subject, Form, Action.
Here is sample .xml:
</ClinicalData>
<ClinicalData StudyOID="SMK-869-002" MetaDataVersionOID="2.0">
<SubjectData SubjectKey="865-015">
</AuditRecord>
</FormData>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="0"/>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="1">
<p1:QueryAction InitialComment="Please enter start date for condition" UserType="User" UserOID="bailey#protocolfirst.com" Action="query" DateTimeStamp="2019-07-12T14:08:43.893Z"/>
</AuditRecord>
First of all your xml file has a lot of errors, to me it has to look like:
<?xml version="1.0"?>
<root xmlns:p1="http://some-url.com">
<ClinicalData StudyOID="SMK-869-002" MetaDataVersionOID="2.0"></ClinicalData>
<SubjectData SubjectKey="865-015"></SubjectData>
<AuditRecord>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="0"/>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="1"/>
<p1:QueryAction InitialComment="Please enter start date for condition" UserType="User" UserOID="bailey#protocolfirst.com" Action="query" DateTimeStamp="2019-07-12T14:08:43.893Z"/>
</AuditRecord>
</root>
ElementTree always expects only a single root node, and a well-formed document.
I do not understand very well what your trying to do, but I hope this could help you:
import xml.etree.cElementTree as ET
tree = ET.parse("response.xml")
root = tree.getroot()
xml_data_to_csv = open("query.csv", 'w')
list_head=[]
count=0
for member in root.findall("AuditRecord"):
AuditRecord = []
Subjectdata = []
FormData = []
if count == 0:
Subjectdata.append(root.find('./SubjectData').attrib['SubjectKey'])
for formData in root.findall('./AuditRecord/FormData'):
#print(formData.attrib['{http://some-url.com}Name'])
FormData.append(formData.attrib['{http://some-url.com}Name'])
AuditRecord.append(root.find('./AuditRecord/{http://some-url.com}QueryAction').attrib['Action'])
xml_data_to_csv.write(Subjectdata[0] + "," + FormData[0] + "," + FormData[1] + "," + AuditRecord[0])
count = count + 1
xml_data_to_csv.close()
This will produce a csv file with the following content:
865-015,Medical History,Medical History,query

Python tree.iterparse export source XML of selected element including all descendants

Python 3.4, parsing GB++ size XML Wikipedia dump files using etree.iterparse. I want to test within the current matched <page> element for its <ns> value, depending on the latter value I then want export the source XML of the whole <page> object and all its contents including any elements nested within it, i.e. the XML of a whole article.
I can iterate the <page> objects and find the ones I want, but then all available functions seem to want to read text/attribute values, whereas I simply want a utf8 string copy of the source file's XML code for the complete in scope <page> object. Is this possible?
A cut-down version of the XML looks like this:
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xml:lang="en">
<page>
<title>Some Article</title>
<ns>0</ns>
<revision>
<timestamp>2017-07-27T00:59:41Z</timestamp>
<text xml:space="preserve">some text</text>
</revision>
</page>
<page>
<title>User:Wonychifans</title>
<ns>2</ns>
<revision>
<text xml:space="preserve">blah blah</text>
</revision>
</page>
</mediawiki>
The python code getting me to the <ns> value test is here:
``from lxml import etree
# store namespace string for all elements (only one used in Wikipedia XML docs)
NAMESPACE = '{http://www.mediawiki.org/xml/export-0.10/}'
ns = {'wiki' : 'http://www.mediawiki.org/xml/export-0.10/'}
context = etree.iterparse('src.xml', events=('end',))
for event, elem in context:
# at end of parsing each
if elem.tag == (NAMESPACE+'page') and event == 'end':
tagNs = elem.find('wiki:ns',ns)
if tagNs is not None:
nsValue = tagNs.text
if nsValue == '2':
# export the current <page>'s XML code
In this case I'd want to extract the XML code of only the second <page> element, i.e. a string holding:
<page>
<title>User:Wonychifans</title>
<ns>2</ns>
<revision>
<text xml:space="preserve">blah blah</text>
</revision>
</page>
edit: minor typo and better mark-up
You can do this.
>>> from lxml import etree
>>> mediawiki = etree.iterparse('mediawiki.xml')
>>> page_content = {}
>>> for ev, el in mediawiki:
... if el.tag=='page':
... if page_content['ns']=='2':
... print (page_content)
... page_content = {}
... else:
... page_content[el.tag.replace('{http://www.mediawiki.org/xml/export-0.10/}', '')] = \
... el.text.strip() if el.text else None
...
>>> page_content
{'mediawiki': '', 'revision': '', 'timestamp': '2017-07-27T00:59:41Z', 'title': 'User:Wonychifans', 'page': '', 'text': 'blah blah', 'ns': '2'}
Because the structure of the output xml is quite simple there should be no difficulty in constructing it from the dictionary.
Edit: Although this approach requires two passes through the xml file it could be faster and it does recover the required xml.
First, look for the starting lines of the page elements.
>>> from lxml import etree
>>> mediawiki = etree.iterparse('mediawiki.xml', events=("start", "end"))
>>> for ev, el in mediawiki:
... tag = el.tag[1+el.tag.rfind('}'):]
... if ev=='start' and tag=='page':
... keep=False
... if ev=='start' and tag=='ns' and el.text=='2':
... keep=True
... if ev=='end' and tag=='page' and keep:
... print (el.sourceline)
...
10
The go through the xml again to find the complete page entries using the starting points.
>>> with open('mediawiki.xml') as mediawiki:
... for _ in range(9):
... r = next(mediawiki)
... for line in mediawiki:
... print (line.strip())
... if '</page>' in line:
... break
...
<page>
<title>User:Wonychifans</title>
<ns>2</ns>
<revision>
<text xml:space="preserve">blah blah</text>
</revision>
</page>
I've marked Bill Bell's answer as accepted as it was instrumental in me getting to my final solution, the core of which is below. The outer loop lets me loop though over 50 source XML files.
As some sources are large, the code checks in-loop for the copied source data exceeding 1GB. If so, a write of data to file occurs and the buffer string variable is purged. Otherwise all extracted data is written at the end of reading the source file(s).
Further polish would be to monitor the size of the output file and switch output sources once a given size were exceeded. In this case, it was easier to only scan some of the whole source set per run of the script.
I've removed some logging & print statements for brevity:
<!-- language: lang-python -->
import sys
dataSourceStr = '/Users/x/WP-data/'
outputDataStr = '/Users/x/WP-data/ns-data/'
headfile = open("header.txt","r")
headStr = headfile.read()
headfile.close()
footStr = '</mediawiki>'
matchCount = 0
strPage = ''
strPage = headStr
fileNum = 20
nameSpaceValue = 4
startNum = 41 # starting file number
lastNum = 53 # ending file number
endNum = lastNum + 1
outputDataFile = outputDataStr + 'ns' + str(nameSpaceValue) + '.xml'
for fileNum in range (startNum , endNum):
with open(dataSourceStr + str(fileNum) + '.xml') as mediawiki:
lineNum = 44
blnKeep = False
strPage = ''
strItem = ''
loopMatchCount = 0
for _ in range(lineNum):
r = next(mediawiki)
for line in mediawiki:
if '<ns>' + str(nameSpaceValue) + '</ns>' in line:
blnKeep = True
matchCount = matchCount + 1
loopMatchCount = loopMatchCount + 1
strItem = strItem + line
lineNum = lineNum + 1
if '</page>' in line:
if blnKeep:
strPage = strPage + strItem
strItem = ''
blnKeep = False
strPageSize = sys.getsizeof(strPage)
if strPageSize > 1073741824:
file = open(outputDataFile,"a")
file.write(strPage)
file.close()
strPage = ''
else:
strItem = ''
mediawiki.close
file = open(outputDataFile,"a")
file.write(strPage)
file.close()
file = open(outputDataFile,"a")
file.write(footStr)
file.close()
I'm sure this could be more elegant but I hope this helps any fellow non-experts arriving here and trying to do this sort of thing.

Define an XML schema from a txt file With Python script and store it in a single XML file

I am trying to write an xml schema using python from a .txt file. I tried the following code but it didn't read the values in lines of the text.
The data is like:
# File format is Team:Player:Salary:Position
New York Yankees :"Acevedo Juan" : 900000: Pitcher
New York Yankees :"Anderson Jason": 300000: Pitcher
............
and the code:
import re
import xml.etree.ElementTree as ET
root = ET.Element('root')
root.text = '\n' # newline before the celldata element
f = open("C:/baseball.txt")
lines = f.readlines()
for l in lines:
elems = l.split(":")
if len(elems) == 4:
elems = map(lambda x: x.strip(), elems)
playerdata = ET.SubElement(root, "playerdata")
playerdata.text = '\n'
playerdata.tail = '\n\n'
team = ET.SubElement(playerdata, "team")
player = ET.SubElement(playerdata, "player")
salary = ET.SubElement(playerdata, "salary")
position = ET.SubElement(playerdata, "position")
ET.dump(root)
tree = ET.ElementTree(root)
tree.write("test1.xml", encoding='utf-8', xml_declaration=True)
The output I got looks like this:
<root>
<playerdata>
<team /><player /><salary /><position /></playerdata>
<playerdata>
<team /><player /><salary /><position /></playerdata>
<playerdata>
<team /><player /><salary /><position /></playerdata>
.
.
.
<playerdata>
<team /><player /><salary /><position /></playerdata>
</root>
A few changes, plus the fix. The changes are:
Using a context manager to ensure the input file gets closed
correctly.
Using the fact that iterating a file yields each line in
turn (rather than f.readlines()).
Combining the splitting and stripping of the parts of the line into one statement.
The fix is to set the text value for the subelements just after they are created.
import xml.etree.ElementTree as ET
root = ET.Element('root')
root.text = '\n' # newline before the celldata element
with open("C:/baseball.txt") as f:
for l in f:
elems = [ x.strip() for x in l.split(":") ]
if len(elems) == 4:
playerdata = ET.SubElement(root, "playerdata")
playerdata.text = '\n'
playerdata.tail = '\n\n'
team = ET.SubElement(playerdata, "team")
team.text = elems[0]
player = ET.SubElement(playerdata, "player")
player.text = elems[1]
salary = ET.SubElement(playerdata, "salary")
salary.text = elems[2]
position = ET.SubElement(playerdata, "position")
position.text = elems[3]
ET.dump(root)
tree = ET.ElementTree(root)
tree.write("test1.xml", encoding='utf-8', xml_declaration=True)

Compare 2 files in python and print non-matching text

I have two files:
Resp.txt:
vrf XXX
address-family ipv4 unicast
import route-target
123:45
212:43
!
export route-policy ABCDE
export route-target
9:43
!
maximum prefix 12 34
spanning tree enable
bandwidth 10
!
!
and sample.txt
vrf
address-family ipv4 unicast
import route-target
export route-target
maximum prefix
I want to match resp.txt and sample.txt such that if contents of sample are not present in resp, I get those lines of text. The output should be like:
spanning tree enable
bandwidth 10
I am using :
t2=open('sample.txt','r')
abc=open('resp.txt','r')
for x in t2:
for line in abc:
if x.strip() in line.strip():
print 'yes'
else:
print line
But it's matching every line in both the text files and hence, not showing the correct result.
So the simplest solution to get all the strings not in sample.txt is to use set difference:
file_1 = set()
file_2 = set()
with open('Resp.txt', 'r') as f:
for line in f:
file_1.add(line.strip())
with open('Sample.txt', 'r') as f:
for line in f:
file_2.add(line.strip())
print(file_1 - file_2)
Which returns:
{'export route-policy ABCDE', 'vrf XXX', 'spanning tree enable', '!', '212:43', 'bandwidth 10', 'maximum prefix 12 34', '9:43', '123:45'}
However, this doesn't include certain rules applied to Resp.txt, for example:
If line is "maximum prefix" ignore the numbers.
These rules can be applied while reading Resp.txt:
import re
file_1 = set()
file_2 = set()
with open('Resp.txt', 'r') as f:
for line in f:
line = line.strip()
if line == "!":
continue
elif re.match( r'\d+:\d+', line): # Matches times.
continue
elif line.startswith("vrf"):
line = "vrf"
elif line.startswith("maximum prefix"):
line = "maximum prefix"
file_1.add(line)
with open('Sample.txt', 'r') as f:
for line in f:
file_2.add(line.strip())
print(file_1) - file_2)
Which returns:
{'export route-policy ABCDE', 'bandwidth 10', 'spanning tree enable'}
Which is correct because sample.txt does not contain route-policy.
These rules could be made more robust, but they should be illustrative enough.
Keep in mind set will only find unique differences, and not all (say you have multiple 'spanning tree enable' lines and would like to know how many times these are seen. In that case, you could do something more in line with your original code:
import re
file_1 = []
file_2 = []
with open('Resp.txt', 'r') as f:
for line in f:
line = line.strip()
if line == "!":
continue
elif re.match( r'\d+:\d+', line):
continue
elif line.startswith("vrf"):
line = "vrf"
elif line.startswith("maximum prefix"):
line = "maximum prefix"
file_1.append(line)
with open('Sample.txt', 'r') as f:
for line in f:
file_2.append(line.strip())
diff = []
for line in file_1:
if line not in file_2:
diff.append(line)
print(diff)
Result:
['export route-policy ABCDE', 'spanning tree enable', 'bandwidth 10']
While this method is slower (although you probably won't notice), it can find duplicate lines and maintains the order of the lines found.

Adding new XML node and Pretty printing XML in python

I can add the XML node using the ElementTree, but this returns the output in one single line instead of a tree structure look alike when I open the xml file in text format. I also tried using the minidom.toprettyxml but I do not know how to add the output to original XML. Since I would like the script to be reproducible in other environments, I prefer not using external libraries such as lxml. Can someone please help how I can pretty print the output? - python 2.7
The Sample XML. This is how it looks both in text format and Explorer.
<?xml version="1.0" encoding="utf-8"?>
<default_locators >
<locator_ref>
<name>cherry</name>
<display_name>cherrycherry</display_name>
<workspace_properties>
<factory_progid>Workspace</factory_progid>
<path>InstallDir</path>
</workspace_properties>
</locator_ref>
</default_locators>
Expected Output in both text format and Explorer.
<?xml version="1.0" encoding="utf-8"?>
<default_locators >
<locator_ref>
<name>cherry</name>
<display_name>cherrycherry</display_name>
<workspace_properties>
<factory_progid>Workspace</factory_progid>
<path>InstallDir</path>
</workspace_properties>
</locator_ref>
<locator_ref>
<name>berry</name>
<display_name>berryberry</display_name>
<workspace_properties>
<factory_progid>Workspace</factory_progid>
<path>C:\temp\temp</path>
</workspace_properties>
</locator_ref>
</default_locators>
My script
#coding: cp932
import xml.etree.ElementTree as ET
tree = ET.parse(r"C:\DefaultLocators.xml")
root = tree.getroot()
locator_ref = ET.SubElement(root, "locator_ref")
name = ET.SubElement(locator_ref, "name")
name.text = " berry"
display_name = ET.SubElement(locator_ref, "display_name")
display_name.text = "berryberry"
workspace_properties = ET.SubElement(locator_ref, "workspace_properties")
factory_progid = ET.SubElement(workspace_properties,"factory_progid")
factory_progid.text = "Workspace"
path = ET.SubElement(workspace_properties, "path")
path.text = r"c:\temp\temp"
tree.write(r"C:\DefaultLocators.xml", encoding='utf-8')
Returned output. After running my script, new nodes are added to my sample.xml file, but it returns output in one single line, with all newlines and indents removed from the original sample.xml file. At least thats how it looks when I open the sample.xml file in text format. However, When I open the sample.xml file in Explorer, it looks fine. I still see the newlines and indents as they were before. How can I keep the original tree structure in text format even after running the script?
<default_locators>
<locator_ref>
<name>cherry</name>
<display_name>cherrycherry</display_name>
<workspace_properties>
<factory_progid>Workspace</factory_progid>
<path>InstallDir</path>
</workspace_properties>
</locator_ref>
<locator_ref><name> berry</name><display_name>berryberry</display_name><workspace_properties><factory_progid>Workspace</factory_progid><path>c:\temp\temp</path></workspace_properties></locator_ref></default_locators>
when dealing with element, you can do like this: element.tail = '\n'
then,it will be written in single line.
write your xml in elementTree as:
import xml.etree.ElementTree as ET
def serialize_xml(write, elem, encoding, qnames, namespaces):
tag = elem.tag
text = elem.text
if tag is ET.Comment:
write("<!--%s-->" % _encode(text, encoding))
elif tag is ET.ProcessingInstruction:
write("<?%s?>" % _encode(text, encoding))
else:
tag = qnames[tag]
if tag is None:
if text:
write(_escape_cdata(text, encoding))
for e in elem:
serialize_xml(write, e, encoding, qnames, None)
else:
write("\n<" + tag) ## '\n' added by namit
items = elem.items()
if items or namespaces:
if namespaces:
for v, k in sorted(namespaces.items(),
key=lambda x: x[1]): # sort on prefix
if k:
k = ":" + k
write(" xmlns%s=\"%s\"" % (
k.encode(encoding),
_escape_attrib(v, encoding)
))
for k, v in sorted(items): # lexical order
if isinstance(k, QName):
k = k.text
if isinstance(v, QName):
v = qnames[v.text]
else:
v = _escape_attrib(v, encoding)
write(" %s=\"%s\"" % (qnames[k], v))
if text or len(elem):
write(">")
if text:
write(ET._escape_cdata(text, encoding))
for e in elem:
serialize_xml(write, e, encoding, qnames, None)
write("</" + tag + ">")
else:
write(" />")
if elem.tail:
write(ET._escape_cdata(elem.tail, encoding))
ET._serialize_xml=serialize_xml
tree = ET.parse(r"samplexml.xml")
root = tree.getroot()
locator_ref = ET.SubElement(root, "locator_ref")
name = ET.SubElement(locator_ref, "name")
name.text = " berry"
display_name = ET.SubElement(locator_ref, "display_name")
display_name.text = "berryberry"
workspace_properties = ET.SubElement(locator_ref, "workspace_properties")
factory_progid = ET.SubElement(workspace_properties,"factory_progid")
factory_progid.text = "WorkspaceFactory"
path = ET.SubElement(workspace_properties, "path")
ins_out=open("samplexml_1.xml",'wb',1000)
ET.ElementTree(locator_ref).write(ins_out,encoding="ASCII")
ins_out.close()
this will write complete file in single line; without adding white space in xml tail.
I think you must try lxml library. It's the best way to parse XML in Python.
It has magic argument *pretty_print* for such things.
Here's an example:
import lxml.etree as etree
root = etree.Element("root")
for rn in range(10):
etree.SubElement(root, "column_%s" % str(rn)).text = str(rn*rn)
pretty_data = etree.tostring(root, pretty_print=True, encoding = 'utf-8')
print final_data
Result:http://pastebin.com/y0rkQ78G

Categories