I'm making an automation that takes an existing XML file, changes one of the values and overwrites the file.
My main problem is that its important for me to keep the formatting of the original file and i can't manage to do that, the new data has no line breaks and its all just a long line.
<StationConfig StationId="8706" SportType="null" StationType="null" UseMetricSystem="US" LocalTempStorageDrive="C:\" LocalStorageDrive="C:\">
<ClubManagementEnable ClubManagementStaticHours="">false</ClubManagementEnable>
</StationConfig>
My code is:
parser = etree.XMLParser()
read = etree.parse("C:\StationConfig.xml", parser=parser).getroot()
read.set("StationId", "8706")
tree = etree.ElementTree(read)
tree.write("C:\devtree\Station.xml", pretty_print=True)
How can i add an \n after each element?
Thanks!
As far as I understand, the below is what you are looking for.
import xml.etree.ElementTree as ET
xml = '''<?xml version="1.0" encoding="UTF-8"?>
<StationConfig StationId="8706" SportType="null" StationType="null" UseMetricSystem="US" LocalTempStorageDrive="C:\" LocalStorageDrive="C:\">
<ClubManagementEnable ClubManagementStaticHours="">false</ClubManagementEnable>
</StationConfig>'''
def _pretty_print(current, parent=None, index=-1, depth=0):
for i, node in enumerate(current):
_pretty_print(node, current, i, depth + 1)
if parent is not None:
if index == 0:
parent.text = '\n' + ('\t' * depth)
else:
parent[index - 1].tail = '\n' + ('\t' * depth)
if index == len(parent) - 1:
current.tail = '\n' + ('\t' * (depth - 1))
root = ET.fromstring(xml)
# change something in the xml
root.attrib['StationId'] = '123'
# save it back to disk
_pretty_print(root)
tree = ET.ElementTree(root)
tree.write("out.xml")
out.xml below
<StationConfig StationId="123" SportType="null" StationType="null" UseMetricSystem="US" LocalTempStorageDrive="C:" LocalStorageDrive="C:">
<ClubManagementEnable ClubManagementStaticHours="">false</ClubManagementEnable>
</StationConfig>
Related
I wrote this code to create a .csv report from an .xml file, but when I open the .csv that's generated it's blank. Feel free to rip my code apart, by the way, I'm super new to this and want to learn!
There are multiple "Subjectkeys" in the xml, but only some have an "AuditRecord". I only want to pull ones with an audit record, and then for those, I want to pull their info from "SubjectData", "FormData" and "AuditRecord"
import csv
import xml.etree.cElementTree as ET
tree = ET.parse("response.xml")
root = tree.getroot()
xml_data_to_csv =open("query.csv", 'w')
AuditRecord_head = []
SubjectData_head = []
FormData_head = []
csvwriter=csv.writer(xml_data_to_csv)
count=0
for member in root.findall("AuditRecord"):
AuditRecord = []
Subjectdata = []
FormData = []
if count == 0:
Subject = member.find("SubjectKey").tag
Subjectdata_head.append(Subject)
Form = member.find("p1Name").tag
FormData_head.append(Form)
Action = member.find("Action").tag
AuditRecord_head.append(Action)
csvwriter.writerow(Auditrecord_head)
count = count + 1
Subject = member.find('SubjectKey').text
Subjectdata.append(Subject)
Form = member.find('p1Name').text
FormData.append(Form)
Action = member.find("Action").text
AuditRecord.append(Action)
csvwriter.writerow(Subjectdata)
xml_data_to_csv.close()
I expect the output to be a table with column headings: Subject, Form, Action.
Here is sample .xml:
</ClinicalData>
<ClinicalData StudyOID="SMK-869-002" MetaDataVersionOID="2.0">
<SubjectData SubjectKey="865-015">
</AuditRecord>
</FormData>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="0"/>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="1">
<p1:QueryAction InitialComment="Please enter start date for condition" UserType="User" UserOID="bailey#protocolfirst.com" Action="query" DateTimeStamp="2019-07-12T14:08:43.893Z"/>
</AuditRecord>
First of all your xml file has a lot of errors, to me it has to look like:
<?xml version="1.0"?>
<root xmlns:p1="http://some-url.com">
<ClinicalData StudyOID="SMK-869-002" MetaDataVersionOID="2.0"></ClinicalData>
<SubjectData SubjectKey="865-015"></SubjectData>
<AuditRecord>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="0"/>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="1"/>
<p1:QueryAction InitialComment="Please enter start date for condition" UserType="User" UserOID="bailey#protocolfirst.com" Action="query" DateTimeStamp="2019-07-12T14:08:43.893Z"/>
</AuditRecord>
</root>
ElementTree always expects only a single root node, and a well-formed document.
I do not understand very well what your trying to do, but I hope this could help you:
import xml.etree.cElementTree as ET
tree = ET.parse("response.xml")
root = tree.getroot()
xml_data_to_csv = open("query.csv", 'w')
list_head=[]
count=0
for member in root.findall("AuditRecord"):
AuditRecord = []
Subjectdata = []
FormData = []
if count == 0:
Subjectdata.append(root.find('./SubjectData').attrib['SubjectKey'])
for formData in root.findall('./AuditRecord/FormData'):
#print(formData.attrib['{http://some-url.com}Name'])
FormData.append(formData.attrib['{http://some-url.com}Name'])
AuditRecord.append(root.find('./AuditRecord/{http://some-url.com}QueryAction').attrib['Action'])
xml_data_to_csv.write(Subjectdata[0] + "," + FormData[0] + "," + FormData[1] + "," + AuditRecord[0])
count = count + 1
xml_data_to_csv.close()
This will produce a csv file with the following content:
865-015,Medical History,Medical History,query
Python 3.4, parsing GB++ size XML Wikipedia dump files using etree.iterparse. I want to test within the current matched <page> element for its <ns> value, depending on the latter value I then want export the source XML of the whole <page> object and all its contents including any elements nested within it, i.e. the XML of a whole article.
I can iterate the <page> objects and find the ones I want, but then all available functions seem to want to read text/attribute values, whereas I simply want a utf8 string copy of the source file's XML code for the complete in scope <page> object. Is this possible?
A cut-down version of the XML looks like this:
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xml:lang="en">
<page>
<title>Some Article</title>
<ns>0</ns>
<revision>
<timestamp>2017-07-27T00:59:41Z</timestamp>
<text xml:space="preserve">some text</text>
</revision>
</page>
<page>
<title>User:Wonychifans</title>
<ns>2</ns>
<revision>
<text xml:space="preserve">blah blah</text>
</revision>
</page>
</mediawiki>
The python code getting me to the <ns> value test is here:
``from lxml import etree
# store namespace string for all elements (only one used in Wikipedia XML docs)
NAMESPACE = '{http://www.mediawiki.org/xml/export-0.10/}'
ns = {'wiki' : 'http://www.mediawiki.org/xml/export-0.10/'}
context = etree.iterparse('src.xml', events=('end',))
for event, elem in context:
# at end of parsing each
if elem.tag == (NAMESPACE+'page') and event == 'end':
tagNs = elem.find('wiki:ns',ns)
if tagNs is not None:
nsValue = tagNs.text
if nsValue == '2':
# export the current <page>'s XML code
In this case I'd want to extract the XML code of only the second <page> element, i.e. a string holding:
<page>
<title>User:Wonychifans</title>
<ns>2</ns>
<revision>
<text xml:space="preserve">blah blah</text>
</revision>
</page>
edit: minor typo and better mark-up
You can do this.
>>> from lxml import etree
>>> mediawiki = etree.iterparse('mediawiki.xml')
>>> page_content = {}
>>> for ev, el in mediawiki:
... if el.tag=='page':
... if page_content['ns']=='2':
... print (page_content)
... page_content = {}
... else:
... page_content[el.tag.replace('{http://www.mediawiki.org/xml/export-0.10/}', '')] = \
... el.text.strip() if el.text else None
...
>>> page_content
{'mediawiki': '', 'revision': '', 'timestamp': '2017-07-27T00:59:41Z', 'title': 'User:Wonychifans', 'page': '', 'text': 'blah blah', 'ns': '2'}
Because the structure of the output xml is quite simple there should be no difficulty in constructing it from the dictionary.
Edit: Although this approach requires two passes through the xml file it could be faster and it does recover the required xml.
First, look for the starting lines of the page elements.
>>> from lxml import etree
>>> mediawiki = etree.iterparse('mediawiki.xml', events=("start", "end"))
>>> for ev, el in mediawiki:
... tag = el.tag[1+el.tag.rfind('}'):]
... if ev=='start' and tag=='page':
... keep=False
... if ev=='start' and tag=='ns' and el.text=='2':
... keep=True
... if ev=='end' and tag=='page' and keep:
... print (el.sourceline)
...
10
The go through the xml again to find the complete page entries using the starting points.
>>> with open('mediawiki.xml') as mediawiki:
... for _ in range(9):
... r = next(mediawiki)
... for line in mediawiki:
... print (line.strip())
... if '</page>' in line:
... break
...
<page>
<title>User:Wonychifans</title>
<ns>2</ns>
<revision>
<text xml:space="preserve">blah blah</text>
</revision>
</page>
I've marked Bill Bell's answer as accepted as it was instrumental in me getting to my final solution, the core of which is below. The outer loop lets me loop though over 50 source XML files.
As some sources are large, the code checks in-loop for the copied source data exceeding 1GB. If so, a write of data to file occurs and the buffer string variable is purged. Otherwise all extracted data is written at the end of reading the source file(s).
Further polish would be to monitor the size of the output file and switch output sources once a given size were exceeded. In this case, it was easier to only scan some of the whole source set per run of the script.
I've removed some logging & print statements for brevity:
<!-- language: lang-python -->
import sys
dataSourceStr = '/Users/x/WP-data/'
outputDataStr = '/Users/x/WP-data/ns-data/'
headfile = open("header.txt","r")
headStr = headfile.read()
headfile.close()
footStr = '</mediawiki>'
matchCount = 0
strPage = ''
strPage = headStr
fileNum = 20
nameSpaceValue = 4
startNum = 41 # starting file number
lastNum = 53 # ending file number
endNum = lastNum + 1
outputDataFile = outputDataStr + 'ns' + str(nameSpaceValue) + '.xml'
for fileNum in range (startNum , endNum):
with open(dataSourceStr + str(fileNum) + '.xml') as mediawiki:
lineNum = 44
blnKeep = False
strPage = ''
strItem = ''
loopMatchCount = 0
for _ in range(lineNum):
r = next(mediawiki)
for line in mediawiki:
if '<ns>' + str(nameSpaceValue) + '</ns>' in line:
blnKeep = True
matchCount = matchCount + 1
loopMatchCount = loopMatchCount + 1
strItem = strItem + line
lineNum = lineNum + 1
if '</page>' in line:
if blnKeep:
strPage = strPage + strItem
strItem = ''
blnKeep = False
strPageSize = sys.getsizeof(strPage)
if strPageSize > 1073741824:
file = open(outputDataFile,"a")
file.write(strPage)
file.close()
strPage = ''
else:
strItem = ''
mediawiki.close
file = open(outputDataFile,"a")
file.write(strPage)
file.close()
file = open(outputDataFile,"a")
file.write(footStr)
file.close()
I'm sure this could be more elegant but I hope this helps any fellow non-experts arriving here and trying to do this sort of thing.
I'm trying to get each instance of an XML tag but I can only seem to return one or none.
#!/usr/software/bin/python
# import libraries
import urllib
from xml.dom.minidom import parseString
# variables
startdate = "2014-01-01"
enddate = "2014-05-01"
rest_client = "test"
rest_host = "restprd.test.com"
rest_port = "80"
rest_base_url = "asup-rest-interface/ASUP_DATA"
rest_date = "/start_date/%s/end_date/%s/limit/5000" % (startdate,enddate)
rest_api = "http://" + rest_host + ":" + rest_port + "/" + rest_base_url + "/" + "client_id" + "/" + rest_client
response = urllib.urlopen(rest_api + rest_date + '/sys_serial_no/700000667725')
data = response.read()
response.close()
dom = parseString(data)
xmlVer = dom.getElementsByTagName('sys_version').toxml()
xmlDate = dom.getElementsByTagName('asup_gen_date').toxml()
xmlVerTag=xmlVer.replace('<sys_version>','').replace('</sys_version>','')
xmlDateTag=xmlDate.replace('<asup_gen_date>','').replace('</asup_gen_date>','').replace('T',' ')[0:-6]
print xmlDateTag , xmlVerTag
The above code generates the following error:
Traceback (most recent call last):
File "./test.py", line 23, in <module>
xmlVer = dom.getElementsByTagName('sys_version').toxml()
AttributeError: 'NodeList' object has no attribute 'toxml'
If I change the .toxml() to [0].toxml() I can get the first element, but I need to get all the elements. Any ideas?
Also, if I try something like this I get no output at all:
response = urllib.urlopen(rest_api + rest_date + '/sys_serial_no/700000667725')
DOMTree = xml.dom.minidom.parse(response)
collection = DOMTree.documentElement
if collection.hasAttribute("results"):
print collection.getAttribute("sys_version")
The original data looks like this.
There are repeating sections of XML like this:
<xml><status request_id="58f39198-2c76-4e87-8e00-f7dd7e69519f1416354337206" response_time="00:00:00:833"></status><results start="1" limit="1000" total_results_count="1" results_count="1"><br/><system><tests start="1" limit="50" total_results_count="18" results_count="18"><test> <biz_key>C|BF02F1A3-3C4E-11DC-8AAE-0015171BBD90|8594169899|700000667725</biz_key><test_id>2014071922090465</test_id><test_subject>HA Group Notification (WEEKLY_LOG) INFO</test_subject><test_type>DOT-REGULAR</test_type><asup_gen_date>2014-07-20T00:21:40-04:00</asup_gen_date><test_received_date>Sat Jul 19 22:09:19 PDT 2014</test_received_date><test_gen_zone>EDT</test_gen_zone><test_is_minimal>false</test_is_minimal><sys_version>9.2.2X22</sys_version><sys_operating_mode>Cluster-Mode</sys_operating_mode><hostname>rerfdsgt</hostname><sys_domain>test.com</sys_domain><cluster_name>bbrtp</cluster_name> ... etc
<xml>
<results>
<system>
-<sys_version>
<asup>
-<asup_gen_date>
I simply want to extract the sys_version and asup_gen_date
9.2.2X22 2014-07-20 00:21:40
9.2.2X21 2014-06-31 12:51:40
8.5.2X1 2014-07-20 04:33:22
You need to loop over the results of getElementsByTagName():
for version in dom.getElementsByTagName('sys_version'):
version = version.toxml()
version = version.replace('<sys_version>','').replace('</sys_version>','')
print version
Also, instead of replacing opening and closing tags, you probably want yo use getText():
def getText(nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)
for version in dom.getElementsByTagName('sys_version'):
print getText(version.childNodes)
Another point is that it would be much more easy and pleasant to parse xml with xml.etree.ElementTree, example:
import xml.etree.ElementTree as ET
tree = ET.parse(response)
root = tree.getroot()
for version in root.findall('sys_version'):
print version.text
ucsc DAS server, which get DNA sequences by coordinate.
URL: http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr20:30037432,30038060
sample file:
<DASDNA>
<SEQUENCE id="chr20" start="30037832" stop="30038060" version="1.00">
<DNA length="229">
gtggcacccaaagatgctggaatctttatggcaaatgccgttacagatgc
tccaagaaggaaagagtctatgtttactgcataaataataaaatgtgctg
cgtgaagcccaagtaccagccaaaagaaaggtggtggccattttaactgc
tttgaagcctgaagccatgaaaatgcagatgaagctcccagtggattccc
acactctatcaataaacacctctggctga
</DNA>
</SEQUENCE>
</DASDNA>
what I want is this part:
gtggcacccaaagatgctggaatctttatggcaaatgccgttacagatgc
tccaagaaggaaagagtctatgtttactgcataaataataaaatgtgctg
cgtgaagcccaagtaccagccaaaagaaaggtggtggccattttaactgc
tttgaagcctgaagccatgaaaatgcagatgaagctcccagtggattccc
acactctatcaataaacacctctggctga
I want to get the sequence part from thousands of this kind urls, how should i do it?
I tried to write the data to file and parse the file, it worked ok, but is there any way to parse the xml-like string directly? i tried some example from other posts, but they didn't work.
Here, I added my solution. Thanks to the 2 answers below.
Solution 1:
def getSequence2(chromosome, start, end):
base = 'http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment='
url = base + chromosome + ':' + str(start) + ',' + str(end)
doc = etree.parse(url,parser=etree.XMLParser())
if doc != '':
sequence = doc.xpath('SEQUENCE/DNA/text()')[0].replace('\n','')
else:
sequence = 'THE SEQUENCE DOES NOT EXIST FOR GIVEN COORDINATES'
return sequence
Solution 2:
def getSequence1(chromosome, start, end):
base = 'http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment='
url = base + chromosome + ':' + str(start) + ',' + str(end)
xml = urllib2.urlopen(url).read()
if xml != '':
w = open('temp.xml', 'w')
w.write(xml)
w.close()
dom = parse('temp.xml')
data = dom.getElementsByTagName('DNA')
sequence = data[0].firstChild.nodeValue.replace('\n','')
else:
sequence = 'THE SEQUENCE DOES NOT EXIST FOR GIVEN COORDINATES'
return sequence
Of course they will need to import some necessary libraries.
>>> from lxml import etree
>>> doc = etree.parse("http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr20:30037432,30038060",parser=etree.XMLParser())
>>> doc.xpath('SEQUENCE/DNA/text()')
['\natagtggcacatgtctgttgtcctagctcctcggggaaactcaggtggga\ngagtcccttgaactgggaggaggaggtttgcagtgagccagaatcattcc\nactgtactccagcctaggtgacagagcaagactcatctcaaaaaaaaaaa\naaaaaaaaaaaaaagacaatccgcacacataaaggctttattcagctgat\ngtaccaaggtcactctctcagtcaaaggtgggaagcaaaaaaacagagta\naaggaaaaacagtgatagatgaaaagagtcaaaggcaagggaaacaaggg\naccttctatctcatctgtttccattcttttacagacctttcaaatccgga\ngcctacttgttaggactgatactgtctcccttctttctgctttgtgtcag\ngtggcacccaaagatgctggaatctttatggcaaatgccgttacagatgc\ntccaagaaggaaagagtctatgtttactgcataaataataaaatgtgctg\ncgtgaagcccaagtaccagccaaaagaaaggtggtggccattttaactgc\ntttgaagcctgaagccatgaaaatgcagatgaagctcccagtggattccc\nacactctatcaataaacacctctggctga\n']
Use a Python XML parsing library like lxml, load the XML file with that parser, and then use a selector (e.g. using XPath) to grab the node/element that you need.
I can add the XML node using the ElementTree, but this returns the output in one single line instead of a tree structure look alike when I open the xml file in text format. I also tried using the minidom.toprettyxml but I do not know how to add the output to original XML. Since I would like the script to be reproducible in other environments, I prefer not using external libraries such as lxml. Can someone please help how I can pretty print the output? - python 2.7
The Sample XML. This is how it looks both in text format and Explorer.
<?xml version="1.0" encoding="utf-8"?>
<default_locators >
<locator_ref>
<name>cherry</name>
<display_name>cherrycherry</display_name>
<workspace_properties>
<factory_progid>Workspace</factory_progid>
<path>InstallDir</path>
</workspace_properties>
</locator_ref>
</default_locators>
Expected Output in both text format and Explorer.
<?xml version="1.0" encoding="utf-8"?>
<default_locators >
<locator_ref>
<name>cherry</name>
<display_name>cherrycherry</display_name>
<workspace_properties>
<factory_progid>Workspace</factory_progid>
<path>InstallDir</path>
</workspace_properties>
</locator_ref>
<locator_ref>
<name>berry</name>
<display_name>berryberry</display_name>
<workspace_properties>
<factory_progid>Workspace</factory_progid>
<path>C:\temp\temp</path>
</workspace_properties>
</locator_ref>
</default_locators>
My script
#coding: cp932
import xml.etree.ElementTree as ET
tree = ET.parse(r"C:\DefaultLocators.xml")
root = tree.getroot()
locator_ref = ET.SubElement(root, "locator_ref")
name = ET.SubElement(locator_ref, "name")
name.text = " berry"
display_name = ET.SubElement(locator_ref, "display_name")
display_name.text = "berryberry"
workspace_properties = ET.SubElement(locator_ref, "workspace_properties")
factory_progid = ET.SubElement(workspace_properties,"factory_progid")
factory_progid.text = "Workspace"
path = ET.SubElement(workspace_properties, "path")
path.text = r"c:\temp\temp"
tree.write(r"C:\DefaultLocators.xml", encoding='utf-8')
Returned output. After running my script, new nodes are added to my sample.xml file, but it returns output in one single line, with all newlines and indents removed from the original sample.xml file. At least thats how it looks when I open the sample.xml file in text format. However, When I open the sample.xml file in Explorer, it looks fine. I still see the newlines and indents as they were before. How can I keep the original tree structure in text format even after running the script?
<default_locators>
<locator_ref>
<name>cherry</name>
<display_name>cherrycherry</display_name>
<workspace_properties>
<factory_progid>Workspace</factory_progid>
<path>InstallDir</path>
</workspace_properties>
</locator_ref>
<locator_ref><name> berry</name><display_name>berryberry</display_name><workspace_properties><factory_progid>Workspace</factory_progid><path>c:\temp\temp</path></workspace_properties></locator_ref></default_locators>
when dealing with element, you can do like this: element.tail = '\n'
then,it will be written in single line.
write your xml in elementTree as:
import xml.etree.ElementTree as ET
def serialize_xml(write, elem, encoding, qnames, namespaces):
tag = elem.tag
text = elem.text
if tag is ET.Comment:
write("<!--%s-->" % _encode(text, encoding))
elif tag is ET.ProcessingInstruction:
write("<?%s?>" % _encode(text, encoding))
else:
tag = qnames[tag]
if tag is None:
if text:
write(_escape_cdata(text, encoding))
for e in elem:
serialize_xml(write, e, encoding, qnames, None)
else:
write("\n<" + tag) ## '\n' added by namit
items = elem.items()
if items or namespaces:
if namespaces:
for v, k in sorted(namespaces.items(),
key=lambda x: x[1]): # sort on prefix
if k:
k = ":" + k
write(" xmlns%s=\"%s\"" % (
k.encode(encoding),
_escape_attrib(v, encoding)
))
for k, v in sorted(items): # lexical order
if isinstance(k, QName):
k = k.text
if isinstance(v, QName):
v = qnames[v.text]
else:
v = _escape_attrib(v, encoding)
write(" %s=\"%s\"" % (qnames[k], v))
if text or len(elem):
write(">")
if text:
write(ET._escape_cdata(text, encoding))
for e in elem:
serialize_xml(write, e, encoding, qnames, None)
write("</" + tag + ">")
else:
write(" />")
if elem.tail:
write(ET._escape_cdata(elem.tail, encoding))
ET._serialize_xml=serialize_xml
tree = ET.parse(r"samplexml.xml")
root = tree.getroot()
locator_ref = ET.SubElement(root, "locator_ref")
name = ET.SubElement(locator_ref, "name")
name.text = " berry"
display_name = ET.SubElement(locator_ref, "display_name")
display_name.text = "berryberry"
workspace_properties = ET.SubElement(locator_ref, "workspace_properties")
factory_progid = ET.SubElement(workspace_properties,"factory_progid")
factory_progid.text = "WorkspaceFactory"
path = ET.SubElement(workspace_properties, "path")
ins_out=open("samplexml_1.xml",'wb',1000)
ET.ElementTree(locator_ref).write(ins_out,encoding="ASCII")
ins_out.close()
this will write complete file in single line; without adding white space in xml tail.
I think you must try lxml library. It's the best way to parse XML in Python.
It has magic argument *pretty_print* for such things.
Here's an example:
import lxml.etree as etree
root = etree.Element("root")
for rn in range(10):
etree.SubElement(root, "column_%s" % str(rn)).text = str(rn*rn)
pretty_data = etree.tostring(root, pretty_print=True, encoding = 'utf-8')
print final_data
Result:http://pastebin.com/y0rkQ78G