Generating Xml using python - python

Kindly have a look at below code i am using this to generate a xml using python .
from lxml import etree
# Some dummy text
conn_id = 5
conn_name = "Airtelll"
conn_desc = "Largets TRelecome"
ip = "192.168.1.23"
# Building the XML tree
# Note how attributes and text are added, using the Element methods
# and not by concatenating strings as in your question
root = etree.Element("ispinfo")
child = etree.SubElement(root, 'connection',
number = str(conn_id),
name = conn_name,
desc = conn_desc)
subchild_ip = etree.SubElement(child, 'ip_address')
subchild_ip.text = ip
# and pretty-printing it
print etree.tostring(root, pretty_print=True)
This will produce:
<ispinfo>
<connection desc="Largets TRelecome" number="5" name="Airtelll">
<ip_address>192.168.1.23</ip_address>
</connection>
</ispinfo>
But i want it to be like :
<ispinfo>
<connection desc="Largets TRelecome" number='1' name="Airtelll">
<ip_address>192.168.1.23</ip_address>
</connection>
</ispinfo>
Mean number attribute should be come in a single quote .Any idea ....How can i achieve this

There is no flag in lxml to do this, so you have to resort to manual manipulation.
import re
re.sub(r'number="([0-9]+)"',r"number='\1'", etree.tostring(root, pretty_print=True))
However, why do you want to do this? As there is no difference other than cosmetics.

Related

How to add namespace prefix at root XML using Python LXML?

I would like to have the following NS prefix <qsp: and </qsp:
<qsp:QSPart xmlns:qsp="urn:qvalent:quicksuper:gateway">
<qsp:MemberRegistrationRequest/>
</qsp:QSPart>
How do I do that in LMXL python?
from lxml import etree
nsmap = {'qsp': 'urn:qvalent:quicksuper:gateway'}
nsprefix = nsmap['qsp']
QSPart = etree.Element('QSPart', nsmap=nsmap)
MemberRegistrationRequest = etree.SubElement(QSPart, etree.QName(nsprefix, 'MemberRegistrationRequest'))
print(etree.tostring(QSPart, pretty_print=True, encoding=str))
Result:
<QSPart xmlns:qsp="urn:qvalent:quicksuper:gateway">
<qsp:MemberRegistrationRequest/>
</QSPart>
According to the documentation, you need to fully qualify the element name in your call to etree.Element:
from lxml import etree
nsmap = {'qsp': 'urn:qvalent:quicksuper:gateway'}
nsprefix = nsmap['qsp']
QSPart = etree.Element(f'{{{nsmap["qsp"]}}}QSPart')
MemberRegistrationRequest = etree.SubElement(QSPart, etree.QName(nsprefix, 'MemberRegistrationRequest'))
print(etree.tostring(QSPart, pretty_print=True, encoding=str))
This outputs:
<ns0:QSPart xmlns:ns0="urn:qvalent:quicksuper:gateway">
<ns0:MemberRegistrationRequest/>
</ns0:QSPart>
Since you know your expected output, I wouldn't bother with all that (though I understand many people frown on this approach...) - just use from and to string:
frag_text = """<qsp:QSPart xmlns:qsp="urn:qvalent:quicksuper:gateway">
<qsp:MemberRegistrationRequest/>
</qsp:QSPart>"""
fragment = etree.fromstring(frag_text)
print(etree.tostring(fragment).decode())
Output should be your expected output.

Using "info.get" for a child element in Python / lxml

I'm trying to get the attribute of a child element in Python, using lxml.
This is the structure of the xml:
<GroupInformation groupId="crid://thing.com/654321" ordered="true">
<GroupType value="show" xsi:type="ProgramGroupTypeType"/>
<BasicDescription>
<Title type="main" xml:lang="EN">A programme</Title>
<RelatedMaterial>
<HowRelated href="urn:eventis:metadata:cs:HowRelatedCS:2010:boxCover">
<Name>Box cover</Name>
</HowRelated>
<MediaLocator>
<mpeg7:MediaUri>file://ftp.something.com/Images/123456.jpg</mpeg7:MediaUri>
</MediaLocator>
</RelatedMaterial>
</BasicDescription>
The code I've got is below. The bit I want to return is the 'value' attribute ("Show" in the example) under 'grouptype' (third line from the bottom):
file_name = input('Enter the file name, including .xml extension: ')
print('Parsing ' + file_name)
from lxml import etree
parser = etree.XMLParser()
tree = etree.parse(file_name, parser)
root = tree.getroot()
nsmap = {'xmlns': 'urn:tva:metadata:2010','mpeg7':'urn:tva:mpeg7:2008'}
with open(file_name+'.log', 'w', encoding='utf-8') as f:
for info in root.xpath('//xmlns:GroupInformation', namespaces=nsmap):
crid = info.get('groupId'))
grouptype = info.find('.//xmlns:GroupType', namespaces=nsmap)
gtype = grouptype.get('value')
titlex = info.find('.//xmlns:BasicDescription/xmlns:Title', namespaces=nsmap)
title = titlex.text if titlex != None else 'Missing'
Can anyone explain to me how to implement it? I had a quick look at the xsi namespace, but was unable to get it to work (and didn't know if it was the right thing to do).
Is this what you are looking for?
grouptype.attrib['value']
PS: why the parenthesis around assignment values? Those look unnecessary.

How should I parse this xml string in python?

My XML string is -
xmlData = """<SMSResponse xmlns="http://example.com" xmlns:i="http://www.w3.org/2001/XMLSchema-instance">
<Cancelled>false</Cancelled>
<MessageID>00000000-0000-0000-0000-000000000000</MessageID>
<Queued>false</Queued>
<SMSError>NoError</SMSError>
<SMSIncomingMessages i:nil="true"/>
<Sent>false</Sent>
<SentDateTime>0001-01-01T00:00:00</SentDateTime>
</SMSResponse>"""
I am trying to parse and get the values of tags - Cancelled, MessageId, SMSError, etc. I am using python's Elementtree library. So far, I have tried things like -
root = ET.fromstring(xmlData)
print root.find('Sent') // gives None
for child in root:
print chil.find('MessageId') // also gives None
Although, I am able to print the tags with -
for child in root:
print child.tag
//child.tag for the tag Cancelled is - {http://example.com}Cancelled
and their respective values with -
for child in root:
print child.text
How do I get something like -
print child.Queued // will print false
Like in PHP we can access them with the root -
$xml = simplexml_load_string($data);
$status = $xml->SMSError;
Your document has a namespace on it, you need to include the namespace when searching:
root = ET.fromstring(xmlData)
print root.find('{http://example.com}Sent',)
print root.find('{http://example.com}MessageID')
output:
<Element '{http://example.com}Sent' at 0x1043e0690>
<Element '{http://example.com}MessageID' at 0x1043e0350>
The find() and findall() methods also take a namespace map; you can search for a arbitrary prefix, and the prefix will be looked up in that map, to save typing:
nsmap = {'n': 'http://example.com'}
print root.find('n:Sent', namespaces=nsmap)
print root.find('n:MessageID', namespaces=nsmap)
If you're set on Python standard XML libraries, you could use something like this:
root = ET.fromstring(xmlData)
namespace = 'http://example.com'
def query(tree, nodename):
return tree.find('{{{ex}}}{nodename}'.format(ex=namespace, nodename=nodename))
queued = query(root, 'Queued')
print queued.text
You can create a dictionary and directly get values out of it...
tree = ET.fromstring(xmlData)
root = {}
for child in tree:
root[child.tag.split("}")[1]] = child.text
print root["Queued"]
With lxml.etree:
In [8]: import lxml.etree as et
In [9]: doc=et.fromstring(xmlData)
In [10]: ns={'n':'http://example.com'}
In [11]: doc.xpath('n:Queued/text()',namespaces=ns)
Out[11]: ['false']
With elementtree you can do:
import xml.etree.ElementTree as ET
root=ET.fromstring(xmlData)
ns={'n':'http://example.com'}
root.find('n:Queued',namespaces=ns).text
Out[13]: 'false'

Generating XML dynamically in Python

Hey friends I am generating XML data using Python libraries as follow
def multiwan_info_save(request):
data = {}
init = "init"
try:
form = Addmultiwanform(request.POST)
except:
pass
if form.is_valid():
from_sv = form.save(commit=False)
obj_get = False
try:
obj_get = MultiWAN.objects.get(isp_name=from_sv.isp_name)
except:
obj_get = False
nameservr = request.POST.getlist('nameserver_mw')
for nm in nameservr:
nameserver1, is_new = NameServer.objects.get_or_create(name=nm)
from_sv.nameserver = nameserver1
from_sv.save()
# main(init)
top = Element('ispinfo')
# comment = Comment('Generated for PyMOTW')
#top.append(comment)
all_connection = MultiWAN.objects.all()
for conn in all_connection:
child = SubElement(top, 'connection number ='+str(conn.id)+'name='+conn.isp_name+'desc='+conn.description )
subchild_ip = SubElement(child,'ip_address')
subchild_subnt = SubElement(child,'subnet')
subchild_gtwy = SubElement(child,'gateway')
subchild_nm1 = SubElement(child,'probe_server1')
subchild_nm2 = SubElement(child,'probe_server2')
subchild_interface = SubElement(child,'interface')
subchild_weight = SubElement(child,'weight')
subchild_ip.text = str(conn.ip_address)
subchild_subnt.text = str(conn.subnet)
subchild_gtwy.text = str(conn.gateway)
subchild_nm1.text = str(conn.nameserver.name)
# subchild_nm2.text = conn.
subchild_weight.text = str(conn.weight)
subchild_interface.text = str(conn.interface)
print "trying to print _____________________________"
print tostring(top)
print "let seeeeeeeeeeeeeeeeee +++++++++++++++++++++++++"
But I am getting output like follow
<ispinfo><connection number =5name=Airtelllldesc=Largets TRelecome ><ip_address>192.168.1.23</ip_address><subnet>192.168.1.23</subnet><gateway>192.168.1.23</gateway><probe_server1>192.168.99.1</probe_server1><probe_server2 /><interface>eth0</interface><weight>160</weight></connection number =5name=Airtelllldesc=Largets TRelecome ><connection number =6name=Uninordesc=Uninor><ip_address>192.166.55.23</ip_address><subnet>192.166.55.23</subnet><gateway>192.168.1.23</gateway><probe_server1>192.168.99.1</probe_server1><probe_server2 /><interface>eth0</interface><weight>160</weight></connection number =6name=Uninordesc=Uninor><connection number =7name=Airteldesc=Largets TRelecome ><ip_address>192.168.1.23</ip_address><subnet>192.168.1.23</subnet><gateway>192.168.1.23</gateway><probe_server1>192.168.99.1</probe_server1><probe_server2 /><interface>eth0</interface><weight>160</weight></connection number =7name=Airteldesc=Largets TRelecome ></ispinfo>
I just want to know that how can I write this XML in proper XML format ?
Thanks in advance
UPDATED to include simulation of both creating and printing of the XML tree
The Basic Issue
Your code is generating invalid connection tags like this:
<connection number =5name=Airtelllldesc=Largets TRelecome ></connection number =5name=Airteldesc=Largets TRelecome >
when they should look like this (I am omitting the sub-elements in between. Your code is generating these correctly):
<connection number="5" name="Airtellll" desc="Largets TRelecome" ></connection>
If you had valid XML, this code would print it neatly:
from lxml import etree
xml = '''<ispinfo><connection number="5" name="Airtellll" desc="Largets TRelecome" ><ip_address>192.168.1.23</ip_address><subnet>192.168.1.23</subnet><gateway>192.168.1.23</gateway><probe_server1>192.168.99.1</probe_server1><probe_server2 /><interface>eth0</interface><weight>160</weight></connection></ispinfo>'''
xml = etree.XML(xml)
print etree.tostring(xml, pretty_print = True)
Generating Valid XML
A small simulation follows:
from lxml import etree
# Some dummy text
conn_id = 5
conn_name = "Airtelll"
conn_desc = "Largets TRelecome"
ip = "192.168.1.23"
# Building the XML tree
# Note how attributes and text are added, using the Element methods
# and not by concatenating strings as in your question
root = etree.Element("ispinfo")
child = etree.SubElement(root, 'connection',
number = str(conn_id),
name = conn_name,
desc = conn_desc)
subchild_ip = etree.SubElement(child, 'ip_address')
subchild_ip.text = ip
# and pretty-printing it
print etree.tostring(root, pretty_print=True)
This will produce:
<ispinfo>
<connection desc="Largets TRelecome" number="5" name="Airtelll">
<ip_address>192.168.1.23</ip_address>
</connection>
</ispinfo>
A single line is proper, in the sense that a XML parser will understand it.
For pretty-printing to sys.stdout, use the dump method of Element.
For pretty-printing to a stream, use the write method of ElementTree.

Xpath select attribute of current node?

I use python with lxml to process the xml. After I query/filter to get to a nodes I want but I have some problem. How to get its attribute's value by xpath ? Here is my input example.
>print(etree.tostring(node, pretty_print=True ))
<rdf:li xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" rdf:resource="urn:miriam:obo.chebi:CHEBI%3A37671"/>
The value I want is in resource=... . Currently I just use the lxml to get the value. I wonder if it is possible to do in pure xpath ? thanks
EDIT: Forgot to said, this is not a root nodes so I can't use // here. I have like 2000-3000 others in xml file. My first attempt was playing around with ".#attrib" and "self::*#" but those does not seems to work.
EDIT2: I will try my best to explain (well, this is my first time to deal with xml problem using xpath. and english is not one of my favorite field....). Here is my input snippet http://pastebin.com/kZmVdbQQ (full one from here http://www.comp-sys-bio.org/yeastnet/ using version 4).
In my code, I try to get speciesTypes node with resource link chebi (<rdf:li rdf:resource="urn:miriam:obo.chebi:...."/>). and then I tried to get value from rdf:resource attribute in rdf:li. The thing is, I am pretty sure it would be easy to get attribute in child node if I start from parent node like speciesTypes, but I wonder how to do if I start from rdf:li. From my understanding, the "//" in xpath will looking for node from everywhere not just only in the current node.
below is my code
import lxml.etree as etree
tree = etree.parse("yeast_4.02.xml")
root = tree.getroot()
ns = {"sbml": "http://www.sbml.org/sbml/level2/version4",
"rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"body":"http://www.w3.org/1999/xhtml",
"re": "http://exslt.org/regular-expressions"
}
#good enough for now
maybemeta = root.xpath("//sbml:speciesType[descendant::rdf:li[starts-with(#rdf:resource, 'urn:miriam:obo.chebi') and not(starts-with(#rdf:resource, 'urn:miriam:uniprot'))]]", namespaces = ns)
def extract_name_and_chebi(node):
name = node.attrib['name']
chebies = node.xpath("./sbml:annotation//rdf:li[starts-with(#rdf:resource, 'urn:miriam:obo.chebi') and not(starts-with(#rdf:resource, 'urn:miriam:uniprot'))]", namespaces=ns) #get all rdf:li node with chebi resource
assert len(chebies) == 1
#my current solution to get rdf:resource value from rdf:li node
rdfNS = "{" + ns.get('rdf') + "}"
chebi = chebies[0].attrib[rdfNS + 'resource']
#do protein later
return (name, chebi)
metaWithChebi = map(extract_name_and_chebi, maybemeta)
fo = open("metabolites.txt", "w")
for name, chebi in metaWithChebi:
fo.write("{0}\t{1}\n".format(name, chebi))
Prefix the attribute name with # in the XPath query:
>>> from lxml import etree
>>> xml = """\
... <?xml version="1.0" encoding="utf8"?>
... <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
... <rdf:li rdf:resource="urn:miriam:obo.chebi:CHEBI%3A37671"/>
... </rdf:RDF>
... """
>>> tree = etree.fromstring(xml)
>>> ns = {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'}
>>> tree.xpath('//rdf:li/#rdf:resource', namespaces=ns)
['urn:miriam:obo.chebi:CHEBI%3A37671']
EDIT
Here's a revised version of the script in the question:
import lxml.etree as etree
ns = {
'sbml': 'http://www.sbml.org/sbml/level2/version4',
'rdf':'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'body':'http://www.w3.org/1999/xhtml',
're': 'http://exslt.org/regular-expressions',
}
def extract_name_and_chebi(node):
chebies = node.xpath("""
.//rdf:li[
starts-with(#rdf:resource, 'urn:miriam:obo.chebi')
]/#rdf:resource
""", namespaces=ns)
return node.attrib['name'], chebies[0]
with open('yeast_4.02.xml') as xml:
tree = etree.parse(xml)
maybemeta = tree.xpath("""
//sbml:speciesType[descendant::rdf:li[
starts-with(#rdf:resource, 'urn:miriam:obo.chebi')]]
""", namespaces = ns)
with open('metabolites.txt', 'w') as output:
for node in maybemeta:
output.write('%s\t%s\n' % extract_name_and_chebi(node))
To select off the current node its attribute named rdf:resource, use this XPath expression:
#rdf:resource
In order for this to "work correctly" you must register the association of the prefix "rdf:" to the corresponding namespace.
If you don't know how to register the rdf namespace, it is still possible to select the attribute -- with this XPath expression:
#*[name()='rdf:resource']
Well, I got it. The xpath expression I need here is "./#rdf:resource" not ".#rdf:resource". But why ? I thought "./" indicate the child of current node.

Categories