How to replace the text inside an XML element? - python

Given the following xml:
<!-- file.xml -->
<video>
<original_spoken_locale>en-US</original_spoken_locale>
<another_tag>somevalue</another_tag>
</video>
What would be the best way to replace the value inside of the <original_spoken_locale> tag? If I did know the value, I could use something like:
with open('file.xml', 'r') as file:
contents = file.read()
new_contents = contents.replace('en-US, 'new-value')
with open('file.xml', 'w') as file:
file.write(new_contents)
However, in this case, I don't know what the value will be.

This is fairly easy with ElementTree. Just replace the value of the text attribute of your element:
>>> from xml.etree.ElementTree import parse, tostring
>>> doc = parse('file.xml')
>>> elem = doc.findall('original_spoken_locale')[0]
>>> elem.text = 'new-value'
>>> print tostring(doc.getroot())
<video>
<original_spoken_locale>new-value</original_spoken_locale>
<another_tag>somevalue</another_tag>
</video>
This is safer, too, since you can have en-US in another places of your document.

Related

Parsing Autosar xml using beautiful soup python 3

I am trying to parse AUTOSAR specific arxml (similar to xml file) using Python but I am unable to read the contents of the file. I want to get the DEFINITION-REF values of definitions inside multiple ECUC-CONTAINER-VALUE tags eg:
/AUTOSAR/ecucdef/BswM/BswMConfig/BswMArbitration/BswMLogicalExpression/BswMArgumentRef
I tried multiple ways but I am unable to print out the contents.
from bs4 import BeautifulSoup as Soup
def parseArxml():
handler = open('input.arxml').read()
soup = Soup(handler,"html.parser")
for ecuc_container in soup.findAll('ECUC-CONTAINER-VALUE'):
print(ecuc_container)
if __name__ == "__main__":
parseArxml()
Here is a part of the arxml file:
<?xml version="1.0" encoding="UTF-8"?>
<AUTOSAR xmlns="http://autosar.org/schema/r4.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://autosar.org/schema/r4.0 autosar_4-2-1.xsd">
<ECUC-CONTAINER-VALUE UUID="c112c504-e546-41c3-abf9-0aaf06b18284">
<SHORT-NAME>BswMLogicalExpression_2</SHORT-NAME>
<DEFINITION-REF DEST="ECUC-PARAM-CONF-CONTAINER-DEF">/AUTOSAR/ecucdef/BswM/BswMConfig/BswMArbitration/BswMLogicalExpression</DEFINITION-REF>
<REFERENCE-VALUES>
<ECUC-REFERENCE-VALUE>
<DEFINITION-REF DEST="ECUC-CHOICE-REFERENCE-DEF">/AUTOSAR/ecucdef/BswM/BswMConfig/BswMArbitration/BswMLogicalExpression/BswMArgumentRef</DEFINITION-REF>
<VALUE-REF DEST="ECUC-CONTAINER-VALUE">/ARRoot/BswM_0/BswMConfig_0/BswMArbitration_0/BswMModeCondition_2</VALUE-REF>
</ECUC-REFERENCE-VALUE>
</REFERENCE-VALUES>
</ECUC-CONTAINER-VALUE>
<ECUC-CONTAINER-VALUE UUID="c112c504-e546-41c3-abf9-0aaf06b18284">
<SHORT-NAME>BswMLogicalExpression_3</SHORT-NAME>
<DEFINITION-REF DEST="ECUC-PARAM-CONF-CONTAINER-DEF">/AUTOSAR/ecucdef/BswM/BswMConfig/BswMArbitration/BswMLogicalExpression</DEFINITION-REF>
<REFERENCE-VALUES>
<ECUC-REFERENCE-VALUE>
<DEFINITION-REF DEST="ECUC-CHOICE-REFERENCE-DEF">/AUTOSAR/ecucdef/BswM/BswMConfig/BswMArbitration/BswMLogicalExpression/BswMArgumentRef</DEFINITION-REF>
<VALUE-REF DEST="ECUC-CONTAINER-VALUE">/ARRoot/BswM_2/BswMConfig_2/BswMArbitration_2/BswMModeCondition_3</VALUE-REF>
</ECUC-REFERENCE-VALUE>
</REFERENCE-VALUES>
</ECUC-CONTAINER-VALUE>
</AUTOSAR>
You'll see with print(soup) that tag names were converted to lower-case by the parser. So use lowercase when searching for tag names:
for ecuc_container in soup.findAll('ECUC-CONTAINER-VALUE'.lower()):
or simply:
for ecuc_container in soup.findAll('ecuc-container-value'):
Or even better: explicitly parse the document as XML, so that the case of tags is not modified:
soup = Soup(handler,'xml')
Here's how you can get a list of the text inside <DEFINITION-REF DEST="ECUC-PARAM-CONF-CONTAINER-DEF"> elements:
def parseArxml():
handler = open('input.arxml').read()
soup = Soup(handler,'xml')
dest = [d.text for d in soup.findAll('DEFINITION-REF') if d['DEST']=='ECUC-CHOICE-REFERENCE-DEF']
print(dest)
Output:
['/AUTOSAR/ecucdef/BswM/BswMConfig/BswMArbitration/BswMLogicalExpression/BswMArgumentRef',
'/AUTOSAR/ecucdef/BswM/BswMConfig/BswMArbitration/BswMLogicalExpression/BswMArgumentRef']
Or if you want to get all definition-ref tags regardless of attribute, use
dest = [d.text for d in soup.findAll('definition-ref')]
Seems your parser and BeautifulSoup version is converting tags to lowercase.
You should do this:
from bs4 import BeautifulSoup as Soup
def parseArxml():
handler = open('input.arxml').read()
soup = Soup(handler,"html.parser")
for ecuc_container in soup.find_all('ecuc-container-value'):
for def_ref in ecuc_container.find_all('definition-ref'):
print(def_ref.get_text())
if __name__ == "__main__":
parseArxml()
OUTPUT:
/AUTOSAR/ecucdef/BswM/BswMConfig/BswMArbitration/BswMLogicalExpression
/AUTOSAR/ecucdef/BswM/BswMConfig/BswMArbitration/BswMLogicalExpression/BswMArgumentRef
/AUTOSAR/ecucdef/BswM/BswMConfig/BswMArbitration/BswMLogicalExpression
/AUTOSAR/ecucdef/BswM/BswMConfig/BswMArbitration/BswMLogicalExpression/BswMArgumentRef

How to get the xml element as a string with namespace using ElementTree in python?

I need to get the elements from xml as a string. I am trying with below xml format.
<xml>
<prot:data xmlns:prot="prot">
<product-id-template>
<prot:ProductId>PRODUCT_ID</prot:ProductId>
</product-id-template>
<product-name-template>
<prot:ProductName>PRODUCT_NAME</prot:ProductName>
</product-name-template>
<dealer-template>
<xsi:Dealer xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">DEALER</xsi:Dealer>
</dealer-template>
</prot:data>
</xml>
And I tried with below code:
from xml.etree import ElementTree as ET
def get_template(xpath, namespaces):
tree = ET.parse('cdata.xml')
elements = tree.getroot()
for element in elements.findall(xpath, namespaces=namespaces):
return element
namespace = {"prot" : "prot"}
aa = get_template(".//prot:ProductId", namespace)
print(ET.tostring(aa).decode())
Actual output:
<ns0:ProductId xmlns:ns0="prot">PRODUCT_ID</ns0:ProductId>
Expected output:
<prot:ProductId>PRODUCT_ID</prot:ProductId>
I should not remove the xmlns from the document where it presents in the document. And It has to be removed where it not presents. Example product-id-template is not containing the xmlns so it needs to be retrieved without xmlns. And dealer-template contains the xmlns so it needs to be retrieved with xmlns.
How to achieve this?
You can remove xmlns with regex.
import re
# ...
with_ns = ET.tostring(aa).decode()
no_ns = re.sub(' xmlns(:\w+)?="[^"]+"', '', with_ns)
print(no_ns)
UPDATE: You can do a very wild thing. Although I can't recommend it, because I'm not a Python expert.
I just checked the source code and found that I can do this hack:
def my_serialize_xml(write, elem, qnames, namespaces,
short_empty_elements, **kwargs):
ET._serialize_xml(write, elem, qnames,
None, short_empty_elements, **kwargs)
ET._serialize["xml"] = my_serialize_xml
I just defined my_serialize_xml, which calls ElementTree._serialize_xml with namespaces=None. And then, in dictionary ElementTree._serialize, I changed value for key "xml" to my_serialize_xml. So when you call ElementTree.tostring, it will use my_serialize_xml.
If you want to try it, just place the code(above) after from xml.etree import ElementTree as ET (but before using the ET).

xml file parsing in python

xml file :
<global>
<rtmp>
<fcsapp>
<password>
<key>hello123</key>
<key>check123</key>
</password>
</fcsapp>
</rtmp>
</global>
python code : To obtain all the key tag values.
hello123
check123
using xml.etree.ElementTree
for streams in xmlRoot.iter('global'):
xpath = "/rtmp/fcsapp/password"
tag = "key"
for child in streams.findall(xpath):
resultlist.append(child.find(tag).text)
print resultlist
The output obtained is [hello123], but I want it to display both ([hello123, check123])
How do I obtain this?
Using lxml and cssselect I would do it like this:
>>> from lxml.html import fromstring
>>> doc = fromstring(open("foo.xml", "r").read())
>>> doc.cssselect("password key")
[<Element key at 0x7f77a6786cb0>, <Element key at 0x7f77a6786d70>]
>>> [e.text for e in doc.cssselect("password key")]
['hello123 \n ', 'check123 \n ']
With lxml and xpath You can do it in the following way:
from lxml import etree
xml = """
<global>
<rtmp>
<fcsapp>
<password>
<key>hello123</key>
<key>check123</key>
</password>
</fcsapp>
</rtmp>
</global>
"""
tree = etree.fromstring(xml)
result = tree.xpath('//password/key/text()')
print result # ['hello123', 'check123']
try beautifulsoup package "https://pypi.python.org/pypi/BeautifulSoup"
using xml.etree.ElementTree
for streams in xmlRoot.iter('global'):
xpath = "/rtmp/fcsapp/password"
tag = "key"
for child in streams.iter(tag):
resultlist.append(child.text)
print resultlist
have to iter over the "key" tag in for loop to obtain the desired result. The above code solves the problem.

lxml classic: Get text content except for that of nested tags?

This must be an absolute classic, but I can't find the answer here. I'm parsing the following tag with lxml cssselect:
<li><span class="num">3</span> Detroit</li>
I want to get the content of the <li> tag without the content of the <span> tag.
Currently I have:
stop_list = doc.cssselect('ol#stations li a')
start = stop_list[0].text_content().strip()
But that gives me 3 Detroit. How can I just get Detroit?
For your example, I think going with XPath is cleaner and easier than CSS:
>>> xml = '<li><span class="num">3</span> Detroit</li>'
>>> root = etree.fromstring(xml)
>>> print( root.xpath('/li/a/text()'))
[' Detroit']
>>> xml = '<li>I <span>FooBar!</span> love <span class="num">3</span> Detroit</li>'
>>> root = etree.fromstring(xml)
>>> print( root.xpath('/li/a/text()'))
['I ', ' love ', ' Detroit']
>>> ' '.join([x.strip() for x in root.xpath('/li/a/text()')])
'I love Detroit'
itertext method of an element returns an iterator of node's text data. For your <a> tag, ' Detroit' would be the 2nd value returned by the iterator. If structure of your document always conforms to a known specification, you could skip specific text elements to get what you need.
from lxml import html
doc = html.fromstring("""<li><span class="num">3</span> Detroit</li>""")
stop_nodes = doc.cssselect('li a')
stop_names = []
for start in stop_list:
node_text = start.itertext()
node_text.next() # Skip '3'
stop_names.append(node_text.next().lstrip())
continue
You can combine css selector with the xpath text() function mentioned in Zachary's answer like this (If you're more comfortable with using CSS selectors than xpath):
stop_names = [a.xpath('text()').lstrip() for a in doc.cssselect('li a')]

XML writing tools for Python

I'm currently trying ElementTree and it looks fine, it escapes HTML entities and so on and so forth. Am I missing something truly wonderful I haven't heard of?
This is similar to what I'm actually doing:
import xml.etree.ElementTree as ET
root = ET.Element('html')
head = ET.SubElement(root,'head')
script = ET.SubElement(head,'script')
script.set('type','text/javascript')
script.text = "var a = 'I love รก letters'"
body = ET.SubElement(root,'body')
h1 = ET.SubElement(body,'h1')
h1.text = "And I like the fact that 3 > 1"
tree = ET.ElementTree(root)
tree.write('foo.xhtml')
# more foo.xhtml
<html><head><script type="text/javascript">var a = 'I love &aacute;
letters'</script></head><body><h1>And I like the fact that 3 > 1</h1>
</body></html>
Another way is using the E Factory builder from lxml (available in Elementtree too)
>>> from lxml import etree
>>> from lxml.builder import E
>>> def CLASS(*args): # class is a reserved word in Python
... return {"class":' '.join(args)}
>>> html = page = (
... E.html( # create an Element called "html"
... E.head(
... E.title("This is a sample document")
... ),
... E.body(
... E.h1("Hello!", CLASS("title")),
... E.p("This is a paragraph with ", E.b("bold"), " text in it!"),
... E.p("This is another paragraph, with a", "\n ",
... E.a("link", href="http://www.python.org"), "."),
... E.p("Here are some reserved characters: <spam&egg>."),
... etree.XML("<p>And finally an embedded XHTML fragment.</p>"),
... )
... )
... )
>>> print(etree.tostring(page, pretty_print=True))
<html>
<head>
<title>This is a sample document</title>
</head>
<body>
<h1 class="title">Hello!</h1>
<p>This is a paragraph with <b>bold</b> text in it!</p>
<p>This is another paragraph, with a
link.</p>
<p>Here are some reservered characters: <spam&egg>.</p>
<p>And finally an embedded XHTML fragment.</p>
</body>
</html>
There's always SimpleXMLWriter, part of the ElementTree toolkit. The interface is dead simple.
Here's an example:
from elementtree.SimpleXMLWriter import XMLWriter
import sys
w = XMLWriter(sys.stdout)
html = w.start("html")
w.start("head")
w.element("title", "my document")
w.element("meta", name="generator", value="my application 1.0")
w.end()
w.start("body")
w.element("h1", "this is a heading")
w.element("p", "this is a paragraph")
w.start("p")
w.data("this is ")
w.element("b", "bold")
w.data(" and ")
w.element("i", "italic")
w.data(".")
w.end("p")
w.close(html)
I assume that you're actually creating an XML DOM tree, because you want to validate that what goes into this file is valid XML, since otherwise you'd just write a static string to a file. If validating your output is indeed your goal, then I'd suggest
from xml.dom.minidom import parseString
doc = parseString("""<html>
<head>
<script type="text/javascript">
var a = 'I love &aacute; letters'
</script>
</head>
<body>
<h1>And I like the fact that 3 > 1</h1>
</body>
</html>""")
with open("foo.xhtml", "w") as f:
f.write( doc.toxml() )
This lets you just write the XML you want to output, validate that it's correct (since parseString will raise an exception if it's invalid) and have your code look much nicer.
Presumably you're not just writing the same static XML every time and want some substitution. In this case I'd have lines like
var a = '%(message)s'
and then use the % operator to do the substitution, like
</html>""" % {"message": "I love &aacute; letters"})
https://github.com/galvez/xmlwitch:
import xmlwitch
xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
with xml.feed(xmlns='http://www.w3.org/2005/Atom'):
xml.title('Example Feed')
xml.updated('2003-12-13T18:30:02Z')
with xml.author:
xml.name('John Doe')
xml.id('urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6')
with xml.entry:
xml.title('Atom-Powered Robots Run Amok')
xml.id('urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a')
xml.updated('2003-12-13T18:30:02Z')
xml.summary('Some text.')
print(xml)
don't you actually want something like:
html(head(script(type='text/javascript', content='var a = ...')),
body(h1('And I like the fact that 3 < 1'), p('just some paragraph'))
I think I saw something like that somewhere. This would be wonderful.
EDIT: Actually, I went and wrote a library today to do just that: magictree
You can use it like this:
from magictree import html, head, script, body, h1, p
root = html(
head(
script('''var a = 'I love &aacute; letters''',
type='text/javascript')),
body(
h1('And I like the fact that 3 > 1')))
# root is a plain Element object, like those created with ET.Element...
# so you can write it out using ElementTree :)
tree = ET.ElementTree(root)
tree.write('foo.xhtml')
The magic in magictree lies in how the importing works: The Element factories are created when needed. Have a look at the source, it is based on an answer to another StackOverflow question.
I ended up using saxutils.escape(str) to generate valid XML strings and then validating it with Eli's approach to be sure I didn't miss any tag
from xml.sax import saxutils
from xml.dom.minidom import parseString
from xml.parsers.expat import ExpatError
xml = '''<?xml version="1.0" encoding="%s"?>\n
<contents title="%s" crawl_date="%s" in_text_date="%s"
url="%s">\n<main_post>%s</main_post>\n</contents>''' %
(self.encoding, saxutils.escape(title), saxutils.escape(time),
saxutils.escape(date), saxutils.escape(url), saxutils.escape(contents))
try:
minidoc = parseString(xml)
catch ExpatError:
print "Invalid xml"
For anyone encountering this now, there's actually a way to do this hidden away in Python's standard library in xml.sax.utils.XMLGenerator. Here's an example of it in action:
>>> from xml.sax.saxutils import XMLGenerator
>>> import StringIO
>>> w = XMLGenerator(out, 'utf-8')
>>> w.startDocument()
>>> w.startElement("test", {'bar': 'baz'})
>>> w.characters("Foo")
>>> w.endElement("test")
>>> w.endDocument()
>>> print out.getvalue()
<?xml version="1.0" encoding="utf-8"?>
<test bar="baz">Foo</test>
Try http://uche.ogbuji.net/tech/4suite/amara. It is quite complete and has a straight forward set of access tools. Normal Unicode support, etc.
#
#Output the XML entry
#
def genFileOLD(out,label,term,idval):
filename=entryTime() + ".html"
writer=MarkupWriter(out, indent=u"yes")
writer.startDocument()
#Test element and attribute writing
ans=namespace=u'http://www.w3.org/2005/Atom'
xns=namespace=u'http://www.w3.org/1999/xhtml'
writer.startElement(u'entry',
ans,
extraNss={u'x':u'http://www.w3.org/1999/xhtml' ,
u'dc':u'http://purl.org/dc/elements/1.1'})
#u'a':u'http://www.w3.org/2005/Atom',
#writer.attribute(u'xml:lang',unicode("en-UK"))
writer.simpleElement(u'title',ans,content=unicode(label))
#writer.simpleElement(u'a:subtitle',ans,content=u' ')
id=unicode("http://www.dpawson.co.uk/nodesets/"+afn.split(".")[0])
writer.simpleElement(u'id',ans,content=id)
writer.simpleElement(u'updated',ans,content=unicode(dtime()))
writer.startElement(u'author',ans)
writer.simpleElement(u'name',ans,content=u'Dave ')
writer.simpleElement(u'uri',ans,
content=u'http://www.dpawson.co.uk/nodesets/'+afn+".xml")
writer.endElement(u'author')
writer.startElement(u'category', ans)
if (prompt):
label=unicode(raw_input("Enter label "))
writer.attribute(u'label',unicode(label))
if (prompt):
term = unicode(raw_input("Enter term to use "))
writer.attribute(u'term', unicode(term))
writer.endElement(u'category')
writer.simpleElement(u'rights',ans,content=u'\u00A9 Dave 2005-2008')
writer.startElement(u'link',ans)
writer.attribute(u'href',
unicode("http://www.dpawson.co.uk/nodesets/entries/"+afn+".html"))
writer.attribute(u'rel',unicode("alternate"))
writer.endElement(u'link')
writer.startElement(u'published', ans)
dt=dtime()
dtu=unicode(dt)
writer.text(dtu)
writer.endElement(u'published')
writer.simpleElement(u'summary',ans,content=unicode(label))
writer.startElement(u'content',ans)
writer.attribute(u'type',unicode("xhtml"))
writer.startElement(u'div',xns)
writer.simpleElement(u'h3',xns,content=unicode(label))
writer.endElement(u'div')
writer.endElement(u'content')
writer.endElement(u'entry')

Categories