XML writing tools for Python - python

I'm currently trying ElementTree and it looks fine, it escapes HTML entities and so on and so forth. Am I missing something truly wonderful I haven't heard of?
This is similar to what I'm actually doing:
import xml.etree.ElementTree as ET
root = ET.Element('html')
head = ET.SubElement(root,'head')
script = ET.SubElement(head,'script')
script.set('type','text/javascript')
script.text = "var a = 'I love รก letters'"
body = ET.SubElement(root,'body')
h1 = ET.SubElement(body,'h1')
h1.text = "And I like the fact that 3 > 1"
tree = ET.ElementTree(root)
tree.write('foo.xhtml')
# more foo.xhtml
<html><head><script type="text/javascript">var a = 'I love &aacute;
letters'</script></head><body><h1>And I like the fact that 3 > 1</h1>
</body></html>

Another way is using the E Factory builder from lxml (available in Elementtree too)
>>> from lxml import etree
>>> from lxml.builder import E
>>> def CLASS(*args): # class is a reserved word in Python
... return {"class":' '.join(args)}
>>> html = page = (
... E.html( # create an Element called "html"
... E.head(
... E.title("This is a sample document")
... ),
... E.body(
... E.h1("Hello!", CLASS("title")),
... E.p("This is a paragraph with ", E.b("bold"), " text in it!"),
... E.p("This is another paragraph, with a", "\n ",
... E.a("link", href="http://www.python.org"), "."),
... E.p("Here are some reserved characters: <spam&egg>."),
... etree.XML("<p>And finally an embedded XHTML fragment.</p>"),
... )
... )
... )
>>> print(etree.tostring(page, pretty_print=True))
<html>
<head>
<title>This is a sample document</title>
</head>
<body>
<h1 class="title">Hello!</h1>
<p>This is a paragraph with <b>bold</b> text in it!</p>
<p>This is another paragraph, with a
link.</p>
<p>Here are some reservered characters: <spam&egg>.</p>
<p>And finally an embedded XHTML fragment.</p>
</body>
</html>

There's always SimpleXMLWriter, part of the ElementTree toolkit. The interface is dead simple.
Here's an example:
from elementtree.SimpleXMLWriter import XMLWriter
import sys
w = XMLWriter(sys.stdout)
html = w.start("html")
w.start("head")
w.element("title", "my document")
w.element("meta", name="generator", value="my application 1.0")
w.end()
w.start("body")
w.element("h1", "this is a heading")
w.element("p", "this is a paragraph")
w.start("p")
w.data("this is ")
w.element("b", "bold")
w.data(" and ")
w.element("i", "italic")
w.data(".")
w.end("p")
w.close(html)

I assume that you're actually creating an XML DOM tree, because you want to validate that what goes into this file is valid XML, since otherwise you'd just write a static string to a file. If validating your output is indeed your goal, then I'd suggest
from xml.dom.minidom import parseString
doc = parseString("""<html>
<head>
<script type="text/javascript">
var a = 'I love &aacute; letters'
</script>
</head>
<body>
<h1>And I like the fact that 3 > 1</h1>
</body>
</html>""")
with open("foo.xhtml", "w") as f:
f.write( doc.toxml() )
This lets you just write the XML you want to output, validate that it's correct (since parseString will raise an exception if it's invalid) and have your code look much nicer.
Presumably you're not just writing the same static XML every time and want some substitution. In this case I'd have lines like
var a = '%(message)s'
and then use the % operator to do the substitution, like
</html>""" % {"message": "I love &aacute; letters"})

https://github.com/galvez/xmlwitch:
import xmlwitch
xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
with xml.feed(xmlns='http://www.w3.org/2005/Atom'):
xml.title('Example Feed')
xml.updated('2003-12-13T18:30:02Z')
with xml.author:
xml.name('John Doe')
xml.id('urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6')
with xml.entry:
xml.title('Atom-Powered Robots Run Amok')
xml.id('urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a')
xml.updated('2003-12-13T18:30:02Z')
xml.summary('Some text.')
print(xml)

don't you actually want something like:
html(head(script(type='text/javascript', content='var a = ...')),
body(h1('And I like the fact that 3 < 1'), p('just some paragraph'))
I think I saw something like that somewhere. This would be wonderful.
EDIT: Actually, I went and wrote a library today to do just that: magictree
You can use it like this:
from magictree import html, head, script, body, h1, p
root = html(
head(
script('''var a = 'I love &aacute; letters''',
type='text/javascript')),
body(
h1('And I like the fact that 3 > 1')))
# root is a plain Element object, like those created with ET.Element...
# so you can write it out using ElementTree :)
tree = ET.ElementTree(root)
tree.write('foo.xhtml')
The magic in magictree lies in how the importing works: The Element factories are created when needed. Have a look at the source, it is based on an answer to another StackOverflow question.

I ended up using saxutils.escape(str) to generate valid XML strings and then validating it with Eli's approach to be sure I didn't miss any tag
from xml.sax import saxutils
from xml.dom.minidom import parseString
from xml.parsers.expat import ExpatError
xml = '''<?xml version="1.0" encoding="%s"?>\n
<contents title="%s" crawl_date="%s" in_text_date="%s"
url="%s">\n<main_post>%s</main_post>\n</contents>''' %
(self.encoding, saxutils.escape(title), saxutils.escape(time),
saxutils.escape(date), saxutils.escape(url), saxutils.escape(contents))
try:
minidoc = parseString(xml)
catch ExpatError:
print "Invalid xml"

For anyone encountering this now, there's actually a way to do this hidden away in Python's standard library in xml.sax.utils.XMLGenerator. Here's an example of it in action:
>>> from xml.sax.saxutils import XMLGenerator
>>> import StringIO
>>> w = XMLGenerator(out, 'utf-8')
>>> w.startDocument()
>>> w.startElement("test", {'bar': 'baz'})
>>> w.characters("Foo")
>>> w.endElement("test")
>>> w.endDocument()
>>> print out.getvalue()
<?xml version="1.0" encoding="utf-8"?>
<test bar="baz">Foo</test>

Try http://uche.ogbuji.net/tech/4suite/amara. It is quite complete and has a straight forward set of access tools. Normal Unicode support, etc.
#
#Output the XML entry
#
def genFileOLD(out,label,term,idval):
filename=entryTime() + ".html"
writer=MarkupWriter(out, indent=u"yes")
writer.startDocument()
#Test element and attribute writing
ans=namespace=u'http://www.w3.org/2005/Atom'
xns=namespace=u'http://www.w3.org/1999/xhtml'
writer.startElement(u'entry',
ans,
extraNss={u'x':u'http://www.w3.org/1999/xhtml' ,
u'dc':u'http://purl.org/dc/elements/1.1'})
#u'a':u'http://www.w3.org/2005/Atom',
#writer.attribute(u'xml:lang',unicode("en-UK"))
writer.simpleElement(u'title',ans,content=unicode(label))
#writer.simpleElement(u'a:subtitle',ans,content=u' ')
id=unicode("http://www.dpawson.co.uk/nodesets/"+afn.split(".")[0])
writer.simpleElement(u'id',ans,content=id)
writer.simpleElement(u'updated',ans,content=unicode(dtime()))
writer.startElement(u'author',ans)
writer.simpleElement(u'name',ans,content=u'Dave ')
writer.simpleElement(u'uri',ans,
content=u'http://www.dpawson.co.uk/nodesets/'+afn+".xml")
writer.endElement(u'author')
writer.startElement(u'category', ans)
if (prompt):
label=unicode(raw_input("Enter label "))
writer.attribute(u'label',unicode(label))
if (prompt):
term = unicode(raw_input("Enter term to use "))
writer.attribute(u'term', unicode(term))
writer.endElement(u'category')
writer.simpleElement(u'rights',ans,content=u'\u00A9 Dave 2005-2008')
writer.startElement(u'link',ans)
writer.attribute(u'href',
unicode("http://www.dpawson.co.uk/nodesets/entries/"+afn+".html"))
writer.attribute(u'rel',unicode("alternate"))
writer.endElement(u'link')
writer.startElement(u'published', ans)
dt=dtime()
dtu=unicode(dt)
writer.text(dtu)
writer.endElement(u'published')
writer.simpleElement(u'summary',ans,content=unicode(label))
writer.startElement(u'content',ans)
writer.attribute(u'type',unicode("xhtml"))
writer.startElement(u'div',xns)
writer.simpleElement(u'h3',xns,content=unicode(label))
writer.endElement(u'div')
writer.endElement(u'content')
writer.endElement(u'entry')

Related

python parsing xml with ElementTree doesn't give interested result

I have an xml file like this
<?xml version="1.0"?>
<sample>
<text>My name is <b>Wrufesh</b>. What is yours?</text>
</sample>
I have a python code like this
import xml.etree.ElementTree as ET
tree = ET.parse('sample.xml')
root = tree.getroot()
for child in root:
print child.text()
I only get
'My name is' as an output.
I want to get
'My name is <b>Wrufesh</b>. What is yours?' as an output.
What can I do?
You can get your desired output using using ElementTree.tostringlist():
>>> import xml.etree.ElementTree as ET
>>> root = ET.parse('sample.xml').getroot()
>>> l = ET.tostringlist(root.find('text'))
>>> l
['<text', '>', 'My name is ', '<b', '>', 'Wrufesh', '</b>', '. What is yours?', '</text>', '\n']
>>> ''.join(l[2:-2])
'My name is <b>Wrufesh</b>. What is yours?'
I wonder though how practical this is going to be for generic use.
I don't think treating tag in xml as a string is right. You can access the text part of xml like this:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import xml.etree.ElementTree as ET
tree = ET.parse('sample.xml')
root = tree.getroot()
text = root[0]
for i in text.itertext():
print i
# As you can see, `<b>` and `</b>` is a pair of tags but not strings.
print text._children
I would suggest pre-processing the xml file to wrap elements under <text> element in CDATA. You should be able to read the values without a problem afterwards.
<text><![CDATA[<My name is <b>Wrufesh</b>. What is yours?]]></text>

extract tag inside text element of an xml tag

Suppose I have an XML document of the following form
<root>
<foos>
<foo>the quick <bar>brown </bar>fox</foo>
</foos>
<!-- Lots more <foo></foo> -->
</root>
How do I extract the full text string the quick fox as well as the string brown?
import xml.etree.ElementTree as ET
doc = ET.parse(xmldocument).getroot()
foos = doc.find('foos')
for foo in foos:
print foo.text # This will print 'the quick '
Not sure how to solve this problem.
You can also try something like this, which iterates in all nested tags automatically:
foos = doc.find('foos')
for foo in foos:
for text in foo.itertext():
print text.strip(),
print
from scrapy.selector import XmlXPathSelector
xml = \
"""
<root>
<foos>
<foo>the quick <bar>brown </bar>fox</foo>
</foos>
</root>
"""
hxs =XmlXPathSelector(text=xml)
foos = hxs.select('//foos')
for one in foos:
text = one.select('./foo//text()').extract()
text = ''.join(text)
print text

How to replace the text inside an XML element?

Given the following xml:
<!-- file.xml -->
<video>
<original_spoken_locale>en-US</original_spoken_locale>
<another_tag>somevalue</another_tag>
</video>
What would be the best way to replace the value inside of the <original_spoken_locale> tag? If I did know the value, I could use something like:
with open('file.xml', 'r') as file:
contents = file.read()
new_contents = contents.replace('en-US, 'new-value')
with open('file.xml', 'w') as file:
file.write(new_contents)
However, in this case, I don't know what the value will be.
This is fairly easy with ElementTree. Just replace the value of the text attribute of your element:
>>> from xml.etree.ElementTree import parse, tostring
>>> doc = parse('file.xml')
>>> elem = doc.findall('original_spoken_locale')[0]
>>> elem.text = 'new-value'
>>> print tostring(doc.getroot())
<video>
<original_spoken_locale>new-value</original_spoken_locale>
<another_tag>somevalue</another_tag>
</video>
This is safer, too, since you can have en-US in another places of your document.

Get all text inside a tag in lxml

I'd like to write a code snippet that would grab all of the text inside the <content> tag, in lxml, in all three instances below, including the code tags. I've tried tostring(getchildren()) but that would miss the text in between the tags. I didn't have very much luck searching the API for a relevant function. Could you help me out?
<!--1-->
<content>
<div>Text inside tag</div>
</content>
#should return "<div>Text inside tag</div>
<!--2-->
<content>
Text with no tag
</content>
#should return "Text with no tag"
<!--3-->
<content>
Text outside tag <div>Text inside tag</div>
</content>
#should return "Text outside tag <div>Text inside tag</div>"
Just use the node.itertext() method, as in:
''.join(node.itertext())
Does text_content() do what you need?
Try:
def stringify_children(node):
from lxml.etree import tostring
from itertools import chain
parts = ([node.text] +
list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
[node.tail])
# filter removes possible Nones in texts and tails
return ''.join(filter(None, parts))
Example:
from lxml import etree
node = etree.fromstring("""<content>
Text outside tag <div>Text <em>inside</em> tag</div>
</content>""")
stringify_children(node)
Produces: '\nText outside tag <div>Text <em>inside</em> tag</div>\n'
A version of albertov 's stringify-content that solves the bugs reported by hoju:
def stringify_children(node):
from lxml.etree import tostring
from itertools import chain
return ''.join(
chunk for chunk in chain(
(node.text,),
chain(*((tostring(child, with_tail=False), child.tail) for child in node.getchildren())),
(node.tail,)) if chunk)
The following snippet which uses python generators works perfectly and is very efficient.
''.join(node.itertext()).strip()
Defining stringify_children this way may be less complicated:
from lxml import etree
def stringify_children(node):
s = node.text
if s is None:
s = ''
for child in node:
s += etree.tostring(child, encoding='unicode')
return s
or in one line
return (node.text if node.text is not None else '') + ''.join((etree.tostring(child, encoding='unicode') for child in node))
Rationale is the same as in this answer: leave the serialization of child nodes to lxml. The tail part of node in this case isn't interesting since it is "behind" the end tag. Note that the encoding argument may be changed according to one's needs.
Another possible solution is to serialize the node itself and afterwards, strip the start and end tag away:
def stringify_children(node):
s = etree.tostring(node, encoding='unicode', with_tail=False)
return s[s.index(node.tag) + 1 + len(node.tag): s.rindex(node.tag) - 2]
which is somewhat horrible. This code is correct only if node has no attributes, and I don't think anyone would want to use it even then.
One of the simplest code snippets, that actually worked for me and as per documentation at http://lxml.de/tutorial.html#using-xpath-to-find-text is
etree.tostring(html, method="text")
where etree is a node/tag whose complete text, you are trying to read. Behold that it doesn't get rid of script and style tags though.
import urllib2
from lxml import etree
url = 'some_url'
getting url
test = urllib2.urlopen(url)
page = test.read()
getting all html code within including table tag
tree = etree.HTML(page)
xpath selector
table = tree.xpath("xpath_here")
res = etree.tostring(table)
res is the html code of table
this was doing job for me.
so you can extract the tags content with xpath_text() and tags including their content using tostring()
div = tree.xpath("//div")
div_res = etree.tostring(div)
text = tree.xpath_text("//content")
or text = tree.xpath("//content/text()")
div_3 = tree.xpath("//content")
div_3_res = etree.tostring(div_3).strip('<content>').rstrip('</')
this last line with strip method using is not nice, but it just works
Just a quick enhancement as the answer has been given. If you want to clean the inside text:
clean_string = ' '.join([n.strip() for n in node.itertext()]).strip()
In response to #Richard's comment above, if you patch stringify_children to read:
parts = ([node.text] +
-- list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
++ list(chain(*([tostring(c)] for c in node.getchildren()))) +
[node.tail])
it seems to avoid the duplication he refers to.
I know that this is an old question, but this is a common problem and I have a solution that seems simpler than the ones suggested so far:
def stringify_children(node):
"""Given a LXML tag, return contents as a string
>>> html = "<p><strong>Sample sentence</strong> with tags.</p>"
>>> node = lxml.html.fragment_fromstring(html)
>>> extract_html_content(node)
"<strong>Sample sentence</strong> with tags."
"""
if node is None or (len(node) == 0 and not getattr(node, 'text', None)):
return ""
node.attrib.clear()
opening_tag = len(node.tag) + 2
closing_tag = -(len(node.tag) + 3)
return lxml.html.tostring(node)[opening_tag:closing_tag]
Unlike some of the other answers to this question this solution preserves all of tags contained within it and attacks the problem from a different angle than the other working solutions.
Here is a working solution. We can get content with a parent tag and then cut the parent tag from output.
import re
from lxml import etree
def _tostr_with_tags(parent_element, html_entities=False):
RE_CUT = r'^<([\w-]+)>(.*)</([\w-]+)>$'
content_with_parent = etree.tostring(parent_element)
def _replace_html_entities(s):
RE_ENTITY = r'&#(\d+);'
def repl(m):
return unichr(int(m.group(1)))
replaced = re.sub(RE_ENTITY, repl, s, flags=re.MULTILINE|re.UNICODE)
return replaced
if not html_entities:
content_with_parent = _replace_html_entities(content_with_parent)
content_with_parent = content_with_parent.strip() # remove 'white' characters on margins
start_tag, content_without_parent, end_tag = re.findall(RE_CUT, content_with_parent, flags=re.UNICODE|re.MULTILINE|re.DOTALL)[0]
if start_tag != end_tag:
raise Exception('Start tag does not match to end tag while getting content with tags.')
return content_without_parent
parent_element must have Element type.
Please note, that if you want text content (not html entities in text) please leave html_entities parameter as False.
lxml have a method for that:
node.text_content()
If this is an a tag, you can try:
node.values()
import re
from lxml import etree
node = etree.fromstring("""
<content>Text before inner tag
<div>Text
<em>inside</em>
tag
</div>
Text after inner tag
</content>""")
print re.search("\A<[^<>]*>(.*)</[^<>]*>\Z", etree.tostring(node), re.DOTALL).group(1)

How can I determine if it's ok to change this word... python script

The goal is to read through html files and change all instances of MyWord to Myword; except, must NOT change the word if it is found inside or as part of a path, file name or script:
href="..."
src="..."
url(...)
class="..."
id="..."
script inline or linked (file name) --> <script ...></script>
styles inline or linked (file name) --> <link ...> <style></style>
Now the question of all questions: how do you determine if the instance of the word is in a position where it's ok to change it? (or, how do you determine if the word is inside of one of the above listed locations and should not be changed?)
Here is my code, it can be changed to read line by line, etc. but I just can not think of how to define and enforce a rule to match above...
Here it is:
#!/usr/bin/python
import os
import time
from stat import *
def fileExtension(s):
i = s.rfind('.')
if i == -1:
return ''
tmp = '|' + s[i+1:] + '|'
return tmp
def changeFiles():
# get all files in current directory with desired extension
files = [f for f in os.listdir('.') if extStr.find(fileExtension(f)) != -1]
for f in files:
if os.path.isdir(f):
continue
st = os.stat(f)
atime = st[ST_ATIME] # org access time
mtime = st[ST_MTIME] # org modification time
fw = open(f, 'r+')
tmp = fw.read().replace(oldStr, newStr)
fw.seek(0)
fw.write(tmp)
fw.close()
# put file timestamp back to org timestamp
os.utime(f,(atime,mtime))
# if we want to check subdirectories
if checkSubDirs :
dirs = [d for d in os.listdir('.') if os.path.isdir(d)]
for d in dirs :
os.chdir(d)
changeFiles()
os.chdir('..')
# ==============================================================================
# ==================================== MAIN ====================================
oldStr = 'MyWord'
newStr = 'Myword'
extStr = '|html|htm|'
checkSubDirs = True
changeFiles()
Anybody know how? Have any suggestions? ANY help is appreciated, beating my brain for 2 days now and just can not think of anything.
lxml helps with this kind of task.
html = """
<html>
<body>
<h1>MyWord</h1>
MyWord
<img src="images/MyWord.png"/>
<div class="MyWord">
<p>MyWord!</p>
MyWord
</div>
MyWord
</body><!-- MyWord -->
</html>
"""
import lxml.etree as etree
tree = etree.fromstring(html)
for elem in tree.iter():
if elem.text:
elem.text = re.sub(r'MyWord', 'Myword', elem.text)
if elem.tail:
elem.tail = re.sub(r'MyWord', 'Myword', elem.tail)
print etree.tostring(tree)
The above prints this:
<html>
<body>
<h1>Myword</h1>
Myword
<img src="images/MyWord.png"/>
<div class="MyWord">
<p>Myword!</p>
Myword
</div>
Myword
</body><!-- Myword -->
</html>
Note: You'll need to make the above code a little more complex if you also need special processing for the contents of script tags, such as the following
<script>
var title = "MyWord"; // this should change to "Myword"
var hoverImage = "images/MyWord-hover.png"; // this should not change
</script>
Use regex here is an example that you can start with, hope this will help :
import re
html = """
<href="MyWord" />
MyWord
"""
re.sub(r'(?<!href=")MyWord', 'myword', html)
output: \n\n <href="MyWord" />\n myword\n\n
ref : http://docs.python.org/library/re.html

Categories