I've discovered that cElementTree is about 30 times faster than xml.dom.minidom and I'm rewriting my XML encoding/decoding code. However, I need to output XML that contains CDATA sections and there doesn't seem to be a way to do that with ElementTree.
Can it be done?
After a bit of work, I found the answer myself. Looking at the ElementTree.py source code, I found there was special handling of XML comments and preprocessing instructions. What they do is create a factory function for the special element type that uses a special (non-string) tag value to differentiate it from regular elements.
def Comment(text=None):
element = Element(Comment)
element.text = text
return element
Then in the _write function of ElementTree that actually outputs the XML, there's a special case handling for comments:
if tag is Comment:
file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
In order to support CDATA sections, I create a factory function called CDATA, extended the ElementTree class and changed the _write function to handle the CDATA elements.
This still doesn't help if you want to parse an XML with CDATA sections and then output it again with the CDATA sections, but it at least allows you to create XMLs with CDATA sections programmatically, which is what I needed to do.
The implementation seems to work with both ElementTree and cElementTree.
import elementtree.ElementTree as etree
#~ import cElementTree as etree
def CDATA(text=None):
element = etree.Element(CDATA)
element.text = text
return element
class ElementTreeCDATA(etree.ElementTree):
def _write(self, file, node, encoding, namespaces):
if node.tag is CDATA:
text = node.text.encode(encoding)
file.write("\n<![CDATA[%s]]>\n" % text)
else:
etree.ElementTree._write(self, file, node, encoding, namespaces)
if __name__ == "__main__":
import sys
text = """
<?xml version='1.0' encoding='utf-8'?>
<text>
This is just some sample text.
</text>
"""
e = etree.Element("data")
cdata = CDATA(text)
e.append(cdata)
et = ElementTreeCDATA(e)
et.write(sys.stdout, "utf-8")
lxml has support for CDATA and API like ElementTree.
Here is a variant of gooli's solution that works for python 3.2:
import xml.etree.ElementTree as etree
def CDATA(text=None):
element = etree.Element('![CDATA[')
element.text = text
return element
etree._original_serialize_xml = etree._serialize_xml
def _serialize_xml(write, elem, qnames, namespaces):
if elem.tag == '![CDATA[':
write("\n<%s%s]]>\n" % (
elem.tag, elem.text))
return
return etree._original_serialize_xml(
write, elem, qnames, namespaces)
etree._serialize_xml = etree._serialize['xml'] = _serialize_xml
if __name__ == "__main__":
import sys
text = """
<?xml version='1.0' encoding='utf-8'?>
<text>
This is just some sample text.
</text>
"""
e = etree.Element("data")
cdata = CDATA(text)
e.append(cdata)
et = etree.ElementTree(e)
et.write(sys.stdout.buffer.raw, "utf-8")
Solution:
import xml.etree.ElementTree as ElementTree
def CDATA(text=None):
element = ElementTree.Element('![CDATA[')
element.text = text
return element
ElementTree._original_serialize_xml = ElementTree._serialize_xml
def _serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs):
if elem.tag == '![CDATA[':
write("\n<{}{}]]>\n".format(elem.tag, elem.text))
if elem.tail:
write(_escape_cdata(elem.tail))
else:
return ElementTree._original_serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs)
ElementTree._serialize_xml = ElementTree._serialize['xml'] = _serialize_xml
if __name__ == "__main__":
import sys
text = """
<?xml version='1.0' encoding='utf-8'?>
<text>
This is just some sample text.
</text>
"""
e = ElementTree.Element("data")
cdata = CDATA(text)
root.append(cdata)
Background:
I don't know whether previous versions of proposed code worked very well and whether ElementTree module has been updated but I have faced problems with using this trick:
etree._original_serialize_xml = etree._serialize_xml
def _serialize_xml(write, elem, qnames, namespaces):
if elem.tag == '![CDATA[':
write("\n<%s%s]]>\n" % (
elem.tag, elem.text))
return
return etree._original_serialize_xml(
write, elem, qnames, namespaces)
etree._serialize_xml = etree._serialize['xml'] = _serialize_xml
The problem with this approach is that after passing this exception, serializer is again treating it as normal tag afterwards. I was getting something like:
<textContent>
<![CDATA[this was the code I wanted to put inside of CDATA]]>
<![CDATA[>this was the code I wanted to put inside of CDATA</![CDATA[>
</textContent>
And of course we know that will cause only plenty of errors.
Why that was happening though?
The answer is in this little guy:
return etree._original_serialize_xml(write, elem, qnames, namespaces)
We don't want to examine code once again through original serialise function if we have trapped our CDATA and successfully passed it through.
Therefore in the "if" block we have to return original serialize function only when CDATA was not there. We were missing "else" before returning original function.
Moreover in my version ElementTree module, serialize function was desperately asking for "short_empty_element" argument. So the most recent version I would recommend looks like this(also with "tail"):
from xml.etree import ElementTree
from xml import etree
#in order to test it you have to create testing.xml file in the folder with the script
xmlParsedWithET = ElementTree.parse("testing.xml")
root = xmlParsedWithET.getroot()
def CDATA(text=None):
element = ElementTree.Element('![CDATA[')
element.text = text
return element
ElementTree._original_serialize_xml = ElementTree._serialize_xml
def _serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs):
if elem.tag == '![CDATA[':
write("\n<{}{}]]>\n".format(elem.tag, elem.text))
if elem.tail:
write(_escape_cdata(elem.tail))
else:
return ElementTree._original_serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs)
ElementTree._serialize_xml = ElementTree._serialize['xml'] = _serialize_xml
text = """
<?xml version='1.0' encoding='utf-8'?>
<text>
This is just some sample text.
</text>
"""
e = ElementTree.Element("data")
cdata = CDATA(text)
root.append(cdata)
#tests
print(root)
print(root.getchildren()[0])
print(root.getchildren()[0].text + "\n\nyay!")
The output I got was:
<Element 'Database' at 0x10062e228>
<Element '![CDATA[' at 0x1021cc9a8>
<?xml version='1.0' encoding='utf-8'?>
<text>
This is just some sample text.
</text>
yay!
I wish you the same result!
It's not possible AFAIK... which is a pity. Basically, ElementTree modules assume that the reader is 100% XML compliant, so it shouldn't matter if they output a section as CDATA or some other format that generates the equivalent text.
See this thread on the Python mailing list for more info. Basically, they recommend some kind of DOM-based XML library instead.
Actually this code has a bug, since you don't catch ]]> appearing in the data you are inserting as CDATA
as per Is there a way to escape a CDATA end token in xml?
you should break it into two CDATA's in that case, splitting the ]]> between the two.
basically data = data.replace("]]>", "]]]]><![CDATA[>")
(not necessarily correct, please verify)
This ended up working for me in Python 2.7. Similar to Amaury's answer.
import xml.etree.ElementTree as ET
ET._original_serialize_xml = ET._serialize_xml
def _serialize_xml(write, elem, encoding, qnames, namespaces):
if elem.tag == '![CDATA[':
write("<%s%s]]>%s" % (elem.tag, elem.text, elem.tail))
return
return ET._original_serialize_xml(
write, elem, encoding, qnames, namespaces)
ET._serialize_xml = ET._serialize['xml'] = _serialize_xml
You can override ElementTree _escape_cdata function:
import xml.etree.ElementTree as ET
def _escape_cdata(text, encoding):
try:
if "&" in text:
text = text.replace("&", "&")
# if "<" in text:
# text = text.replace("<", "<")
# if ">" in text:
# text = text.replace(">", ">")
return text
except TypeError:
raise TypeError(
"cannot serialize %r (type %s)" % (text, type(text).__name__)
)
ET._escape_cdata = _escape_cdata
Note that you may not need pass extra encoding param, depending on your library/python version.
Now you can write CDATA into obj.text like:
root = ET.Element('root')
body = ET.SubElement(root, 'body')
body.text = '<![CDATA[perform extra angle brackets escape for this text]]>'
print(ET.tostring(root))
and get clear CDATA node:
<root>
<body>
<![CDATA[perform extra angle brackets escape for this text]]>
</body>
</root>
I've discovered a hack to get CDATA to work using comments:
node.append(etree.Comment(' --><![CDATA[' + data.replace(']]>', ']]]]><![CDATA[>') + ']]><!-- '))
for python3 and ElementTree you can use next reciept
import xml.etree.ElementTree as ET
ET._original_serialize_xml = ET._serialize_xml
def serialize_xml_with_CDATA(write, elem, qnames, namespaces, short_empty_elements, **kwargs):
if elem.tag == 'CDATA':
write("<![CDATA[{}]]>".format(elem.text))
return
return ET._original_serialize_xml(write, elem, qnames, namespaces, short_empty_elements, **kwargs)
ET._serialize_xml = ET._serialize['xml'] = serialize_xml_with_CDATA
def CDATA(text):
element = ET.Element("CDATA")
element.text = text
return element
my_xml = ET.Element("my_name")
my_xml.append(CDATA("<p>some text</p>")
tree = ElementTree(my_xml)
if you need xml as str, you can use
ET.tostring(tree)
or next hack (which almost same as code inside tostring())
fake_file = BytesIO()
tree.write(fake_file, encoding="utf-8", xml_declaration=True)
result_xml_text = str(fake_file.getvalue(), encoding="utf-8")
and get result
<?xml version='1.0' encoding='utf-8'?>
<my_name>
<![CDATA[<p>some text</p>]]>
</my_name>
The DOM has (atleast in level 2) an interface
DATASection, and an operation Document::createCDATASection. They are
extension interfaces, supported only if an implementation supports the
"xml" feature.
from xml.dom import minidom
my_xmldoc=minidom.parse(xmlfile)
my_xmldoc.createCDATASection(data)
now u have cadata node add it wherever u want....
The accepted solution cannot work with Python 2.7. However, there is another package called lxml which (though slightly slower) shared a largely identical syntax with the xml.etree.ElementTree. lxml is able to both write and parse CDATA. Documentation here
Here's my version which is based on both gooli's and amaury's answers above. It works for both ElementTree 1.2.6 and 1.3.0, which use very different methods of doing this.
Note that gooli's does not work with 1.3.0, which seems to be the current standard in Python 2.7.x.
Also note that this version does not use the CDATA() method gooli used either.
import xml.etree.cElementTree as ET
class ElementTreeCDATA(ET.ElementTree):
"""Subclass of ElementTree which handles CDATA blocks reasonably"""
def _write(self, file, node, encoding, namespaces):
"""This method is for ElementTree <= 1.2.6"""
if node.tag == '![CDATA[':
text = node.text.encode(encoding)
file.write("\n<![CDATA[%s]]>\n" % text)
else:
ET.ElementTree._write(self, file, node, encoding, namespaces)
def _serialize_xml(write, elem, qnames, namespaces):
"""This method is for ElementTree >= 1.3.0"""
if elem.tag == '![CDATA[':
write("\n<![CDATA[%s]]>\n" % elem.text)
else:
ET._serialize_xml(write, elem, qnames, namespaces)
I got here looking for a way to "parse an XML with CDATA sections and then output it again with the CDATA sections".
I was able to do this (maybe lxml has been updated since this post?) with the following: (it is a little rough - sorry ;-). Someone else may have a better way to find the CDATA sections programatically but I was too lazy.
parser = etree.XMLParser(encoding='utf-8') # my original xml was utf-8 and that was a lot of the problem
tree = etree.parse(ppath, parser)
for cdat in tree.findall('./ProjectXMPMetadata'): # the tag where my CDATA lives
cdat.text = etree.CDATA(cdat.text)
# other stuff here
tree.write(opath, encoding="UTF-8",)
Simple way of making .xml file with CDATA sections
The main idea is that we covert the element tree to a string and call unescape on it. Once we have the string we use standard python to write a string to a file.
Based on:
How to write unescaped string to a XML element with ElementTree?
Code that generates the XML file
import xml.etree.ElementTree as ET
from xml.sax.saxutils import unescape
# defining the tree structure
element1 = ET.Element('test1')
element1.text = '<![CDATA[Wired & Forbidden]]>'
# & and <> are in a weird format
string1 = ET.tostring(element1).decode()
print(string1)
# now they are not weird anymore
# more formally, we unescape '&', '<', and '>' in a string of data
# from https://docs.python.org/3.8/library/xml.sax.utils.html#xml.sax.saxutils.unescape
string1 = unescape(string1)
print(string1)
element2 = ET.Element('test2')
element2.text = '<![CDATA[Wired & Forbidden]]>'
string2 = unescape(ET.tostring(element2).decode())
print(string2)
# make the xml file and open in append mode
with open('foo.xml', 'a') as f:
f.write(string1 + '\n')
f.write(string2)
Output foo.xml
<test1><![CDATA[Wired & Forbidden]]></test1>
<test2><![CDATA[Wired & Forbidden]]></test2>
Related
After extracting XML from CDATA I can't find tags in the extracted XML. If I convert to string and then back to an ElementTree I can find the tags I'm looking for (un-comment the lines marked "UNCOMMENT ME"). Looking for a better / more correct way.
import xml.etree.ElementTree as ElementTree
XML = '''<?xml version="1.0" encoding="UTF-8"?>
<Catalog>
<Data><![CDATA[
<Book>
<Author>George Orwell</Author>
<Title>1984</Title>
</Book>
]]></Data>
</Catalog>
'''
def get_cdata_xml(xml_str: str) -> ElementTree:
xml_root = ElementTree.fromstring(xml_str)
cdata_xml = xml_root.find('.//Data')
return cdata_xml
if __name__ == '__main__':
cdata_xml = get_cdata_xml(XML)
#xml_str = cdata_xml.text # UNCOMMENT ME
#cdata_xml = ElementTree.fromstring(xml_str) #UNCOMMENT ME
# type(cdata_xml) = xml.etree.ElementTree.Element
author = cdata_xml.find('.//Author')
print(author.text)
A CDATA block is just a string; it's not XML content. You would need to parse the cdata content with another call to `ElementTree.fromstring:
cdata_xml = get_cdata_xml(XML)
book = ElementTree.fromstring(cdata_xml.text)
author = book.find(".//Author")
print(author.text)
I have an xml file I need to open and make some changes to, one of those changes is to remove the namespace and prefix and then save to another file.
Here is the xml:
<?xml version='1.0' encoding='UTF-8'?>
<package xmlns="http://apple.com/itunes/importer">
<provider>some data</provider>
<language>en-GB</language>
</package>
I can make the other changes I need, but can't find out how to remove the namespace and prefix. This is the reusklt xml I need:
<?xml version='1.0' encoding='UTF-8'?>
<package>
<provider>some data</provider>
<language>en-GB</language>
</package>
And here is my script which will open and parse the xml and save it:
metadata = '/Users/user1/Desktop/Python/metadata.xml'
from lxml import etree
parser = etree.XMLParser(remove_blank_text=True)
open(metadata)
tree = etree.parse(metadata, parser)
root = tree.getroot()
tree.write('/Users/user1/Desktop/Python/done.xml', pretty_print = True, xml_declaration = True, encoding = 'UTF-8')
So how would I add code in my script which will remove the namespace and prefix?
We can get the desired output document in two steps:
Remove namespace URIs from element names
Remove unused namespace declarations from the XML tree
Example code
from lxml import etree
input_xml = """
<package xmlns="http://apple.com/itunes/importer">
<provider>some data</provider>
<language>en-GB</language>
<!-- some comment -->
<?xml-some-processing-instruction ?>
</package>
"""
root = etree.fromstring(input_xml)
# Iterate through all XML elements
for elem in root.getiterator():
# Skip comments and processing instructions,
# because they do not have names
if not (
isinstance(elem, etree._Comment)
or isinstance(elem, etree._ProcessingInstruction)
):
# Remove a namespace URI in the element's name
elem.tag = etree.QName(elem).localname
# Remove unused namespace declarations
etree.cleanup_namespaces(root)
print(etree.tostring(root).decode())
Output XML
<package>
<provider>some data</provider>
<language>en-GB</language>
<!-- some comment -->
<?xml-some-processing-instruction ?>
</package>
Details explaining the code
As described in the documentation, we use lxml.etree.QName.localname to get local names of elements, that is names without namespace URIs. Then we replace the fully qualified names of the elements by their local names.
Some XML elements, such as comments and processing instructions do not have names. So, we have to skip these elements while replacing element names, otherwise a ValueError will be raised.
Finally, we use lxml.etree.cleanup_namespaces() to remove unused namespace declarations from the XML tree.
Note on namespaced XML attributes
If the XML input contains attributes with explicitly specified namespace prefixes, the example code will not remove those prefixes. To accomplish the deletion of namespace prefixes in attributes, add the following for-loop after the line elem.tag = etree.QName(elem).localname, as suggested here
for attr_name in elem.attrib:
local_attr_name = etree.QName(attr_name).localname
if attr_name != local_attr_name:
attr_value = elem.attrib[attr_name]
del elem.attrib[attr_name]
elem.attrib[local_attr_name] = attr_value
To learn more about namespaced XML attributes see this answer.
Replace tag as Uku Loskit suggests. In addition to that, use lxml.objectify.deannotate.
from lxml import etree, objectify
metadata = '/Users/user1/Desktop/Python/metadata.xml'
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(metadata, parser)
root = tree.getroot()
####
for elem in root.getiterator():
if not hasattr(elem.tag, 'find'): continue # guard for Comment tags
i = elem.tag.find('}')
if i >= 0:
elem.tag = elem.tag[i+1:]
objectify.deannotate(root, cleanup_namespaces=True)
####
tree.write('/Users/user1/Desktop/Python/done.xml',
pretty_print=True, xml_declaration=True, encoding='UTF-8')
Note: Some tags like Comment return a function when accessing tag attribute. added a guard for that.
import xml.etree.ElementTree as ET
def remove_namespace(doc, namespace):
"""Remove namespace in the passed document in place."""
ns = u'{%s}' % namespace
nsl = len(ns)
for elem in doc.getiterator():
if elem.tag.startswith(ns):
elem.tag = elem.tag[nsl:]
metadata = '/Users/user1/Desktop/Python/metadata.xml'
tree = ET.parse(metadata)
root = tree.getroot()
remove_namespace(root, u'http://apple.com/itunes/importer')
tree.write('/Users/user1/Desktop/Python/done.xml',
pretty_print=True, xml_declaration=True, encoding='UTF-8')
Used a snippet of code from here
This method could be easily extended to delete any namespace attributes by searching for tags that begin with "xmlns"
You could also use XSLT to strip the namespaces...
XSLT 1.0 (test.xsl)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output indent="yes"/>
<xsl:strip-space elements="*"/>
<xsl:template match="node()">
<xsl:copy>
<xsl:apply-templates select="#*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="*" priority="1">
<xsl:element name="{local-name()}" namespace="">
<xsl:apply-templates select="#*|node()"/>
</xsl:element>
</xsl:template>
<xsl:template match="#*">
<xsl:attribute name="{local-name()}" namespace="">
<xsl:value-of select="."/>
</xsl:attribute>
</xsl:template>
</xsl:stylesheet>
Python
from lxml import etree
tree = etree.parse("metadata.xml")
xslt = etree.parse("test.xsl")
new_tree = tree.xslt(xslt)
print(etree.tostring(new_tree, pretty_print=True, xml_declaration=True,
encoding="UTF-8").decode("UTF-8"))
Output
<?xml version='1.0' encoding='UTF-8'?>
<package>
<provider>some data</provider>
<language>en-GB</language>
</package>
you can try with lxml:
# Remove namespace prefixes
for elem in root.getiterator():
namespace_removed = elem.xpath('local-name()')
Define and call the following function, right after you parse the XML string:
from lxml import etree
def clean_xml_namespaces(root):
for element in root.getiterator():
if isinstance(element, etree._Comment):
continue
element.tag = etree.QName(element).localname
etree.cleanup_namespaces(root)
💡 Note - comment elements in the XML are ignored, as they should be
Usage:
xml_content = b'''<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<dependencies>
<dependency>
<groupId>org.easytesting</groupId>
<artifactId>fest-assert</artifactId>
<version>1.4</version>
</dependency>
<!-- this dependency is critical -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>
</dependencies>
</project>
'''
root = etree.fromstring(xml_content)
clean_xml_namespaces(root)
elements = root.findall(".//dependency")
print(len(elements))
# outputs "2", as expected
So I realize this is an older answer with a highly up-voted and accepted answer, but if you are reading LARGE-FILES and find yourself in the same predicament I did; I hope this helps you out.
The issue with this approach is, in fact, the iteration. Regardless of how fast the parser is, doing anything say... a few 100k times is gonna eat your execution time. With that said, it came down to really thinking about the problem for me and understanding how namespaces work (or are "intended to work", because they are honestly not needed). Now if your xml truly uses namespaces, meaning you see tags that look like this: <xs:table>, then you'll need to tweak the approach here for your use-case. I'll include the full way of handling, as well.
DISCLAIMER : I cannot, with a good conscience, tell you to use regular expressions when parsing html/xml, go look at SergiyKolesnikov's answer as it WORKS, but I had an edge case so with that said... let's dive into some regex!
Problem: namespace stripping takes forever... and most of the time the namespaces only live inside of the very opening tag, or our "root". So in thinking about how python reads information in, and where our only problem-child is that root node, why not use that to our advantage.
Please NOTE: the file i'm using as my example comes as a raw, horrid, remarkably senseless structure of lulz with the promise of data in there somewhere.
my_file is the path to the file im using for our example, I cannot share it with you for professional reasons; and it has been cut down way in size just to get through this answer.
import os, sys, subprocess, re, io, json
from lxml import etree
# Your file would be '_biggest_file' if playing along at home
my_file = _biggest_file
meta_stuff = dict(
exists = os.path.exists(_biggest_file),
sizeof = os.path.getsize(_biggest_file),
extension_is_a_real_thing = any(re.findall("\.(html|xml)$", my_file, re.I)),
system_thinks_its_a = subprocess.check_output(
["file", "-i", _biggest_file]
).decode().split(":")[-1:][0].strip()
)
print(json.dumps(meta_stuff, indent = 2))
So for starters, decently sized, and system thinks at best it's html; the file extension is neither xml or html either...
{
"exists": true,
"sizeof": 24442371,
"extension_is_a_real_thing": false,
"system_thinks_its_a": "text/html; charset=us-ascii"
}
Approach:
In order to parse an xml file... it should at the very least be xml, so we'll need to check and add a declarations tag if one doesn't exist
If I have namespaces.. thats bad because I can't use xpaths, which is what I want to do
If my file is huge, I should only operate on the smallest imaginable parts that I need to clean before I'm ready to parse it.
Function
def speed_read(file_path):
# We're gonna be low-brow and add our own using this string. It's fine
_xml_dec = '<?xml version="1.0" encoding="utf-8"?>'
# Even worse.. rgx for xml here we go
#
# We'll need to extract the very first node that we find in our document,
# because for our purposes thats the one we know has the namespace uri's
# ie: "attributes"
# FiRsT node : <actual_name xmlns:xsi="idontactuallydoanything.com">
# We're going to pluck out that first node, get the tags actual name
# which means from:
# <actual_name xmlns:xsi="idontactuallydoanything.com">...</actual_name>
# We pluck:
# actual_name
# Then we're gonna replace the entire tag with one we make from that name
# by simple string substitution
#
# -> 'starting from the beginning, capture everything between the < and the >'
_first_node = re.compile('^(\<.*?\>)', re.I|re.M|re.U)
# -> 'Starting from the beginning, but dont you get me the <, find anything that happens
# before the first white-space, which i don't want either man'
_first_tagname = re.compile('(?<=^\<)(.*?)\S+',re.I|re.M|re.U)
# open the file context
with open(file_path, "r", encoding = "utf-8") as f:
# go ahead and strip leading and trailing, cause why not... plus adds
# safety for our regex's
_raw = f.read().strip()
# Now, if the file somehow happens to magically have the xml declaration, we
# wanna go ahead and remove it as we plan to add our own. But for efficiency,
# only check the first couple of characters
if _raw.startswith('<?xml', 0, 5):
#_raw = re.sub(_xml_dec, '', _raw).strip()
_raw = re.sub('\<\?xml.*?\?>\n?', '', _raw).strip()
# Here we grab that first node that has those meaningless namespaces
root_element = _first_node.search(_raw).group()
# here we get its name
first_tag = _first_tagname.search(root_element).group()
# Here, we rubstitute the entire element, with a new one
# that only contains the elements name
_raw = re.sub(root_element, '<{}>'.format(first_tag), _raw)
# Now we add our declaration tag in the worst way you have ever
# seen, but I miss sprintf, so this is how i'm rolling. Python is terrible btw
_raw = "{}{}".format(_xml_dec, _raw)
# The bytes part here might end up being overkill.. but this has worked
# for me consistently so it stays.
return etree.parse(io.BytesIO(bytes(bytearray(_raw, encoding = "utf-8"))))
# a good answer from above:
def safe_read(file_path):
root = etree.parse(file_path)
for elem in root.getiterator():
elem.tag = etree.QName(elem).localname
# Remove unused namespace declarations
etree.cleanup_namespaces(root)
return root
Benchmarking - Yes I know there's better ways to do this.
import pandas as pd
safe_times = []
for i in range(0,5):
s = time.time()
safe_read(_biggest_file)
safe_times.append(time.time() - s)
fast_times = []
for i in range(0,5):
s = time.time()
speed_read(_biggest_file)
fast_times.append(time.time() - s)
pd.DataFrame({"safe":safe_times, "fast":fast_times})
Results
safe
fast
2.36
0.61
2.15
0.58
2.47
0.49
2.94
0.60
2.83
0.53
The accepted solution removes namespaces in node names and not in attributes, i.e. <b:spam c:name="cheese"/> will be transformed to <spam c:name="cheese"/>.
An updated version which will give you <spam name="cheese"/>
def remove_namespaces(root):
for elem in root.getiterator():
if not (
isinstance(elem, etree._Comment)
or isinstance(elem, etree._ProcessingInstruction)
):
localname = etree.QName(elem).localname
if elem.tag != localname:
elem.tag = etree.QName(elem).localname
for attr_name in elem.attrib:
local_attr_name = etree.QName(attr_name).localname
if attr_name != local_attr_name:
attr_value = elem.attrib[attr_name]
del elem.attrib[attr_name]
elem.attrib[local_attr_name] = attr_value
deannotate(root, cleanup_namespaces=True)
Here are two other ways of removing namespaces. The first uses the lxml.etree.QName helper while the second uses regexes. Both functions allow an optional list of namespaces to match against. If no namespace list is supplied then all namespaces are removed. Attribute keys are also cleaned.
from lxml import etree
import re
def remove_namespaces_qname(doc, namespaces=None):
for el in doc.getiterator():
# clean tag
q = etree.QName(el.tag)
if q is not None:
if namespaces is not None:
if q.namespace in namespaces:
el.tag = q.localname
else:
el.tag = q.localname
# clean attributes
for a, v in el.items():
q = etree.QName(a)
if q is not None:
if namespaces is not None:
if q.namespace in namespaces:
del el.attrib[a]
el.attrib[q.localname] = v
else:
del el.attrib[a]
el.attrib[q.localname] = v
return doc
def remove_namespace_re(doc, namespaces=None):
if namespaces is not None:
ns = list(map(lambda n: u'{%s}' % n, namespaces))
for el in doc.getiterator():
# clean tag
m = re.match(r'({.+})(.+)', el.tag)
if m is not None:
if namespaces is not None:
if m.group(1) in ns:
el.tag = m.group(2)
else:
el.tag = m.group(2)
# clean attributes
for a, v in el.items():
m = re.match(r'({.+})(.+)', a)
if m is not None:
if namespaces is not None:
if m.group(1) in ns:
del el.attrib[a]
el.attrib[m.group(2)] = v
else:
del el.attrib[a]
el.attrib[m.group(2)] = v
return doc
all you need to do is:
objectify.deannotate(root, cleanup_namespaces=True)
after you have get the root, by using root = tree.getroot()
I need to get the elements from xml as a string. I am trying with below xml format.
<xml>
<prot:data xmlns:prot="prot">
<product-id-template>
<prot:ProductId>PRODUCT_ID</prot:ProductId>
</product-id-template>
<product-name-template>
<prot:ProductName>PRODUCT_NAME</prot:ProductName>
</product-name-template>
<dealer-template>
<xsi:Dealer xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">DEALER</xsi:Dealer>
</dealer-template>
</prot:data>
</xml>
And I tried with below code:
from xml.etree import ElementTree as ET
def get_template(xpath, namespaces):
tree = ET.parse('cdata.xml')
elements = tree.getroot()
for element in elements.findall(xpath, namespaces=namespaces):
return element
namespace = {"prot" : "prot"}
aa = get_template(".//prot:ProductId", namespace)
print(ET.tostring(aa).decode())
Actual output:
<ns0:ProductId xmlns:ns0="prot">PRODUCT_ID</ns0:ProductId>
Expected output:
<prot:ProductId>PRODUCT_ID</prot:ProductId>
I should not remove the xmlns from the document where it presents in the document. And It has to be removed where it not presents. Example product-id-template is not containing the xmlns so it needs to be retrieved without xmlns. And dealer-template contains the xmlns so it needs to be retrieved with xmlns.
How to achieve this?
You can remove xmlns with regex.
import re
# ...
with_ns = ET.tostring(aa).decode()
no_ns = re.sub(' xmlns(:\w+)?="[^"]+"', '', with_ns)
print(no_ns)
UPDATE: You can do a very wild thing. Although I can't recommend it, because I'm not a Python expert.
I just checked the source code and found that I can do this hack:
def my_serialize_xml(write, elem, qnames, namespaces,
short_empty_elements, **kwargs):
ET._serialize_xml(write, elem, qnames,
None, short_empty_elements, **kwargs)
ET._serialize["xml"] = my_serialize_xml
I just defined my_serialize_xml, which calls ElementTree._serialize_xml with namespaces=None. And then, in dictionary ElementTree._serialize, I changed value for key "xml" to my_serialize_xml. So when you call ElementTree.tostring, it will use my_serialize_xml.
If you want to try it, just place the code(above) after from xml.etree import ElementTree as ET (but before using the ET).
Im trying to take two elements from one file (file1.xml), and write them onto the end of another file (file2.xml). I am able to get them to print out, but am stuck trying to write them onto file2.xml! Help !
filename = "file1.xml"
appendtoxml = "file2.xml"
output_file = appendtoxml.replace('.xml', '') + "_editedbyed.xml"
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(filename, parser)
etree.tostring(tree)
root = tree.getroot()
a = root.findall(".//Device")
b = root.findall(".//Speaker")
for r in a:
print etree.tostring(r)
for e in b:
print etree.tostring(e)
NewSub = etree.SubElement (root, "Audio(just writes audio..")
print NewSub
I want the results of a, b to be added onto the end of outputfile.xml in the root.
Parse both the input file and the file you wish to append to.
Use root.append(elt) to append Element, elt, to root.
Then use tree.write to write the new tree to a file (e.g. appendtoxml):
Note: The links above point to documentation for xml.etree from the standard
library. Since lxml's API tries to be compatible with the standard library's
xml.etree, the standard library documentation applies to lxml as well (at
least for these methods). See http://lxml.de/api.html for information on where
the APIs differ.
import lxml.etree as ET
filename = "file1.xml"
appendtoxml = "file2.xml"
output_file = appendtoxml.replace('.xml', '') + "_editedbyed.xml"
parser = ET.XMLParser(remove_blank_text=True)
tree = ET.parse(filename, parser)
root = tree.getroot()
out_tree = ET.parse(appendtoxml, parser)
out_root = out_tree.getroot()
for path in [".//Device", ".//Speaker"]:
for elt in root.findall(path):
out_root.append(elt)
out_tree.write(output_file, pretty_print=True)
If file1.xml contains
<?xml version="1.0"?>
<root>
<Speaker>boozhoo</Speaker>
<Device>waaboo</Device>
<Speaker>anin</Speaker>
<Device>gigiwishimowin</Device>
</root>
and file2.xml contains
<?xml version="1.0"?>
<root>
<Speaker>jubal</Speaker>
<Device>crane</Device>
</root>
then file2_editedbyed.xml will contain
<root>
<Speaker>jubal</Speaker>
<Device>crane</Device>
<Device>waaboo</Device>
<Device>gigiwishimowin</Device>
<Speaker>boozhoo</Speaker>
<Speaker>anin</Speaker>
</root>
I have an xml file I need to open and make some changes to, one of those changes is to remove the namespace and prefix and then save to another file.
Here is the xml:
<?xml version='1.0' encoding='UTF-8'?>
<package xmlns="http://apple.com/itunes/importer">
<provider>some data</provider>
<language>en-GB</language>
</package>
I can make the other changes I need, but can't find out how to remove the namespace and prefix. This is the reusklt xml I need:
<?xml version='1.0' encoding='UTF-8'?>
<package>
<provider>some data</provider>
<language>en-GB</language>
</package>
And here is my script which will open and parse the xml and save it:
metadata = '/Users/user1/Desktop/Python/metadata.xml'
from lxml import etree
parser = etree.XMLParser(remove_blank_text=True)
open(metadata)
tree = etree.parse(metadata, parser)
root = tree.getroot()
tree.write('/Users/user1/Desktop/Python/done.xml', pretty_print = True, xml_declaration = True, encoding = 'UTF-8')
So how would I add code in my script which will remove the namespace and prefix?
We can get the desired output document in two steps:
Remove namespace URIs from element names
Remove unused namespace declarations from the XML tree
Example code
from lxml import etree
input_xml = """
<package xmlns="http://apple.com/itunes/importer">
<provider>some data</provider>
<language>en-GB</language>
<!-- some comment -->
<?xml-some-processing-instruction ?>
</package>
"""
root = etree.fromstring(input_xml)
# Iterate through all XML elements
for elem in root.getiterator():
# Skip comments and processing instructions,
# because they do not have names
if not (
isinstance(elem, etree._Comment)
or isinstance(elem, etree._ProcessingInstruction)
):
# Remove a namespace URI in the element's name
elem.tag = etree.QName(elem).localname
# Remove unused namespace declarations
etree.cleanup_namespaces(root)
print(etree.tostring(root).decode())
Output XML
<package>
<provider>some data</provider>
<language>en-GB</language>
<!-- some comment -->
<?xml-some-processing-instruction ?>
</package>
Details explaining the code
As described in the documentation, we use lxml.etree.QName.localname to get local names of elements, that is names without namespace URIs. Then we replace the fully qualified names of the elements by their local names.
Some XML elements, such as comments and processing instructions do not have names. So, we have to skip these elements while replacing element names, otherwise a ValueError will be raised.
Finally, we use lxml.etree.cleanup_namespaces() to remove unused namespace declarations from the XML tree.
Note on namespaced XML attributes
If the XML input contains attributes with explicitly specified namespace prefixes, the example code will not remove those prefixes. To accomplish the deletion of namespace prefixes in attributes, add the following for-loop after the line elem.tag = etree.QName(elem).localname, as suggested here
for attr_name in elem.attrib:
local_attr_name = etree.QName(attr_name).localname
if attr_name != local_attr_name:
attr_value = elem.attrib[attr_name]
del elem.attrib[attr_name]
elem.attrib[local_attr_name] = attr_value
To learn more about namespaced XML attributes see this answer.
Replace tag as Uku Loskit suggests. In addition to that, use lxml.objectify.deannotate.
from lxml import etree, objectify
metadata = '/Users/user1/Desktop/Python/metadata.xml'
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(metadata, parser)
root = tree.getroot()
####
for elem in root.getiterator():
if not hasattr(elem.tag, 'find'): continue # guard for Comment tags
i = elem.tag.find('}')
if i >= 0:
elem.tag = elem.tag[i+1:]
objectify.deannotate(root, cleanup_namespaces=True)
####
tree.write('/Users/user1/Desktop/Python/done.xml',
pretty_print=True, xml_declaration=True, encoding='UTF-8')
Note: Some tags like Comment return a function when accessing tag attribute. added a guard for that.
import xml.etree.ElementTree as ET
def remove_namespace(doc, namespace):
"""Remove namespace in the passed document in place."""
ns = u'{%s}' % namespace
nsl = len(ns)
for elem in doc.getiterator():
if elem.tag.startswith(ns):
elem.tag = elem.tag[nsl:]
metadata = '/Users/user1/Desktop/Python/metadata.xml'
tree = ET.parse(metadata)
root = tree.getroot()
remove_namespace(root, u'http://apple.com/itunes/importer')
tree.write('/Users/user1/Desktop/Python/done.xml',
pretty_print=True, xml_declaration=True, encoding='UTF-8')
Used a snippet of code from here
This method could be easily extended to delete any namespace attributes by searching for tags that begin with "xmlns"
You could also use XSLT to strip the namespaces...
XSLT 1.0 (test.xsl)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output indent="yes"/>
<xsl:strip-space elements="*"/>
<xsl:template match="node()">
<xsl:copy>
<xsl:apply-templates select="#*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="*" priority="1">
<xsl:element name="{local-name()}" namespace="">
<xsl:apply-templates select="#*|node()"/>
</xsl:element>
</xsl:template>
<xsl:template match="#*">
<xsl:attribute name="{local-name()}" namespace="">
<xsl:value-of select="."/>
</xsl:attribute>
</xsl:template>
</xsl:stylesheet>
Python
from lxml import etree
tree = etree.parse("metadata.xml")
xslt = etree.parse("test.xsl")
new_tree = tree.xslt(xslt)
print(etree.tostring(new_tree, pretty_print=True, xml_declaration=True,
encoding="UTF-8").decode("UTF-8"))
Output
<?xml version='1.0' encoding='UTF-8'?>
<package>
<provider>some data</provider>
<language>en-GB</language>
</package>
you can try with lxml:
# Remove namespace prefixes
for elem in root.getiterator():
namespace_removed = elem.xpath('local-name()')
Define and call the following function, right after you parse the XML string:
from lxml import etree
def clean_xml_namespaces(root):
for element in root.getiterator():
if isinstance(element, etree._Comment):
continue
element.tag = etree.QName(element).localname
etree.cleanup_namespaces(root)
💡 Note - comment elements in the XML are ignored, as they should be
Usage:
xml_content = b'''<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<dependencies>
<dependency>
<groupId>org.easytesting</groupId>
<artifactId>fest-assert</artifactId>
<version>1.4</version>
</dependency>
<!-- this dependency is critical -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>
</dependencies>
</project>
'''
root = etree.fromstring(xml_content)
clean_xml_namespaces(root)
elements = root.findall(".//dependency")
print(len(elements))
# outputs "2", as expected
So I realize this is an older answer with a highly up-voted and accepted answer, but if you are reading LARGE-FILES and find yourself in the same predicament I did; I hope this helps you out.
The issue with this approach is, in fact, the iteration. Regardless of how fast the parser is, doing anything say... a few 100k times is gonna eat your execution time. With that said, it came down to really thinking about the problem for me and understanding how namespaces work (or are "intended to work", because they are honestly not needed). Now if your xml truly uses namespaces, meaning you see tags that look like this: <xs:table>, then you'll need to tweak the approach here for your use-case. I'll include the full way of handling, as well.
DISCLAIMER : I cannot, with a good conscience, tell you to use regular expressions when parsing html/xml, go look at SergiyKolesnikov's answer as it WORKS, but I had an edge case so with that said... let's dive into some regex!
Problem: namespace stripping takes forever... and most of the time the namespaces only live inside of the very opening tag, or our "root". So in thinking about how python reads information in, and where our only problem-child is that root node, why not use that to our advantage.
Please NOTE: the file i'm using as my example comes as a raw, horrid, remarkably senseless structure of lulz with the promise of data in there somewhere.
my_file is the path to the file im using for our example, I cannot share it with you for professional reasons; and it has been cut down way in size just to get through this answer.
import os, sys, subprocess, re, io, json
from lxml import etree
# Your file would be '_biggest_file' if playing along at home
my_file = _biggest_file
meta_stuff = dict(
exists = os.path.exists(_biggest_file),
sizeof = os.path.getsize(_biggest_file),
extension_is_a_real_thing = any(re.findall("\.(html|xml)$", my_file, re.I)),
system_thinks_its_a = subprocess.check_output(
["file", "-i", _biggest_file]
).decode().split(":")[-1:][0].strip()
)
print(json.dumps(meta_stuff, indent = 2))
So for starters, decently sized, and system thinks at best it's html; the file extension is neither xml or html either...
{
"exists": true,
"sizeof": 24442371,
"extension_is_a_real_thing": false,
"system_thinks_its_a": "text/html; charset=us-ascii"
}
Approach:
In order to parse an xml file... it should at the very least be xml, so we'll need to check and add a declarations tag if one doesn't exist
If I have namespaces.. thats bad because I can't use xpaths, which is what I want to do
If my file is huge, I should only operate on the smallest imaginable parts that I need to clean before I'm ready to parse it.
Function
def speed_read(file_path):
# We're gonna be low-brow and add our own using this string. It's fine
_xml_dec = '<?xml version="1.0" encoding="utf-8"?>'
# Even worse.. rgx for xml here we go
#
# We'll need to extract the very first node that we find in our document,
# because for our purposes thats the one we know has the namespace uri's
# ie: "attributes"
# FiRsT node : <actual_name xmlns:xsi="idontactuallydoanything.com">
# We're going to pluck out that first node, get the tags actual name
# which means from:
# <actual_name xmlns:xsi="idontactuallydoanything.com">...</actual_name>
# We pluck:
# actual_name
# Then we're gonna replace the entire tag with one we make from that name
# by simple string substitution
#
# -> 'starting from the beginning, capture everything between the < and the >'
_first_node = re.compile('^(\<.*?\>)', re.I|re.M|re.U)
# -> 'Starting from the beginning, but dont you get me the <, find anything that happens
# before the first white-space, which i don't want either man'
_first_tagname = re.compile('(?<=^\<)(.*?)\S+',re.I|re.M|re.U)
# open the file context
with open(file_path, "r", encoding = "utf-8") as f:
# go ahead and strip leading and trailing, cause why not... plus adds
# safety for our regex's
_raw = f.read().strip()
# Now, if the file somehow happens to magically have the xml declaration, we
# wanna go ahead and remove it as we plan to add our own. But for efficiency,
# only check the first couple of characters
if _raw.startswith('<?xml', 0, 5):
#_raw = re.sub(_xml_dec, '', _raw).strip()
_raw = re.sub('\<\?xml.*?\?>\n?', '', _raw).strip()
# Here we grab that first node that has those meaningless namespaces
root_element = _first_node.search(_raw).group()
# here we get its name
first_tag = _first_tagname.search(root_element).group()
# Here, we rubstitute the entire element, with a new one
# that only contains the elements name
_raw = re.sub(root_element, '<{}>'.format(first_tag), _raw)
# Now we add our declaration tag in the worst way you have ever
# seen, but I miss sprintf, so this is how i'm rolling. Python is terrible btw
_raw = "{}{}".format(_xml_dec, _raw)
# The bytes part here might end up being overkill.. but this has worked
# for me consistently so it stays.
return etree.parse(io.BytesIO(bytes(bytearray(_raw, encoding = "utf-8"))))
# a good answer from above:
def safe_read(file_path):
root = etree.parse(file_path)
for elem in root.getiterator():
elem.tag = etree.QName(elem).localname
# Remove unused namespace declarations
etree.cleanup_namespaces(root)
return root
Benchmarking - Yes I know there's better ways to do this.
import pandas as pd
safe_times = []
for i in range(0,5):
s = time.time()
safe_read(_biggest_file)
safe_times.append(time.time() - s)
fast_times = []
for i in range(0,5):
s = time.time()
speed_read(_biggest_file)
fast_times.append(time.time() - s)
pd.DataFrame({"safe":safe_times, "fast":fast_times})
Results
safe
fast
2.36
0.61
2.15
0.58
2.47
0.49
2.94
0.60
2.83
0.53
The accepted solution removes namespaces in node names and not in attributes, i.e. <b:spam c:name="cheese"/> will be transformed to <spam c:name="cheese"/>.
An updated version which will give you <spam name="cheese"/>
def remove_namespaces(root):
for elem in root.getiterator():
if not (
isinstance(elem, etree._Comment)
or isinstance(elem, etree._ProcessingInstruction)
):
localname = etree.QName(elem).localname
if elem.tag != localname:
elem.tag = etree.QName(elem).localname
for attr_name in elem.attrib:
local_attr_name = etree.QName(attr_name).localname
if attr_name != local_attr_name:
attr_value = elem.attrib[attr_name]
del elem.attrib[attr_name]
elem.attrib[local_attr_name] = attr_value
deannotate(root, cleanup_namespaces=True)
Here are two other ways of removing namespaces. The first uses the lxml.etree.QName helper while the second uses regexes. Both functions allow an optional list of namespaces to match against. If no namespace list is supplied then all namespaces are removed. Attribute keys are also cleaned.
from lxml import etree
import re
def remove_namespaces_qname(doc, namespaces=None):
for el in doc.getiterator():
# clean tag
q = etree.QName(el.tag)
if q is not None:
if namespaces is not None:
if q.namespace in namespaces:
el.tag = q.localname
else:
el.tag = q.localname
# clean attributes
for a, v in el.items():
q = etree.QName(a)
if q is not None:
if namespaces is not None:
if q.namespace in namespaces:
del el.attrib[a]
el.attrib[q.localname] = v
else:
del el.attrib[a]
el.attrib[q.localname] = v
return doc
def remove_namespace_re(doc, namespaces=None):
if namespaces is not None:
ns = list(map(lambda n: u'{%s}' % n, namespaces))
for el in doc.getiterator():
# clean tag
m = re.match(r'({.+})(.+)', el.tag)
if m is not None:
if namespaces is not None:
if m.group(1) in ns:
el.tag = m.group(2)
else:
el.tag = m.group(2)
# clean attributes
for a, v in el.items():
m = re.match(r'({.+})(.+)', a)
if m is not None:
if namespaces is not None:
if m.group(1) in ns:
del el.attrib[a]
el.attrib[m.group(2)] = v
else:
del el.attrib[a]
el.attrib[m.group(2)] = v
return doc
all you need to do is:
objectify.deannotate(root, cleanup_namespaces=True)
after you have get the root, by using root = tree.getroot()