Python XML validation with SAX, pyexpat, ElementTree - python

I'm trying to check validity of XML files (against DTDs, entities, Processing instructions, namespaces) in Python 3.4.
Looking at Python docs the default underlying parser for three Python XML modules pyexpat, ELementTree and SAX is expat. On Pyexpat page (https://docs.python.org/3.4/library/pyexpat.html?highlight=pyexpat#module-xml.parsers.expat) says that non-validating version of expat parser is used:
"The xml.parsers.expat module is a Python interface to the Expat non-validating XML parser." Yet at the same time when you look into SAX documentation in Python you see all these handler functions for enabling DTD validation etc. How the heck do you make them work?
However, according to this post Parsing XML Entity with python xml.sax SAX can validate. Obviously with expat as a parser.
I have reused the code from this post and can't get it to work I get error saying expat does not support validation:
"File "/usr/lib/python3.4/xml/sax/expatreader.py", line 149, in setFeature
"expat does not support validation")
xml.sax._exceptions.SAXNotSupportedException: expat does not support validation".
In the post Python 2.5 was used, so maybe SAX has changed since then...
This is the code:
import xml.sax
from xml.sax import handler, make_parser, parse
import os
import collections
class SaxParser():
# initializer with directory part as argument
def __init__(self, dir_path):
self.dir_path = dir_path
def test_each_file(self, file_path):
# ensure full file name is shown
rev = file_path[::-1] # reverse string file_path to access position of "/"
file = file_path[-rev.index("/"):]
try:
f = open(file_path, 'r', encoding="ISO-8859-1") # same as "latin-1" encoding
# see this for enabling validation:
# https://stackoverflow.com/questions/6349513/parsing-xml-entity-with-python-xml-sax
parser = make_parser() # default parser is expat
parser.setContentHandler(handler.ContentHandler())
parser.setFeature(handler.feature_namespaces,True)
parser.setFeature(handler.feature_validation,True)
parser.setFeature(handler.feature_external_ges, True)
parser.parse(f)
f.close()
return (file, "OK")
except xml.sax.SAXParseException as PE:
column = PE.getColumnNumber()
line = PE.getLineNumber()
msg = PE.getMessage()
value = msg + " " + str(line) + " " + str(column)
return (file, value)
except ValueError:
return (file, "ValueError. DTD uri not found.") # that can happen
def test_directory_sax(self, dir_path):
tuples = []
for ind, file in enumerate(os.listdir(dir_path), 1):
if file.endswith('.xml'):
tuples.append(self.test_each_file(dir_path + file))
# convert into dict and sort it by key (file number)
dict_of_errors = dict(tuples)
dict_of_errors = collections.OrderedDict(sorted(dict_of_errors.items()))
return dict_of_errors
# ========================================================================
# INVOKE TESTS FOR SINGLE SPECIFIED DIRECTORY THAT CONTAINS TEST FILES
# ========================================================================
path = # path to directory where xml file is. - not the filepath!
single_sax = SaxParser(path)
print('============================================================')
print('TEST FOR SAX parser FOR DIRECTORY ' + path)
print('============================================================\n')
print(single_sax.test_directory_sax(path))
and test xml file (should produce validation error):
<!DOCTYPE root [
<!ATTLIST root
id2 ID "x23"
>
]>
<!-- an ID attribute must have a declared default
of #IMPLIED or #REQUIRED
-->
<root/>
How do I check validity? For either one of three XML modules?
A simple example would do.
Thanks.

If you look into the source file, you'll see that the xml.sax.handler.feature_validation is not really doing anything but raising this exception:
def setFeature(self, name, state):
# ...
elif name == feature_validation:
if state:
raise SAXNotSupportedException(
"expat does not support validation")
#...
I would suggest using lxml to do this. An example would be like this:
from lxml import etree
from cStringIO import StringIO
# from io import StringIO (py3)
f = StringIO('<!ATTLIST root id2 ID "x23">')
dtd = etree.DTD(f)
root = etree.XML('<root/>')
print(dtd.validate(root))
print(dtd.error_log.filter_from_errors()[0])

Related

NotXMLError: Failed to parse the XML data

I'm trying to use the Entrez module from Biopython to retrive full text articles from PubMed Central. This is my code to do the same.
import urllib3
import json
import requests
from Bio import Entrez
from Bio.Entrez import efetch, Parser
print(Parser.__file__)
pmcid = 'PMC2837563'
def print_text(pmcid):
handle = efetch(db='pmc', id=pmcid, retmode='xml', rettype=None)
#print(handle.read())
record = Entrez.read(handle)
print(record)
print_text(pmcid)
handle.read() works which means the data is being fetched properly. But, I'm not able to do Entrez.read(handle) to convert the fetched data into a python object. It gives me the below error:
NotXMLError: Failed to parse the XML data (syntax error: line 1036, column 69). Please make sure that the input data are in XML format.
Could someone tell me what to do about this? This seems to be correct syntax as per the biopython documentation.
The reason is that the last available Biopython version (1.79) does not recognise DTD with uri http://www.niso.org/schemas/ali/1.0/. The GitHub version has the corrected Parser but it is not available from pip now.
Compare:
current 1.79
def startNamespaceDeclHandler(self, prefix, uri):
"""Handle start of an XML namespace declaration."""
if prefix == "xsi":
# This is an xml schema
self.schema_namespace = uri
self.parser.StartElementHandler = self.schemaHandler
else:
# Note that the DTD for MathML specifies a default attribute
# that declares the namespace for each MathML element. This means
# that MathML element in the XML has an invisible MathML namespace
# declaration that triggers a call to startNamespaceDeclHandler
# and endNamespaceDeclHandler. Therefore we need to count how often
# startNamespaceDeclHandler and endNamespaceDeclHandler were called
# to find out their first and last invocation for each namespace.
if prefix == "mml":
assert uri == "http://www.w3.org/1998/Math/MathML"
elif prefix == "xlink":
assert uri == "http://www.w3.org/1999/xlink"
else:
raise ValueError("Unknown prefix '%s' with uri '%s'" % (prefix, uri))
self.namespace_level[prefix] += 1
self.namespace_prefix[uri] = prefix
GitHub
def startNamespaceDeclHandler(self, prefix, uri):
"""Handle start of an XML namespace declaration."""
if prefix == "xsi":
# This is an xml schema
self.schema_namespace = uri
self.parser.StartElementHandler = self.schemaHandler
else:
# Note that the DTD for MathML specifies a default attribute
# that declares the namespace for each MathML element. This means
# that MathML element in the XML has an invisible MathML namespace
# declaration that triggers a call to startNamespaceDeclHandler
# and endNamespaceDeclHandler. Therefore we need to count how often
# startNamespaceDeclHandler and endNamespaceDeclHandler were called
# to find out their first and last invocation for each namespace.
if prefix == "mml":
assert uri == "http://www.w3.org/1998/Math/MathML"
elif prefix == "xlink":
assert uri == "http://www.w3.org/1999/xlink"
elif prefix == "ali":
assert uri == "http://www.niso.org/schemas/ali/1.0/"
else:
raise ValueError(f"Unknown prefix '{prefix}' with uri '{uri}'")
self.namespace_level[prefix] += 1
self.namespace_prefix[uri] = prefix
So you can either exchange or edit Parser.py file, or use third party libraries for converting your handle to built-in python object.
If you want download just a full text of the article, you could try to download a pdf through metapub & go on to extract a text via textract.
import metapub
from urllib.request import urlretrieve
import textract
pmcid = 'PMC2837563'
fetch = metapub.PubMedFetcher()
article_metadata = fetch.article_by_pmcid(pmcid)
#Get just an abstract
abstract = article_metadata.abstract
#Download full article text
pmid = article_metadata.pmid
url = metapub.FindIt(pmid).url
urlretrieve(url, any_path)
with open(another_path, "w") as textfile:
textfile.write(textract.process(
any_path,
extension='pdf',
method='pdftotext',
encoding="utf_8",
))

Loading and dumping multiple yaml files with ruamel.yaml (python)

Using python 2 (atm) and ruamel.yaml 0.13.14 (RedHat EPEL)
I'm currently writing some code to load yaml definitions, but they are split up in multiple files. The user-editable part contains eg.
users:
xxxx1:
timestamp: '2018-10-22 11:38:28.541810'
<< : *userdefaults
xxxx2:
<< : *userdefaults
timestamp: '2018-10-22 11:38:28.541810'
the defaults are stored in another file, which is not editable:
userdefaults: &userdefaults
# Default values for user settings
fileCountQuota: 1000
diskSizeQuota: "300g"
I can process these together by loading both and concatinating the strings, and then running them through merged_data = list(yaml.load_all("{}\n{}".format(defaults_data, user_data), Loader=yaml.RoundTripLoader)) which correctly resolves everything. (when not using RoundTripLoader I get errors that the references cannot be resolved, which is normal)
Now, I want to do some updates via python code (eg. update the timestamp), and for that I need to just write back the user part. And that's where things get hairy. I sofar haven't found a way to just write that yaml document, not both.
First of all, unless there are multiple documents in your defaults file, you
don't have to use load_all, as you don't concatenate two documents into a
multiple-document stream. If you had by using a format string with a document-end
marker ("{}\n...\n{}") or with a directives-end marker ("{}\n---\n{}")
your aliases would not carry over from one document to another, as per the
YAML specification:
It is an error for an alias node to use an anchor that does not
previously occur in the document.
The anchor has to be in the document, not just in the stream (which can consist of multiple
documents).
I tried some hocus pocus, pre-populating the already represented dictionary
of anchored nodes:
import sys
import datetime
from ruamel import yaml
def load():
with open('defaults.yaml') as fp:
defaults_data = fp.read()
with open('user.yaml') as fp:
user_data = fp.read()
merged_data = yaml.load("{}\n{}".format(defaults_data, user_data),
Loader=yaml.RoundTripLoader)
return merged_data
class MyRTDGen(object):
class MyRTD(yaml.RoundTripDumper):
def __init__(self, *args, **kw):
pps = kw.pop('pre_populate', None)
yaml.RoundTripDumper.__init__(self, *args, **kw)
if pps is not None:
for pp in pps:
try:
anchor = pp.yaml_anchor()
except AttributeError:
anchor = None
node = yaml.nodes.MappingNode(
u'tag:yaml.org,2002:map', [], flow_style=None, anchor=anchor)
self.represented_objects[id(pp)] = node
def __init__(self, pre_populate=None):
assert isinstance(pre_populate, list)
self._pre_populate = pre_populate
def __call__(self, *args, **kw):
kw1 = kw.copy()
kw1['pre_populate'] = self._pre_populate
myrtd = self.MyRTD(*args, **kw1)
return myrtd
def update(md, file_name):
ud = md.pop('userdefaults')
MyRTD = MyRTDGen([ud])
yaml.dump(md, sys.stdout, Dumper=MyRTD)
with open(file_name, 'w') as fp:
yaml.dump(md, fp, Dumper=MyRTD)
md = load()
md['users']['xxxx2']['timestamp'] = str(datetime.datetime.utcnow())
update(md, 'user.yaml')
Since the PyYAML based API requires a class instead of an object, you need to
use a class generator, that actually adds the data elements to pre-populate on
the fly from withing yaml.load().
But this doesn't work, as a node only gets written out with an anchor once it is
determined that the anchor is used (i.e. there is a second reference). So actually the
first merge key gets written out as an anchor. And although I am quite familiar
with the code base, I could not get this to work properly in a reasonable amount of time.
So instead, I would just rely on the fact that there is only one key that matches
the first key of users.yaml at the root level of the dump of the combined updated
file and strip anything before that.
import sys
import datetime
from ruamel import yaml
with open('defaults.yaml') as fp:
defaults_data = fp.read()
with open('user.yaml') as fp:
user_data = fp.read()
merged_data = yaml.load("{}\n{}".format(defaults_data, user_data),
Loader=yaml.RoundTripLoader)
# find the key
for line in user_data.splitlines():
line = line.split('# ')[0].rstrip() # end of line comment, not checking for strings
if line and line[-1] == ':' and line[0] != ' ':
split_key = line
break
merged_data['users']['xxxx2']['timestamp'] = str(datetime.datetime.utcnow())
buf = yaml.compat.StringIO()
yaml.dump(merged_data, buf, Dumper=yaml.RoundTripDumper)
document = split_key + buf.getvalue().split('\n' + split_key)[1]
sys.stdout.write(document)
which gives:
users:
xxxx1:
<<: *userdefaults
timestamp: '2018-10-22 11:38:28.541810'
xxxx2:
<<: *userdefaults
timestamp: '2018-10-23 09:59:13.829978'
I had to make a virtualenv to make sure I could run the above with ruamel.yaml==0.13.14.
That version is from the time I was still young (I won't claim to have been innocent).
There have been over 85 releases of the library since then.
I can understand that you might not be able to run anything but
Python2 at the moment and cannot compile/use a newer version. But what
you really should do is install virtualenv (can be done using EPEL, but also without
further "polluting" your system installation), make a virtualenv for the
code you are developping and install the latest version of ruamel.yaml (and
your other libraries) in there. You can also do that if you need
to distribute your software to other systems, just install virtualenv there as well.
I have all my utilties under /opt/util, and managed
virtualenvutils a
wrapper around virtualenv.
For writing the user part, you will have to manually split the output of yaml.dump() multifile output and write the appropriate part back to users yaml file.
import datetime
import StringIO
import ruamel.yaml
yaml = ruamel.yaml.YAML(typ='rt')
data = None
with open('defaults.yaml', 'r') as defaults:
with open('users.yaml', 'r') as users:
raw = "{}\n{}".format(''.join(defaults.readlines()), ''.join(users.readlines()))
data = list(yaml.load_all(raw))
data[0]['users']['xxxx1']['timestamp'] = datetime.datetime.now().isoformat()
with open('users.yaml', 'w') as outfile:
sio = StringIO.StringIO()
yaml.dump(data[0], sio)
out = sio.getvalue()
outfile.write(out.split('\n\n')[1]) # write the second part here as this is the contents of users.yaml

Building a generic XML parser in Python?

I am a newbie and having 1 week experience writing python scripts.
I am trying to write a generic parser (Library for all my future jobs) which parses any input XML without any prior knowledge of tags.
Parse input XML.
Get the values from the XML and Set the values basing on the tags.
Use these values in the rest of the job.
I am using the "xml.etree.ElementTree" library and i am able to parse the XML in the below mentioned way.
#!/usr/bin/python
import os
import xml.etree.ElementTree as etree
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
logger.info('start reading XML property file')
filename = "mood_ib_history_parameters_DEV.xml"
logger.info('getting the current location')
__currentlocation__ = os.getcwd()
__fullpath__ = os.path.join(__currentlocation__,filename)
logger.info('start parsing the XML property file')
tree = etree.parse(__fullpath__)
root = tree.getroot()
hive_db = root.find("hive_db").text
EDGE_HIVE_CONN = root.find("EDGE_HIVE_CONN").text
target_dir = root.find("target_dir").text
to_email_alias = root.find("to_email_alias").text
to_email_cc = root.find("to_email_cc").text
from_email_alias = root.find("from_email_alias").text
dburl = root.find("dburl").text
SQOOP_EDGE_CONN = root.find("SQOOP_EDGE_CONN").text
user_name = root.find("user_name").text
password = root.find("password").text
IB_log_table = root.find("IB_log_table").text
SR_DG_master_table = root.find("SR_DG_master_table").text
SR_DG_table = root.find("SR_DG_table").text
logger.info('Hive DB %s', hive_db)
logger.info('Hive DB %s', hive_db)
logger.info('Edge Hive Connection %s', EDGE_HIVE_CONN)
logger.info('Target Directory %s', target_dir)
logger.info('To Email address %s', to_email_alias)
logger.info('CC Email address %s', to_email_cc)
logger.info('From Email address %s', from_email_alias)
logger.info('DB URL %s',dburl)
logger.info('Sqoop Edge node connection %s',SQOOP_EDGE_CONN)
logger.info('Log table name %s',IB_log_table)
logger.info('Master table name %s',SR_DG_master_table)
logger.info('Data governance table name %s',SR_DG_table)
Now the question is if i want to parse an XML without any knowledge of the tags and elements and use the values how do i do it. I have gone through multiple tutorials but all of them help me with parsing the XML by using the tags like below
SQOOP_EDGE_CONN = root.find("SQOOP_EDGE_CONN").text
Can anybody point me to a right tutorial or library or a code snippet to parse the XML dynamically.
I think official documentation is pretty clear and contains some examples: https://docs.python.org/3/library/xml.etree.elementtree.html
The main part you need to implement is loop over the child nodes (potentially recursively):
for child in root:
# child.tag contains the tag name, child.attrib contains the attributes
print(child.tag, child.attrib)
Well parsing is easy as that - etree.parse(path)
Once you've got the root in hand using tree.getroot() you can just iterate over the tree using Python's "in":
for child_node in tree.getroot():
print child_node.text
Then, to see tags these child_nodes have, you do the same trick.
This lets you go over all tags in the XML without having to know the tag names at all.

lxml, xi:include, and original file

I'm using lxml to parse a file that contains xi:include elements, and I'm resolve the includes using xinclude().
Given an element, is there any way to identify the file and source line that the element originally appeared in?
For example:
from lxml import etree
doc = etree.parse('file.xml')
doc.xinclude()
xpath_expression = ...
elt = doc.xpath(xpath_expression)
# Print file name and source line of `elt` location
The xinclude expansion will add an xml:base attribute to the top level expanded element,
and elt.base and elt.sourceline are also updated for the child nodes as well, so:
print elt.base, elt.sourceline
will give you what you want.
If elt is not part of the xinclude expansion, then elt.base will point to the base
document ( 'file.xml' ) and elt.sourceline will be the line number in that file.
( Note that sourceline usually seems to actually point to the line where the element tag
ends, not to the line where it begins, if the element is on multiple lines, just as
validation error messages usually point to the closing tag where the error occurs. )
You can find the initial xincluded elements and check this with:
xels = doc.xpath( '//*[#xml:base] )
for x in xels:
print x.tag, x.base, x.sourceline
for c in x.getchildren():
print c.tag, c.base, c.sourceline
Sadly, current versions of lxml no longer include this ability. However, I've developed a workaround using a simple custom loader. Here's a test script which demonstrates the bug in the approach above along with the workaround. Note that this approach only updates the xml:base attribute of the root tag of the included document.
The output of the program (using Python 3.9.1, lxml 4.6.3):
Included file was source.xml; xinclude reports it as document.xml
Included file was source.xml; workaround reports it as source.xml
Here's the sample program.
# Includes
# ========
from pathlib import Path
from textwrap import dedent
from lxml import etree as ElementTree
from lxml import ElementInclude
# Setup
# =====
# Create a sample document, taken from the `Python stdlib
# <https://docs.python.org/3/library/xml.etree.elementtree.html#id3>`_...
Path("document.xml").write_text(
dedent(
"""\
<?xml version="1.0"?>
<document xmlns:xi="http://www.w3.org/2001/XInclude">
<xi:include href="source.xml" parse="xml" />
</document>
"""
)
)
# ...and the associated include file.
Path("source.xml").write_text("<para>This is a paragraph.</para>")
# Failing xinclude case
# =====================
# Load and xinclude this.
tree = ElementTree.parse("document.xml")
tree.xinclude()
# Show that the ``base`` attribute refers to the top-level
# ``document.xml``, instead of the xincluded ``source.xml``.
root = tree.getroot()
print(f"Included file was source.xml; xinclude reports it as {root[0].base}")
# Workaround
# ==========
# As a workaround, define a loader which sets the ``xml:base`` of an
# xincluded element. While lxml evidently used to do this, a change
# eliminated this ability per some `discussion
# <https://mail.gnome.org/archives/xml/2014-April/msg00015.html>`_,
# which included a rejected patch fixing this problem. `Current source
# <https://github.com/GNOME/libxml2/blob/master/xinclude.c#L1689>`_
# lacks this patch.
def my_loader(href, parse, encoding=None, parser=None):
ret = ElementInclude._lxml_default_loader(href, parse, encoding, parser)
ret.attrib["{http://www.w3.org/XML/1998/namespace}base"] = href
return ret
new_tree = ElementTree.parse("document.xml")
ElementInclude.include(new_tree, loader=my_loader)
new_root = new_tree.getroot()
print(f"Included file was source.xml; workaround reports it as {new_root[0].base}")

DTD Validation With Python? [duplicate]

I need to validate an XML string (and not a file)
against a DTD description file.
How can that be done in python?
Another good option is lxml's validation which I find quite pleasant to use.
A simple example taken from the lxml site:
from StringIO import StringIO
from lxml import etree
dtd = etree.DTD(StringIO("""<!ELEMENT foo EMPTY>"""))
root = etree.XML("<foo/>")
print(dtd.validate(root))
# True
root = etree.XML("<foo>bar</foo>")
print(dtd.validate(root))
# False
print(dtd.error_log.filter_from_errors())
# <string>:1:0:ERROR:VALID:DTD_NOT_EMPTY: Element foo was declared EMPTY this one has content
from the examples directory in the libxml2 python bindings:
#!/usr/bin/python -u
import libxml2
import sys
# Memory debug specific
libxml2.debugMemory(1)
dtd="""<!ELEMENT foo EMPTY>"""
instance="""<?xml version="1.0"?>
<foo></foo>"""
dtd = libxml2.parseDTD(None, 'test.dtd')
ctxt = libxml2.newValidCtxt()
doc = libxml2.parseDoc(instance)
ret = doc.validateDtd(ctxt, dtd)
if ret != 1:
print "error doing DTD validation"
sys.exit(1)
doc.freeDoc()
dtd.freeDtd()
del dtd
del ctxt

Categories