NotXMLError: Failed to parse the XML data - python

I'm trying to use the Entrez module from Biopython to retrive full text articles from PubMed Central. This is my code to do the same.
import urllib3
import json
import requests
from Bio import Entrez
from Bio.Entrez import efetch, Parser
print(Parser.__file__)
pmcid = 'PMC2837563'
def print_text(pmcid):
handle = efetch(db='pmc', id=pmcid, retmode='xml', rettype=None)
#print(handle.read())
record = Entrez.read(handle)
print(record)
print_text(pmcid)
handle.read() works which means the data is being fetched properly. But, I'm not able to do Entrez.read(handle) to convert the fetched data into a python object. It gives me the below error:
NotXMLError: Failed to parse the XML data (syntax error: line 1036, column 69). Please make sure that the input data are in XML format.
Could someone tell me what to do about this? This seems to be correct syntax as per the biopython documentation.

The reason is that the last available Biopython version (1.79) does not recognise DTD with uri http://www.niso.org/schemas/ali/1.0/. The GitHub version has the corrected Parser but it is not available from pip now.
Compare:
current 1.79
def startNamespaceDeclHandler(self, prefix, uri):
"""Handle start of an XML namespace declaration."""
if prefix == "xsi":
# This is an xml schema
self.schema_namespace = uri
self.parser.StartElementHandler = self.schemaHandler
else:
# Note that the DTD for MathML specifies a default attribute
# that declares the namespace for each MathML element. This means
# that MathML element in the XML has an invisible MathML namespace
# declaration that triggers a call to startNamespaceDeclHandler
# and endNamespaceDeclHandler. Therefore we need to count how often
# startNamespaceDeclHandler and endNamespaceDeclHandler were called
# to find out their first and last invocation for each namespace.
if prefix == "mml":
assert uri == "http://www.w3.org/1998/Math/MathML"
elif prefix == "xlink":
assert uri == "http://www.w3.org/1999/xlink"
else:
raise ValueError("Unknown prefix '%s' with uri '%s'" % (prefix, uri))
self.namespace_level[prefix] += 1
self.namespace_prefix[uri] = prefix
GitHub
def startNamespaceDeclHandler(self, prefix, uri):
"""Handle start of an XML namespace declaration."""
if prefix == "xsi":
# This is an xml schema
self.schema_namespace = uri
self.parser.StartElementHandler = self.schemaHandler
else:
# Note that the DTD for MathML specifies a default attribute
# that declares the namespace for each MathML element. This means
# that MathML element in the XML has an invisible MathML namespace
# declaration that triggers a call to startNamespaceDeclHandler
# and endNamespaceDeclHandler. Therefore we need to count how often
# startNamespaceDeclHandler and endNamespaceDeclHandler were called
# to find out their first and last invocation for each namespace.
if prefix == "mml":
assert uri == "http://www.w3.org/1998/Math/MathML"
elif prefix == "xlink":
assert uri == "http://www.w3.org/1999/xlink"
elif prefix == "ali":
assert uri == "http://www.niso.org/schemas/ali/1.0/"
else:
raise ValueError(f"Unknown prefix '{prefix}' with uri '{uri}'")
self.namespace_level[prefix] += 1
self.namespace_prefix[uri] = prefix
So you can either exchange or edit Parser.py file, or use third party libraries for converting your handle to built-in python object.
If you want download just a full text of the article, you could try to download a pdf through metapub & go on to extract a text via textract.
import metapub
from urllib.request import urlretrieve
import textract
pmcid = 'PMC2837563'
fetch = metapub.PubMedFetcher()
article_metadata = fetch.article_by_pmcid(pmcid)
#Get just an abstract
abstract = article_metadata.abstract
#Download full article text
pmid = article_metadata.pmid
url = metapub.FindIt(pmid).url
urlretrieve(url, any_path)
with open(another_path, "w") as textfile:
textfile.write(textract.process(
any_path,
extension='pdf',
method='pdftotext',
encoding="utf_8",
))

Related

Building a generic XML parser in Python?

I am a newbie and having 1 week experience writing python scripts.
I am trying to write a generic parser (Library for all my future jobs) which parses any input XML without any prior knowledge of tags.
Parse input XML.
Get the values from the XML and Set the values basing on the tags.
Use these values in the rest of the job.
I am using the "xml.etree.ElementTree" library and i am able to parse the XML in the below mentioned way.
#!/usr/bin/python
import os
import xml.etree.ElementTree as etree
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
logger.info('start reading XML property file')
filename = "mood_ib_history_parameters_DEV.xml"
logger.info('getting the current location')
__currentlocation__ = os.getcwd()
__fullpath__ = os.path.join(__currentlocation__,filename)
logger.info('start parsing the XML property file')
tree = etree.parse(__fullpath__)
root = tree.getroot()
hive_db = root.find("hive_db").text
EDGE_HIVE_CONN = root.find("EDGE_HIVE_CONN").text
target_dir = root.find("target_dir").text
to_email_alias = root.find("to_email_alias").text
to_email_cc = root.find("to_email_cc").text
from_email_alias = root.find("from_email_alias").text
dburl = root.find("dburl").text
SQOOP_EDGE_CONN = root.find("SQOOP_EDGE_CONN").text
user_name = root.find("user_name").text
password = root.find("password").text
IB_log_table = root.find("IB_log_table").text
SR_DG_master_table = root.find("SR_DG_master_table").text
SR_DG_table = root.find("SR_DG_table").text
logger.info('Hive DB %s', hive_db)
logger.info('Hive DB %s', hive_db)
logger.info('Edge Hive Connection %s', EDGE_HIVE_CONN)
logger.info('Target Directory %s', target_dir)
logger.info('To Email address %s', to_email_alias)
logger.info('CC Email address %s', to_email_cc)
logger.info('From Email address %s', from_email_alias)
logger.info('DB URL %s',dburl)
logger.info('Sqoop Edge node connection %s',SQOOP_EDGE_CONN)
logger.info('Log table name %s',IB_log_table)
logger.info('Master table name %s',SR_DG_master_table)
logger.info('Data governance table name %s',SR_DG_table)
Now the question is if i want to parse an XML without any knowledge of the tags and elements and use the values how do i do it. I have gone through multiple tutorials but all of them help me with parsing the XML by using the tags like below
SQOOP_EDGE_CONN = root.find("SQOOP_EDGE_CONN").text
Can anybody point me to a right tutorial or library or a code snippet to parse the XML dynamically.
I think official documentation is pretty clear and contains some examples: https://docs.python.org/3/library/xml.etree.elementtree.html
The main part you need to implement is loop over the child nodes (potentially recursively):
for child in root:
# child.tag contains the tag name, child.attrib contains the attributes
print(child.tag, child.attrib)
Well parsing is easy as that - etree.parse(path)
Once you've got the root in hand using tree.getroot() you can just iterate over the tree using Python's "in":
for child_node in tree.getroot():
print child_node.text
Then, to see tags these child_nodes have, you do the same trick.
This lets you go over all tags in the XML without having to know the tag names at all.

Python XML validation with SAX, pyexpat, ElementTree

I'm trying to check validity of XML files (against DTDs, entities, Processing instructions, namespaces) in Python 3.4.
Looking at Python docs the default underlying parser for three Python XML modules pyexpat, ELementTree and SAX is expat. On Pyexpat page (https://docs.python.org/3.4/library/pyexpat.html?highlight=pyexpat#module-xml.parsers.expat) says that non-validating version of expat parser is used:
"The xml.parsers.expat module is a Python interface to the Expat non-validating XML parser." Yet at the same time when you look into SAX documentation in Python you see all these handler functions for enabling DTD validation etc. How the heck do you make them work?
However, according to this post Parsing XML Entity with python xml.sax SAX can validate. Obviously with expat as a parser.
I have reused the code from this post and can't get it to work I get error saying expat does not support validation:
"File "/usr/lib/python3.4/xml/sax/expatreader.py", line 149, in setFeature
"expat does not support validation")
xml.sax._exceptions.SAXNotSupportedException: expat does not support validation".
In the post Python 2.5 was used, so maybe SAX has changed since then...
This is the code:
import xml.sax
from xml.sax import handler, make_parser, parse
import os
import collections
class SaxParser():
# initializer with directory part as argument
def __init__(self, dir_path):
self.dir_path = dir_path
def test_each_file(self, file_path):
# ensure full file name is shown
rev = file_path[::-1] # reverse string file_path to access position of "/"
file = file_path[-rev.index("/"):]
try:
f = open(file_path, 'r', encoding="ISO-8859-1") # same as "latin-1" encoding
# see this for enabling validation:
# https://stackoverflow.com/questions/6349513/parsing-xml-entity-with-python-xml-sax
parser = make_parser() # default parser is expat
parser.setContentHandler(handler.ContentHandler())
parser.setFeature(handler.feature_namespaces,True)
parser.setFeature(handler.feature_validation,True)
parser.setFeature(handler.feature_external_ges, True)
parser.parse(f)
f.close()
return (file, "OK")
except xml.sax.SAXParseException as PE:
column = PE.getColumnNumber()
line = PE.getLineNumber()
msg = PE.getMessage()
value = msg + " " + str(line) + " " + str(column)
return (file, value)
except ValueError:
return (file, "ValueError. DTD uri not found.") # that can happen
def test_directory_sax(self, dir_path):
tuples = []
for ind, file in enumerate(os.listdir(dir_path), 1):
if file.endswith('.xml'):
tuples.append(self.test_each_file(dir_path + file))
# convert into dict and sort it by key (file number)
dict_of_errors = dict(tuples)
dict_of_errors = collections.OrderedDict(sorted(dict_of_errors.items()))
return dict_of_errors
# ========================================================================
# INVOKE TESTS FOR SINGLE SPECIFIED DIRECTORY THAT CONTAINS TEST FILES
# ========================================================================
path = # path to directory where xml file is. - not the filepath!
single_sax = SaxParser(path)
print('============================================================')
print('TEST FOR SAX parser FOR DIRECTORY ' + path)
print('============================================================\n')
print(single_sax.test_directory_sax(path))
and test xml file (should produce validation error):
<!DOCTYPE root [
<!ATTLIST root
id2 ID "x23"
>
]>
<!-- an ID attribute must have a declared default
of #IMPLIED or #REQUIRED
-->
<root/>
How do I check validity? For either one of three XML modules?
A simple example would do.
Thanks.
If you look into the source file, you'll see that the xml.sax.handler.feature_validation is not really doing anything but raising this exception:
def setFeature(self, name, state):
# ...
elif name == feature_validation:
if state:
raise SAXNotSupportedException(
"expat does not support validation")
#...
I would suggest using lxml to do this. An example would be like this:
from lxml import etree
from cStringIO import StringIO
# from io import StringIO (py3)
f = StringIO('<!ATTLIST root id2 ID "x23">')
dtd = etree.DTD(f)
root = etree.XML('<root/>')
print(dtd.validate(root))
print(dtd.error_log.filter_from_errors()[0])

Python 3.4 - XML Parse - IndexError: List Index Out of Range - How do I find range of XML?

Okay guys, I'm new to parsing XML and Python, and am trying to get this to work. If someone could help me with this it would be greatly appreciated. If you can help me (educate me) on how to figure it out for myself, that would be even better!
I am having trouble trying to figure out the range to reference for an XML document as I can't find any documentation on it. Here is my code and I'll include the entire Traceback after.
#import library to do http requests:
import urllib.request
#import easy to use xml parser called minidom:
from xml.dom.minidom import parseString
#all these imports are standard on most modern python implementations
#download the file:
file = urllib.request.urlopen('http://www.wizards.com/dndinsider/compendium/CompendiumSearch.asmx/KeywordSearch?Keywords=healing%20%word&nameOnly=True&tab=')
#convert to string:
data = file.read()
#close file because we dont need it anymore:
file.close()
#parse the xml you downloaded
dom = parseString(data)
#retrieve the first xml tag (<tag>data</tag>) that the parser finds with name tagName:
xmlTag = dom.getElementsByTagName('Data.Results.Power.ID')[0].toxml()
#strip off the tag (<tag>data</tag> ---> data):
xmlData=xmlTag.replace('<id>','').replace('</id>','')
#print out the xml tag and data in this format: <tag>data</tag>
print(xmlTag)
#just print the data
print(xmlData)
Traceback
/usr/bin/python3.4 /home/mint/PycharmProjects/DnD_Project/Power_Name.py
Traceback (most recent call last):
File "/home/mint/PycharmProjects/DnD_Project/Power_Name.py", line 14, in <module>
xmlTag = dom.getElementsByTagName('id')[0].toxml()
IndexError: list index out of range
Process finished with exit code 1
print len( dom.getElementsByTagName('id') )
EDIT:
ids = dom.getElementsByTagName('id')
if len( ids ) > 0 :
xmlTag = ids[0].toxml()
# rest of code
EDIT: I add example because I saw in other comment tha you don't know how to use it
BTW: I add some comment in code about file/connection
import urllib.request
from xml.dom.minidom import parseString
# create connection to data/file on server
connection = urllib.request.urlopen('http://www.wizards.com/dndinsider/compendium/CompendiumSearch.asmx/KeywordSearch?Keywords=healing%20%word&nameOnly=True&tab=')
# read from server as string (not "convert" to string):
data = connection.read()
#close connection because we dont need it anymore:
connection.close()
dom = parseString(data)
# get tags from dom
ids = dom.getElementsByTagName('Data.Results.Power.ID')
# check if there are any data
if len( ids ) > 0 :
xmlTag = ids[0].toxml()
xmlData=xmlTag.replace('<id>','').replace('</id>','')
print(xmlTag)
print(xmlData)
else:
print("Sorry, there was no data")
or you can use for loop if there is more tags
dom = parseString(data)
# get tags from dom
ids = dom.getElementsByTagName('Data.Results.Power.ID')
# get all tags - one by one
for one_tag in ids:
xmlTag = one_tag.toxml()
xmlData = xmlTag.replace('<id>','').replace('</id>','')
print(xmlTag)
print(xmlData)
BTW:
getElementsByTagName() expects tagname ID - not path Data.Results.Power.ID
tagname is ID so you have to replace <ID> not <id>
for this tag you can event use one_tag.firstChild.nodeValue in place of xmlTag.replace
.
dom = parseString(data)
# get tags from dom
ids = dom.getElementsByTagName('ID') # tagname
# get all tags - one by one
for one_tag in ids:
xmlTag = one_tag.toxml()
#xmlData = xmlTag.replace('<ID>','').replace('</ID>','')
xmlData = one_tag.firstChild.nodeValue
print(xmlTag)
print(xmlData)
I haven't used the built in xml library in a while, but it's covered in Mark Pilgrim's great Dive into Python book.
-- I see as I'm typing this that your question has already been answered but since you mention being new to Python I think you will find the text useful for xml parsing and as an excellent introduction to the language.
If you would like to try another approach to parsing xml and html, I highly recommend lxml.

lxml attributes require full namespace

The code below reads the a table from an Excel 2003 XML workbook using lxml (python 3.3). The code works fine, however in order to access the Type attribute of the Data element via the get() method I need to use the key '{urn:schemas-microsoft-com:office:spreadsheet}Type' - why is this, I've specified this namespace with the ss prefix.
All I can think of is this namespace appears twice in the document, once with a namespace prefix and once without i.e.
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"
xmlns:html="http://www.w3.org/TR/REC-html40">
And in the file the element and attribute are declared as below - The Type attribute with ss: prefix and the Cell and Data element with no prefix. However the declaration says both belong to the same schema 'urn:schemas-microsoft-com:office:spreadsheet' so surely the parser should treat them equivalently?
<Cell><Data ss:Type="String">QB11128020</Data></Cell>
My code:
with (open(filename,'r')) as f:
doc = etree.parse(f)
namespaces={'o':'urn:schemas-microsoft-com:office:office',
'x':'urn:schemas-microsoft-com:office:excel',
'ss':'urn:schemas-microsoft-com:office:spreadsheet'}
ws = doc.xpath('/ss:Workbook/ss:Worksheet', namespaces=namespaces)
if len(ws) > 0:
tables = ws[0].xpath('./ss:Table', namespaces=namespaces)
if len(tables) > 0:
rows = tables[0].xpath('./ss:Row', namespaces=namespaces)
for row in rows:
cells = row.xpath('./ss:Cell/ss:Data', namespaces=namespaces)
for cell in cells:
print(cell.text);
print(cell.keys());
print(cell.get('{urn:schemas-microsoft-com:office:spreadsheet}Type'));
According to The lxml.etree Tutorial -- Namespace:
The ElementTree API avoids namespace prefixes wherever possible and
deploys the real namespaces (the URI) instead:
BTW, following
cell.get('{urn:schemas-microsoft-com:office:spreadsheet}Type')
can be written as:
cell.get('{%(ss)s}Type' % namespaces)
or:
cell.get('{{{0[ss]}}}Type'.format(namespaces))

Extracting data from a URL result with special formatting

I have a URL:
http://somewhere.com/relatedqueries?limit=2&query=seedterm
where modifying the inputs, limit and query, will generate wanted data. Limit is the max number of term possible and query is the seed term.
The URL provides text result formatted in this way:
oo.visualization.Query.setResponse({version:'0.5',reqId:'0',status:'ok',sig:'1303596067112929220',table:{cols:[{id:'score',label:'Score',type:'number',pattern:'#,##0.###'},{id:'query',label:'Query',type:'string',pattern:''}],rows:[{c:[{v:0.9894380670262618,f:'0.99'},{v:'newterm1'}]},{c:[{v:0.9894380670262618,f:'0.99'},{v:'newterm2'}]}],p:{'totalResultsCount':'7727'}}});
I'd like to write a python script that takes two arguments (limit number and the query seed), go fetch the data online, parse the result and return a list with the new terms ['newterm1','newterm2'] in this case.
I'd love some help, especially with the URL fetching since I have never done this before.
It sounds like you can break this problem up into several subproblems.
Subproblems
There are a handful of problems that need to be solved before composing the completed script:
Forming the request URL: Creating a configured request URL from a template
Retrieving data: Actually making the request
Unwrapping JSONP: The returned data appears to be JSON wrapped in a JavaScript function call
Traversing the object graph: Navigating through the result to find the desired bits of information
Forming the request URL
This is just simple string formatting.
url_template = 'http://somewhere.com/relatedqueries?limit={limit}&query={seedterm}'
url = url_template.format(limit=2, seedterm='seedterm')
Python 2 Note
You will need to use the string formatting operator (%) here.
url_template = 'http://somewhere.com/relatedqueries?limit=%(limit)d&query=%(seedterm)s'
url = url_template % dict(limit=2, seedterm='seedterm')
Retrieving data
You can use the built-in urllib.request module for this.
import urllib.request
data = urllib.request.urlopen(url) # url from previous section
This returns a file-like object called data. You can also use a with-statement here:
with urllib.request.urlopen(url) as data:
# do processing here
Python 2 Note
Import urllib2 instead of urllib.request.
Unwrapping JSONP
The result you pasted looks like JSONP. Given that the wrapping function that is called (oo.visualization.Query.setResponse) doesn't change, we can simply strip this method call out.
result = data.read()
prefix = 'oo.visualization.Query.setResponse('
suffix = ');'
if result.startswith(prefix) and result.endswith(suffix):
result = result[len(prefix):-len(suffix)]
Parsing JSON
The resulting result string is just JSON data. Parse it with the built-in json module.
import json
result_object = json.loads(result)
Traversing the object graph
Now, you have a result_object that represents the JSON response. The object itself be a dict with keys like version, reqId, and so on. Based on your question, here is what you would need to do to create your list.
# Get the rows in the table, then get the second column's value for
# each row
terms = [row['c'][2]['v'] for row in result_object['table']['rows']]
Putting it all together
#!/usr/bin/env python3
"""A script for retrieving and parsing results from requests to
somewhere.com.
This script works as either a standalone script or as a library. To use
it as a standalone script, run it as `python3 scriptname.py`. To use it
as a library, use the `retrieve_terms` function."""
import urllib.request
import json
import sys
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
prefix = 'oo.visualization.Query.setResponse('
suffix = ');'
# Strip JSONP function wrapper
if result.startswith(prefix) and result.endswith(suffix):
result = result[len(prefix):-len(suffix)]
# Deserialize JSON to Python objects
result_object = json.loads(result)
# Get the rows in the table, then get the second column's value
# for each row
return [row['c'][2]['v'] for row in result_object['table']['rows']]
def retrieve_terms(limit, seedterm):
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://somewhere.com/relatedqueries?limit={limit}&query={seedterm}'
url = url_template.format(limit=limit, seedterm=seedterm)
try:
with urllib.request.urlopen(url) as data:
data = perform_request(limit, seedterm)
result = data.read()
except:
print('Could not request data from server', file=sys.stderr)
exit(E_OPERATION_ERROR)
terms = parse_result(result)
print(terms)
def main(limit, seedterm):
"""Retrieves and parses data and prints each term to standard output"""
terms = retrieve_terms(limit, seedterm)
for term in terms:
print(term)
if __name__ == '__main__'
try:
limit = int(sys.argv[1])
seedterm = sys.argv[2]
except:
error_message = '''{} limit seedterm
limit must be an integer'''.format(sys.argv[0])
print(error_message, file=sys.stderr)
exit(2)
exit(main(limit, seedterm))
Python 2.7 version
#!/usr/bin/env python2.7
"""A script for retrieving and parsing results from requests to
somewhere.com.
This script works as either a standalone script or as a library. To use
it as a standalone script, run it as `python2.7 scriptname.py`. To use it
as a library, use the `retrieve_terms` function."""
import urllib2
import json
import sys
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
prefix = 'oo.visualization.Query.setResponse('
suffix = ');'
# Strip JSONP function wrapper
if result.startswith(prefix) and result.endswith(suffix):
result = result[len(prefix):-len(suffix)]
# Deserialize JSON to Python objects
result_object = json.loads(result)
# Get the rows in the table, then get the second column's value
# for each row
return [row['c'][2]['v'] for row in result_object['table']['rows']]
def retrieve_terms(limit, seedterm):
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://somewhere.com/relatedqueries?limit=%(limit)d&query=%(seedterm)s'
url = url_template % dict(limit=2, seedterm='seedterm')
try:
with urllib2.urlopen(url) as data:
data = perform_request(limit, seedterm)
result = data.read()
except:
sys.stderr.write('%s\n' % 'Could not request data from server')
exit(E_OPERATION_ERROR)
terms = parse_result(result)
print terms
def main(limit, seedterm):
"""Retrieves and parses data and prints each term to standard output"""
terms = retrieve_terms(limit, seedterm)
for term in terms:
print term
if __name__ == '__main__'
try:
limit = int(sys.argv[1])
seedterm = sys.argv[2]
except:
error_message = '''{} limit seedterm
limit must be an integer'''.format(sys.argv[0])
sys.stderr.write('%s\n' % error_message)
exit(2)
exit(main(limit, seedterm))
i didn't understand well your problem because from your code there it seem to me that you use Visualization API (it's the first time that i hear about it by the way).
But well if you are just searching for a way to fetch data from a web page you could use urllib2 this is just for getting data, and if you want to parse the retrieved data you will have to use a more appropriate library like BeautifulSoop
if you are dealing with another web service (RSS, Atom, RPC) rather than web pages you can find a bunch of python library that you can use and that deal with each service perfectly.
import urllib2
from BeautifulSoup import BeautifulSoup
result = urllib2.urlopen('http://somewhere.com/relatedqueries?limit=%s&query=%s' % (2, 'seedterm'))
htmletxt = resul.read()
result.close()
soup = BeautifulSoup(htmltext, convertEntities="html" )
# you can parse your data now check BeautifulSoup API.

Categories