lxml attributes require full namespace - python

The code below reads the a table from an Excel 2003 XML workbook using lxml (python 3.3). The code works fine, however in order to access the Type attribute of the Data element via the get() method I need to use the key '{urn:schemas-microsoft-com:office:spreadsheet}Type' - why is this, I've specified this namespace with the ss prefix.
All I can think of is this namespace appears twice in the document, once with a namespace prefix and once without i.e.
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"
xmlns:html="http://www.w3.org/TR/REC-html40">
And in the file the element and attribute are declared as below - The Type attribute with ss: prefix and the Cell and Data element with no prefix. However the declaration says both belong to the same schema 'urn:schemas-microsoft-com:office:spreadsheet' so surely the parser should treat them equivalently?
<Cell><Data ss:Type="String">QB11128020</Data></Cell>
My code:
with (open(filename,'r')) as f:
doc = etree.parse(f)
namespaces={'o':'urn:schemas-microsoft-com:office:office',
'x':'urn:schemas-microsoft-com:office:excel',
'ss':'urn:schemas-microsoft-com:office:spreadsheet'}
ws = doc.xpath('/ss:Workbook/ss:Worksheet', namespaces=namespaces)
if len(ws) > 0:
tables = ws[0].xpath('./ss:Table', namespaces=namespaces)
if len(tables) > 0:
rows = tables[0].xpath('./ss:Row', namespaces=namespaces)
for row in rows:
cells = row.xpath('./ss:Cell/ss:Data', namespaces=namespaces)
for cell in cells:
print(cell.text);
print(cell.keys());
print(cell.get('{urn:schemas-microsoft-com:office:spreadsheet}Type'));

According to The lxml.etree Tutorial -- Namespace:
The ElementTree API avoids namespace prefixes wherever possible and
deploys the real namespaces (the URI) instead:
BTW, following
cell.get('{urn:schemas-microsoft-com:office:spreadsheet}Type')
can be written as:
cell.get('{%(ss)s}Type' % namespaces)
or:
cell.get('{{{0[ss]}}}Type'.format(namespaces))

Related

NotXMLError: Failed to parse the XML data

I'm trying to use the Entrez module from Biopython to retrive full text articles from PubMed Central. This is my code to do the same.
import urllib3
import json
import requests
from Bio import Entrez
from Bio.Entrez import efetch, Parser
print(Parser.__file__)
pmcid = 'PMC2837563'
def print_text(pmcid):
handle = efetch(db='pmc', id=pmcid, retmode='xml', rettype=None)
#print(handle.read())
record = Entrez.read(handle)
print(record)
print_text(pmcid)
handle.read() works which means the data is being fetched properly. But, I'm not able to do Entrez.read(handle) to convert the fetched data into a python object. It gives me the below error:
NotXMLError: Failed to parse the XML data (syntax error: line 1036, column 69). Please make sure that the input data are in XML format.
Could someone tell me what to do about this? This seems to be correct syntax as per the biopython documentation.
The reason is that the last available Biopython version (1.79) does not recognise DTD with uri http://www.niso.org/schemas/ali/1.0/. The GitHub version has the corrected Parser but it is not available from pip now.
Compare:
current 1.79
def startNamespaceDeclHandler(self, prefix, uri):
"""Handle start of an XML namespace declaration."""
if prefix == "xsi":
# This is an xml schema
self.schema_namespace = uri
self.parser.StartElementHandler = self.schemaHandler
else:
# Note that the DTD for MathML specifies a default attribute
# that declares the namespace for each MathML element. This means
# that MathML element in the XML has an invisible MathML namespace
# declaration that triggers a call to startNamespaceDeclHandler
# and endNamespaceDeclHandler. Therefore we need to count how often
# startNamespaceDeclHandler and endNamespaceDeclHandler were called
# to find out their first and last invocation for each namespace.
if prefix == "mml":
assert uri == "http://www.w3.org/1998/Math/MathML"
elif prefix == "xlink":
assert uri == "http://www.w3.org/1999/xlink"
else:
raise ValueError("Unknown prefix '%s' with uri '%s'" % (prefix, uri))
self.namespace_level[prefix] += 1
self.namespace_prefix[uri] = prefix
GitHub
def startNamespaceDeclHandler(self, prefix, uri):
"""Handle start of an XML namespace declaration."""
if prefix == "xsi":
# This is an xml schema
self.schema_namespace = uri
self.parser.StartElementHandler = self.schemaHandler
else:
# Note that the DTD for MathML specifies a default attribute
# that declares the namespace for each MathML element. This means
# that MathML element in the XML has an invisible MathML namespace
# declaration that triggers a call to startNamespaceDeclHandler
# and endNamespaceDeclHandler. Therefore we need to count how often
# startNamespaceDeclHandler and endNamespaceDeclHandler were called
# to find out their first and last invocation for each namespace.
if prefix == "mml":
assert uri == "http://www.w3.org/1998/Math/MathML"
elif prefix == "xlink":
assert uri == "http://www.w3.org/1999/xlink"
elif prefix == "ali":
assert uri == "http://www.niso.org/schemas/ali/1.0/"
else:
raise ValueError(f"Unknown prefix '{prefix}' with uri '{uri}'")
self.namespace_level[prefix] += 1
self.namespace_prefix[uri] = prefix
So you can either exchange or edit Parser.py file, or use third party libraries for converting your handle to built-in python object.
If you want download just a full text of the article, you could try to download a pdf through metapub & go on to extract a text via textract.
import metapub
from urllib.request import urlretrieve
import textract
pmcid = 'PMC2837563'
fetch = metapub.PubMedFetcher()
article_metadata = fetch.article_by_pmcid(pmcid)
#Get just an abstract
abstract = article_metadata.abstract
#Download full article text
pmid = article_metadata.pmid
url = metapub.FindIt(pmid).url
urlretrieve(url, any_path)
with open(another_path, "w") as textfile:
textfile.write(textract.process(
any_path,
extension='pdf',
method='pdftotext',
encoding="utf_8",
))

Getting parsing error when attempting to remove empty xml tags with lxml

I'm in the process of creating an xml file from a csv source, and have had some very useful input from users in this process. It appears that my last requirement (of which I've only just become aware), is to drop any empty xml tags (that is, those with no content) before calling an API.
I'm attempting to use the etree remove method to drop the empty tags, but I'm getting an error that etree.fromstring can only parse strings. Here is my sample data and code.
ACTION|INV_ACCT_CLASS|EXT_INV_ID|WAREHOUSE_ID|NAME|CNTRY_CD|PHONE|ADDR_STR1|ADDR_STR2|CITY|ST|ZIP|ADD_KEY_NUM
add|2|AAA_00005|1001213|Company 1|US|9995555555|1313 Mockingbird Lane||New York|NY|10001|44433322
add|2|BBB_00008|1004312|Company 2|US|43255511110|Some other address||Stamford|CT|44112|11122233
import lxml.etree
from lxml.builder import E
import csv
import string
import date from datetime
with open("filename.csv") as csvfile:
results = E.paiInv(*(
E.invrec(
E.action(row['ACTION']),
E.investor(
E.inv_account_class(row['INV_ACCOUNT_CLASS']),
E.ext_inv_id(row['EXT_INV_ID']),
E.warehouse_id(row['WAREHOUSE_ID']),
E.name(row['NAME']),
E.cntry_cd(row['CNTRY_CD']),
E.phone(row['PHONE']),
E.addr_str1(row['ADDRESS_STR1']),
E.addr_str2(row['ADDRESS_STR2']),
E.city(row['CITY']),
E.st(row['ST']),
E.zip(row['ZIP']),
E.add_key_num(row['ADD_KEY_NUM'])
)
) for row in csv.DictReader(csvfile, delimiter = '|'))
)
req = '<request_id>Investor' + str(date.today()) + '</request_id>'
doc = lxml.etree.ElementTree(results)
ins = lxml.etree.fromstring(req)
ins.tail = "\n"
dest = doc.xpath('/paiInv')[0]
dest.insert(0,ins)
This gives me exactly what I need, except that if any columns in the csv are empty, I'll get empty xml tags, as would be expected. Since we're required to drop any empty tags, I've tried this code:
root = lxml.etree.fromstring(results)
for element in root.xpath(".//*[not(node())]"):
element.getparent().remove(element)
I receive an error saying "can only parse strings". I'm trying to understand why it would find anything other than strings, and how I should tweak the code that it will correctly drop empty tags.
Thanks!
Here's one way to handle it: change your for loop at the end to:
for element in root.xpath('//*'):
if element.text is None:
elememt.getparent().remove(element)
The output should now have removed both <addr_str2> nodes which are empty.

Building a generic XML parser in Python?

I am a newbie and having 1 week experience writing python scripts.
I am trying to write a generic parser (Library for all my future jobs) which parses any input XML without any prior knowledge of tags.
Parse input XML.
Get the values from the XML and Set the values basing on the tags.
Use these values in the rest of the job.
I am using the "xml.etree.ElementTree" library and i am able to parse the XML in the below mentioned way.
#!/usr/bin/python
import os
import xml.etree.ElementTree as etree
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
logger.info('start reading XML property file')
filename = "mood_ib_history_parameters_DEV.xml"
logger.info('getting the current location')
__currentlocation__ = os.getcwd()
__fullpath__ = os.path.join(__currentlocation__,filename)
logger.info('start parsing the XML property file')
tree = etree.parse(__fullpath__)
root = tree.getroot()
hive_db = root.find("hive_db").text
EDGE_HIVE_CONN = root.find("EDGE_HIVE_CONN").text
target_dir = root.find("target_dir").text
to_email_alias = root.find("to_email_alias").text
to_email_cc = root.find("to_email_cc").text
from_email_alias = root.find("from_email_alias").text
dburl = root.find("dburl").text
SQOOP_EDGE_CONN = root.find("SQOOP_EDGE_CONN").text
user_name = root.find("user_name").text
password = root.find("password").text
IB_log_table = root.find("IB_log_table").text
SR_DG_master_table = root.find("SR_DG_master_table").text
SR_DG_table = root.find("SR_DG_table").text
logger.info('Hive DB %s', hive_db)
logger.info('Hive DB %s', hive_db)
logger.info('Edge Hive Connection %s', EDGE_HIVE_CONN)
logger.info('Target Directory %s', target_dir)
logger.info('To Email address %s', to_email_alias)
logger.info('CC Email address %s', to_email_cc)
logger.info('From Email address %s', from_email_alias)
logger.info('DB URL %s',dburl)
logger.info('Sqoop Edge node connection %s',SQOOP_EDGE_CONN)
logger.info('Log table name %s',IB_log_table)
logger.info('Master table name %s',SR_DG_master_table)
logger.info('Data governance table name %s',SR_DG_table)
Now the question is if i want to parse an XML without any knowledge of the tags and elements and use the values how do i do it. I have gone through multiple tutorials but all of them help me with parsing the XML by using the tags like below
SQOOP_EDGE_CONN = root.find("SQOOP_EDGE_CONN").text
Can anybody point me to a right tutorial or library or a code snippet to parse the XML dynamically.
I think official documentation is pretty clear and contains some examples: https://docs.python.org/3/library/xml.etree.elementtree.html
The main part you need to implement is loop over the child nodes (potentially recursively):
for child in root:
# child.tag contains the tag name, child.attrib contains the attributes
print(child.tag, child.attrib)
Well parsing is easy as that - etree.parse(path)
Once you've got the root in hand using tree.getroot() you can just iterate over the tree using Python's "in":
for child_node in tree.getroot():
print child_node.text
Then, to see tags these child_nodes have, you do the same trick.
This lets you go over all tags in the XML without having to know the tag names at all.

Python: XML parsing using xml.dom.minidom - iterating over collection.getElementsByTagName

I have some xml I'm trying to parse using Python 2.7. The XML is in the format below. In the code (also included below), if I try to print using collection.getAttribute('ProjectId') I get the id number, but once I assign collection to tags and then run it through a loop, I don't get output(no error message). Any clues?
XML:
<SummaryReport ProjectId="37f8d135-1f1d-4e57-9b7d-b084770c6bf5" EntityId="016fbc07-69f0-407e-b5b5-0b0b6bba4307" Status="Failed">
<TotalCount>0</TotalCount>
<SuccessfulCount>0</SuccessfulCount>
<FailedCount>0</FailedCount>
<StartUtcTime>2015-09-09T16:43:11.810715Z</StartUtcTime>
<EndUtcTime>2015-09-09T16:43:44.5418427Z</EndUtcTime>
<IsIncremental>false</IsIncremental>
<OnDemand>true</OnDemand>
<TrackingId>c0972936-c8b6-4cdb-b089-d08c6f9702aa</TrackingId>
<Message>An error occurred during content building: Index was out of range. Must be non-negative and less than the size of the collection.
Parameter name: index</Message>
<LogEntries>
<LogEntry>
<Level>Info</Level>
<LogCodeType>System</LogCodeType>
<LogCode>PhaseSucceedInfo</LogCode>
<Name>Phase</Name>
<Message>'Load Metadata' succeeded in 00:00:00.1905878 seconds.</Message>
<Anchor>Info_7333babe-fc51-4b45-9167-bf263e7babcb</Anchor>
</LogEntry>
<LogEntry>
<Level>Info</Level>
<LogCodeType>System</LogCodeType>
<LogCode>PublishRequest</LogCode>
<Name>PublishTocAndArticleInit</Name>
<Message>'Load Metadata' succeeded in 00:00:01.1905878 seconds.</Message>
<Anchor>Info_51c10e71-d99a-49f9-b4aa-d83dc273426a</Anchor>
</LogEntry>
</LogEntries>
</SummaryReport>
Code:
#!/usr/bin/python
from xml.dom.minidom import parse
import xml.dom.minidom
# Open XML document using minidom parser
DOMTree = xml.dom.minidom.parse("file.xml")
collection = DOMTree.documentElement
#Get all the tags under summaryreport
tags = collection.getElementsByTagName("SummaryReport")
#print tag info
for tag in tags:
print '*******Tag Info************'
print 'Project Id: %s' % tag.getAttribute('ProjectId')
You get no output because collection.getElementsByTagName("SummaryReport") returns nothing :
>>> tags = collection.getElementsByTagName("SummaryReport")
>>> print(tags)
[]
That make sense since collection already reference SummaryReport element and it has no descendant element named the same.
UPDATE :
Simple for loop works fine to iterate through Level elements and print the value, for example :
>>> tags = collection.getElementsByTagName("Level")
>>> for tag in tags:
print(tag.firstChild.nodeValue)
Info
Info

lxml, xi:include, and original file

I'm using lxml to parse a file that contains xi:include elements, and I'm resolve the includes using xinclude().
Given an element, is there any way to identify the file and source line that the element originally appeared in?
For example:
from lxml import etree
doc = etree.parse('file.xml')
doc.xinclude()
xpath_expression = ...
elt = doc.xpath(xpath_expression)
# Print file name and source line of `elt` location
The xinclude expansion will add an xml:base attribute to the top level expanded element,
and elt.base and elt.sourceline are also updated for the child nodes as well, so:
print elt.base, elt.sourceline
will give you what you want.
If elt is not part of the xinclude expansion, then elt.base will point to the base
document ( 'file.xml' ) and elt.sourceline will be the line number in that file.
( Note that sourceline usually seems to actually point to the line where the element tag
ends, not to the line where it begins, if the element is on multiple lines, just as
validation error messages usually point to the closing tag where the error occurs. )
You can find the initial xincluded elements and check this with:
xels = doc.xpath( '//*[#xml:base] )
for x in xels:
print x.tag, x.base, x.sourceline
for c in x.getchildren():
print c.tag, c.base, c.sourceline
Sadly, current versions of lxml no longer include this ability. However, I've developed a workaround using a simple custom loader. Here's a test script which demonstrates the bug in the approach above along with the workaround. Note that this approach only updates the xml:base attribute of the root tag of the included document.
The output of the program (using Python 3.9.1, lxml 4.6.3):
Included file was source.xml; xinclude reports it as document.xml
Included file was source.xml; workaround reports it as source.xml
Here's the sample program.
# Includes
# ========
from pathlib import Path
from textwrap import dedent
from lxml import etree as ElementTree
from lxml import ElementInclude
# Setup
# =====
# Create a sample document, taken from the `Python stdlib
# <https://docs.python.org/3/library/xml.etree.elementtree.html#id3>`_...
Path("document.xml").write_text(
dedent(
"""\
<?xml version="1.0"?>
<document xmlns:xi="http://www.w3.org/2001/XInclude">
<xi:include href="source.xml" parse="xml" />
</document>
"""
)
)
# ...and the associated include file.
Path("source.xml").write_text("<para>This is a paragraph.</para>")
# Failing xinclude case
# =====================
# Load and xinclude this.
tree = ElementTree.parse("document.xml")
tree.xinclude()
# Show that the ``base`` attribute refers to the top-level
# ``document.xml``, instead of the xincluded ``source.xml``.
root = tree.getroot()
print(f"Included file was source.xml; xinclude reports it as {root[0].base}")
# Workaround
# ==========
# As a workaround, define a loader which sets the ``xml:base`` of an
# xincluded element. While lxml evidently used to do this, a change
# eliminated this ability per some `discussion
# <https://mail.gnome.org/archives/xml/2014-April/msg00015.html>`_,
# which included a rejected patch fixing this problem. `Current source
# <https://github.com/GNOME/libxml2/blob/master/xinclude.c#L1689>`_
# lacks this patch.
def my_loader(href, parse, encoding=None, parser=None):
ret = ElementInclude._lxml_default_loader(href, parse, encoding, parser)
ret.attrib["{http://www.w3.org/XML/1998/namespace}base"] = href
return ret
new_tree = ElementTree.parse("document.xml")
ElementInclude.include(new_tree, loader=my_loader)
new_root = new_tree.getroot()
print(f"Included file was source.xml; workaround reports it as {new_root[0].base}")

Categories