XML incremental parsing with python - python

I'm trying to parse a huge XML file with the code below and whenever I run the code through the terminal, it's just run without any errors and does nothing. I need it to parse the file incrementally and delete the parent element after checking if Submission time is older than a specific number of days.
For example, the XML structure is like this:
<Feed>
<Reviews>
<Review>
<SubmissionTime>2015-06-16T19:00:00.000-05:00</SubmissionTime>
</Review>
</Reviews
</Feed>
from lxml import etree, objectify
import logging, sys, iso8601
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import re
def remove_per_age(file):
datestring = datetime.now().strftime("%Y%m%d-%H%M%S")
full_data = ""
for event, elem in ET.iterparse(sys.argv[1], events=("end",)):
if elem.tag == 'SubmissionTime':
element_datetime = iso8601.parse_date(elem.text)
element_date = element_datetime.date()
if (element_date < datetime.now(element_datetime.tzinfo).date()-relativedelta(days=180)):
elem.getparent().remove(elem)
else:
full_data += ET.tostring(elem)
else:
elem.clear()
with open("output.xml", 'w') as f:
f.write(full_data)
def strip_tag_name(tag):
pattern = re.compile(r'\{.+\}')
clean_tag = pattern.sub(r'', tag)
return clean_tag
if __name__ == "__main__":
remove_per_age(sys.argv[1])
#Reviews/Review/SubmissionTime

The way to handle huge XML file incrementally is to use SAX.
You will need to extend xml.sax.ContentHandler and add your logic there.
See https://www.tutorialspoint.com/parsing-xml-with-sax-apis-in-python for an example

Related

how to get the text in one of the divs? (html)

I am writing my bot, which so far has to get the text from the div from one page and put it in a variable, but this does not work out and the variable always remains empty. How i can extract it?
import telebot;
import requests
from lxml import etree
import lxml.html
import csv
bot = telebot.TeleBot('');
#bot.message_handler(content_types=['text'])
def get_text_messages(message):
api = requests.get("https://slovardalja.net/word.php?wordid=21880")
tree = lxml.html.document_fromstring(api.text)
text_original = tree.xpath('/html/body/table/tbody/tr[2]/td/table/tbody/tr/td[2]/index/div[2]/p[1]/strong/text()')
print(text_original)
bot.send_message(message.chat.id,str(text_original))
bot.polling(none_stop=True, interval=0)
https://slovardalja.net/word.php?wordid=21880
I think this code should get the word "ОЛЕКВАС", I copied the path to it and added /text(), but it doesn't work
I have no cyrillic on my system, but with a smaller xpath value and the usage from text_content it print something on shell, hopefully it helps
api = requests.get("https://slovardalja.net/word.php?wordid=21880")
tree = lxml.html.document_fromstring(api.text)
text_original = tree.xpath('//div[#align="justify"]/p/strong')
print(text_original[0].text_content())

NotXMLError: Failed to parse the XML data

I'm trying to use the Entrez module from Biopython to retrive full text articles from PubMed Central. This is my code to do the same.
import urllib3
import json
import requests
from Bio import Entrez
from Bio.Entrez import efetch, Parser
print(Parser.__file__)
pmcid = 'PMC2837563'
def print_text(pmcid):
handle = efetch(db='pmc', id=pmcid, retmode='xml', rettype=None)
#print(handle.read())
record = Entrez.read(handle)
print(record)
print_text(pmcid)
handle.read() works which means the data is being fetched properly. But, I'm not able to do Entrez.read(handle) to convert the fetched data into a python object. It gives me the below error:
NotXMLError: Failed to parse the XML data (syntax error: line 1036, column 69). Please make sure that the input data are in XML format.
Could someone tell me what to do about this? This seems to be correct syntax as per the biopython documentation.
The reason is that the last available Biopython version (1.79) does not recognise DTD with uri http://www.niso.org/schemas/ali/1.0/. The GitHub version has the corrected Parser but it is not available from pip now.
Compare:
current 1.79
def startNamespaceDeclHandler(self, prefix, uri):
"""Handle start of an XML namespace declaration."""
if prefix == "xsi":
# This is an xml schema
self.schema_namespace = uri
self.parser.StartElementHandler = self.schemaHandler
else:
# Note that the DTD for MathML specifies a default attribute
# that declares the namespace for each MathML element. This means
# that MathML element in the XML has an invisible MathML namespace
# declaration that triggers a call to startNamespaceDeclHandler
# and endNamespaceDeclHandler. Therefore we need to count how often
# startNamespaceDeclHandler and endNamespaceDeclHandler were called
# to find out their first and last invocation for each namespace.
if prefix == "mml":
assert uri == "http://www.w3.org/1998/Math/MathML"
elif prefix == "xlink":
assert uri == "http://www.w3.org/1999/xlink"
else:
raise ValueError("Unknown prefix '%s' with uri '%s'" % (prefix, uri))
self.namespace_level[prefix] += 1
self.namespace_prefix[uri] = prefix
GitHub
def startNamespaceDeclHandler(self, prefix, uri):
"""Handle start of an XML namespace declaration."""
if prefix == "xsi":
# This is an xml schema
self.schema_namespace = uri
self.parser.StartElementHandler = self.schemaHandler
else:
# Note that the DTD for MathML specifies a default attribute
# that declares the namespace for each MathML element. This means
# that MathML element in the XML has an invisible MathML namespace
# declaration that triggers a call to startNamespaceDeclHandler
# and endNamespaceDeclHandler. Therefore we need to count how often
# startNamespaceDeclHandler and endNamespaceDeclHandler were called
# to find out their first and last invocation for each namespace.
if prefix == "mml":
assert uri == "http://www.w3.org/1998/Math/MathML"
elif prefix == "xlink":
assert uri == "http://www.w3.org/1999/xlink"
elif prefix == "ali":
assert uri == "http://www.niso.org/schemas/ali/1.0/"
else:
raise ValueError(f"Unknown prefix '{prefix}' with uri '{uri}'")
self.namespace_level[prefix] += 1
self.namespace_prefix[uri] = prefix
So you can either exchange or edit Parser.py file, or use third party libraries for converting your handle to built-in python object.
If you want download just a full text of the article, you could try to download a pdf through metapub & go on to extract a text via textract.
import metapub
from urllib.request import urlretrieve
import textract
pmcid = 'PMC2837563'
fetch = metapub.PubMedFetcher()
article_metadata = fetch.article_by_pmcid(pmcid)
#Get just an abstract
abstract = article_metadata.abstract
#Download full article text
pmid = article_metadata.pmid
url = metapub.FindIt(pmid).url
urlretrieve(url, any_path)
with open(another_path, "w") as textfile:
textfile.write(textract.process(
any_path,
extension='pdf',
method='pdftotext',
encoding="utf_8",
))

Parsing multiple XML using urrlib, lxml and multiprocessing

I'm triying to speed up a script to scrape an XML which is obtained by making a request to an API with urllib. I have to make ~2.3 million requests, so it tooks ~8 hours without multiprocessing.
Without applying multiprocessing:
from urllib import request as rq
from lxml import etree
def download_data(id):
data = []
xml = etree.iterparse(rq.urlretrieve(url + id + ".xml")[0], events=('start', 'end'))
for event, id_data in xml:
if event == "start":
try:
data.append(id_data.get('value'))
except:
pass
return data
with open("/path/to/file", "rt") as ids_file:
ids = ids_file.read().splitlines()
data_dict = {id: download_data(id) for id in ids}
I've tried the following code:
from urllib import request as rq
from lxml import etree
from multiprocessing import Pool, cpu_count
def download_data(id):
data = []
xml = etree.iterparse(rq.urlretrieve(url + id + ".xml")[0], events=('start', 'end'))
for event, id_data in xml:
if event == "start":
try:
data.append(id_data.get('value'))
except:
pass
return (id, data)
with open("/path/to/file", "rt") as ids_file:
ids = ids_file.read().splitlines()
with Pool(processes=cpu_count()*2) as pool:
dt = pool.map(download_data, ids)
data_dict = dict(dt)
I get the following error:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
Any suggestions?
Thank you in advance!

Parsing Javascript In Python

I usually use Beautiful Soup to parse html that I need, but I came across some Javascript that I would like to get from here.
<script>
function Model(){
this.players = [{".....data......:""}];...etc
I tried to load it like...
import json
scrape_url = "https://swishanalytics.com/optimus/nba/daily-fantasy-projections?date=2016-12-15"
result = json.loads(scrape_url)
But I get "No Json Can Be Decoded". Not sure how to go about this.
You can extract JSON from arbitrary text with the jsonfinder library:
from jsonfinder import jsonfinder
import requests
scrape_url = "https://swishanalytics.com/optimus/nba/daily-fantasy-projections?date=2016-12-15"
content = requests.get(scrape_url).text
for _, __, obj in jsonfinder(content, json_only=True):
if (obj and
isinstance(obj, list) and
isinstance(obj[0], dict) and
{'player_id', 'event_id', 'name'}.issubset(obj[0])
):
break
else:
raise ValueError('data not found')
# Now you can use obj
print(len(obj))
print(obj[0])

Python XML validation with SAX, pyexpat, ElementTree

I'm trying to check validity of XML files (against DTDs, entities, Processing instructions, namespaces) in Python 3.4.
Looking at Python docs the default underlying parser for three Python XML modules pyexpat, ELementTree and SAX is expat. On Pyexpat page (https://docs.python.org/3.4/library/pyexpat.html?highlight=pyexpat#module-xml.parsers.expat) says that non-validating version of expat parser is used:
"The xml.parsers.expat module is a Python interface to the Expat non-validating XML parser." Yet at the same time when you look into SAX documentation in Python you see all these handler functions for enabling DTD validation etc. How the heck do you make them work?
However, according to this post Parsing XML Entity with python xml.sax SAX can validate. Obviously with expat as a parser.
I have reused the code from this post and can't get it to work I get error saying expat does not support validation:
"File "/usr/lib/python3.4/xml/sax/expatreader.py", line 149, in setFeature
"expat does not support validation")
xml.sax._exceptions.SAXNotSupportedException: expat does not support validation".
In the post Python 2.5 was used, so maybe SAX has changed since then...
This is the code:
import xml.sax
from xml.sax import handler, make_parser, parse
import os
import collections
class SaxParser():
# initializer with directory part as argument
def __init__(self, dir_path):
self.dir_path = dir_path
def test_each_file(self, file_path):
# ensure full file name is shown
rev = file_path[::-1] # reverse string file_path to access position of "/"
file = file_path[-rev.index("/"):]
try:
f = open(file_path, 'r', encoding="ISO-8859-1") # same as "latin-1" encoding
# see this for enabling validation:
# https://stackoverflow.com/questions/6349513/parsing-xml-entity-with-python-xml-sax
parser = make_parser() # default parser is expat
parser.setContentHandler(handler.ContentHandler())
parser.setFeature(handler.feature_namespaces,True)
parser.setFeature(handler.feature_validation,True)
parser.setFeature(handler.feature_external_ges, True)
parser.parse(f)
f.close()
return (file, "OK")
except xml.sax.SAXParseException as PE:
column = PE.getColumnNumber()
line = PE.getLineNumber()
msg = PE.getMessage()
value = msg + " " + str(line) + " " + str(column)
return (file, value)
except ValueError:
return (file, "ValueError. DTD uri not found.") # that can happen
def test_directory_sax(self, dir_path):
tuples = []
for ind, file in enumerate(os.listdir(dir_path), 1):
if file.endswith('.xml'):
tuples.append(self.test_each_file(dir_path + file))
# convert into dict and sort it by key (file number)
dict_of_errors = dict(tuples)
dict_of_errors = collections.OrderedDict(sorted(dict_of_errors.items()))
return dict_of_errors
# ========================================================================
# INVOKE TESTS FOR SINGLE SPECIFIED DIRECTORY THAT CONTAINS TEST FILES
# ========================================================================
path = # path to directory where xml file is. - not the filepath!
single_sax = SaxParser(path)
print('============================================================')
print('TEST FOR SAX parser FOR DIRECTORY ' + path)
print('============================================================\n')
print(single_sax.test_directory_sax(path))
and test xml file (should produce validation error):
<!DOCTYPE root [
<!ATTLIST root
id2 ID "x23"
>
]>
<!-- an ID attribute must have a declared default
of #IMPLIED or #REQUIRED
-->
<root/>
How do I check validity? For either one of three XML modules?
A simple example would do.
Thanks.
If you look into the source file, you'll see that the xml.sax.handler.feature_validation is not really doing anything but raising this exception:
def setFeature(self, name, state):
# ...
elif name == feature_validation:
if state:
raise SAXNotSupportedException(
"expat does not support validation")
#...
I would suggest using lxml to do this. An example would be like this:
from lxml import etree
from cStringIO import StringIO
# from io import StringIO (py3)
f = StringIO('<!ATTLIST root id2 ID "x23">')
dtd = etree.DTD(f)
root = etree.XML('<root/>')
print(dtd.validate(root))
print(dtd.error_log.filter_from_errors()[0])

Categories