Parsing large XML file using 'xmltodict' module results in OverflowError - python

I have a fairly large XML File of about 3GB size that I am wanting to parse in streaming mode using 'xmltodict' utility. The code I have iterates through each item and forms a dictionary item and appends to the dictionary in memory, eventually to be dumped as json in a file.
I have the following working perfectly on a small xml data set:
import xmltodict, json
import io
output = []
def handle(path, item):
#do stuff
return
doc_file = open("affiliate_partner_feeds.xml","r")
doc = doc_file.read()
xmltodict.parse(doc, item_depth=2, item_callback=handle)
f = open('jbtest.json', 'w')
json.dump(output,f)
On a large file, I get the following:
Traceback (most recent call last):
File "jbparser.py", line 125, in <module>
**xmltodict.parse(doc, item_depth=2, item_callback=handle)**
File "/usr/lib/python2.7/site-packages/xmltodict.py", line 248, in parse
parser.Parse(xml_input, True)
OverflowError: size does not fit in an int
The exact location of exception inside xmltodict.py is:
def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
namespace_separator=':', **kwargs):
handler = _DictSAXHandler(namespace_separator=namespace_separator,
**kwargs)
if isinstance(xml_input, _unicode):
if not encoding:
encoding = 'utf-8'
xml_input = xml_input.encode(encoding)
if not process_namespaces:
namespace_separator = None
parser = expat.ParserCreate(
encoding,
namespace_separator
)
try:
parser.ordered_attributes = True
except AttributeError:
# Jython's expat does not support ordered_attributes
pass
parser.StartElementHandler = handler.startElement
parser.EndElementHandler = handler.endElement
parser.CharacterDataHandler = handler.characters
parser.buffer_text = True
try:
parser.ParseFile(xml_input)
except (TypeError, AttributeError):
**parser.Parse(xml_input, True)**
return handler.item
Any way to get around this? AFAIK, the xmlparser object is not exposed for me to play around and change 'int' to 'long'. More importantly, what is really going on here?
Would really appreciate any leads on this. Thanks!

Try to use marshal.load(file) or marshal.load(sys.stdin) in order to unserialize the file (or to use it as a stream) instead of reading the whole file into memory and then parse it as a whole.
Here is an example:
>>> def handle_artist(_, artist):
... print artist['name']
... return True
>>>
>>> xmltodict.parse(GzipFile('discogs_artists.xml.gz'),
... item_depth=2, item_callback=handle_artist)
A Perfect Circle
Fantômas
King Crimson
Chris Potter
...
STDIN:
import sys, marshal
while True:
_, article = marshal.load(sys.stdin)
print article['title']

Related

Where am I going wrong with patching a function with mock_open?

I have a function that calls a sub-function to open up a file. I am trying to test the parent function, but I want to patch the sub-function and have it return the data I pass in (as if it read from a file).
tests.py
# Read in the sample data
__SAMPLE_LOG = os.path.join(settings.BASE_DIR, "apps/tests/log_viewer/sample_logs/sample_manager_log.log")
sample_data = []
for line in reversed_lines(open(__SAMPLE_LOG)):
sample_data.append(line)
sample_data = ('').join(sample_data)
class ReadLog(TestCase):
#patch('apps.log_viewer.utils.reversed_lines', new_callable = mock_open, read_data = sample_data)
def test_returnsDictionaryContainingListOfDictionaries(self, mock_file):
activity = read_log()
# Make sure the sample data was read ==> this fails.
self.assertEqual(open(settings.ACTIVITY_LOG_FILE).read(), sample_data)
utils.py
def read_log():
# This is the line I am trying to patch
for line in reversed_lines(open(settings.ACTIVITY_LOG_FILE)):
# process data
# see: https://stackoverflow.com/questions/260273/most-efficient-way-to-search-the-last-x-lines-of-a-file-in-python/260433#260433
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
for block in reversed_blocks(file):
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def reversed_blocks(file, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
The error
I am trying to patch reversed_lines() in utils.py within the read_log() method, but read_log() is still reading from the actual log, indicating that I am not patching reversed_lines() correctly.
When I change
#patch('apps.log_viewer.utils.reversed_lines', new_callable = mock_open, read_data = sample_data)
to
#patch('builtins.open', new_callable = mock_open, read_data = sample_data)
I get
======================================================================
ERROR: test_returnsDictionaryContainingListOfDictionaries
(tests.log_viewer.test_utils.ReadLog)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/usr/local/Cellar/python/3.7.4/Frameworks/Python.framework/Versions/3.7/lib/python3.7/unittest/mock.py", line 1209, in patched
return func(*args, **keywargs)
File "/webapp/apps/tests/log_viewer/test_utils.py", line 32, in test_returnsDictionaryContainingListOfDictionaries
activity = read_log()
File "/webapp/apps/log_viewer/utils.py", line 64, in read_log
for line in reversed_lines(open(settings.ACTIVITY_LOG_FILE)):
File "/webapp/apps/log_viewer/utils.py", line 173, in reversed_lines
for block in reversed_blocks(file):
File "/webapp/apps/log_viewer/utils.py", line 164, in reversed_blocks
while 0 < here:
TypeError: '<' not supported between instances of 'int' and 'MagicMock'
Where am I going wrong?
Following the example from the docs at https://docs.python.org/3.3/library/unittest.mock.html#mock-open I think you want
#patch('builtins.open', mock_open(read_data = sample_data), create=True)
However reading through the source of mock_open: https://github.com/python/cpython/blob/3.7/Lib/unittest/mock.py#L2350
It appears that the tell method for filehandles is not implemented by the mock. The only supported methods are read, readline, readlines, write and iterating over the contents. You'll need to manually set up the mock for the tell method. This is not a general implementation but will work in your specific case:
class ReadLog(TestCase):
#patch('builtins.open', mock_open(read_data = sample_data), create=True)
def test_returnsDictionaryContainingListOfDictionaries(self, mock_file):
mock_file.return_value.tell.return_value = len(sample_data)
...

How can I retain ASCII hex code points when writing an ElementTree in Python?

I've loaded an xml file (Rhythmbox's database file) into Python 3 via the ElementTree parser. After modifying the tree and writing it to disk (ElementTree.write()) using the ascii encoding all of the ASCII hex characters that are in hex code point are converted to ASCII decimal code point. For example here is a diff containing the copyright symbol:
< <copyright>© WNYC</copyright>
---
> <copyright>© WNYC</copyright>
Is there any way to tell Python/ElementTree not to do this? I'd like all the hex codes to stay in hex code point.
I found a solution. First I created a new codec error handler and then monkey patched ElementTree._get_writer() to use the new error handler. Looks like:
from xml.etree import ElementTree
import io
import contextlib
import codecs
def lower_first(s):
return s[:1].lower() + s[1:] if s else ''
def html_replace(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
s = []
for c in exc.object[exc.start:exc.end]:
s.append('&#%s;' % lower_first(hex(ord(c))[1:].upper()))
return ''.join(s), exc.end
else:
raise TypeError("can't handle %s" % exc.__name__)
codecs.register_error('html_replace', html_replace)
# monkey patch this python function to prevent it from using xmlcharrefreplace
#contextlib.contextmanager
def _get_writer(file_or_filename, encoding):
# returns text write method and release all resources after using
try:
write = file_or_filename.write
except AttributeError:
# file_or_filename is a file name
if encoding == "unicode":
file = open(file_or_filename, "w")
else:
file = open(file_or_filename, "w", encoding=encoding,
errors="html_replace")
with file:
yield file.write
else:
# file_or_filename is a file-like object
# encoding determines if it is a text or binary writer
if encoding == "unicode":
# use a text writer as is
yield write
else:
# wrap a binary writer with TextIOWrapper
with contextlib.ExitStack() as stack:
if isinstance(file_or_filename, io.BufferedIOBase):
file = file_or_filename
elif isinstance(file_or_filename, io.RawIOBase):
file = io.BufferedWriter(file_or_filename)
# Keep the original file open when the BufferedWriter is
# destroyed
stack.callback(file.detach)
else:
# This is to handle passed objects that aren't in the
# IOBase hierarchy, but just have a write method
file = io.BufferedIOBase()
file.writable = lambda: True
file.write = write
try:
# TextIOWrapper uses this methods to determine
# if BOM (for UTF-16, etc) should be added
file.seekable = file_or_filename.seekable
file.tell = file_or_filename.tell
except AttributeError:
pass
file = io.TextIOWrapper(file,
encoding=encoding,
errors='html_replace',
newline="\n")
# Keep the original file open when the TextIOWrapper is
# destroyed
stack.callback(file.detach)
yield file.write
ElementTree._get_writer = _get_writer

How to update line with modified data in Jython?

I'm have a csv file which contains hundred thousands of rows and below are some sample lines..,
1,Ni,23,28-02-2015 12:22:33.2212-02
2,Fi,21,28-02-2015 12:22:34.3212-02
3,Us,33,30-03-2015 12:23:35-01
4,Uk,34,31-03-2015 12:24:36.332211-02
I need to get the last column of csv data which is in wrong datetime format. So I need to get default datetimeformat("YYYY-MM-DD hh:mm:ss[.nnn]") from last column of the data.
I have tried the following script to get lines from it and write into flow file.
import json
import java.io
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import StreamCallback
class PyStreamCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
text = IOUtils.readLines(inputStream, StandardCharsets.UTF_8)
for line in text[1:]:
outputStream.write(line + "\n")
flowFile = session.get()
if (flowFile != None):
flowFile = session.write(flowFile,PyStreamCallback())
flowFile = session.putAttribute(flowFile, "filename", flowFile.getAttribute('filename'))
session.transfer(flowFile, REL_SUCCESS)
but I am not able to find a way to convert it like below output.
1,Ni,23,28-02-2015 12:22:33.221
2,Fi,21,29-02-2015 12:22:34.321
3,Us,33,30-03-2015 12:23:35
4,Uk,34,31-03-2015 12:24:36.332
I have checked solutions with my friend(google) and was still not able to find solution.
Can anyone guide me to convert those input data into my required output?
In this transformation the unnecessary data located at the end of each line, so it's really easy to manage transform task with regular expression.
^(.*:\d\d)((\.\d{1,3})(\d*))?(-\d\d)?
Check the regular expression and explanation here:
https://regex101.com/r/sAB4SA/2
As soon as you have a large file - better not to load it into the memory. The following code loads whole the file into the memory:
IOUtils.readLines(inputStream, StandardCharsets.UTF_8)
Better to iterate line by line.
So this code is for ExecuteScript nifi processor with python (Jython) language:
import sys
import re
import traceback
from org.apache.commons.io import IOUtils
from org.apache.nifi.processor.io import StreamCallback
from org.python.core.util import StringUtil
from java.lang import Class
from java.io import BufferedReader
from java.io import InputStreamReader
from java.io import OutputStreamWriter
class TransformCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
try:
writer = OutputStreamWriter(outputStream,"UTF-8")
reader = BufferedReader(InputStreamReader(inputStream,"UTF-8"))
line = reader.readLine()
p = re.compile('^(.*:\d\d)((\.\d{1,3})(\d*))?(-\d\d)?')
while line!= None:
# print line
match = p.search(line)
writer.write( match.group(1) + (match.group(3) if match.group(3)!=None else '') )
writer.write('\n')
line = reader.readLine()
writer.flush()
writer.close()
reader.close()
except:
traceback.print_exc(file=sys.stdout)
raise
flowFile = session.get()
if flowFile != None:
flowFile = session.write(flowFile, TransformCallback())
# Finish by transferring the FlowFile to an output relationship
session.transfer(flowFile, REL_SUCCESS)
And as soon as question is about nifi, here are alternatives that seems to be easier
the same code as above but in groovy for nifi ExecuteScript processor:
def ff = session.get()
if(!ff)return
ff = session.write(ff, {rawIn, rawOut->
// ## transform streams into reader and writer
rawIn.withReader("UTF-8"){reader->
rawOut.withWriter("UTF-8"){writer->
reader.eachLine{line, lineNum->
if(lineNum>1) { // # skip the first line
// ## let use regular expression to transform each line
writer << line.replaceAll( /^(.*:\d\d)((\.\d{1,3})(\d*))?(-\d\d)?/ , '$1$3' ) << '\n'
}
}
}
}
} as StreamCallback)
session.transfer(ff, REL_SUCCESS)
ReplaceText processor
And if regular expression is ok - the easiest way in nifi is a ReplaceText processor that could do regular expression replace line-by-line.
In this case you don't need to write any code, just build the regular expression and configure your processor correctly.
Just using pure jython. It is an example that can be adapted to OP's needs.
Define a datetime parser for this csv file
from datetime import datetime
def parse_datetime(dtstr):
mydatestr='-'.join(dtstr.split('-')[:-1])
try:
return datetime.strptime(mydatestr,'%d-%m-%Y %H:%M:%S.%f').strftime('%d-%m-%Y %H:%M:%S.%f')[:-3]
except ValueError:
return datetime.strptime(mydatestr,'%d-%m-%Y %H:%M:%S').strftime('%d-%m-%Y %H:%M:%S')
my test.csv includes data like this: ( 2015 didnt have 29 Feb had to change OP's example ).
1,Ni,23,27-02-2015 12:22:33.2212-02
2,Fi,21,28-02-2015 12:22:34.3212-02
3,Us,33,30-03-2015 12:23:35-01
4,Uk,34,31-03-2015 12:24:36.332211-02
now the solution
with open('test.csv') as fi:
for line in fi:
line_split=line.split(',')
out_line = ', '.join(word if i<3 else parse_datetime(word) for i,word in enumerate(line_split))
#print(out_line)
#you can write this out_line to a file here.
printing out_line looks like this
1, Ni, 23, 27-02-2015 12:22:33.221
2, Fi, 21, 28-02-2015 12:22:34.321
3, Us, 33, 30-03-2015 12:23:35
4, Uk, 34, 31-03-2015 12:24:36.332
You can get them with regex :
(\d\d-\d\d-\d\d\d\d\ \d\d:\d\d:)(\d+(?:\.\d+)*)(-\d\d)$
Then just replace #2 with a rounded version of #2
See regex example at regexr.com
You could even do it "nicer" by getting every single value with a capturing group and then put them into a datetime.datetime object and print it from there, but imho that would be an overkill in maintainability and loose you too much performance.
Code had no possibility to test
import re
...
pattern = '^(.{25})(\d+(?:\.\d+)*)(-\d\d)$' //used offset for simplicity
....
for line in text[1:]:
match = re.search(pattern, line)
line = match.group(1) + round(match.group(2),3) + match.group(3)
outputStream.write(line + "\n")

How to load a .json file with python nltk

I'm trying to load a .json file from an output of an application so I can feed it into different machine learning algorithms so I can classify the text, problem is I can't seem to figure out why NLTK is not loading my .json file, even if I try it with their own .json file, it doesn't seem to work. From what I gather based on the book, I should only need to import 'nltk' and I can use the function 'load' from 'nltk.data'. Can somebody help me realise what I am doing wrong?
Below is the code I used to try loading my the file from nltk.
import nltk
nltk.data.load('corpora/twitter_samples/negative_tweets.json')
After trying that out I got an error from it.
C:\Python34\python.exe "C:/Users/JarvinLi/PycharmProjects/ThesisTrial1/Trial Loading.py"
Traceback (most recent call last):
File "C:/Users/JarvinLi/PycharmProjects/ThesisTrial1/Trial Loading.py", line 7, in <module>
nltk.data.load('corpora/twitter_samples/negative_tweets.json')
File "C:\Python34\lib\site-packages\nltk\data.py", line 810, in load
resource_val = json.load(opened_resource)
File "C:\Python34\lib\json\__init__.py", line 268, in load
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File "C:\Python34\lib\json\__init__.py", line 312, in loads
s.__class__.__name__))
TypeError: the JSON object must be str, not 'bytes'
Process finished with exit code 1
EDIT #1 : I'm using Python 3.4.1 and NLTK 3.
EDIT #2 : Below is another try I did but now using json.load()
import json
json.load('corpora/twitter_samples/negative_tweets.json')
But I encountered a similar error
C:\Python34\python.exe "C:/Users/JarvinLi/PycharmProjects/ThesisTrial1/Trial Loading.py"
Traceback (most recent call last):
File "C:/Users/JarvinLi/PycharmProjects/ThesisTrial1/Trial Loading.py", line 5, in <module>
json.load('corpora/twitter_samples/quotefileNeg.json')
File "C:\Python34\lib\json\__init__.py", line 265, in load
return loads(fp.read(),
AttributeError: 'str' object has no attribute 'read'
Process finished with exit code 1
If you want to access a new corpus with a specific format, you can extend the NLTK CorpusReader class as follow
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer
class StoryCorpusReader(CorpusReader):
corpus_view = StreamBackedCorpusView
def __init__(self, word_tokenizer=StoryTokenizer(), encoding="utf8"):
CorpusReader.__init__(
self, <folder_path>, <file_name>, encoding
)
for path in self.abspaths(self._fileids):
if isinstance(path, ZipFilePathPointer):
pass
elif os.path.getsize(path) == 0:
raise ValueError(f"File {path} is empty")
self._word_tokenizer = word_tokenizer
def docs(self, fileids=None):
return concat(
[
self.corpus_view(path, self._read_stories, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def titles(self):
titles = self.docs()
standards_list = []
for jsono in titles:
text = jsono["title"]
if isinstance(text, bytes):
text = text.decode(self.encoding)
standards_list.append(text)
return standards_list
def _read_stories(self, stream):
stories = []
for i in range(10):
line = stream.readline()
if not line:
return stories
story = json.loads(line)
stories.append(story)
return stories
with a specific Tokenizer
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.casual import _replace_html_entities
import typing
import re
REGEXPS = (
# HTML tags:
r"""<[^<>]+>""",
# email addresses
r"""[\w.+-]+#[\w-]+\.(?:[\w-]\.?)+[\w-]""")
class StoryTokenizer(TokenizerI):
_WORD_RE = None
def tokenize(self, text: str) -> typing.List[str]:
# Fix HTML character entities:
safe_text = _replace_html_entities(text)
# Tokenize
words = self.WORD_RE.findall(safe_text)
# Remove punctuation
words = [
word
for word in words
if re.match(f"[{re.escape(string.punctuation)}——–’‘“”×]", word.casefold())
== None
]
return words
#property
def WORD_RE(self) -> "re.Pattern":
# Compiles the regex for this and all future instantiations of TweetTokenizer.
if not type(self)._WORD_RE:
type(self)._WORD_RE = re.compile(
f"({'|'.join(REGEXPS)})",
re.VERBOSE | re.I | re.UNICODE,
)
return type(self)._WORD_RE

Understanding how to quote XML string in order to serialize using Python's ElementTree

My specs:
Python 3.4.3
Windows 7
IDE is Jupyter Notebooks
What I have referenced:
how-to-properly-escape-single-and-double-quotes
python-escaping-strings-for-use-in-xml
escaping-characters-in-a-xml-file-with-python
Here is the data and script, respectively, below (I have tried variations on serializing Column 'E' using both Sax and ElementTree):
Data
A,B,C,D,E,F,G,H,I,J
"3","8","1","<Request TransactionID="3" RequestType="FOO"><InstitutionISO /><CallID>23</CallID><MemberID>12</MemberID><MemberPassword /><RequestData><AccountNumber>2</AccountNumber><AccountSuffix>85</AccountSuffix><AccountType>S</AccountType><MPIAcctType>Checking</MPIAcctType><TransactionCount>10</TransactionCount></RequestData></Request>","<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000',0001,0070,</ShareList></Response>","1967-12-25 22:18:13.471000","2005-12-25 22:18:13.768000","2","70","0"
Script
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os.path
import sys
import csv
from io import StringIO
import xml.etree.cElementTree as ElementTree
from xml.etree.ElementTree import XMLParser
import xml
import xml.sax
from xml.sax import ContentHandler
class MyHandler(xml.sax.handler.ContentHandler):
def __init__(self):
self._charBuffer = []
self._result = []
def _getCharacterData(self):
data = ''.join(self._charBuffer).strip()
self._charBuffer = []
return data.strip() #remove strip() if whitespace is important
def parse(self, f):
xml.sax.parse(f, self)
return self._result
def characters(self, data):
self._charBuffer.append(data)
def startElement(self, name, attrs):
if name == 'Response':
self._result.append({})
def endElement(self, name):
if not name == 'Response': self._result[-1][name] = self._getCharacterData()
def read_data(path):
with open(path, 'rU', encoding='utf-8') as data:
reader = csv.DictReader(data, delimiter =',', quotechar="'", skipinitialspace=True)
for row in reader:
yield row
if __name__ == "__main__":
empty = ''
Response = 'sample.csv'
for idx, row in enumerate(read_data(Response)):
if idx > 10: break
data = row['E']
print(data) # The before
data = data[1:-1]
data = ""'{}'"".format(data)
print(data) # Sanity check
# data = '<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000',0001,0070,</ShareList></Response>'
try:
root = ElementTree.XML(data)
# print(root)
except StopIteration:
raise
pass
# xmlstring = StringIO(data)
# print(xmlstring)
# Handler = MyHandler().parse(xmlstring)
Specifically, due to the quoting in the CSV file (which is beyond my control), I have had to resort to slicing the string (line 51) and then formatting it (line 52).
However the print out from the above attempt is as follows:
"<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000'
<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000
File "<string>", line unknown
ParseError: no element found: line 1, column 69
Interestingly - if I assign the variable "data" (as in line 54) I receive this:
File "<ipython-input-80-7357c9272b92>", line 56
data = '<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000',0001,0070,</ShareList></Response>'
^
SyntaxError: invalid token
I seek feedback and information on how to address utilizing the most Pythonic means to do so. Ideally, is there a method that can leverage ElementTree. Thank you, in advance, for your feedback and guidance.
It seems that You have badly formatted (well, badly quoted) csv data.
If csv file is beyond Your control I suggest not using csv reader to read them,
instead - if You can rely on each field being properly quoted - split them yourself.
with open(Response, 'rU', encoding='utf-8') as data:
separated = data.read().split('","')
try:
x = ElementTree.XML(separated[3])
print(x)
xml.etree.ElementTree.dump(x)
y = ElementTree.XML(separated[4])
xml.etree.ElementTree.dump(y)
except Exception as e:
print(e)
outputs
<Element 'Request' at 0xb6d973b0>
<Request RequestType="FOO" TransactionID="3"><InstitutionISO /><CallID>23</CallID><MemberID>12</MemberID><MemberPassword /><RequestData><AccountNumber>2</AccountNumber><AccountSuffix>85</AccountSuffix><AccountType>S</AccountType><MPIAcctType>Checking</MPIAcctType><TransactionCount>10</TransactionCount></RequestData></Request>
<Response RequestType="HoldInquiry" TransactionID="2"><ShareList>0000',0001,0070,</ShareList></Response>

Categories