Empty SVG file errror while it's not - python

Working with Pycairo, I create an "SVGSurface" (which create for me an svg file) and I write on it using a "context". Now after I finish, I need to use the svg file, but it seems that the file is not closed so that it gave me an error telling that the document is empty.
ps = cairo.SVGSurface("header.svg", width, height)
cr = cairo.Context(ps)
drawRectangle (cr,
papersize.convert_length(int(lg[0]), "px","pt"),
papersize.convert_length(int(lg[2]), "px","pt"),
papersize.convert_length(int(lg[1])-int(lg[0])-1, "px","pt"),
papersize.convert_length(int(lg[3])-int(lg[2])-1, "px","pt"),
0, 0, 0.5
)
cr.show_page()
head = st.fromfile("header.svg")
It gives me this error :
File "/usr/local/lib/python2.7/dist-packages/svgutils/transform.py", line 249, in fromfile
svg_file = etree.parse(fid)
File "src/lxml/lxml.etree.pyx", line 3427, in lxml.etree.parse (src/lxml/lxml.etree.c:81117)
File "src/lxml/parser.pxi", line 1832, in lxml.etree._parseDocument (src/lxml/lxml.etree.c:118116)
File "src/lxml/parser.pxi", line 1852, in lxml.etree._parseFilelikeDocument (src/lxml/lxml.etree.c:118399)
File "src/lxml/parser.pxi", line 1747, in lxml.etree._parseDocFromFilelike (src/lxml/lxml.etree.c:117187)
File "src/lxml/parser.pxi", line 1162, in lxml.etree._BaseParser._parseDocFromFilelike (src/lxml/lxml.etree.c:111914)
File "src/lxml/parser.pxi", line 595, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:105109)
File "src/lxml/parser.pxi", line 706, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:106817)
File "src/lxml/parser.pxi", line 635, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:105671)
lxml.etree.XMLSyntaxError: Document is empty, line 1, column 1 (line 1)
I tried to close the file with os but it didn't work

Related

"EOFError: Ran out of input" when packaging a Python script with PyInstaller

I'm developing an application for Windows operating systems written in Python 3.8 and which makes use of the nnunet library (https://pypi.org/project/nnunet/) which uses multiprocessing. I have tested the script and it works correctly.
Now I'm trying to package everything with pyinstaller v5.7.0. The creation of the .exe is successful but when I run it I get the following error:
Traceback (most recent call last):
File "main.py", line 344, in <module>
File "nnunet\inference\predict.py", line 694, in predict_from_folder
File "nnunet\inference\predict.py", line 496, in predict_cases_fastest
File "nnunet\inference\predict.py", line 123, in preprocess_multithreaded
File "multiprocess\process.py", line 121, in start
File "multiprocess\context.py", line 224, in _Popen
File "multiprocess\context.py", line 327, in _Popen
File "multiprocess\popen_spawn_win32.py", line 93, in __init__
File "multiprocess\reduction.py", line 70, in dump
File "dill\_dill.py", line 394, in dump
File "pickle.py", line 487, in dump
File "dill\_dill.py", line 388, in save
File "pickle.py", line 603, in save
File "pickle.py", line 717, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1186, in save_module_dict
File "pickle.py", line 971, in save_dict
Traceback (most recent call last):
File "main.py", line 341, in <module>
File "pickle.py", line 997, in _batch_setitems
File "D:\MyProject\venv\Lib\site-packages\PyInstaller\hooks\rthooks\pyi_rth_multiprocessing.py", line 49, in _freeze_support
File "dill\_dill.py", line 388, in save
spawn.spawn_main(**kwds)
File "pickle.py", line 560, in save
File "pickle.py", line 901, in save_tuple
File "dill\_dill.py", line 388, in save
File "multiprocessing\spawn.py", line 116, in spawn_main
File "pickle.py", line 560, in save
File "multiprocessing\spawn.py", line 126, in _main
File "dill\_dill.py", line 1427, in save_instancemethod0
EOFError: Ran out of input
[588] Failed to ex File "pickle.py", line 692, in save_reduce
ecute script 'main' d File "dill\_dill.py", line 388, in save
ue to unhandled File "pickle.py", line 560, in save
exception!
File "pickle.py", line 886, in save_tuple
File "dill\_dill.py", line 388, in save
File "pickle.py", line 603, in save
File "pickle.py", line 717, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1186, in save_module_dict
File "pickle.py", line 971, in save_dict
File "pickle.py", line 997, in _batch_setitems
File "dill\_dill.py", line 388, in save
File "pickle.py", line 603, in save
File "pickle.py", line 687, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1698, in save_type
File "dill\_dill.py", line 1070, in _save_with_postproc
File "pickle.py", line 692, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "pickle.py", line 901, in save_tuple
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "pickle.py", line 886, in save_tuple
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1698, in save_type
File "dill\_dill.py", line 1084, in _save_with_postproc
File "pickle.py", line 997, in _batch_setitems
File "dill\_dill.py", line 388, in save
File "pickle.py", line 603, in save
File "pickle.py", line 717, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1186, in save_module_dict
File "pickle.py", line 971, in save_dict
File "pickle.py", line 997, in _batch_setitems
File "dill\_dill.py", line 388, in save
File "pickle.py", line 603, in save
File "pickle.py", line 717, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1186, in save_module_dict
File "pickle.py", line 971, in save_dict
File "pickle.py", line 997, in _batch_setitems
File "dill\_dill.py", line 388, in save
File "pickle.py", line 578, in save
File "PyInstaller\loader\pyimod01_archive.py", line 76, in __getattr__
AssertionError
[4392] Failed to execute script 'main' due to unhandled exception!
Below is the code of my python script:
#==============================
# main.py
#==============================
from multiprocessing import freeze_support
from nnunet.inference.predict import predict_from_folder
if __name__ == "__main__":
freeze_support()
...
predict_from_folder(...)
...
Below is the code of the nnunet library that triggers the error:
#==============================
# nnunet\inference\predict.py
#==============================
def preprocess_multithreaded(trainer, list_of_lists, output_files, num_processes=2, segs_from_prev_stage=None):
if segs_from_prev_stage is None:
segs_from_prev_stage = [None] * len(list_of_lists)
num_processes = min(len(list_of_lists), num_processes)
classes = list(range(1, trainer.num_classes))
assert isinstance(trainer, nnUNetTrainer)
q = Queue(1)
processes = []
for i in range(num_processes):
pr = Process(
target=preprocess_save_to_queue,
args=(
trainer.preprocess_patient,
q,
list_of_lists[i::num_processes],
output_files[i::num_processes],
segs_from_prev_stage[i::num_processes],
classes,
trainer.plans['transpose_forward']
)
)
pr.start() ## <------------ The error is generated here!!!!!!!!!!!!!
processes.append(pr)
try:
end_ctr = 0
while end_ctr != num_processes:
item = q.get()
if item == "end":
end_ctr += 1
continue
else:
yield item
finally:
for p in processes:
if p.is_alive():
p.terminate()
p.join()
q.close()
def predict_cases_fastest(...):
...
pool = Pool(num_threads_nifti_save)
...
preprocessing = preprocess_multithreaded(
trainer,
list_of_lists,
cleaned_output_files,
num_threads_preprocessing,
segs_from_prev_stage
)
...
pool.starmap_async(...)
...
pool.close()
pool.join()
def predict_from_folder(...):
...
return predict_cases_fastest(...)
if __name__ == "__main__":
...
Edit 03-02-2023
I have created a public project with which it is possible to reproduce the reported problem: https://gitlab.com/carlopoletto/nnunet_pyinstaller_problem
In the ./scripts folder there are some scripts to install everything and run the tests:
./scripts/install: dependency installation
./scripts/dist: creating the executable with pyinstaller
./scripts/run_py: running the python script (NB: this script automatically delete the ./temp folder and recreate it by copying the contents of ./data)
./scripts/run_exe: running the executable created with ./scripts/dist (NB: this script automatically delete the ./temp folder and recreate it by copying the contents of ./data)
The problem appears to be internal to the nnunet library. I don't know if this problem can be solved by properly configuring pyinstaller.

lxml.etree.XMLSyntaxError while trying to read_excel using pandas

I've a spreadsheet (~50 mb) with multiple sheets, and I'm trying to read it using pandas.
import pandas as pd
df = pd.read_excel('compiled_output.xlsx', sheet_name='Sheet1')
I'm not sure why it's throwing lxml.etree.XMLSyntaxError; I've done this many times before. I also tried passing engine=openpyxl, downgraded to pandas==1.2.4 but I get the same error:
df = pd.read_excel('compiled_output.xlsx', sheet_name='Sheet1')
File "/usr/local/lib/python3.9/site-packages/pandas/util/_decorators.py", line 299, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/pandas/io/excel/_base.py", line 336, in read_excel
io = ExcelFile(io, storage_options=storage_options, engine=engine)
File "/usr/local/lib/python3.9/site-packages/pandas/io/excel/_base.py", line 1131, in __init__
self._reader = self._engines[engine](self._io, storage_options=storage_options)
File "/usr/local/lib/python3.9/site-packages/pandas/io/excel/_openpyxl.py", line 475, in __init__
super().__init__(filepath_or_buffer, storage_options=storage_options)
File "/usr/local/lib/python3.9/site-packages/pandas/io/excel/_base.py", line 391, in __init__
self.book = self.load_workbook(self.handles.handle)
File "/usr/local/lib/python3.9/site-packages/pandas/io/excel/_openpyxl.py", line 486, in load_workbook
return load_workbook(
File "/usr/local/lib/python3.9/site-packages/openpyxl/reader/excel.py", line 317, in load_workbook
reader.read()
File "/usr/local/lib/python3.9/site-packages/openpyxl/reader/excel.py", line 282, in read
self.read_worksheets()
File "/usr/local/lib/python3.9/site-packages/openpyxl/reader/excel.py", line 216, in read_worksheets
rels = get_dependents(self.archive, rels_path)
File "/usr/local/lib/python3.9/site-packages/openpyxl/packaging/relationship.py", line 131, in get_dependents
node = fromstring(src)
File "src/lxml/etree.pyx", line 3237, in lxml.etree.fromstring
File "src/lxml/parser.pxi", line 1896, in lxml.etree._parseMemoryDocument
File "src/lxml/parser.pxi", line 1784, in lxml.etree._parseDoc
File "src/lxml/parser.pxi", line 1141, in lxml.etree._BaseParser._parseDoc
File "src/lxml/parser.pxi", line 615, in lxml.etree._ParserContext._handleParseResultDoc
File "src/lxml/parser.pxi", line 725, in lxml.etree._handleParseResult
File "src/lxml/parser.pxi", line 654, in lxml.etree._raiseParseError
File "<string>", line 2
lxml.etree.XMLSyntaxError: internal error: Huge input lookup, line 2, column 12753697

Can etree.XMLParser in recover mode still throw a parse error?

I have a utility method that parses XML using a parser created as etree.XMLParser(recover=True). I would like to test failure scenarios in a unit test. Except for empty input throwing an lxml.etree.XMLSyntaxError, I can't seem to break the parser.
My question is: is it possible to construct a StringIO or BytesIO input for this parser such that the parser throws a parse error?
Here's some examples (tested with Python 3.5 and lxml 4.3.3):
from io import BytesIO
from lxml import etree
def parse(xml):
parser = etree.XMLParser(recover=True)
elem = etree.parse(BytesIO(xml), parser)
print(etree.tostring(elem))
parse(b'<broken<') # prints b'<broken/>'
parse(b'</lf|\jf>') # prints None
parse('<?xml encoding="ascii"?><foo>æøå</foo>'.encode('utf-8')) # prints b'<foo/>'
parse(b'') # Throws lxml.etree.XMLSyntaxError
If I slap a NULL character at the beginning of any of the bad inputs you show that don't raise an error, I do get an error. For instance:
parse(b'\0<broken<')
produces:
Traceback (most recent call last):
File "test.py", line 13, in <module>
parse(b'\0<broken<') # prints b'<broken/>'
File "test.py", line 9, in parse
elem = etree.parse(BytesIO(xml), parser)
File "src/lxml/etree.pyx", line 3435, in lxml.etree.parse
File "src/lxml/parser.pxi", line 1857, in lxml.etree._parseDocument
File "src/lxml/parser.pxi", line 1877, in lxml.etree._parseMemoryDocument
File "src/lxml/parser.pxi", line 1765, in lxml.etree._parseDoc
File "src/lxml/parser.pxi", line 1127, in lxml.etree._BaseParser._parseDoc
File "src/lxml/parser.pxi", line 601, in lxml.etree._ParserContext._handleParseResultDoc
File "src/lxml/parser.pxi", line 711, in lxml.etree._handleParseResult
File "src/lxml/parser.pxi", line 640, in lxml.etree._raiseParseError
File "<string>", line 1
lxml.etree.XMLSyntaxError: Document is empty, line 1, column 1
Isn't it because you are using recover=True?
recover - try hard to parse through broken XML
I changed recover=False and I get:
Traceback (most recent call last):
File "./foo.py", line 11, in <module>
parse(b'<broken<') # prints b'<broken/>'
File "./foo.py", line 7, in parse
elem = etree.parse(BytesIO(xml), parser)
File "src/lxml/etree.pyx", line 3435, in lxml.etree.parse
File "src/lxml/parser.pxi", line 1857, in lxml.etree._parseDocument
File "src/lxml/parser.pxi", line 1877, in lxml.etree._parseMemoryDocument
File "src/lxml/parser.pxi", line 1765, in lxml.etree._parseDoc
File "src/lxml/parser.pxi", line 1127, in lxml.etree._BaseParser._parseDoc
File "src/lxml/parser.pxi", line 601, in lxml.etree._ParserContext._handleParseResultDoc
File "src/lxml/parser.pxi", line 711, in lxml.etree._handleParseResult
File "src/lxml/parser.pxi", line 640, in lxml.etree._raiseParseError
File "<string>", line 1
lxml.etree.XMLSyntaxError: error parsing attribute name, line 1, column 8
Am I missing something?

How to download all transcripts from seeking alpha

Is there some way to automatically download all the transcripts from the SA website?
http://seekingalpha.com/earnings/earnings-call-transcripts
I tried using the http://newspaper.readthedocs.io/en/latest/ python code but I get the following error:
earnings_call_transcripts_2 = newspaper.build('http://seekingalpha.com/earnings/earnings-call-transcripts', memoize_articles=False)
Traceback (most recent call last):
File "/Users/name/anaconda/lib/python3.5/site-packages/newspaper/parsers.py", line 67, in fromstring
cls.doc = lxml.html.fromstring(html)
File "/Users/name/anaconda/lib/python3.5/site-packages/lxml/html/__init__.py", line 867, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/Users/name/anaconda/lib/python3.5/site-packages/lxml/html/__init__.py", line 752, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "src/lxml/lxml.etree.pyx", line 3213, in lxml.etree.fromstring (src/lxml/lxml.etree.c:77697)
File "src/lxml/parser.pxi", line 1819, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:116494)
File "src/lxml/parser.pxi", line 1700, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:115040)
File "src/lxml/parser.pxi", line 1040, in lxml.etree._BaseParser._parseUnicodeDoc (src/lxml/lxml.etree.c:109165)
File "src/lxml/parser.pxi", line 573, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:103404)
File "src/lxml/parser.pxi", line 683, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:105058)
File "src/lxml/parser.pxi", line 622, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:104143)
File "<string>", line None
lxml.etree.XMLSyntaxError: line 295: b"htmlParseEntityRef: expecting ';'"
[Source parse ERR] http://seekingalpha.com/earnings/earnings-call-transcripts

Why is the slash at the end of lxml.html.parse() important?

I am using lxml to scrape html. This code works.
lxml.html.parse( "http://google.com/" )
This code does not.
lxml.html.parse( "http://google.com" )
Why does the slash at the end of the URL matter? Thank you.
To be clear, here is the error log that python is giving me from the latter code.
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/davidfaux/epd-7.2-2-rh5-x86/lib/python2.7/site-packages/lxml/html/__init__.py", line 692, in parse
return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
File "lxml.etree.pyx", line 2953, in lxml.etree.parse (src/lxml/lxml.etree.c:56204)
File "parser.pxi", line 1533, in lxml.etree._parseDocument (src/lxml/lxml.etree.c:82287)
File "parser.pxi", line 1562, in lxml.etree._parseDocumentFromURL (src/lxml/lxml.etree.c:82580)
File "parser.pxi", line 1462, in lxml.etree._parseDocFromFile (src/lxml/lxml.etree.c:81619)
File "parser.pxi", line 1002, in lxml.etree._BaseParser._parseDocFromFile (src/lxml/lxml.etree.c:78528)
File "parser.pxi", line 569, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:74472)
File "parser.pxi", line 650, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:75363)
File "parser.pxi", line 588, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:74665)
IOError: Error reading file 'http://google.com': failed to load HTTP resource
Because without the slash, Google isn't sending you a page, it's sending you a redirect. In fact, it's redirecting you to the URL with the slash! The body of the redirect is probably empty.

Categories