Xref error while parsing PDF using PyPDF2 and pdfrw? - python

I am trying to extract the text from the following PDF.
First, I used the PyPDF2 library. I used the following code:
from PyPDF2 import PdfFileReader
pdf = PdfFileReader('March_5_8000/1.pdf',strict=False)
information = pdf.getDocumentInfo()
This throws the following error:
/opt/conda/envs/fastai/lib/python3.6/site-packages/PyPDF2/pdf.py in getObject(self, indirectReference)
1597 if debug: print((" Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start))
1598 self.stream.seek(start, 0)
-> 1599 idnum, generation = self.readObjectHeader(self.stream)
1600 if idnum != indirectReference.idnum and self.xrefIndex:
1601 # Xref table probably had bad indexes due to not being zero-indexed
/opt/conda/envs/fastai/lib/python3.6/site-packages/PyPDF2/pdf.py in readObjectHeader(self, stream)
1665 warnings.warn("Superfluous whitespace found in object header %s %s" % \
1666 (idnum, generation), utils.PdfReadWarning)
-> 1667 return int(idnum), int(generation)
1668
1669 def cacheGetIndirectObject(self, generation, idnum):
ValueError: invalid literal for int() with base 10: b'obj'
I also tried using the pdfrw library, but that returned the same error:
Code:
from pdfrw import PdfReader
x = PdfReader('March_5_8000/980.pdf')
x.keys()
ERROR:
PdfParseError Traceback (most recent call last)
<ipython-input-6-6d3575671b6c> in <module>
1 from pdfrw import PdfReader
----> 2 x = PdfReader('March_5_8000/980.pdf')
3 x.keys()
/opt/conda/envs/fastai/lib/python3.6/site-packages/pdfrw/pdfreader.py in __init__(self, fname, fdata, decompress, decrypt, password, disable_gc, verbose)
617 while 1:
618 source.obj_offsets = {}
--> 619 trailer, is_stream = self.parsexref(source)
620 prev = trailer.Prev
621 if prev is None:
/opt/conda/envs/fastai/lib/python3.6/site-packages/pdfrw/pdfreader.py in parsexref(self, source)
463 return self.readdict(source), False
464 else:
--> 465 source.exception('Expected "xref" keyword or xref stream object')
466
467 def readpages(self, node):
/opt/conda/envs/fastai/lib/python3.6/site-packages/pdfrw/tokens.py in exception(self, *arg)
227
228 def exception(self, *arg):
--> 229 raise PdfParseError(self.msg(*arg))
PdfParseError: Expected "xref" keyword or xref stream object (line=2238, col=39, token='obj')
How can I solve this?

Related

Error while saving Optuna study to Google Drive from Colab

I can save a random file to my drive colab as:
with open ("gdrive/My Drive/chapter_classification/output/hello.txt",'w')as f:
f.write('hello')
works fine but when I use the Official documentation approach of Optuna using the code:
direction = 'minimize'
name = 'opt1'
study = optuna.create_study(sampler=optuna.samplers.TPESampler(),direction=direction,study_name=name, storage=f"gdrive/My Drive/chapter_classification/output/sqlite:///{name}.db",load_if_exists=True)
study.optimize(tune, n_trials=1000)
throws an error as:
ArgumentError Traceback (most recent call last)
<ipython-input-177-f32da2c0f69a> in <module>()
2 direction = 'minimize'
3 name = 'opt1'
----> 4 study = optuna.create_study(sampler=optuna.samplers.TPESampler(),direction=direction,study_name=name, storage="gdrive/My Drive/chapter_classification/output/sqlite:///opt1.db",load_if_exists=True)
5 study.optimize(tune, n_trials=1000)
6 frames
/usr/local/lib/python3.7/dist-packages/optuna/study/study.py in create_study(storage, sampler, pruner, study_name, direction, load_if_exists, directions)
1134 ]
1135
-> 1136 storage = storages.get_storage(storage)
1137 try:
1138 study_id = storage.create_new_study(study_name)
/usr/local/lib/python3.7/dist-packages/optuna/storages/__init__.py in get_storage(storage)
29 return RedisStorage(storage)
30 else:
---> 31 return _CachedStorage(RDBStorage(storage))
32 elif isinstance(storage, RDBStorage):
33 return _CachedStorage(storage)
/usr/local/lib/python3.7/dist-packages/optuna/storages/_rdb/storage.py in __init__(self, url, engine_kwargs, skip_compatibility_check, heartbeat_interval, grace_period, failed_trial_callback)
173
174 try:
--> 175 self.engine = create_engine(self.url, **self.engine_kwargs)
176 except ImportError as e:
177 raise ImportError(
<string> in create_engine(url, **kwargs)
/usr/local/lib/python3.7/dist-packages/sqlalchemy/util/deprecations.py in warned(fn, *args, **kwargs)
307 stacklevel=3,
308 )
--> 309 return fn(*args, **kwargs)
310
311 doc = fn.__doc__ is not None and fn.__doc__ or ""
/usr/local/lib/python3.7/dist-packages/sqlalchemy/engine/create.py in create_engine(url, **kwargs)
528
529 # create url.URL object
--> 530 u = _url.make_url(url)
531
532 u, plugins, kwargs = u._instantiate_plugins(kwargs)
/usr/local/lib/python3.7/dist-packages/sqlalchemy/engine/url.py in make_url(name_or_url)
713
714 if isinstance(name_or_url, util.string_types):
--> 715 return _parse_rfc1738_args(name_or_url)
716 else:
717 return name_or_url
/usr/local/lib/python3.7/dist-packages/sqlalchemy/engine/url.py in _parse_rfc1738_args(name)
775 else:
776 raise exc.ArgumentError(
--> 777 "Could not parse rfc1738 URL from string '%s'" % name
778 )
779
ArgumentError: Could not parse rfc1738 URL from string 'gdrive/My Drive/chapter_classification/output/sqlite:///opt1.db'
So according to the official Documentation of create_study
When a database URL is passed, Optuna internally uses SQLAlchemy to handle the database. Please refer to SQLAlchemy’s document for further details. If you want to specify non-default options to SQLAlchemy Engine, you can instantiate RDBStorage with your desired options and pass it to the storage argument instead of a URL.
And when you visit the documentation of SQLAlchemy, you find that it uses absolute path.
So all you have to do is to change
storage=f"gdrive/My Drive/chapter_classification/output/sqlite:///{name}.db"
to the absolute path as:
storage = f"sqlite:///gdrive/My Drive/chapter_classification/output{name}.db"

Wahoo TICKR X .fit file reading/parsing and analysis in Python

Not sure if I can post a question like this here so please redirect me if I'm in the wrong place.
I've bought a Wahoo TICKR X to monitor my heart rate during exercise. Also I would like to get more familiar with python so i decided I would like do do the analyses of my heart rate myself in python instead of in the wahoo app. I thought this would also give more freedom in the choice of visualization, testing etc.
I've recorded my heart rate for 5 minutes or so and exported the .fit file. However I can't even find a suitable library to read the .fit file. Can anyone recommend a library that works with .fit file from wahoo?
I'm using ubuntu, anaconda, python 3.7
import pyfits
# Load the FITS file into the program
hdulist = pyfits.open('/home/bradmin/Downloads/2020-03-26.fit')
# Load table data as tbdata
tbdata = hdulist[1].data
OSError Traceback (most recent call last)
<ipython-input-3-a970e2cd9dee> in <module>
2
3 # Load the FITS file into the program
----> 4 hdulist = pyfits.open('/home/bradmin/Downloads/2020-03-26.fit')
5
6 # Load table data as tbdata
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in fitsopen(name, mode, memmap, save_backup, **kwargs)
122 raise ValueError('Empty filename: %s' % repr(name))
123
--> 124 return HDUList.fromfile(name, mode, memmap, save_backup, **kwargs)
125
126
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in fromfile(cls, fileobj, mode, memmap, save_backup, **kwargs)
264
265 return cls._readfrom(fileobj=fileobj, mode=mode, memmap=memmap,
--> 266 save_backup=save_backup, **kwargs)
267
268 #classmethod
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in _readfrom(cls, fileobj, data, mode, memmap, save_backup, **kwargs)
853 # raise and exception
854 if mode in ('readonly', 'denywrite') and len(hdulist) == 0:
--> 855 raise IOError('Empty or corrupt FITS file')
856
857 # initialize/reset attributes to be used in "update/append" mode
OSError: Empty or corrupt FITS file
link to the file: https://wetransfer.com/downloads/6d054a5d52899aefcb1bcd22bda92ba120200326161849/b9831a
EDIT
I've tried this now but i get an error:
import fitdecode
src_file = "/home/bradmin/Downloads/2020-03-26.fit"
with fitdecode.FitReader(src_file) as fit:
for frame in fit:
# The yielded frame object is of one of the following types:
# * fitdecode.FitHeader
# * fitdecode.FitDefinitionMessage
# * fitdecode.FitDataMessage
# * fitdecode.FitCRC
if isinstance(frame, fitdecode.FitDataMessage):
# Here, frame is a FitDataMessage object.
# A FitDataMessage object contains decoded values that
# are directly usable in your script logic.
print(frame.name)
file_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
field_description
field_description
field_description
field_description
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-7-e8d95d3087dc> in <module>
2
3 with fitdecode.FitReader(src_file) as fit:
----> 4 for frame in fit:
5 # The yielded frame object is of one of the following types:
6 # * fitdecode.FitHeader
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in __iter__(self)
191
192 def __iter__(self):
--> 193 yield from self._read_next()
194
195 #property
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _read_next(self)
298 assert self._header
299
--> 300 record = self._read_record()
301 if not record:
302 break
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _read_record(self)
443 self._add_dev_data_id(message)
444 elif message.mesg_type.mesg_num == profile.MESG_NUM_FIELD_DESCRIPTION:
--> 445 self._add_dev_field_description(message)
446
447 return message
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _add_dev_field_description(self, message)
780 base_type_id = message.get_field('fit_base_type_id').raw_value
781 field_name = message.get_field('field_name').raw_value
--> 782 units = message.get_field('units').raw_value
783
784 try:
~/anaconda3/lib/python3.7/site-packages/fitdecode/records.py in get_field(self, field_name_or_num, idx)
188 raise KeyError(
189 f'field "{field_name_or_num}" (idx #{idx}) not found in ' +
--> 190 f'message "{self.name}"')
191
192 def get_fields(self, field_name_or_num):
KeyError: 'field "units" (idx #0) not found in message "field_description"'
The format seems to be this FIT format. pyfits is for an entirely different format, it seems.
The article above refers to a gpsbabel tool, which you could use to convert the FIT file to something more interoperable and usable, e.g. GPX (an XML-based format that's easy to parse).
Or, of course, if you want a pure-Python solution, you can port the FIT format reading bits from gpsbabel to Python use the fitdecode library.

How can I load sklearn data in Jupyter Python 3?

Hey I have a very short question. I need to load data for my machine learning course, but it does not work for me and I have no idea why. Im using Jupyter with Python 3.
My Code:
from sklearn.datasets import fetch_covtype
forest = fetch_covtype()
For my friend it works fine with the same conditions. I already tried to update sklearn with pip install -U scikit-learn, but it did not solve the problem. I hope somebody can help me.
It creates the following error:
UnboundLocalError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/covtype.py in fetch_covtype(data_home, download_if_missing, random_state, shuffle, return_X_y)
126 try:
--> 127 X, y
128 except NameError:
UnboundLocalError: local variable 'X' referenced before assignment
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-9-fb303a92b6ca> in <module>
----> 1 forest =fetch_covtype()
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/covtype.py in fetch_covtype(data_home, download_if_missing, random_state, shuffle, return_X_y)
127 X, y
128 except NameError:
--> 129 X, y = _refresh_cache([samples_path, targets_path], 9)
130 # TODO: Revert to the following two lines in v0.23
131 # X = joblib.load(samples_path)
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/base.py in _refresh_cache(files, compress)
928 msg = "sklearn.externals.joblib is deprecated in 0.21"
929 with warnings.catch_warnings(record=True) as warns:
--> 930 data = tuple([joblib.load(f) for f in files])
931
932 refresh_needed = any([str(x.message).startswith(msg) for x in warns])
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/base.py in <listcomp>(.0)
928 msg = "sklearn.externals.joblib is deprecated in 0.21"
929 with warnings.catch_warnings(record=True) as warns:
--> 930 data = tuple([joblib.load(f) for f in files])
931
932 refresh_needed = any([str(x.message).startswith(msg) for x in warns])
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in load(filename, mmap_mode)
603 return load_compatibility(fobj)
604
--> 605 obj = _unpickle(fobj, filename, mmap_mode)
606
607 return obj
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in _unpickle(fobj, filename, mmap_mode)
527 obj = None
528 try:
--> 529 obj = unpickler.load()
530 if unpickler.compat_mode:
531 warnings.warn("The file '%s' has been generated with a "
/opt/conda/lib/python3.7/pickle.py in load(self)
1083 raise EOFError
1084 assert isinstance(key, bytes_types)
-> 1085 dispatch[key[0]](self)
1086 except _Stop as stopinst:
1087 return stopinst.value
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in load_build(self)
353 if isinstance(array_wrapper, NDArrayWrapper):
354 self.compat_mode = True
--> 355 self.stack.append(array_wrapper.read(self))
356
357 # Be careful to register our new method.
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in read(self, unpickler)
196 array = self.read_mmap(unpickler)
197 else:
--> 198 array = self.read_array(unpickler)
199
200 # Manage array subclass case
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in read_array(self, unpickler)
147 read_size = int(read_count * self.dtype.itemsize)
148 data = _read_bytes(unpickler.file_handle,
--> 149 read_size, "array data")
150 array[i:i + read_count] = \
151 unpickler.np.frombuffer(data, dtype=self.dtype,
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle_utils.py in _read_bytes(fp, size, error_template)
241 if len(data) != size:
242 msg = "EOF: reading %s, expected %d bytes got %d"
--> 243 raise ValueError(msg % (error_template, size, len(data)))
244 else:
245 return data
ValueError: EOF: reading array data, expected 262144 bytes got 209661

zipfile extractall raising "BadZipFile: Bad CRC-32 for file" error

This is the file I am trying to open
https://drive.google.com/file/d/1K2kDBTNXS2ikx9xKmi2Fy0Wsc5u_Lls0/view
It is described here
https://github.com/armancohan/long-summarization
After I added the file to my google drive, this is the code I am trying to use to open it.
from google.colab import drive
drive.mount('/content/gdrive')
import zipfile
zip_ref = zipfile.ZipFile('/content/gdrive/My Drive/arxiv-release.zip', 'r')
zip_ref.extractall('arxiv-release')
zip_ref.close()
This is the error that is raised
---------------------------------------------------------------------------
BadZipFile Traceback (most recent call last)
<ipython-input-9-9965160388a1> in <module>()
1
----> 2 zip_ref.extractall('arxiv-release')
3 zip_ref.close()
5 frames
/usr/lib/python3.6/zipfile.py in extractall(self, path, members, pwd)
1522
1523 for zipinfo in members:
-> 1524 self._extract_member(zipinfo, path, pwd)
1525
1526 #classmethod
/usr/lib/python3.6/zipfile.py in _extract_member(self, member, targetpath, pwd)
1577 with self.open(member, pwd=pwd) as source, \
1578 open(targetpath, "wb") as target:
-> 1579 shutil.copyfileobj(source, target)
1580
1581 return targetpath
/usr/lib/python3.6/shutil.py in copyfileobj(fsrc, fdst, length)
77 """copy data from file-like object fsrc to file-like object fdst"""
78 while 1:
---> 79 buf = fsrc.read(length)
80 if not buf:
81 break
/usr/lib/python3.6/zipfile.py in read(self, n)
870 self._offset = 0
871 while n > 0 and not self._eof:
--> 872 data = self._read1(n)
873 if n < len(data):
874 self._readbuffer = data
/usr/lib/python3.6/zipfile.py in _read1(self, n)
960 if self._left <= 0:
961 self._eof = True
--> 962 self._update_crc(data)
963 return data
964
/usr/lib/python3.6/zipfile.py in _update_crc(self, newdata)
888 # Check the CRC if we're at the end of the file
889 if self._eof and self._running_crc != self._expected_crc:
--> 890 raise BadZipFile("Bad CRC-32 for file %r" % self.name)
891
892 def read1(self, n):
BadZipFile: Bad CRC-32 for file 'arxiv-release/train.txt'

How to resolve unsupported pickle protocol: 4 in python 3.5?

My code:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
with open('word2vec_model', 'rb') as handle:
model = pickle.load(handle)
The error that happens:
ValueError
Traceback (most recent call last)
<ipython-input-2-aa1ad84b16ed> in <module>()
18 #if you do NOT have RAM >= 12GB, use the code below.
19 with open('word2vec_model', 'rb') as handle:
---> 20 model = pickle.load(handle)
C:\Users\home pc\Anaconda2\lib\pickle.pyc in load(file)
1382
1383 def load(file):
-> 1384 return Unpickler(file).load()
1385
1386 def loads(str):
C:\Users\home pc\Anaconda2\lib\pickle.pyc in load(self)
862 while 1:
863 key = read(1)
--> 864 dispatch[key](self)
865 except _Stop, stopinst:
866 return stopinst.value
C:\Users\home pc\Anaconda2\lib\pickle.pyc in load_proto(self)
890 proto = ord(self.read(1))
891 if not 0 <= proto <= 2:
--> 892 raise ValueError, "unsupported pickle protocol: %d" % proto
893 dispatch[PROTO] = load_proto
894
ValueError: unsupported pickle protocol: 4

Categories