Python lzma unable to load joblib - python

I have a scikit learn pipeline that I serialize using:
with lzma.open('outputs/baseModel_LR.joblib',"wb") as f:
dill.dump(pipeline, f)
When I try to open the file and load the pipeline using:
with lzma.open('outputs/baseModel_LR.joblib',"rb") as f:
model = dill.load(f)
it gives error:
---------------------------------------------------------------------------
EOFError Traceback (most recent call last)
somePath/notebooks/test.ipynb Cell 5 in <cell line: 1>()
1 with lzma.open('outputs/baseModel_LR.joblib',"rb") as f:
----> 2 model = dill.load(f)
3 model
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/dill/_dill.py:373, in load(file, ignore, **kwds)
367 def load(file, ignore=None, **kwds):
368 """
369 Unpickle an object from a file.
370
371 See :func:`loads` for keyword arguments.
372 """
--> 373 return Unpickler(file, ignore=ignore, **kwds).load()
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/dill/_dill.py:646, in Unpickler.load(self)
645 def load(self): #NOTE: if settings change, need to update attributes
--> 646 obj = StockUnpickler.load(self)
647 if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'):
648 if not self._ignore:
649 # point obj class to main
File /anaconda/envs/azureml_py38/lib/python3.8/lzma.py:200, in LZMAFile.read(self, size)
194 """Read up to size uncompressed bytes from the file.
...
100 "end-of-stream marker was reached")
101 else:
102 rawblock = b""
**EOFError: Compressed file ended before the end-of-stream marker was reached**
Has anyone faced this problem and solved it? I use lzma because otherwise the joblib size is 27GB and with lzma its just 20MB

Related

TypeError while pickling a file using joblib

I am working on extended isolation forest for anomaly detection. Gihub link for the alogirthm is present here. I am not able to pickle the model after training it. How can I resolve this error.
iso_forest_model = iso.iForest(X , ntrees=100, sample_size=256, ExtensionLevel=1)
import joblib
joblib.dump(iso_forest_model, os.path.join(models_path,'extended_isolation_forest.pkl'), compress=9)
Error
--------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-36-6b85f8a16cb7> in <module>()
6
7 import joblib
----> 8 joblib.dump(iso_forest_model, os.path.join(models_path,'extended_isolation_forest.pkl'), compress=9)
4 frames
/usr/local/lib/python3.7/dist-packages/joblib/numpy_pickle.py in dump(value, filename, compress, protocol, cache_size)
475 with _write_fileobject(filename, compress=(compress_method,
476 compress_level)) as f:
--> 477 NumpyPickler(f, protocol=protocol).dump(value)
478 elif is_filename:
479 with open(filename, 'wb') as f:
/usr/lib/python3.7/pickle.py in dump(self, obj)
435 if self.proto >= 4:
436 self.framer.start_framing()
--> 437 self.save(obj)
438 self.write(STOP)
439 self.framer.end_framing()
/usr/local/lib/python3.7/dist-packages/joblib/numpy_pickle.py in save(self, obj)
280 return
281
--> 282 return Pickler.save(self, obj)
283
284
/usr/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
522 reduce = getattr(obj, "__reduce_ex__", None)
523 if reduce is not None:
--> 524 rv = reduce(self.proto)
525 else:
526 reduce = getattr(obj, "__reduce__", None)
/usr/local/lib/python3.7/dist-packages/eif.cpython-37m-x86_64-linux-gnu.so in eif.iForest.__reduce_cython__()
TypeError: no default __reduce__ due to non-trivial __cinit__

Errno 2 No such file or directory:

I am using jupyter notebook (python 3.8 both from anaconda3) and following this post, cells 84 and 85 are resulting in the traceback and followed the advice of
FileNotFoundError Traceback (most recent call last)
<ipython-input-15-9cdebd0bb247> in <module>
2
3
----> 4 create_wordcloud(tw_list["text"].values)
<ipython-input-14-524a73dcd1e0> in create_wordcloud(text)
2
3 def create_wordcloud(text):
----> 4 mask = np.array(Image.open("cloud.png"))
5 stopwords = set(STOPWORDS)
6 wc = WordCloud(background_color="white",
~/opt/anaconda3/lib/python3.8/site-packages/PIL/Image.py in open(fp, mode, formats)
2889
2890 if filename:
-> 2891 fp = builtins.open(filename, "rb")
2892 exclusive_fp = True
2893
FileNotFoundError: [Errno 2] No such file or directory: 'cloud.png'
following this i found advice (the link evades me but its somewhere on this site to change from PIL import image to import PIL.image in cell 2 and add
from IPython.display import Image
Image(filename='cloud.png')
still resulting in a similar, but longer traceback
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-16-8c5d56ae9874> in <module>
1 #Creating wordcloud for all tweets
2 from IPython.display import Image
----> 3 Image(filename='cloud.png')
4
5 create_wordcloud(tw_list["text"].values)
~/opt/anaconda3/lib/python3.8/site-packages/IPython/core/display.py in
__init__(self, data, url, filename, format, embed, width, height, retina,
unconfined, metadata)
1222 self.retina = retina
1223 self.unconfined = unconfined
-> 1224 super(Image, self).__init__(data=data, url=url, filename=filename,
1225 metadata=metadata)
1226
~/opt/anaconda3/lib/python3.8/site-packages/IPython/core/display.py in
__init__(self, data, url, filename, metadata)
628 self.metadata = {}
629
--> 630 self.reload()
631 self._check_data()
632
~/opt/anaconda3/lib/python3.8/site-packages/IPython/core/display.py in
reload(self)
1254 """Reload the raw data from file or URL."""
1255 if self.embed:
-> 1256 super(Image,self).reload()
1257 if self.retina:
1258 self._retina_shape()
~/opt/anaconda3/lib/python3.8/site-packages/IPython/core/display.py in
reload(self)
653 """Reload the raw data from file or URL."""
654 if self.filename is not None:
--> 655 with open(self.filename, self._read_flags) as f:
656 self.data = f.read()
657 elif self.url is not None:
FileNotFoundError: [Errno 2] No such file or directory: 'cloud.png'
which evidently is not the right solution, I am a little out of my depth here and grateful for any help
That means the file does not exist in the directory it is called. You must download their 'cloud.png' and put it in the same file as the jupyter notebook file.
https://github.com/ChilesheChanda/TwitterSentimentAnalysis/blob/master/cloud.png

How to fix unpickling key error when loading word2vec (gensim)?

I am trying to load a pre-trained word2vec model in pkl format taken from here
The line of code I use to load it:
model = gensim.models.KeyedVectors.load('enwiki_20180420_500d.pkl')
However, i keep getting the following error (full traceback):
UnpicklingError Traceback (most recent call last)
<ipython-input-15-ebd5780b6636> in <module>
55
56 #Load pretrained word2vec
---> 57 model = gensim.models.KeyedVectors.load('enwiki_20180420_500d.pkl',mmap='r')
58
~/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py in load(cls, fname_or_handle, **kwargs)
1551 #classmethod
1552 def load(cls, fname_or_handle, **kwargs):
-> 1553 model = super(WordEmbeddingsKeyedVectors, cls).load(fname_or_handle, **kwargs)
1554 if isinstance(model, FastTextKeyedVectors):
1555 if not hasattr(model, 'compatible_hash'):
~/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py in load(cls, fname_or_handle, **kwargs)
226 #classmethod
227 def load(cls, fname_or_handle, **kwargs):
--> 228 return super(BaseKeyedVectors, cls).load(fname_or_handle, **kwargs)
229
230 def similarity(self, entity1, entity2):
~/anaconda3/lib/python3.7/site-packages/gensim/utils.py in load(cls, fname, mmap)
433 compress, subname = SaveLoad._adapt_by_suffix(fname)
434
--> 435 obj = unpickle(fname)
436 obj._load_specials(fname, mmap, compress, subname)
437 logger.info("loaded %s", fname)
~/anaconda3/lib/python3.7/site-packages/gensim/utils.py in unpickle(fname)
1396 # Because of loading from S3 load can't be used (missing readline in smart_open)
1397 if sys.version_info > (3, 0):
-> 1398 return _pickle.load(f, encoding='latin1')
1399 else:
1400 return _pickle.loads(f.read())
UnpicklingError: invalid load key, ':'.
I tried loading it with load_word2vec_format, but no luck. Any ideas what might be wrong with it?
Per your link https://wikipedia2vec.github.io/wikipedia2vec/pretrained/ these are to be loaded using that library's Wikipedia2Vec.load() method.
Gensim's .load() methods should only be used with files saved directly from Gensim model objects.
The Wikipedia2Vec project does say that their .txt file formats would load with .load_word2vec_format(), so you could also try that - but with one of their .txt format files.
Their full model .pkl files are only going to work with their class's own loading function.

Wahoo TICKR X .fit file reading/parsing and analysis in Python

Not sure if I can post a question like this here so please redirect me if I'm in the wrong place.
I've bought a Wahoo TICKR X to monitor my heart rate during exercise. Also I would like to get more familiar with python so i decided I would like do do the analyses of my heart rate myself in python instead of in the wahoo app. I thought this would also give more freedom in the choice of visualization, testing etc.
I've recorded my heart rate for 5 minutes or so and exported the .fit file. However I can't even find a suitable library to read the .fit file. Can anyone recommend a library that works with .fit file from wahoo?
I'm using ubuntu, anaconda, python 3.7
import pyfits
# Load the FITS file into the program
hdulist = pyfits.open('/home/bradmin/Downloads/2020-03-26.fit')
# Load table data as tbdata
tbdata = hdulist[1].data
OSError Traceback (most recent call last)
<ipython-input-3-a970e2cd9dee> in <module>
2
3 # Load the FITS file into the program
----> 4 hdulist = pyfits.open('/home/bradmin/Downloads/2020-03-26.fit')
5
6 # Load table data as tbdata
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in fitsopen(name, mode, memmap, save_backup, **kwargs)
122 raise ValueError('Empty filename: %s' % repr(name))
123
--> 124 return HDUList.fromfile(name, mode, memmap, save_backup, **kwargs)
125
126
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in fromfile(cls, fileobj, mode, memmap, save_backup, **kwargs)
264
265 return cls._readfrom(fileobj=fileobj, mode=mode, memmap=memmap,
--> 266 save_backup=save_backup, **kwargs)
267
268 #classmethod
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in _readfrom(cls, fileobj, data, mode, memmap, save_backup, **kwargs)
853 # raise and exception
854 if mode in ('readonly', 'denywrite') and len(hdulist) == 0:
--> 855 raise IOError('Empty or corrupt FITS file')
856
857 # initialize/reset attributes to be used in "update/append" mode
OSError: Empty or corrupt FITS file
link to the file: https://wetransfer.com/downloads/6d054a5d52899aefcb1bcd22bda92ba120200326161849/b9831a
EDIT
I've tried this now but i get an error:
import fitdecode
src_file = "/home/bradmin/Downloads/2020-03-26.fit"
with fitdecode.FitReader(src_file) as fit:
for frame in fit:
# The yielded frame object is of one of the following types:
# * fitdecode.FitHeader
# * fitdecode.FitDefinitionMessage
# * fitdecode.FitDataMessage
# * fitdecode.FitCRC
if isinstance(frame, fitdecode.FitDataMessage):
# Here, frame is a FitDataMessage object.
# A FitDataMessage object contains decoded values that
# are directly usable in your script logic.
print(frame.name)
file_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
field_description
field_description
field_description
field_description
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-7-e8d95d3087dc> in <module>
2
3 with fitdecode.FitReader(src_file) as fit:
----> 4 for frame in fit:
5 # The yielded frame object is of one of the following types:
6 # * fitdecode.FitHeader
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in __iter__(self)
191
192 def __iter__(self):
--> 193 yield from self._read_next()
194
195 #property
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _read_next(self)
298 assert self._header
299
--> 300 record = self._read_record()
301 if not record:
302 break
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _read_record(self)
443 self._add_dev_data_id(message)
444 elif message.mesg_type.mesg_num == profile.MESG_NUM_FIELD_DESCRIPTION:
--> 445 self._add_dev_field_description(message)
446
447 return message
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _add_dev_field_description(self, message)
780 base_type_id = message.get_field('fit_base_type_id').raw_value
781 field_name = message.get_field('field_name').raw_value
--> 782 units = message.get_field('units').raw_value
783
784 try:
~/anaconda3/lib/python3.7/site-packages/fitdecode/records.py in get_field(self, field_name_or_num, idx)
188 raise KeyError(
189 f'field "{field_name_or_num}" (idx #{idx}) not found in ' +
--> 190 f'message "{self.name}"')
191
192 def get_fields(self, field_name_or_num):
KeyError: 'field "units" (idx #0) not found in message "field_description"'
The format seems to be this FIT format. pyfits is for an entirely different format, it seems.
The article above refers to a gpsbabel tool, which you could use to convert the FIT file to something more interoperable and usable, e.g. GPX (an XML-based format that's easy to parse).
Or, of course, if you want a pure-Python solution, you can port the FIT format reading bits from gpsbabel to Python use the fitdecode library.

pydicom 'Dataset' object has no attribute 'TransferSyntaxUID'

I'm using pydicom 1.0.0a1, downloaded from here, When I run the following code:
ds=pydicom.read_file('./DR/abnormal/abc.dcm',force=True)
ds.pixel_array
this error occurs:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-6-d4e81d303439> in <module>()
7 ds=pydicom.read_file('./DR/abnormal/abc.dcm',force=True)
8
----> 9 ds.pixel_array
10
/Applications/anaconda/lib/python2.7/site-packages/pydicom-1.0.0a1-py2.7.egg/pydicom/dataset.pyc in __getattr__(self, name)
501 if tag is None: # `name` isn't a DICOM element keyword
502 # Try the base class attribute getter (fix for issue 332)
--> 503 return super(Dataset, self).__getattribute__(name)
504 tag = Tag(tag)
505 if tag not in self: # DICOM DataElement not in the Dataset
/Applications/anaconda/lib/python2.7/site-packages/pydicom-1.0.0a1-py2.7.egg/pydicom/dataset.pyc in pixel_array(self)
1064 The Pixel Data (7FE0,0010) as a NumPy ndarray.
1065 """
-> 1066 return self._get_pixel_array()
1067
1068 # Format strings spec'd according to python string formatting options
/Applications/anaconda/lib/python2.7/site-packages/pydicom-1.0.0a1-py2.7.egg/pydicom/dataset.pyc in _get_pixel_array(self)
1042 elif self._pixel_id != id(self.PixelData):
1043 already_have = False
-> 1044 if not already_have and not self._is_uncompressed_transfer_syntax():
1045 try:
1046 # print("Pixel Data is compressed")
/Applications/anaconda/lib/python2.7/site-packages/pydicom-1.0.0a1-py2.7.egg/pydicom/dataset.pyc in _is_uncompressed_transfer_syntax(self)
662 """Return True if the TransferSyntaxUID is a compressed syntax."""
663 # FIXME uses file_meta here, should really only be thus for FileDataset
--> 664 return self.file_meta.TransferSyntaxUID in NotCompressedPixelTransferSyntaxes
665
666 def __ne__(self, other):
/Applications/anaconda/lib/python2.7/site-packages/pydicom-1.0.0a1-py2.7.egg/pydicom/dataset.pyc in __getattr__(self, name)
505 if tag not in self: # DICOM DataElement not in the Dataset
506 # Try the base class attribute getter (fix for issue 332)
--> 507 return super(Dataset, self).__getattribute__(name)
508 else:
509 return self[tag].value
AttributeError: 'Dataset' object has no attribute 'TransferSyntaxUID'
I read the google group post , and I changed the filereader.py file to the posted file, and I got this error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Applications/anaconda/lib/python2.7/site-packages/pydicom-1.0.0a1-py2.7.egg/pydicom/__init__.py", line 41, in read_file
from pydicom.dicomio import read_file
File "/Applications/anaconda/lib/python2.7/site-packages/pydicom-1.0.0a1-py2.7.egg/pydicom/dicomio.py", line 3, in <module>
from pydicom.filereader import read_file, read_dicomdir
File "/Applications/anaconda/lib/python2.7/site-packages/pydicom-1.0.0a1-py2.7.egg/pydicom/filereader.py", line 35, in <module>
from pydicom.datadict import dictionaryVR
ImportError: cannot import name dictionaryVR
Does anybody know how to solve this problem?
You should set the TransferSyntaxUID after reading the file before trying to get the pixel_array.
import pydicom.uid
ds=pydicom.read_file('./DR/abnormal/abc.dcm',force=True)
ds.file_meta.TransferSyntaxUID = pydicom.uid.ImplicitVRLittleEndian # or whatever is the correct transfer syntax for the file
ds.pixel_array
The correction from the post you referenced was done before some changes in the code to harmonize some naming, so the error is thrown because the current master uses dictionary_VR rather than dictionaryVR. Setting the transfer syntax in user code as above avoids that problem.

Categories