How to fix unpickling key error when loading word2vec (gensim)? - python

I am trying to load a pre-trained word2vec model in pkl format taken from here
The line of code I use to load it:
model = gensim.models.KeyedVectors.load('enwiki_20180420_500d.pkl')
However, i keep getting the following error (full traceback):
UnpicklingError Traceback (most recent call last)
<ipython-input-15-ebd5780b6636> in <module>
55
56 #Load pretrained word2vec
---> 57 model = gensim.models.KeyedVectors.load('enwiki_20180420_500d.pkl',mmap='r')
58
~/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py in load(cls, fname_or_handle, **kwargs)
1551 #classmethod
1552 def load(cls, fname_or_handle, **kwargs):
-> 1553 model = super(WordEmbeddingsKeyedVectors, cls).load(fname_or_handle, **kwargs)
1554 if isinstance(model, FastTextKeyedVectors):
1555 if not hasattr(model, 'compatible_hash'):
~/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py in load(cls, fname_or_handle, **kwargs)
226 #classmethod
227 def load(cls, fname_or_handle, **kwargs):
--> 228 return super(BaseKeyedVectors, cls).load(fname_or_handle, **kwargs)
229
230 def similarity(self, entity1, entity2):
~/anaconda3/lib/python3.7/site-packages/gensim/utils.py in load(cls, fname, mmap)
433 compress, subname = SaveLoad._adapt_by_suffix(fname)
434
--> 435 obj = unpickle(fname)
436 obj._load_specials(fname, mmap, compress, subname)
437 logger.info("loaded %s", fname)
~/anaconda3/lib/python3.7/site-packages/gensim/utils.py in unpickle(fname)
1396 # Because of loading from S3 load can't be used (missing readline in smart_open)
1397 if sys.version_info > (3, 0):
-> 1398 return _pickle.load(f, encoding='latin1')
1399 else:
1400 return _pickle.loads(f.read())
UnpicklingError: invalid load key, ':'.
I tried loading it with load_word2vec_format, but no luck. Any ideas what might be wrong with it?

Per your link https://wikipedia2vec.github.io/wikipedia2vec/pretrained/ these are to be loaded using that library's Wikipedia2Vec.load() method.
Gensim's .load() methods should only be used with files saved directly from Gensim model objects.
The Wikipedia2Vec project does say that their .txt file formats would load with .load_word2vec_format(), so you could also try that - but with one of their .txt format files.
Their full model .pkl files are only going to work with their class's own loading function.

Related

Python lzma unable to load joblib

I have a scikit learn pipeline that I serialize using:
with lzma.open('outputs/baseModel_LR.joblib',"wb") as f:
dill.dump(pipeline, f)
When I try to open the file and load the pipeline using:
with lzma.open('outputs/baseModel_LR.joblib',"rb") as f:
model = dill.load(f)
it gives error:
---------------------------------------------------------------------------
EOFError Traceback (most recent call last)
somePath/notebooks/test.ipynb Cell 5 in <cell line: 1>()
1 with lzma.open('outputs/baseModel_LR.joblib',"rb") as f:
----> 2 model = dill.load(f)
3 model
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/dill/_dill.py:373, in load(file, ignore, **kwds)
367 def load(file, ignore=None, **kwds):
368 """
369 Unpickle an object from a file.
370
371 See :func:`loads` for keyword arguments.
372 """
--> 373 return Unpickler(file, ignore=ignore, **kwds).load()
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/dill/_dill.py:646, in Unpickler.load(self)
645 def load(self): #NOTE: if settings change, need to update attributes
--> 646 obj = StockUnpickler.load(self)
647 if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'):
648 if not self._ignore:
649 # point obj class to main
File /anaconda/envs/azureml_py38/lib/python3.8/lzma.py:200, in LZMAFile.read(self, size)
194 """Read up to size uncompressed bytes from the file.
...
100 "end-of-stream marker was reached")
101 else:
102 rawblock = b""
**EOFError: Compressed file ended before the end-of-stream marker was reached**
Has anyone faced this problem and solved it? I use lzma because otherwise the joblib size is 27GB and with lzma its just 20MB

Extracting Data from a DICOMDIR file using Pydicom

I'm unable to read in a DICOM file as I usually would, citing the error:
AttributeError: 'DicomDir' object has no attribute 'DirectoryRecordSequence'
I've tried:
pydicom.fileset.FileSet
using specific tags with dcmread
pydicom.filereader.read_dicomdir
pydicom.filereader.read_partial
using force=True in dcmread
pydicom.filereader.read_file_meta_info is about the only thing that's not returned an error and yields;
(0002, 0000) File Meta Information Group Length UL: 172
(0002, 0001) File Meta Information Version OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID UI: Media Storage Directory Storage
(0002, 0003) Media Storage SOP Instance UID UI: 2.25.330614241706723499239981063503184149269
(0002, 0010) Transfer Syntax UID UI: Explicit VR Little Endian
(0002, 0012) Implementation Class UID UI: 1.3.6.1.4.1.30071.8
(0002, 0013) Implementation Version Name SH: 'fo-dicom 4.0.7'
Moreover, the image is supposed to be a regular DICOM file, not a DICOMDIR. I can open the file in ImageJ and view header information there so I know the data is recoverable.
Is there a way for me to read in this file in Python or alternatively force it to ignore looking for DirectoryRecordSequence?
Edit:
Code and stacktrace from using FileSet:
from pydicom.fileset import FileSet
fs = FileSet("unprocessed.dcm")
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-2b6ba2e435fe> in <module>
1 from pydicom.fileset import FileSet
----> 2 fs = FileSet("unprocessed.dcm")
c:\****\appdata\local\programs\python\python38-32\lib\site-packages\pydicom\fileset.py in __init__(self, ds)
998 # Check the DICOMDIR dataset and create the record tree
999 if ds:
-> 1000 self.load(ds)
1001 else:
1002 # New File-set
c:\****\appdata\local\programs\python\python38-32\lib\site-packages\pydicom\fileset.py in load(self, ds_or_path, include_orphans, raise_orphans)
1641 ds = ds_or_path
1642 else:
-> 1643 ds = dcmread(ds_or_path)
1644
1645 sop_class = ds.file_meta.get("MediaStorageSOPClassUID", None)
c:\****\appdata\local\programs\python\python38-32\lib\site-packages\pydicom\filereader.py in dcmread(fp, defer_size, stop_before_pixels, force, specific_tags)
1027 stop_when = _at_pixel_data
1028 try:
-> 1029 dataset = read_partial(
1030 fp,
1031 stop_when,
c:\****\appdata\local\programs\python\python38-32\lib\site-packages\pydicom\filereader.py in read_partial(fileobj, stop_when, defer_size, force, specific_tags)
879 DeprecationWarning
880 )
--> 881 ds = DicomDir(
882 fileobj,
883 dataset,
c:\****\appdata\local\programs\python\python38-32\lib\site-packages\pydicom\dicomdir.py in __init__(self, filename_or_obj, dataset, preamble, file_meta, is_implicit_VR, is_little_endian)
94
95 self.patient_records: List[Dataset] = []
---> 96 self.parse_records()
97
98 def parse_records(self) -> None:
c:\****\appdata\local\programs\python\python38-32\lib\site-packages\pydicom\dicomdir.py in parse_records(self)
125
126 # Build the mapping from file offsets to records
--> 127 records = self.DirectoryRecordSequence
128 if not records:
129 return
c:\****\appdata\local\programs\python\python38-32\lib\site-packages\pydicom\dataset.py in __getattr__(self, name)
834 return {}
835 # Try the base class attribute getter (fix for issue 332)
--> 836 return object.__getattribute__(self, name)
837
838 #property
AttributeError: 'DicomDir' object has no attribute 'DirectoryRecordSequence'
​
pydicom reads the dataset correctly, but because it identifies as Media Storage Directory it gets processed by the deprecated DicomDir class, even when passed directly to the FileSet class. Because the dataset isn't a valid Media Storage Directory instance this fails, producing the exception seen.
You should be able to fix this by changing the file meta information's (0002,0002) Media Storage SOP Class UID during read:
from pydicom import dcmread
from pydicom import config
def fix_sop_class(elem, **kwargs):
if elem.tag == 0x00020002:
# DigitalXRayImageStorageForProcessing
elem = elem._replace(value=b"1.2.840.10008.5.1.4.1.1.1.1.1")
return elem
config.data_element_callback = fix_sop_class
ds = dcmread('path/to/file')
By changing the SOP Class UID, that processing is skipped and the dataset returned.

Wahoo TICKR X .fit file reading/parsing and analysis in Python

Not sure if I can post a question like this here so please redirect me if I'm in the wrong place.
I've bought a Wahoo TICKR X to monitor my heart rate during exercise. Also I would like to get more familiar with python so i decided I would like do do the analyses of my heart rate myself in python instead of in the wahoo app. I thought this would also give more freedom in the choice of visualization, testing etc.
I've recorded my heart rate for 5 minutes or so and exported the .fit file. However I can't even find a suitable library to read the .fit file. Can anyone recommend a library that works with .fit file from wahoo?
I'm using ubuntu, anaconda, python 3.7
import pyfits
# Load the FITS file into the program
hdulist = pyfits.open('/home/bradmin/Downloads/2020-03-26.fit')
# Load table data as tbdata
tbdata = hdulist[1].data
OSError Traceback (most recent call last)
<ipython-input-3-a970e2cd9dee> in <module>
2
3 # Load the FITS file into the program
----> 4 hdulist = pyfits.open('/home/bradmin/Downloads/2020-03-26.fit')
5
6 # Load table data as tbdata
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in fitsopen(name, mode, memmap, save_backup, **kwargs)
122 raise ValueError('Empty filename: %s' % repr(name))
123
--> 124 return HDUList.fromfile(name, mode, memmap, save_backup, **kwargs)
125
126
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in fromfile(cls, fileobj, mode, memmap, save_backup, **kwargs)
264
265 return cls._readfrom(fileobj=fileobj, mode=mode, memmap=memmap,
--> 266 save_backup=save_backup, **kwargs)
267
268 #classmethod
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in _readfrom(cls, fileobj, data, mode, memmap, save_backup, **kwargs)
853 # raise and exception
854 if mode in ('readonly', 'denywrite') and len(hdulist) == 0:
--> 855 raise IOError('Empty or corrupt FITS file')
856
857 # initialize/reset attributes to be used in "update/append" mode
OSError: Empty or corrupt FITS file
link to the file: https://wetransfer.com/downloads/6d054a5d52899aefcb1bcd22bda92ba120200326161849/b9831a
EDIT
I've tried this now but i get an error:
import fitdecode
src_file = "/home/bradmin/Downloads/2020-03-26.fit"
with fitdecode.FitReader(src_file) as fit:
for frame in fit:
# The yielded frame object is of one of the following types:
# * fitdecode.FitHeader
# * fitdecode.FitDefinitionMessage
# * fitdecode.FitDataMessage
# * fitdecode.FitCRC
if isinstance(frame, fitdecode.FitDataMessage):
# Here, frame is a FitDataMessage object.
# A FitDataMessage object contains decoded values that
# are directly usable in your script logic.
print(frame.name)
file_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
field_description
field_description
field_description
field_description
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-7-e8d95d3087dc> in <module>
2
3 with fitdecode.FitReader(src_file) as fit:
----> 4 for frame in fit:
5 # The yielded frame object is of one of the following types:
6 # * fitdecode.FitHeader
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in __iter__(self)
191
192 def __iter__(self):
--> 193 yield from self._read_next()
194
195 #property
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _read_next(self)
298 assert self._header
299
--> 300 record = self._read_record()
301 if not record:
302 break
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _read_record(self)
443 self._add_dev_data_id(message)
444 elif message.mesg_type.mesg_num == profile.MESG_NUM_FIELD_DESCRIPTION:
--> 445 self._add_dev_field_description(message)
446
447 return message
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _add_dev_field_description(self, message)
780 base_type_id = message.get_field('fit_base_type_id').raw_value
781 field_name = message.get_field('field_name').raw_value
--> 782 units = message.get_field('units').raw_value
783
784 try:
~/anaconda3/lib/python3.7/site-packages/fitdecode/records.py in get_field(self, field_name_or_num, idx)
188 raise KeyError(
189 f'field "{field_name_or_num}" (idx #{idx}) not found in ' +
--> 190 f'message "{self.name}"')
191
192 def get_fields(self, field_name_or_num):
KeyError: 'field "units" (idx #0) not found in message "field_description"'
The format seems to be this FIT format. pyfits is for an entirely different format, it seems.
The article above refers to a gpsbabel tool, which you could use to convert the FIT file to something more interoperable and usable, e.g. GPX (an XML-based format that's easy to parse).
Or, of course, if you want a pure-Python solution, you can port the FIT format reading bits from gpsbabel to Python use the fitdecode library.

I'm Trying to Get Keras to Load vgg16.h5 Wheights Locally Instead of Downloading

I've tried modifying this code serveral different ways,
from trying to change the last lines to my vgg16.h5 file on my local disk,
to importing load_weights from Keras and trying to get it to grab the weights that way instead.
This code is from lesson 1 of the fast.ai course. I've asked on their forum but got no response.
The files running this are in this link.
https://github.com/fastai/courses/tree/master/deeplearning1/nbs
lesson1.ipynb calls on the file vgg16.py to download the weights.
The code below starts at line 117 in the vgg16.py file.
def create(self):
"""
Creates the VGG16 network achitecture and loads the pretrained weights.
Args: None
Returns: None
"""
model = self.model = Sequential()
model.add(Lambda(vgg_preprocess, input_shape=(3,224,224), output_shape=(3,224,224)))
self.ConvBlock(2, 64)
self.ConvBlock(2, 128)
self.ConvBlock(3, 256)
self.ConvBlock(3, 512)
self.ConvBlock(3, 512)
model.add(Flatten())
self.FCBlock()
self.FCBlock()
model.add(Dense(1000, activation='softmax'))
fname = 'vgg16.h5'
model.load_weights(get_file(fname, self.FILE_PATH+fname, cache_subdir='models'))
The above code is the code out of the box that downloads the weights.
When I change that last line and get rid of everything in the parenthesis except for 'fname' like this...
fname = 'vgg16.h5'
model.load_weights(fname)
I get the error below.
---------------------------------------------------------------------------
InternalError Traceback (most recent call last)
<ipython-input-7-2b6861506a11> in <module>()
----> 1 vgg = Vgg16()
2 # Grab a few images at a time for training and validation.
3 # NB: They must be in subdirectories named based on their category
4 batches = vgg.get_batches(path+'train', batch_size=batch_size)
5 val_batches = vgg.get_batches(path+'valid', batch_size=batch_size*2)
/home/eagle/fastai/courses-master/deeplearning1/nbs/vgg16.pyc in __init__(self)
45 def __init__(self):
46 self.FILE_PATH = 'http://files.fast.ai/models/'
---> 47 self.create()
48 self.get_classes()
49
/home/eagle/fastai/courses-master/deeplearning1/nbs/vgg16.pyc in create(self)
137
138 fname = 'vgg16.h5'
--> 139 model.load_weights(fname)
140
141
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/engine/topology.pyc in load_weights(self, filepath, by_name)
2706 self.load_weights_from_hdf5_group_by_name(f)
2707 else:
-> 2708 self.load_weights_from_hdf5_group(f)
2709
2710 if hasattr(f, 'close'):
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/engine/topology.pyc in load_weights_from_hdf5_group(self, f)
2792 weight_values[0] = w
2793 weight_value_tuples += zip(symbolic_weights, weight_values)
-> 2794 K.batch_set_value(weight_value_tuples)
2795
2796 def load_weights_from_hdf5_group_by_name(self, f):
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/backend/tensorflow_backend.pyc in batch_set_value(tuples)
1879 assign_ops.append(assign_op)
1880 feed_dict[assign_placeholder] = value
-> 1881 get_session().run(assign_ops, feed_dict=feed_dict)
1882
1883
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/backend/tensorflow_backend.pyc in get_session()
120 config = tf.ConfigProto(intra_op_parallelism_threads=nb_thread,
121 allow_soft_placement=True)
--> 122 _SESSION = tf.Session(config=config)
123 session = _SESSION
124 if not _MANUAL_VAR_INIT:
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in __init__(self, target, graph, config)
1191
1192 """
-> 1193 super(Session, self).__init__(target, graph, config=config)
1194 # NOTE(mrry): Create these on first `__enter__` to avoid a reference cycle.
1195 self._default_graph_context_manager = None
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in __init__(self, target, graph, config)
552 try:
553 with errors.raise_exception_on_not_ok_status() as status:
--> 554 self._session = tf_session.TF_NewDeprecatedSession(opts, status)
555 finally:
556 tf_session.TF_DeleteSessionOptions(opts)
/home/eagle/anaconda3/envs/les1/lib/python2.7/contextlib.pyc in __exit__(self, type, value, traceback)
22 if type is None:
23 try:
---> 24 self.gen.next()
25 except StopIteration:
26 return
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/framework/errors_impl.pyc in raise_exception_on_not_ok_status()
464 None, None,
465 compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466 pywrap_tensorflow.TF_GetCode(status))
467 finally:
468 pywrap_tensorflow.TF_DeleteStatus(status)
InternalError: Failed to create session.
I found the folder Keras is/ or would store this weight file and dropped it in there with the following line in Terminal.
mv /home/mine/fastai/courses-master/deeplearning1/nbs/vgg16.h5 ~/.keras/models/vgg16.h5
The first path is the path with my fully downloaded weights .h5 file. The second path is where I put said weights and the path Keras looks at to find the weights.
One of the possible ways to load weights locally is as follows:
vgg = vgg16.VGG16(weights=<path_to_weights_file>)
This will work fine. And there is no need to modify the vgg16.py file at all.
Copying the .h5 file to .keras/models/ and modifying the vgg16.py at line 30 to
WEIGHTS_PATH_NO_TOP = ('.keras/models/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5') seems to work fine
The path in different system:
Linux:
~/.keras/models/
Win:
settings/.keras/models/ of Python
Anaconda on Win
D:\Anaconda3\Lib\site-packages\tensorflow\contrib\keras\api\keras\applications\vgg16
Putting the downloaded .h5 file to these local folder seems to work.

Gensim: how to load pretrained doc2vec model?

I'm trying to read my pretrained doc2vec model:
from gensim.models import Doc2Vec
model = Doc2Vec.load('/path/to/pretrained/model')
However, an error appears during reading process. Could anyone suggest how to deal with this? Here is the error:
AttributeErrorTraceback (most recent call last)
<ipython-input-9-819b254ac835> in <module>()
----> 1 model = Doc2Vec.load('/path/to/pretrained/model')
/opt/jupyter-notebook/.local/lib/python2.7/site-packages/gensim/models/word2vec.pyc in load(cls, *args, **kwargs)
1682 #classmethod
1683 def load(cls, *args, **kwargs):
-> 1684 model = super(Word2Vec, cls).load(*args, **kwargs)
1685 # update older models
1686 if hasattr(model, 'table'):
/opt/jupyter-notebook/.local/lib/python2.7/site-packages/gensim/utils.pyc in load(cls, fname, mmap)
246 compress, subname = SaveLoad._adapt_by_suffix(fname)
247
--> 248 obj = unpickle(fname)
249 obj._load_specials(fname, mmap, compress, subname)
250 return obj
/opt/jupyter-notebook/.local/lib/python2.7/site-packages/gensim/utils.pyc in unpickle(fname)
909 with smart_open(fname) as f:
910 # Because of loading from S3 load can't be used (missing readline in smart_open)
--> 911 return _pickle.loads(f.read())
912
913
AttributeError: 'module' object has no attribute 'defaultdict'
As noted in the comments to the question, this was likely related to an issue in gensim that was fixed in 0.13.4 release.

Categories