Gensim: how to load pretrained doc2vec model? - python

I'm trying to read my pretrained doc2vec model:
from gensim.models import Doc2Vec
model = Doc2Vec.load('/path/to/pretrained/model')
However, an error appears during reading process. Could anyone suggest how to deal with this? Here is the error:
AttributeErrorTraceback (most recent call last)
<ipython-input-9-819b254ac835> in <module>()
----> 1 model = Doc2Vec.load('/path/to/pretrained/model')
/opt/jupyter-notebook/.local/lib/python2.7/site-packages/gensim/models/word2vec.pyc in load(cls, *args, **kwargs)
1682 #classmethod
1683 def load(cls, *args, **kwargs):
-> 1684 model = super(Word2Vec, cls).load(*args, **kwargs)
1685 # update older models
1686 if hasattr(model, 'table'):
/opt/jupyter-notebook/.local/lib/python2.7/site-packages/gensim/utils.pyc in load(cls, fname, mmap)
246 compress, subname = SaveLoad._adapt_by_suffix(fname)
247
--> 248 obj = unpickle(fname)
249 obj._load_specials(fname, mmap, compress, subname)
250 return obj
/opt/jupyter-notebook/.local/lib/python2.7/site-packages/gensim/utils.pyc in unpickle(fname)
909 with smart_open(fname) as f:
910 # Because of loading from S3 load can't be used (missing readline in smart_open)
--> 911 return _pickle.loads(f.read())
912
913
AttributeError: 'module' object has no attribute 'defaultdict'

As noted in the comments to the question, this was likely related to an issue in gensim that was fixed in 0.13.4 release.

Related

How to solve "RuntimeError: 'len' is not supported in symbolic tracing by default" for vision transformers?

I am trying to create a feature extractor using from torchvision.models.feature_extraction import create_feature_extractor.
The model I am trying to use is from the vit_pytorch (link: https://github.com/lucidrains/vit-pytorch). The problem I face is that when I create a model from this lib:
from vit_pytorch import ViT
from torchvision.models.feature_extraction import create_feature_extractor
model = ViT(image_size=28,
patch_size=7,
num_classes=10,
dim=16,
depth=6,
heads=16,
mlp_dim=256,
dropout=0.1,
emb_dropout=0.1,
channels=1)
random_layer_name = 'transformer.layers.1.1.fn.net.4'
feature_extractor = create_feature_extractor(model,
return_nodes=random_layer_name)
and when trying to use the create_feature_extractor() on this model I always get this error:
RuntimeError Traceback (most recent call last)
Cell In[17], line 2
1 # torch.fx.wrap('len')
----> 2 feature_extractor = create_feature_extractor(model,
3 return_nodes=['transformer.layers.1.1.fn.net.4'])
File ~/Mokslas/AI/venv/lib/python3.10/site-packages/torchvision/models/feature_extraction.py:485, in create_feature_extractor(model, return_nodes, train_return_nodes, eval_return_nodes, tracer_kwargs, suppress_diff_warning)
483 # Instantiate our NodePathTracer and use that to trace the model
484 tracer = NodePathTracer(**tracer_kwargs)
--> 485 graph = tracer.trace(model)
487 name = model.__class__.__name__ if isinstance(model, nn.Module) else model.__name__
488 graph_module = fx.GraphModule(tracer.root, graph, name)
File ~/Mokslas/AI/venv/lib/python3.10/site-packages/torch/fx/_symbolic_trace.py:756, in Tracer.trace(self, root, concrete_args)
749 for module in self._autowrap_search:
750 _autowrap_check(
751 patcher, module.__dict__, self._autowrap_function_ids
752 )
753 self.create_node(
754 "output",
755 "output",
--> 756 (self.create_arg(fn(*args)),),
757 {},
758 type_expr=fn.__annotations__.get("return", None),
759 )
761 self.submodule_paths = None
762 finally:
File ~/Mokslas/AI/venv/lib/python3.10/site-packages/vit_pytorch/vit.py:115, in ViT.forward(self, img)
114 def forward(self, img):
--> 115 x = self.to_patch_embedding(img)
116 b, n, _ = x.shape
118 cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)
File ~/Mokslas/AI/venv/lib/python3.10/site-packages/torch/fx/_symbolic_trace.py:734, in Tracer.trace.<locals>.module_call_wrapper(mod, *args, **kwargs)
727 return _orig_module_call(mod, *args, **kwargs)
729 _autowrap_check(
730 patcher,
731 getattr(getattr(mod, "forward", mod), "__globals__", {}),
732 self._autowrap_function_ids,
733 )
--> 734 return self.call_module(mod, forward, args, kwargs)
File ~/Mokslas/AI/venv/lib/python3.10/site-packages/torchvision/models/feature_extraction.py:83, in NodePathTracer.call_module(self, m, forward, args, kwargs)
...
--> 396 raise RuntimeError("'len' is not supported in symbolic tracing by default. If you want "
397 "this call to be recorded, please call torch.fx.wrap('len') at "
398 "module scope")
RuntimeError: 'len' is not supported in symbolic tracing by default. If you want this call to be recorded, please call torch.fx.wrap('len') at module scope
It doesn't matter which model I choose from that library or which layer or layers I choose to be outputed I always get the same error.
I have tried to add torch.fx.wrap('len') but the same problem persisted. I know I could try to solve it by using the hook methods, but is there a way to solve this problem so that I could still use the create_feature_extractor() functionality?

Raytune is throwing error: "module 'pickle' has no attribute 'PickleBuffer'" when attempting hyperparameter search

I am more or less following this example to integrate the ray tune hyperparameter library with the huggingface transformers library using my own dataset.
Here is my script:
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.examples.pbt_transformers.utils import download_data, \
build_compute_metrics_fn
from ray.tune.schedulers import PopulationBasedTraining
from transformers import glue_tasks_num_labels, AutoConfig, \
AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
def get_model():
# tokenizer = AutoTokenizer.from_pretrained(model_name, additional_special_tokens = ['[CHARACTER]'])
model = ElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=2)
model.resize_token_embeddings(len(tokenizer))
return model
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
training_args = TrainingArguments(
"electra_hp_tune",
report_to = "wandb",
learning_rate=2e-5, # config
do_train=True,
do_eval=True,
evaluation_strategy="epoch",
load_best_model_at_end=True,
num_train_epochs=2, # config
per_device_train_batch_size=16, # config
per_device_eval_batch_size=16, # config
warmup_steps=0,
weight_decay=0.1, # config
logging_dir="./logs",
)
trainer = Trainer(
model_init=get_model,
args=training_args,
train_dataset=chunked_encoded_dataset['train'],
eval_dataset=chunked_encoded_dataset['validation'],
compute_metrics=compute_metrics
)
tune_config = {
"per_device_train_batch_size": 32,
"per_device_eval_batch_size": 32,
"num_train_epochs": tune.choice([2, 3, 4, 5])
}
scheduler = PopulationBasedTraining(
time_attr="training_iteration",
metric="eval_acc",
mode="max",
perturbation_interval=1,
hyperparam_mutations={
"weight_decay": tune.uniform(0.0, 0.3),
"learning_rate": tune.uniform(1e-5, 2.5e-5),
"per_device_train_batch_size": [16, 32, 64],
})
reporter = CLIReporter(
parameter_columns={
"weight_decay": "w_decay",
"learning_rate": "lr",
"per_device_train_batch_size": "train_bs/gpu",
"num_train_epochs": "num_epochs"
},
metric_columns=[
"eval_f1", "eval_loss", "epoch", "training_iteration"
])
from ray.tune.integration.wandb import WandbLogger
trainer.hyperparameter_search(
hp_space=lambda _: tune_config,
backend="ray",
n_trials=10,
scheduler=scheduler,
keep_checkpoints_num=1,
checkpoint_score_attr="training_iteration",
progress_reporter=reporter,
name="tune_transformer_gr")
The last function call (to trainer.hyperparameter_search) is when the error is raised. The error message is:
AttributeError: module 'pickle' has no attribute 'PickleBuffer'
And here is the full stack trace:
AttributeError Traceback (most recent call
last)
in ()
8 checkpoint_score_attr="training_iteration",
9 progress_reporter=reporter,
---> 10 name="tune_transformer_gr")
14 frames
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in
hyperparameter_search(self, hp_space, compute_objective, n_trials,
direction, backend, hp_name, **kwargs) 1666 1667
run_hp_search = run_hp_search_optuna if backend ==
HPSearchBackend.OPTUNA else run_hp_search_ray
-> 1668 best_run = run_hp_search(self, n_trials, direction, **kwargs) 1669 1670 self.hp_search_backend = None
/usr/local/lib/python3.7/dist-packages/transformers/integrations.py in
run_hp_search_ray(trainer, n_trials, direction, **kwargs)
231
232 analysis = ray.tune.run(
--> 233 ray.tune.with_parameters(_objective, local_trainer=trainer),
234 config=trainer.hp_space(None),
235 num_samples=n_trials,
/usr/local/lib/python3.7/dist-packages/ray/tune/utils/trainable.py in
with_parameters(trainable, **kwargs)
294 prefix = f"{str(trainable)}_"
295 for k, v in kwargs.items():
--> 296 parameter_registry.put(prefix + k, v)
297
298 trainable_name = getattr(trainable, "name", "tune_with_parameters")
/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in
put(self, k, v)
160 self.to_flush[k] = v
161 if ray.is_initialized():
--> 162 self.flush()
163
164 def get(self, k):
/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in
flush(self)
169 def flush(self):
170 for k, v in self.to_flush.items():
--> 171 self.references[k] = ray.put(v)
172 self.to_flush.clear()
173
/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py
in wrapper(*args, **kwargs)
45 if client_mode_should_convert():
46 return getattr(ray, func.name)(*args, **kwargs)
---> 47 return func(*args, **kwargs)
48
49 return wrapper
/usr/local/lib/python3.7/dist-packages/ray/worker.py in put(value)
1512 with profiling.profile("ray.put"): 1513 try:
-> 1514 object_ref = worker.put_object(value) 1515 except ObjectStoreFullError: 1516 logger.info(
/usr/local/lib/python3.7/dist-packages/ray/worker.py in
put_object(self, value, object_ref)
259 "inserting with an ObjectRef")
260
--> 261 serialized_value = self.get_serialization_context().serialize(value)
262 # This must be the first place that we construct this python
263 # ObjectRef because an entry with 0 local references is created when
/usr/local/lib/python3.7/dist-packages/ray/serialization.py in
serialize(self, value)
322 return RawSerializedObject(value)
323 else:
--> 324 return self._serialize_to_msgpack(value)
/usr/local/lib/python3.7/dist-packages/ray/serialization.py in
_serialize_to_msgpack(self, value)
302 metadata = ray_constants.OBJECT_METADATA_TYPE_PYTHON
303 pickle5_serialized_object =
--> 304 self._serialize_to_pickle5(metadata, python_objects)
305 else:
306 pickle5_serialized_object = None
/usr/local/lib/python3.7/dist-packages/ray/serialization.py in
_serialize_to_pickle5(self, metadata, value)
262 except Exception as e:
263 self.get_and_clear_contained_object_refs()
--> 264 raise e
265 finally:
266 self.set_out_of_band_serialization()
/usr/local/lib/python3.7/dist-packages/ray/serialization.py in
_serialize_to_pickle5(self, metadata, value)
259 self.set_in_band_serialization()
260 inband = pickle.dumps(
--> 261 value, protocol=5, buffer_callback=writer.buffer_callback)
262 except Exception as e:
263 self.get_and_clear_contained_object_refs()
/usr/local/lib/python3.7/dist-packages/ray/cloudpickle/cloudpickle_fast.py
in dumps(obj, protocol, buffer_callback)
71 file, protocol=protocol, buffer_callback=buffer_callback
72 )
---> 73 cp.dump(obj)
74 return file.getvalue()
75
/usr/local/lib/python3.7/dist-packages/ray/cloudpickle/cloudpickle_fast.py
in dump(self, obj)
578 def dump(self, obj):
579 try:
--> 580 return Pickler.dump(self, obj)
581 except RuntimeError as e:
582 if "recursion" in e.args[0]:
/usr/local/lib/python3.7/dist-packages/pyarrow/io.pxi in
pyarrow.lib.Buffer.reduce_ex()
AttributeError: module 'pickle' has no attribute 'PickleBuffer'
My environment set-up:
Am using Google Colab
Platform: Linux-5.4.109+-x86_64-with-Ubuntu-18.04-bionic
Python version: 3.7.10
Transformers version: 4.6.1
ray version: 1.3.0
What I have tried:
Updating pickle
Installed and imported pickle5 as pickle
Made sure that I did not have a python file with the name of 'pickle' in my immediate directory
Where is this bug coming from and how can I resolve it?
I had the same error when trying to use pickle.dump(), for me it worked to downgrade pickle5 from version 0.0.11 to 0.0.10
I also encountered this error on Google Colab trying ray tune hyperparameter search with the huggingface transformers.
This helped me:
!pip install pickle5
Then
import pickle5 as pickle
After the first run there will be the pickle warning to restart the notebook and the same error. After the second “Restart and run all” the ray tune hyperparameter search begins.
Not a "real" solution but at least a workaround. For me this issue was occurring on Python 3.7. Switching to Python 3.8 solved the issue.

XGBoostError when loading model from json file

I'm trying to load a trained XGBoost model which has been saved in a json file. I'm using the following code:
params= {'objective' : 'multi:softmax',
'eval_metric': 'mlogloss',
'num_class': 10,
'early_stopping_rounds': 10}
xgb = xgb.XGBClassifier(**params)
xgb.load_model("xgb_default.json")
However I'm getting an error. I will include it in here together with the Traceback:
XGBoostError Traceback (most recent call last)
<ipython-input-4-8a9abeb40a78> in <module>
10
11 xgb = xgb.XGBClassifier(**params)
---> 12 xgb.load_model("xgb_default.json")
~\anaconda3\lib\site-packages\xgboost\sklearn.py in load_model(self, fname)
412 if not hasattr(self, '_Booster'):
413 self._Booster = Booster({'n_jobs': self.n_jobs})
--> 414 self._Booster.load_model(fname)
415 meta = self._Booster.attr('scikit_learn')
416 if meta is None:
~\anaconda3\lib\site-packages\xgboost\core.py in load_model(self, fname)
1601 # assume file name, cannot use os.path.exist to check, file can be
1602 # from URL.
-> 1603 _check_call(_LIB.XGBoosterLoadModel(
1604 self.handle, c_str(os_fspath(fname))))
1605 elif isinstance(fname, bytearray):
~\anaconda3\lib\site-packages\xgboost\core.py in _check_call(ret)
186 """
187 if ret != 0:
--> 188 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
189
190
XGBoostError: [11:07:00] C:\Users\Administrator\workspace\xgboost-win64_release_1.2.0\include\xgboost/json.h:65: Invalid cast, from Null to Array
Does anyone know what is the issue here? Thank you in advance!
Administrator\workspace\xgboost-win64_release_1.2.0
Support for JSON was introduced in XGBoost 1.3.

How to fix unpickling key error when loading word2vec (gensim)?

I am trying to load a pre-trained word2vec model in pkl format taken from here
The line of code I use to load it:
model = gensim.models.KeyedVectors.load('enwiki_20180420_500d.pkl')
However, i keep getting the following error (full traceback):
UnpicklingError Traceback (most recent call last)
<ipython-input-15-ebd5780b6636> in <module>
55
56 #Load pretrained word2vec
---> 57 model = gensim.models.KeyedVectors.load('enwiki_20180420_500d.pkl',mmap='r')
58
~/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py in load(cls, fname_or_handle, **kwargs)
1551 #classmethod
1552 def load(cls, fname_or_handle, **kwargs):
-> 1553 model = super(WordEmbeddingsKeyedVectors, cls).load(fname_or_handle, **kwargs)
1554 if isinstance(model, FastTextKeyedVectors):
1555 if not hasattr(model, 'compatible_hash'):
~/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py in load(cls, fname_or_handle, **kwargs)
226 #classmethod
227 def load(cls, fname_or_handle, **kwargs):
--> 228 return super(BaseKeyedVectors, cls).load(fname_or_handle, **kwargs)
229
230 def similarity(self, entity1, entity2):
~/anaconda3/lib/python3.7/site-packages/gensim/utils.py in load(cls, fname, mmap)
433 compress, subname = SaveLoad._adapt_by_suffix(fname)
434
--> 435 obj = unpickle(fname)
436 obj._load_specials(fname, mmap, compress, subname)
437 logger.info("loaded %s", fname)
~/anaconda3/lib/python3.7/site-packages/gensim/utils.py in unpickle(fname)
1396 # Because of loading from S3 load can't be used (missing readline in smart_open)
1397 if sys.version_info > (3, 0):
-> 1398 return _pickle.load(f, encoding='latin1')
1399 else:
1400 return _pickle.loads(f.read())
UnpicklingError: invalid load key, ':'.
I tried loading it with load_word2vec_format, but no luck. Any ideas what might be wrong with it?
Per your link https://wikipedia2vec.github.io/wikipedia2vec/pretrained/ these are to be loaded using that library's Wikipedia2Vec.load() method.
Gensim's .load() methods should only be used with files saved directly from Gensim model objects.
The Wikipedia2Vec project does say that their .txt file formats would load with .load_word2vec_format(), so you could also try that - but with one of their .txt format files.
Their full model .pkl files are only going to work with their class's own loading function.

TF-Lite-Converter with Tensorflow-Extended Pipeline (Chicago Taxi Pipeline Example)

Goal: TFX -> TF Lite Converter -> Deploy models on mobile/IoT devices
I am currently learning the Tensorflow Extended with its Chicago Taxi Pipeline Example.
The pipeline is done running (although through a lot of hardships) and the Pusher Component has emitted a Tensorflow SavedModel file (.pb).
However, a new problem is encountered here:
By Tensorflow nightly/1.13.1 (tried both) and Python 2.7.6, I can generate, save and load a SavedModel (a model for mnist digit data for testing the utility) with some simple python code, such as saved_model.simple_save and saved_model.loader.load, but I keep running into errors when applying on the models the TFX Pusher emits, as follows.
(Maybe I did something wrong with the TFX Pipeline?)
The code I used:
import tensorflow as tf
with tf.Session(graph=tf.Graph()) as sess:
tf.compat.v1.saved_model.loader.load(sess, ["serve"], "/home/tigerpaws/taxi/serving_model/taxi_simple/1553187887")#"/home/tigerpaws/saved_model_example/model")
graph=tf.get_default_graph()
Error:
KeyError Traceback (most recent call last)
<ipython-input-11-a6978b82c3d2> in <module>()
1 with tf.Session(graph=tf.Graph()) as sess:
----> 2 tf.compat.v1.saved_model.loader.load(sess, ["serve"], "/home/tigerpaws/taxi/serving_model/taxi_simple/1553187887")#"/home/tigerpaws/saved_model_example/model")
3 graph=tf.get_default_graph()
/home/tigerpaws/anaconda/lib/python2.7/site-packages/tensorflow/python/util/deprecation.pyc in new_func(*args, **kwargs)
322 'in a future version' if date is None else ('after %s' % date),
323 instructions)
--> 324 return func(*args, **kwargs)
325 return tf_decorator.make_decorator(
326 func, new_func, 'deprecated',
/home/tigerpaws/anaconda/lib/python2.7/site-packages/tensorflow/python/saved_model/loader_impl.pyc in load(sess, tags, export_dir, import_scope, **saver_kwargs)
267 """
268 loader = SavedModelLoader(export_dir)
--> 269 return loader.load(sess, tags, import_scope, **saver_kwargs)
270
271
/home/tigerpaws/anaconda/lib/python2.7/site-packages/tensorflow/python/saved_model/loader_impl.pyc in load(self, sess, tags, import_scope, **saver_kwargs)
418 with sess.graph.as_default():
419 saver, _ = self.load_graph(sess.graph, tags, import_scope,
--> 420 **saver_kwargs)
421 self.restore_variables(sess, saver, import_scope)
422 self.run_init_ops(sess, tags, import_scope)
/home/tigerpaws/anaconda/lib/python2.7/site-packages/tensorflow/python/saved_model/loader_impl.pyc in load_graph(self, graph, tags, import_scope, **saver_kwargs)
348 with graph.as_default():
349 return tf_saver._import_meta_graph_with_return_elements( # pylint: disable=protected-access
--> 350 meta_graph_def, import_scope=import_scope, **saver_kwargs)
351
352 def restore_variables(self, sess, saver, import_scope=None):
/home/tigerpaws/anaconda/lib/python2.7/site-packages/tensorflow/python/training/saver.pyc in _import_meta_graph_with_return_elements(meta_graph_or_file, clear_devices, import_scope, return_elements, **kwargs)
1455 import_scope=import_scope,
1456 return_elements=return_elements,
-> 1457 **kwargs))
1458
1459 saver = _create_saver_from_imported_meta_graph(
/home/tigerpaws/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/meta_graph.pyc in import_scoped_meta_graph_with_return_elements(meta_graph_or_file, clear_devices, graph, import_scope, input_map, unbound_inputs_col_name, restore_collections_predicate, return_elements)
804 input_map=input_map,
805 producer_op_list=producer_op_list,
--> 806 return_elements=return_elements)
807
808 # Restores all the other collections.
/home/tigerpaws/anaconda/lib/python2.7/site-packages/tensorflow/python/util/deprecation.pyc in new_func(*args, **kwargs)
505 'in a future version' if date is None else ('after %s' % date),
506 instructions)
--> 507 return func(*args, **kwargs)
508
509 doc = _add_deprecated_arg_notice_to_docstring(
/home/tigerpaws/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/importer.pyc in import_graph_def(graph_def, input_map, return_elements, name, op_dict, producer_op_list)
397 if producer_op_list is not None:
398 # TODO(skyewm): make a copy of graph_def so we're not mutating the argument?
--> 399 _RemoveDefaultAttrs(op_dict, producer_op_list, graph_def)
400
401 graph = ops.get_default_graph()
/home/tigerpaws/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/importer.pyc in _RemoveDefaultAttrs(op_dict, producer_op_list, graph_def)
157 # Remove any default attr values that aren't in op_def.
158 if node.op in producer_op_dict:
--> 159 op_def = op_dict[node.op]
160 producer_op_def = producer_op_dict[node.op]
161 # We make a copy of node.attr to iterate through since we may modify
KeyError: u'BucketizeWithInputBoundaries'
There was also another attempt, where I tried to convert the SavedModel into a GraphDef (Frozen Graph) so I could give the converter another try.
The conversion would need a output_node_names, which I don't know;
Neither could I find where the model is saved in the code (so maybe I can spot the output node names somewhere).
Any ideas on the problem or alternative ways? Thanks in advance.
Edit: can somebody help create tags? I have not reached 1500 reputation, but this question is really about tfx / tensorflow-extended
Sorry for the confusion caused; the problem actually is caused by the reading of the SavedModel file.
In the SavedModel, there is an operation BucketizeWithInputBoundaries, that is not defined in op_dict.
This is still in Google's TODO list, commented in two of their scripts.
Here and Here. (Github links):
# TODO(jyzhao): BucketizeWithInputBoundaries error without this.
After importing the specified script this problem is solved.
from tensorflow.contrib.boosted_trees.python.ops import quantile_ops # pylint: disable=unused-import

Categories