I keep on getting this error with my new training data. I have tried the example data and that works but when I use my own, it gives a "Key Error". The only thing different to my data and the training data is that mine has more classes.
Full Error:
Traceback (most recent call last):
File "object_detection/create_tf_record.py", line 185, in <module>
tf.app.run()
File "C:\Users\edupt\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\platform\app.py", line 125, in run
_sys.exit(main(argv))
File "object_detection/create_tf_record.py", line 180, in main
image_dir, train_examples)
File "object_detection/create_tf_record.py", line 152, in create_tf_record
tf_example = dict_to_tf_example(data, label_map_dict, image_dir)
File "object_detection/create_tf_record.py", line 97, in dict_to_tf_example
classes.append(label_map_dict[class_name])
KeyError: '300424' <---------- THAT IS THE NAME OF ONE OF THE CLASSES
Related
Does torch.save work on hugging face models (I am using vit)? I assumed yes.
My error:
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 379, in save
_save(obj, opened_zipfile, pickle_module, pickle_protocol)
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 499, in _save
zip_file.write_record(name, storage.data_ptr(), num_bytes)
OSError: [Errno 116] Stale file handle
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/shared/rsaas/miranda9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 1815, in <module>
main()
File "/shared/rsaas/miranda9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 1748, in main
train(args=args)
File "/shared/rsaas/miranda9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 1795, in train
meta_train_iterations_ala_l2l(args, args.agent, args.opt, args.scheduler)
File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/training/meta_training.py", line 213, in meta_train_iterations_ala_l2l
log_train_val_stats(args, args.it, step_name, train_loss, train_acc, training=True)
File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/logging_uu/wandb_logging/supervised_learning.py", line 55, in log_train_val_stats
_log_train_val_stats(args=args,
File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/logging_uu/wandb_logging/supervised_learning.py", line 113, in _log_train_val_stats
save_for_supervised_learning(args, ckpt_filename='ckpt.pt')
File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/checkpointing_uu/supervised_learning.py", line 54, in save_for_supervised_learning
torch.save({'training_mode': args.training_mode,
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 380, in save
return
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 259, in __exit__
self.file_like.write_end_of_file()
RuntimeError: [enforce fail at inline_container.cc:298] . unexpected pos 2736460544 vs 2736460432
my code:
# - ckpt
args_pickable: Namespace = uutils.make_args_pickable(args)
# note not saving any objects, to make sure checkpoint is loadable later with no problems
torch.save({'training_mode': args.training_mode,
'it': args.it,
'epoch_num': args.epoch_num,
# 'args': args_pickable, # some versions of this might not have args!
# decided only to save the dict version to avoid this ckpt not working, make it args when loading
'args_dict': vars(args_pickable), # some versions of this might not have args!
'model_state_dict': get_model_from_ddp(args.model).state_dict(),
'model_str': str(args.model), # added later, to make it easier to check what optimizer was used
'model_hps': args.model_hps,
'model_option': args.model_option,
'opt_state_dict': args.opt.state_dict(),
'opt_str': str(args.opt),
'opt_hps': args.opt_hps,
'opt_option': args.opt_option,
'scheduler_str': str(args.scheduler),
'scheduler_state_dict': try_to_get_scheduler_state_dict(args.scheduler),
'scheduler_hps': args.scheduler_hps,
'scheduler_option': args.scheduler_option,
},
pickle_module=pickle,
f=args.log_root / ckpt_filename)
if this is not the right way to checkpoint hugging face (HF) models, what is?
cross: hf discussion forum: https://discuss.huggingface.co/t/torch-save-with-hugging-face-models-fails/25034
I am trying to load an XGBClassifier in my streamlit app from a pickle file.
When I load it and try to predict on the new input values, it throws the error:
XGBoostError: [11:25:40] c:\users\administrator\workspace\xgboost-win64_release_1.6.0\src\data\array_interface.h:462: Unicode-3 is not supported.
The entire traceback is:
2022-07-02 11:25:40.046 Uncaught app exception
Traceback (most recent call last):
File "C:\Users\\Anaconda3\lib\site-packages\streamlit\scriptrunner\script_runner.py", line 554, in _run_script
exec(code, module.__dict__)
File "temp.py", line 250, in <module>
st.write(clf.predict(feat_list))
File "C:\Users\\Anaconda3\lib\site-packages\xgboost\sklearn.py", line 1434, in predict
class_probs = super().predict(
File "C:\Users\\Anaconda3\lib\site-packages\xgboost\sklearn.py", line 1049, in predict
predts = self.get_booster().inplace_predict(
File "C:\Users\\Anaconda3\lib\site-packages\xgboost\core.py", line 2102, in inplace_predict
_check_call(
File "C:\Users\\Anaconda3\lib\site-packages\xgboost\core.py", line 203, in _check_call
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [11:25:40] c:\users\administrator\workspace\xgboost-win64_release_1.6.0\src\data\array_interface.h:462: Unicode-3
is not supported.
I load the model this way:
clf = pickle.load(open('xgb.pkl', "rb"))
Or
clf = xgboost.XGBClassifier(tree_method ="hist", enable_categorical=True)
clf.load_model("model.json")
And I predict using:
clf.predict(feat_list)
I had a similar problem which came along with the the same XGBoostError. In my case the reason was the dtype of ndarray, which was supposed to be object.
Assuming that your feat_list is numpy.ndarray and that you create it in such way:
feat_list = np.array(features)
adding dtype=object:
feat_list = np.array(features, dtype=object)
should do the trick.
I am trying to use Huggingface transformer api to load a locally downloaded M-BERT model but it is throwing an exception.
I clone this repo: https://huggingface.co/bert-base-multilingual-cased
bert = TFBertModel.from_pretrained("input/bert-base-multilingual-cased")
The directory structure is:
But I am getting this error:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/transformers/modeling_tf_utils.py", line 1277, in from_pretrained
missing_keys, unexpected_keys = load_tf_weights(model, resolved_archive_file, load_weight_prefix)
File "/usr/local/lib/python3.7/dist-packages/transformers/modeling_tf_utils.py", line 467, in load_tf_weights
with h5py.File(resolved_archive_file, "r") as f:
File "/usr/local/lib/python3.7/dist-packages/h5py/_hl/files.py", line 408, in __init__
swmr=swmr)
File "/usr/local/lib/python3.7/dist-packages/h5py/_hl/files.py", line 173, in make_fid
fid = h5f.open(name, flags, fapl=fapl)
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "h5py/h5f.pyx", line 88, in h5py.h5f.open
OSError: Unable to open file (file signature not found)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "train.py", line 81, in <module>
__main__()
File "train.py", line 59, in __main__
model = create_model(num_classes)
File "/content/drive/My Drive/msc-project/code/model.py", line 26, in create_model
bert = TFBertModel.from_pretrained("input/bert-base-multilingual-cased")
File "/usr/local/lib/python3.7/dist-packages/transformers/modeling_tf_utils.py", line 1280, in from_pretrained
"Unable to load weights from h5 file. "
OSError: Unable to load weights from h5 file. If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True.
Where am I going wrong?
Need help!
Thanks in advance.
As it was already pointed in the comments - your from_pretrained param should be either id of a model hosted on huggingface.co or a local path:
A path to a directory containing model weights saved using
save_pretrained(), e.g., ./my_model_directory/.
See documentation
Looking at your stacktrace it seems like your code is run inside:
/content/drive/My Drive/msc-project/code/model.py so unless your model is in:
/content/drive/My Drive/msc-project/code/input/bert-base-multilingual-cased/ it won't load.
I would also set the path to be similar to documentation example ie:
bert = TFBertModel.from_pretrained("./input/bert-base-multilingual-cased/")
Got this error when trying to use Derrick Schultz's repository for StyleGAN2 neural training in Google Colab https://github.com/dvschultz/ai/blob/master/StyleGAN2_Colab_Train.ipynb
The command were:
!python run_generator.py generate-images --network=/content/drive/My\ Drive/stylegan2-colab-test/stylegan2/results/00002-stylegan2-birdaus-1gpu-config-f/submit_config.pkl --seeds=3875451-3876000 --truncation-psi=0.7
Everything prior were done as in the tutorial but with my own dataset. But then I got:
Local submit - run_dir: results/00006-generate-images
dnnlib: Running run_generator.generate_images() on localhost...
Loading networks from "/content/drive/My Drive/stylegan2-colab-test/stylegan2/results/00002-stylegan2-birdaus-1gpu-config-f/submit_config.pkl"...
Traceback (most recent call last):
File "run_generator.py", line 490, in <module>
main()
File "run_generator.py", line 485, in main
dnnlib.submit_run(sc, func_name_map[subcmd], **kwargs)
File "/content/drive/My Drive/stylegan2-colab-test/stylegan2/dnnlib/submission/submit.py", line 343, in submit_run
return farm.submit(submit_config, host_run_dir)
File "/content/drive/My Drive/stylegan2-colab-test/stylegan2/dnnlib/submission/internal/local.py", line 22, in submit
return run_wrapper(submit_config)
File "/content/drive/My Drive/stylegan2-colab-test/stylegan2/dnnlib/submission/submit.py", line 280, in run_wrapper
run_func_obj(**submit_config.run_func_kwargs)
File "/content/drive/My Drive/stylegan2-colab-test/stylegan2/run_generator.py", line 120, in generate_images
_G, _D, Gs = pretrained_networks.load_networks(network_pkl)
File "/content/drive/My Drive/stylegan2-colab-test/stylegan2/pretrained_networks.py", line 76, in load_networks
G, D, Gs = pickle.load(stream, encoding='latin1')
ValueError: too many values to unpack (expected 3)
This code were supposed to work as is, but doesn't. Unfortunately I'm not a coder myself. What needs to be changed here?
I am following the below page
https://github.com/tensorflow/models/tree/master/inception
I got to the point I have to run:
bazel-bin/inception/imagenet_train --num_gpus=1 --batch_size=32 --train_dir=/tmp/imagenet_train --data_dir=/tmp/imagenet_data
However, I got below error:
Traceback (most recent call last):
File "/home/demo/anaconda3/envs/tensorflow/models/inception/bazel-bin/inception/imagenet_train.runfiles/inception/inception/imagenet_train.py", line 41, in <module>
tf.app.run()
File "/home/demo/anaconda3/envs/tensorflow/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/home/demo/anaconda3/envs/tensorflow/models/inception/bazel-bin/inception/imagenet_train.runfiles/inception/inception/imagenet_train.py", line 35, in main
tf.gfile.DeleteRecursively(FLAGS.train_dir)
File "/home/demo/anaconda3/envs/tensorflow/lib/python2.7/site-packages/tensorflow/python/lib/io/file_io.py", line 420, in delete_recursively
pywrap_tensorflow.DeleteRecursively(compat.as_bytes(dirname), status)
File "/home/demo/anaconda3/envs/tensorflow/lib/python2.7/contextlib.py", line 24, in __exit__
self.gen.next()
File "/home/demo/anaconda3/envs/tensorflow/lib/python2.7/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.FailedPreconditionError: /tmp/imagenet_train
My DATA_DIR is /tmp/imagenet_data from previous step bazel-bin/inception/download_and_preprocess_imagenet "${DATA_DIR}"
But what would be my train_dir? The doc doesn't mention it? Look like an empty folder is incorrect.
For me, it works if I set the path of --train_dir=/tmp. Also, you have the processed dataset in the same directory. The --train_dir and --data_dir should not coincide with each other.
Location of where to place the ImageNet data DATA_DIR=$HOME/imagenet-data
Can you tell me if you are still running into problems after changing the directory?
--train_dir is the path to an empty directory where the model checkpoints and events files are stored as the model is trained.