Loss key needs to be present Pytorch - python

I am following this repo:
https://github.com/NVIDIA/NeMo/tree/main/examples/nlp/entity_linking
Here is a small tutorial:
https://colab.research.google.com/github/NVIDIA/NeMo/blob/v1.0.2/tutorials/nlp/Entity_Linking_Medical.ipynb
Before starting this tutorial change branch to r1.10.0
When I train this model on entire UMLS dataset given the commands it gives the following error:
In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present
I checked the training steps method and it is fine:
def training_step(self, batch, batch_idx):
"""
Lightning calls this inside the training loop with the data from the training dataloader
passed in as `batch`.
"""
input_ids, token_type_ids, attention_mask, concept_ids = batch
logits = self.forward(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
train_loss = self.loss(logits=logits, labels=concept_ids)
# No hard examples found in batch,
# shouldn't use this batch to update model weights
if train_loss == 0:
train_loss = None
lr = None
else:
lr = self._optimizer.param_groups[0]["lr"]
self.log("train_loss", train_loss)
self.log("lr", lr, prog_bar=True)
return {"loss": train_loss, "lr": lr}
Here is a full stacktrace:
[NeMo I 2022-07-29 18:29:27 multi_similarity_loss:91] Encountered zero loss in multisimloss, loss = 0.0. No hard examples found in the batch
Error executing job with overrides: ['project_dir=.']
Traceback (most recent call last):
File "self_alignment_pretraining.py", line 38, in main
trainer.fit(model)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 769, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 719, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1234, in _run
results = self._run_stage()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1321, in _run_stage
return self._run_train()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1351, in _run_train
self.fit_loop.run()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/fit_loop.py", line 268, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 207, in advance
self.optimizer_idx,
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 256, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 378, in _optimizer_step
using_lbfgs=is_lbfgs,
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1593, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1644, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/ddp.py", line 278, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 193, in optimizer_step
return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 85, in optimizer_step
closure_result = closure()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 148, in __call__
self._result = self.closure(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 134, in closure
step_output = self._step_fn()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 437, in _training_step
training_step_output, self.trainer.accumulate_grad_batches
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 75, in from_training_step_output
"In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present"
pytorch_lightning.utilities.exceptions.MisconfigurationException: In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.

You get this error message about "loss key needs to be present" because in some training steps you return the dict {"loss": None}. This happens in your code here
if train_loss == 0:
train_loss = None
lr = None
where you set train_loss = None. Lightning does not like that, because it wants loss to be a tensor with a graph attached.
If you wish to skip the optimization step completely, just return None from the training_step method, like this:
if train_loss == 0:
return None

Related

pytorch_lightning.utilities.exceptions.MisconfigurationException: The closure hasn't been executed

I got an error message when I used pytorch-lightning Trainer module:
File "run.py", line 105, in <module>
runner.fit(experiment)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 769, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 721, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1234, in _run
results = self._run_stage()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1321, in _run_stage
return self._run_train()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1351, in _run_train
self.fit_loop.run()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/fit_loop.py", line 269, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 207, in advance
self.optimizer_idx,
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 258, in _run_optimization
result = closure.consume_result()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/closure.py", line 52, in consume_result
"The closure hasn't been executed."
pytorch_lightning.utilities.exceptions.MisconfigurationException: The closure hasn't been executed. HINT: did you call `optimizer_closure()` in your `optimizer_step` hook? It could also happen because the `optimizer.step(optimizer_closure)` call did not execute it internally.
I read the Pytorch Lightning 1.6.1 documentation. It seems that under the hood, the Lightning Trainer handles the training loop details for me. I don't know how to handle the "optimizer.step()" by myself...
The following is my code of calling Trainer:
runner = Trainer(checkpoint_callback=checkpoint_callback,
resume_from_checkpoint=model_path,
logger=tt_logger,
log_every_n_steps=100,
weights_summary='full',
# early_stop_callback = False,
**config['trainer_params'])
And I defined the following function in my model:
def forward(self, input: Tensor, **kwargs) -> Tensor
def training_step(self, batch, batch_idx, optimizer_idx)
def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, def optimizer_closure, on_tpu, using_native_amp, using_lbfgs)
on_load_checkpoint(self, checkpoint)
def training_epoch_end(self, outputs)
def configure_optimizers(self)
def train_dataloader(self)
def data_transforms(self)
This is my first time asking a question here. If there is something wrong, please forgive me. Thank you all for your help!

Keras fit predictions in Callback

I'm trying to get the predictions inside the function on_epoch_end of keras' Callback.
At the moment, to get the predictions, I execute self.model.predict with batch_size of 2, but at the 3rd epochs I get this error:
RuntimeError: Dst tensor is not initialized in Tensorflow
Reading on the web, I notice that this error appears when the GPU goes out of memory. In my case, reading the stack trace, this error is triggered by self.model.predict inside on_epoch_end, it says:
File "mlp_keras.py", line 20, in on_epoch_end predictions =
self.model.predict(self.dataset)
This is the full stack trace:
Traceback (most recent call last):
File "mlp_keras.py", line 150, in <module>
callbacks=[KendallTauHistory(training_dataset, training_dataset_labels, groups_id_count)])
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 819, in fit
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 397, in fit
prefix='val_')
File "/usr/lib64/python2.7/contextlib.py", line 24, in __exit__
self.gen.next()
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 771, in on_epoch
self.callbacks.on_epoch_end(epoch, epoch_logs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/callbacks.py", line 302, in on_epoch_end
callback.on_epoch_end(epoch, logs)
File "mlp_keras.py", line 20, in on_epoch_end
predictions = self.model.predict(self.dataset)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 1013, in predict
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 498, in predict
workers=workers, use_multiprocessing=use_multiprocessing, **kwargs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 426, in _model_iteration
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 706, in _process_inputs
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/data_adapter.py", line 357, in __init__
dataset = self.slice_inputs(indices_dataset, inputs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/data_adapter.py", line 383, in slice_inputs
dataset_ops.DatasetV2.from_tensors(inputs).repeat()
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 566, in from_tensors
return TensorDataset(tensors)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 2765, in __init__
element = structure.normalize_element(element)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/util/structure.py", line 113, in normalize_element
ops.convert_to_tensor(t, name="component_%d" % i))
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/ops.py", line 1314, in convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/tensor_conversion_registry.py", line 52, in _default_conversion_function
return constant_op.constant(value, dtype, name=name)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 258, in constant
allow_broadcast=True)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 266, in _constant_impl
t = convert_to_eager_tensor(value, ctx, dtype)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 96, in convert_to_eager_tensor
return ops.EagerTensor(value, ctx.device_name, dtype)
RuntimeError: Dst tensor is not initialized.
My question is: is there a way to get the predictions without performing predict inside on_epoch_end? Thanks in advance.
Alright, after seeing your last comment, what you could do:
epochs = 100
for epoch in range(epochs):
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

Loading TensorFlow1 checkpoint in TensorFlow2 - "Function Call Stack: keras_scratch_graph"

I want to load a checkpoint from TensorFlow1 that consists of .index, .meta, .data-00000-of-00001 files into tensorflow2.0.0 and convert it to a keras model so to be able to use it natively in eager mode without need for tf.Session. Here is the code I ran:
import tensorflow as tf
import numpy as np
from tensorflow.python.keras.backend import set_session
from tensorflow.python.training.saver import _import_meta_graph_with_return_elements
def save_ckpt(ckpt_path='test'):
'''save TensorFlow-1 Checkpoint '''
with tf.Graph().as_default() as g:
in_op = tf.constant(np.random.rand(1,2,2,2),name='input',dtype=tf.float32)
out_op = tf.keras.layers.Conv2D(3,(3,3),padding='same',name='MY_LAYER')(in_op)
saver = tf.compat.v1.train.Saver()
with tf.compat.v1.Session() as sess:
sess.run(tf.compat.v1.variables_initializer(tf.compat.v1.global_variables()))
saver.save(sess,ckpt_path)
def load_ckpt():
'''KerasModel from meta & ckpt'''
in_op = tf.keras.Input([2,2,2])
_m = tf.keras.models.Model(inputs=in_op,outputs=in_op)
with _m.input.graph.as_default() as g:
saver, out_op = _import_meta_graph_with_return_elements('test.meta',
input_map={'input':_m.output},
return_elements=[
# 'input:0',
'MY_LAYER/Conv2D:0'
])
with tf.compat.v1.Session() as sess:
saver.restore(sess,'test')
set_session(sess)
out_mdl = tf.keras.models.Model(inputs=_m.input, outputs=out_op[0])
return out_mdl
# main
save_ckpt() # save name based checkpoint
meta_model = load_ckpt() # restore in keras model
oo = meta_model(np.random.rand(1,2,2,2)) # run the model
print(oo)
but I get this error:
Traceback (most recent call last):
File "question2.py", line 38, in <module>
meta_model = load_ckpt() # restore in keras model
File "question2.py", line 32, in load_ckpt
out_mdl = tf.keras.models.Model(inputs=_m.input, outputs=out_op[0])
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py", line 146, in __init__
super(Model, self).__init__(*args, **kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/network.py", line 167, in __init__
self._init_graph_network(*args, **kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/training/tracking/base.py", line 457, in _method_wrapper
result = method(self, *args, **kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/network.py", line 270, in _init_graph_network
base_layer_utils.create_keras_history(self._nested_outputs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/base_layer_utils.py", line 184, in create_keras_history
_, created_layers = _create_keras_history_helper(tensors, set(), [])
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/base_layer_utils.py", line 229, in _create_keras_history_helper
constants[i] = backend.function([], op_input)([])
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/backend.py", line 3740, in __call__
outputs = self._graph_fn(*converted_inputs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 1081, in __call__
return self._call_impl(args, kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 1121, in _call_impl
return self._call_flat(args, self.captured_inputs, cancellation_manager)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 1224, in _call_flat
ctx, args, cancellation_manager=cancellation_manager)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 511, in call
ctx=ctx)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/execute.py", line 67, in quick_execute
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.FailedPreconditionError: Error while reading resource variable MY_LAYER/kernel from Container: localhost. This could mean that the variable was uninitialized. Not found: Container localhost does not exist. (Could not find resource: localhost/MY_LAYER/kernel)
[[node MY_LAYER/Conv2D/ReadVariableOp (defined at /home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1751) ]] [Op:__inference_keras_scratch_graph_72]
Function call stack:
keras_scratch_graph
What I tried so far
Replacing MY_LAYER/Conv2D:0 with input:0 in _import_meta_graph_with_return_elements() makes the code run with no problem.

ValueError: The features to the model returned by input_fn must have static shape, using TPU Estimator API

I am trying to train a model using the TPU Estimator API on Cloud TPU. The error logs, and the code for reading my input data are attached below. I tried using the python debugger to determine where the bug is encountered. The control doesn't go out of the traing_input_fn function before the error is encountered. So, I believe my data pipeline is the source of the problem. Can someone please help me out with this problem? I will be happy to provide any more information, if necessary. Thanks
INFO:tensorflow:Error recorded from training_loop: The features to the model returned by input_fn must have static shape. Tensor: Tensor("Inf[25/1805]
dequeue:0", shape=(16, ?, 50, 1024), dtype=float32, device=/device:TPU_REPLICATED_CORE:0)
INFO:tensorflow:training_loop marked as finished
WARNING:tensorflow:Reraising captured error
Traceback (most recent call last):
File "estimator_task.py", line 303, in <module>
main(**arguments)
File "estimator_task.py", line 261, in main
estimator.train(input_fn=train_input_fn, max_steps=train_steps, hooks=hooks)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2457, in train
rendezvous.raise_errors()
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/error_handling.py", line 128, in raise_error$
six.reraise(typ, value, traceback)
File "/home/abi/.local/lib/python3.5/site-packages/six.py", line 693, in reraise
raise value
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2452, in train
saving_listeners=saving_listeners)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 358, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1124, in _train_mode$
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1154, in _train_mode$
_default
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2251, in _call_model$
fn
config)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1112, in _call_model$
fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2558, in _model_fn
_train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2893, in _train_on_t$
u_system
device_assignment=ctx.device_assignment)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 890, in split_compile_and_shar$
name=name)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 689, in split_compile_and_repl$
cate
outputs = computation(*computation_inputs)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2886, in multi_tpu_t$
ain_steps_on_single_shard
[_INITIAL_LOSS])
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 208, in repeat
cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 170, in while_loop
condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py", line 3556, in while_loop
return_same_structure)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py", line 3087, in BuildLoop
pred, body, original_loop_vars, loop_vars, shape_invariants)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py", line 3022, in _BuildLoop
body_result = body(*packed_vars_for_body)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 121, in body_wrapper
outputs = body(*(inputs + dequeue_ops))
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 204, in body_wrapper
return [i + 1] + _convert_to_list(body(*args))
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1359, in train_step
self._call_model_fn(features, labels))
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1552, in _call_model_
fn
self._validate_model_features_and_labels(features, labels, is_export_mode)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1546, in _validate_mo
del_features_and_labels
validate(features, 'features')
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1538, in validate
' Tensor: {}'.format(obj_name, obj))
ValueError: The features to the model returned by input_fn must have static shape. Tensor: Tensor("InfeedQueue/dequeue:0", shape=(16, ?, 50, 1024), dt
ype=float32, device=/device:TPU_REPLICATED_CORE:0)
This is my training data pipeline
def train_input_fn(params):
def decode_example(example_proto, t=50, dim=1024):
features = tf.parse_single_example(
example_proto,
features = {
'X': tf.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
'Y': tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
}
)
feat = features['X']
feat = tf.squeeze(feat)
feat.set_shape([t, dim])
labels = features['Y']
labels = tf.cast(labels, dtype=tf.int32)
return feat, labels
train_files = params["train_filenames"]
batch_size = params['batch_size']
dataset = tf.data.TFRecordDataset(train_files, num_parallel_reads=8)
dataset = dataset.apply(
tf.contrib.data.shuffle_and_repeat(buffer_size=100))
dataset = dataset.apply(
tf.contrib.data.map_and_batch(decode_example, batch_size, drop_remainder=False))
dataset = dataset.prefetch(1)
dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(batch_size))
return dataset
I had a very similar problem and I solved it by setting the shape dimensions with an explicit value, in your case:
feat.set_shape([50, 1024])
Not very convenient, but it worked for me.

How to reuse slim.arg_scope in tensorFlow?

I'm trying to load inception_resnet_v2_2016_08_30.ckpt file and do testing.
The code works well with single image (entering oneFile() function only once).
If I call oneFile() function twice, the following error occur:
ValueError: Variable InceptionResnetV2/Conv2d_1a_3x3/weights already
exists, disallowed. Did you mean to set reuse=True in VarScope?
Originally defined at:
I found related solution on Sharing Variables
If tf.variable_scope meet the same problem, could call scope.reuse_variables() to resolve this problem.
But I can't find the slim.arg_scope version to reuse the scope.
def oneFile(filepath):
imgPath = filepath
testImage_string = tf.gfile.FastGFile(imgPath, 'rb').read()
testImage = tf.image.decode_jpeg(testImage_string, channels=3)
processed_image = inception_preprocessing.preprocess_image(testImage, image_size, image_size, is_training=False)
processed_images = tf.expand_dims(processed_image, 0)
# Create the model, use the default arg scope to configure the batch norm parameters.
with slim.arg_scope(inception_resnet_v2_arg_scope()):
#logits, end_points = inception_resnet_v2(images, num_classes = dataset.num_classes, is_training = False)
logits, _ = inception_resnet_v2(processed_images, num_classes=16, is_training=False)
probabilities = tf.nn.softmax(logits)
init_fn = slim.assign_from_checkpoint_fn(
checkpoint_file,
slim.get_model_variables(model_name))
with tf.Session() as sess:
init_fn(sess)
np_image, probabilities = sess.run([processed_images, probabilities])
probabilities = probabilities[0, 0:]
sorted_inds = [i[0] for i in sorted(enumerate(-probabilities), key=lambda x: x[1])]
#print(probabilities)
print(probabilities.argmax(axis=0))
#names = imagenet.create_readable_names_for_imagenet_labels()
#for i in range(15):
# index = sorted_inds[i]
# print((probabilities[index], names[index]))
def main():
for image_file in os.listdir(dataset_dir):
try:
image_type = imghdr.what(os.path.join(dataset_dir, image_file))
if not image_type:
continue
except IsADirectoryError:
continue
#image = Image.open(os.path.join(dataset_dir, image_file))
filepath = os.path.join(dataset_dir, image_file)
oneFile(filepath)
inception_resnet_v2_arg_scope
def inception_resnet_v2_arg_scope(weight_decay=0.00004,
batch_norm_decay=0.9997,
batch_norm_epsilon=0.001):
"""Yields the scope with the default parameters for inception_resnet_v2.
Args:
weight_decay: the weight decay for weights variables.
batch_norm_decay: decay for the moving average of batch_norm momentums.
batch_norm_epsilon: small float added to variance to avoid dividing by zero.
Returns:
a arg_scope with the parameters needed for inception_resnet_v2.
"""
# Set weight_decay for weights in conv2d and fully_connected layers.
with slim.arg_scope([slim.conv2d, slim.fully_connected],
weights_regularizer=slim.l2_regularizer(weight_decay),
biases_regularizer=slim.l2_regularizer(weight_decay)):
batch_norm_params = {
'decay': batch_norm_decay,
'epsilon': batch_norm_epsilon,
}
# Set activation_fn and parameters for batch_norm.
with slim.arg_scope([slim.conv2d], activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params) as scope:
return scope
Complete error message:
./data/test/teeth/1/7070.jpg Traceback (most recent call last): File
"testing.py", line 111, in
main() File "testing.py", line 106, in main
cal(processed_images) File "testing.py", line 67, in cal
logits, _ = inception_resnet_v2(processed_images, num_classes=16, is_training=False) File
"/notebooks/transfer_learning_tutorial/inception_resnet_v2.py", line
123, in inception_resnet_v2
scope='Conv2d_1a_3x3') File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py",
line 181, in func_with_args
return func(*args, **current_args) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py",
line 918, in convolution
outputs = layer.apply(inputs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py",
line 320, in apply
return self.call(inputs, **kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py",
line 286, in call
self.build(input_shapes[0]) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/convolutional.py",
line 138, in build
dtype=self.dtype) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 1049, in get_variable
use_resource=use_resource, custom_getter=custom_getter) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 948, in get_variable
use_resource=use_resource, custom_getter=custom_getter) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 349, in get_variable
validate_shape=validate_shape, use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 1389, in wrapped_custom_getter
*args, **kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py",
line 275, in variable_getter
variable_getter=functools.partial(getter, **kwargs)) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py",
line 228, in _add_variable
trainable=trainable and self.trainable) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py",
line 1334, in layer_variable_getter
return _model_variable_getter(getter, *args, **kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py",
line 1326, in _model_variable_getter
custom_getter=getter, use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py",
line 181, in func_with_args
return func(*args, **current_args) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py",
line 262, in model_variable
use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py",
line 181, in func_with_args
return func(*args, **current_args) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py",
line 217, in variable
use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 341, in _true_getter
use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 653, in _get_single_variable
name, "".join(traceback.format_list(tb)))) ValueError: Variable InceptionResnetV2/Conv2d_1a_3x3/weights already exists, disallowed.
Did you mean to set reuse=True in VarScope? Originally defined at:
File
"/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py",
line 217, in variable
use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py",
line 181, in func_with_args
return func(*args, **current_args) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py",
line 262, in model_variable
use_resource=use_resource)
It seems like tf.reset_default_graph() before processing each image in your oneFile() function will solve this problem, as I encountered the same issue on a very similar example code. My understanding is that once you feed the image to the neural network (NN), because of the variable scope concept TensorFlow uses, it needs to be told that the variables can be reused before you can apply the NN to another image.
My guess would be that you specified the same scope for multiple variables in the graph. This error occurs when tensorflow finds multiple variables under the same scope which is irrespective of the next image or the next batch. When you create the graph, you should create it thinking about one image or batch only. If everything works well with the first batch or first image, tensorflow will take care of the next iterations including the scoping.
So check all the scopes in your model file. I am pretty sure you used the same name twice.

Categories