I'm trying to load inception_resnet_v2_2016_08_30.ckpt file and do testing.
The code works well with single image (entering oneFile() function only once).
If I call oneFile() function twice, the following error occur:
ValueError: Variable InceptionResnetV2/Conv2d_1a_3x3/weights already
exists, disallowed. Did you mean to set reuse=True in VarScope?
Originally defined at:
I found related solution on Sharing Variables
If tf.variable_scope meet the same problem, could call scope.reuse_variables() to resolve this problem.
But I can't find the slim.arg_scope version to reuse the scope.
def oneFile(filepath):
imgPath = filepath
testImage_string = tf.gfile.FastGFile(imgPath, 'rb').read()
testImage = tf.image.decode_jpeg(testImage_string, channels=3)
processed_image = inception_preprocessing.preprocess_image(testImage, image_size, image_size, is_training=False)
processed_images = tf.expand_dims(processed_image, 0)
# Create the model, use the default arg scope to configure the batch norm parameters.
with slim.arg_scope(inception_resnet_v2_arg_scope()):
#logits, end_points = inception_resnet_v2(images, num_classes = dataset.num_classes, is_training = False)
logits, _ = inception_resnet_v2(processed_images, num_classes=16, is_training=False)
probabilities = tf.nn.softmax(logits)
init_fn = slim.assign_from_checkpoint_fn(
checkpoint_file,
slim.get_model_variables(model_name))
with tf.Session() as sess:
init_fn(sess)
np_image, probabilities = sess.run([processed_images, probabilities])
probabilities = probabilities[0, 0:]
sorted_inds = [i[0] for i in sorted(enumerate(-probabilities), key=lambda x: x[1])]
#print(probabilities)
print(probabilities.argmax(axis=0))
#names = imagenet.create_readable_names_for_imagenet_labels()
#for i in range(15):
# index = sorted_inds[i]
# print((probabilities[index], names[index]))
def main():
for image_file in os.listdir(dataset_dir):
try:
image_type = imghdr.what(os.path.join(dataset_dir, image_file))
if not image_type:
continue
except IsADirectoryError:
continue
#image = Image.open(os.path.join(dataset_dir, image_file))
filepath = os.path.join(dataset_dir, image_file)
oneFile(filepath)
inception_resnet_v2_arg_scope
def inception_resnet_v2_arg_scope(weight_decay=0.00004,
batch_norm_decay=0.9997,
batch_norm_epsilon=0.001):
"""Yields the scope with the default parameters for inception_resnet_v2.
Args:
weight_decay: the weight decay for weights variables.
batch_norm_decay: decay for the moving average of batch_norm momentums.
batch_norm_epsilon: small float added to variance to avoid dividing by zero.
Returns:
a arg_scope with the parameters needed for inception_resnet_v2.
"""
# Set weight_decay for weights in conv2d and fully_connected layers.
with slim.arg_scope([slim.conv2d, slim.fully_connected],
weights_regularizer=slim.l2_regularizer(weight_decay),
biases_regularizer=slim.l2_regularizer(weight_decay)):
batch_norm_params = {
'decay': batch_norm_decay,
'epsilon': batch_norm_epsilon,
}
# Set activation_fn and parameters for batch_norm.
with slim.arg_scope([slim.conv2d], activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params) as scope:
return scope
Complete error message:
./data/test/teeth/1/7070.jpg Traceback (most recent call last): File
"testing.py", line 111, in
main() File "testing.py", line 106, in main
cal(processed_images) File "testing.py", line 67, in cal
logits, _ = inception_resnet_v2(processed_images, num_classes=16, is_training=False) File
"/notebooks/transfer_learning_tutorial/inception_resnet_v2.py", line
123, in inception_resnet_v2
scope='Conv2d_1a_3x3') File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py",
line 181, in func_with_args
return func(*args, **current_args) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py",
line 918, in convolution
outputs = layer.apply(inputs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py",
line 320, in apply
return self.call(inputs, **kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py",
line 286, in call
self.build(input_shapes[0]) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/convolutional.py",
line 138, in build
dtype=self.dtype) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 1049, in get_variable
use_resource=use_resource, custom_getter=custom_getter) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 948, in get_variable
use_resource=use_resource, custom_getter=custom_getter) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 349, in get_variable
validate_shape=validate_shape, use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 1389, in wrapped_custom_getter
*args, **kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py",
line 275, in variable_getter
variable_getter=functools.partial(getter, **kwargs)) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py",
line 228, in _add_variable
trainable=trainable and self.trainable) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py",
line 1334, in layer_variable_getter
return _model_variable_getter(getter, *args, **kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py",
line 1326, in _model_variable_getter
custom_getter=getter, use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py",
line 181, in func_with_args
return func(*args, **current_args) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py",
line 262, in model_variable
use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py",
line 181, in func_with_args
return func(*args, **current_args) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py",
line 217, in variable
use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 341, in _true_getter
use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py",
line 653, in _get_single_variable
name, "".join(traceback.format_list(tb)))) ValueError: Variable InceptionResnetV2/Conv2d_1a_3x3/weights already exists, disallowed.
Did you mean to set reuse=True in VarScope? Originally defined at:
File
"/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py",
line 217, in variable
use_resource=use_resource) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py",
line 181, in func_with_args
return func(*args, **current_args) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py",
line 262, in model_variable
use_resource=use_resource)
It seems like tf.reset_default_graph() before processing each image in your oneFile() function will solve this problem, as I encountered the same issue on a very similar example code. My understanding is that once you feed the image to the neural network (NN), because of the variable scope concept TensorFlow uses, it needs to be told that the variables can be reused before you can apply the NN to another image.
My guess would be that you specified the same scope for multiple variables in the graph. This error occurs when tensorflow finds multiple variables under the same scope which is irrespective of the next image or the next batch. When you create the graph, you should create it thinking about one image or batch only. If everything works well with the first batch or first image, tensorflow will take care of the next iterations including the scoping.
So check all the scopes in your model file. I am pretty sure you used the same name twice.
Related
I am following this repo:
https://github.com/NVIDIA/NeMo/tree/main/examples/nlp/entity_linking
Here is a small tutorial:
https://colab.research.google.com/github/NVIDIA/NeMo/blob/v1.0.2/tutorials/nlp/Entity_Linking_Medical.ipynb
Before starting this tutorial change branch to r1.10.0
When I train this model on entire UMLS dataset given the commands it gives the following error:
In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present
I checked the training steps method and it is fine:
def training_step(self, batch, batch_idx):
"""
Lightning calls this inside the training loop with the data from the training dataloader
passed in as `batch`.
"""
input_ids, token_type_ids, attention_mask, concept_ids = batch
logits = self.forward(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
train_loss = self.loss(logits=logits, labels=concept_ids)
# No hard examples found in batch,
# shouldn't use this batch to update model weights
if train_loss == 0:
train_loss = None
lr = None
else:
lr = self._optimizer.param_groups[0]["lr"]
self.log("train_loss", train_loss)
self.log("lr", lr, prog_bar=True)
return {"loss": train_loss, "lr": lr}
Here is a full stacktrace:
[NeMo I 2022-07-29 18:29:27 multi_similarity_loss:91] Encountered zero loss in multisimloss, loss = 0.0. No hard examples found in the batch
Error executing job with overrides: ['project_dir=.']
Traceback (most recent call last):
File "self_alignment_pretraining.py", line 38, in main
trainer.fit(model)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 769, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 719, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1234, in _run
results = self._run_stage()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1321, in _run_stage
return self._run_train()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1351, in _run_train
self.fit_loop.run()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/fit_loop.py", line 268, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 207, in advance
self.optimizer_idx,
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 256, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 378, in _optimizer_step
using_lbfgs=is_lbfgs,
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1593, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1644, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/ddp.py", line 278, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 193, in optimizer_step
return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 85, in optimizer_step
closure_result = closure()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 148, in __call__
self._result = self.closure(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 134, in closure
step_output = self._step_fn()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 437, in _training_step
training_step_output, self.trainer.accumulate_grad_batches
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 75, in from_training_step_output
"In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present"
pytorch_lightning.utilities.exceptions.MisconfigurationException: In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
You get this error message about "loss key needs to be present" because in some training steps you return the dict {"loss": None}. This happens in your code here
if train_loss == 0:
train_loss = None
lr = None
where you set train_loss = None. Lightning does not like that, because it wants loss to be a tensor with a graph attached.
If you wish to skip the optimization step completely, just return None from the training_step method, like this:
if train_loss == 0:
return None
I want to load a checkpoint from TensorFlow1 that consists of .index, .meta, .data-00000-of-00001 files into tensorflow2.0.0 and convert it to a keras model so to be able to use it natively in eager mode without need for tf.Session. Here is the code I ran:
import tensorflow as tf
import numpy as np
from tensorflow.python.keras.backend import set_session
from tensorflow.python.training.saver import _import_meta_graph_with_return_elements
def save_ckpt(ckpt_path='test'):
'''save TensorFlow-1 Checkpoint '''
with tf.Graph().as_default() as g:
in_op = tf.constant(np.random.rand(1,2,2,2),name='input',dtype=tf.float32)
out_op = tf.keras.layers.Conv2D(3,(3,3),padding='same',name='MY_LAYER')(in_op)
saver = tf.compat.v1.train.Saver()
with tf.compat.v1.Session() as sess:
sess.run(tf.compat.v1.variables_initializer(tf.compat.v1.global_variables()))
saver.save(sess,ckpt_path)
def load_ckpt():
'''KerasModel from meta & ckpt'''
in_op = tf.keras.Input([2,2,2])
_m = tf.keras.models.Model(inputs=in_op,outputs=in_op)
with _m.input.graph.as_default() as g:
saver, out_op = _import_meta_graph_with_return_elements('test.meta',
input_map={'input':_m.output},
return_elements=[
# 'input:0',
'MY_LAYER/Conv2D:0'
])
with tf.compat.v1.Session() as sess:
saver.restore(sess,'test')
set_session(sess)
out_mdl = tf.keras.models.Model(inputs=_m.input, outputs=out_op[0])
return out_mdl
# main
save_ckpt() # save name based checkpoint
meta_model = load_ckpt() # restore in keras model
oo = meta_model(np.random.rand(1,2,2,2)) # run the model
print(oo)
but I get this error:
Traceback (most recent call last):
File "question2.py", line 38, in <module>
meta_model = load_ckpt() # restore in keras model
File "question2.py", line 32, in load_ckpt
out_mdl = tf.keras.models.Model(inputs=_m.input, outputs=out_op[0])
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py", line 146, in __init__
super(Model, self).__init__(*args, **kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/network.py", line 167, in __init__
self._init_graph_network(*args, **kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/training/tracking/base.py", line 457, in _method_wrapper
result = method(self, *args, **kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/network.py", line 270, in _init_graph_network
base_layer_utils.create_keras_history(self._nested_outputs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/base_layer_utils.py", line 184, in create_keras_history
_, created_layers = _create_keras_history_helper(tensors, set(), [])
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/base_layer_utils.py", line 229, in _create_keras_history_helper
constants[i] = backend.function([], op_input)([])
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/backend.py", line 3740, in __call__
outputs = self._graph_fn(*converted_inputs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 1081, in __call__
return self._call_impl(args, kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 1121, in _call_impl
return self._call_flat(args, self.captured_inputs, cancellation_manager)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 1224, in _call_flat
ctx, args, cancellation_manager=cancellation_manager)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 511, in call
ctx=ctx)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/execute.py", line 67, in quick_execute
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.FailedPreconditionError: Error while reading resource variable MY_LAYER/kernel from Container: localhost. This could mean that the variable was uninitialized. Not found: Container localhost does not exist. (Could not find resource: localhost/MY_LAYER/kernel)
[[node MY_LAYER/Conv2D/ReadVariableOp (defined at /home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1751) ]] [Op:__inference_keras_scratch_graph_72]
Function call stack:
keras_scratch_graph
What I tried so far
Replacing MY_LAYER/Conv2D:0 with input:0 in _import_meta_graph_with_return_elements() makes the code run with no problem.
I am trying to train a model using the TPU Estimator API on Cloud TPU. The error logs, and the code for reading my input data are attached below. I tried using the python debugger to determine where the bug is encountered. The control doesn't go out of the traing_input_fn function before the error is encountered. So, I believe my data pipeline is the source of the problem. Can someone please help me out with this problem? I will be happy to provide any more information, if necessary. Thanks
INFO:tensorflow:Error recorded from training_loop: The features to the model returned by input_fn must have static shape. Tensor: Tensor("Inf[25/1805]
dequeue:0", shape=(16, ?, 50, 1024), dtype=float32, device=/device:TPU_REPLICATED_CORE:0)
INFO:tensorflow:training_loop marked as finished
WARNING:tensorflow:Reraising captured error
Traceback (most recent call last):
File "estimator_task.py", line 303, in <module>
main(**arguments)
File "estimator_task.py", line 261, in main
estimator.train(input_fn=train_input_fn, max_steps=train_steps, hooks=hooks)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2457, in train
rendezvous.raise_errors()
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/error_handling.py", line 128, in raise_error$
six.reraise(typ, value, traceback)
File "/home/abi/.local/lib/python3.5/site-packages/six.py", line 693, in reraise
raise value
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2452, in train
saving_listeners=saving_listeners)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 358, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1124, in _train_mode$
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1154, in _train_mode$
_default
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2251, in _call_model$
fn
config)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1112, in _call_model$
fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2558, in _model_fn
_train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2893, in _train_on_t$
u_system
device_assignment=ctx.device_assignment)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 890, in split_compile_and_shar$
name=name)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 689, in split_compile_and_repl$
cate
outputs = computation(*computation_inputs)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2886, in multi_tpu_t$
ain_steps_on_single_shard
[_INITIAL_LOSS])
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 208, in repeat
cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 170, in while_loop
condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py", line 3556, in while_loop
return_same_structure)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py", line 3087, in BuildLoop
pred, body, original_loop_vars, loop_vars, shape_invariants)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py", line 3022, in _BuildLoop
body_result = body(*packed_vars_for_body)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 121, in body_wrapper
outputs = body(*(inputs + dequeue_ops))
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 204, in body_wrapper
return [i + 1] + _convert_to_list(body(*args))
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1359, in train_step
self._call_model_fn(features, labels))
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1552, in _call_model_
fn
self._validate_model_features_and_labels(features, labels, is_export_mode)
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1546, in _validate_mo
del_features_and_labels
validate(features, 'features')
File "/home/abi/anaconda3/envs/myenv_3_5/lib/python3.5/site-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1538, in validate
' Tensor: {}'.format(obj_name, obj))
ValueError: The features to the model returned by input_fn must have static shape. Tensor: Tensor("InfeedQueue/dequeue:0", shape=(16, ?, 50, 1024), dt
ype=float32, device=/device:TPU_REPLICATED_CORE:0)
This is my training data pipeline
def train_input_fn(params):
def decode_example(example_proto, t=50, dim=1024):
features = tf.parse_single_example(
example_proto,
features = {
'X': tf.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
'Y': tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
}
)
feat = features['X']
feat = tf.squeeze(feat)
feat.set_shape([t, dim])
labels = features['Y']
labels = tf.cast(labels, dtype=tf.int32)
return feat, labels
train_files = params["train_filenames"]
batch_size = params['batch_size']
dataset = tf.data.TFRecordDataset(train_files, num_parallel_reads=8)
dataset = dataset.apply(
tf.contrib.data.shuffle_and_repeat(buffer_size=100))
dataset = dataset.apply(
tf.contrib.data.map_and_batch(decode_example, batch_size, drop_remainder=False))
dataset = dataset.prefetch(1)
dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(batch_size))
return dataset
I had a very similar problem and I solved it by setting the shape dimensions with an explicit value, in your case:
feat.set_shape([50, 1024])
Not very convenient, but it worked for me.
I'm trying to do transfer learning of an Inception-resnet v2 model pretrained on imagenet, using my own dataset and classes.
My original codebase was a modification of a tf.slim sample which I can't find anymore and now I'm trying to rewrite the same code using the tf.estimator.* framework.
I am running, however, into the problem of loading only some of the weights from the pretrained checkpoint, initializing the remaining layers with their default initializers.
Researching the problem, I found this GitHub issue and this question, both mentioning the need to use tf.train.init_from_checkpoint in my model_fn. I tried, but given the lack of examples in both, I guess I got something wrong.
This is my minimal example:
import sys
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import tensorflow as tf
import numpy as np
import inception_resnet_v2
NUM_CLASSES = 900
IMAGE_SIZE = 299
def input_fn(mode, num_classes, batch_size=1):
# some code that loads images, reshapes them to 299x299x3 and batches them
return tf.constant(np.zeros([batch_size, 299, 299, 3], np.float32)), tf.one_hot(tf.constant(np.zeros([batch_size], np.int32)), NUM_CLASSES)
def model_fn(images, labels, num_classes, mode):
with tf.contrib.slim.arg_scope(inception_resnet_v2.inception_resnet_v2_arg_scope()):
logits, end_points = inception_resnet_v2.inception_resnet_v2(images,
num_classes,
is_training=(mode==tf.estimator.ModeKeys.TRAIN))
predictions = {
'classes': tf.argmax(input=logits, axis=1),
'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
variables_to_restore = tf.contrib.slim.get_variables_to_restore(exclude=exclude)
scopes = { os.path.dirname(v.name) for v in variables_to_restore }
tf.train.init_from_checkpoint('inception_resnet_v2_2016_08_30.ckpt',
{s+'/':s+'/' for s in scopes})
tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits)
total_loss = tf.losses.get_total_loss() #obtain the regularization losses as well
# Configure the training op
if mode == tf.estimator.ModeKeys.TRAIN:
global_step = tf.train.get_or_create_global_step()
optimizer = tf.train.AdamOptimizer(learning_rate=0.00002)
train_op = optimizer.minimize(total_loss, global_step)
else:
train_op = None
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=total_loss,
train_op=train_op)
def main(unused_argv):
# Create the Estimator
classifier = tf.estimator.Estimator(
model_fn=lambda features, labels, mode: model_fn(features, labels, NUM_CLASSES, mode),
model_dir='model/MCVE')
# Train the model
classifier.train(
input_fn=lambda: input_fn(tf.estimator.ModeKeys.TRAIN, NUM_CLASSES, batch_size=1),
steps=1000)
# Evaluate the model and print results
eval_results = classifier.evaluate(
input_fn=lambda: input_fn(tf.estimator.ModeKeys.EVAL, NUM_CLASSES, batch_size=1))
print()
print('Evaluation results:\n %s' % eval_results)
if __name__ == '__main__':
tf.app.run(main=main, argv=[sys.argv[0]])
where inception_resnet_v2 is the model implementation in Tensorflow's models repository.
If I run this script, I get a bunch of info log from init_from_checkpoint, but then, at session creation time, it seems it attempts to load the Logits weights from the checkpoint and fails because of incompatible shapes. This is the full traceback:
Traceback (most recent call last):
File "<ipython-input-6-06fadd69ae8f>", line 1, in <module>
runfile('C:/Users/1/Desktop/transfer_learning_tutorial-master/MCVE.py', wdir='C:/Users/1/Desktop/transfer_learning_tutorial-master')
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\spyder\utils\site\sitecustomize.py", line 710, in runfile
execfile(filename, namespace)
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\spyder\utils\site\sitecustomize.py", line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/1/Desktop/transfer_learning_tutorial-master/MCVE.py", line 77, in <module>
tf.app.run(main=main, argv=[sys.argv[0]])
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\platform\app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "C:/Users/1/Desktop/transfer_learning_tutorial-master/MCVE.py", line 68, in main
steps=1000)
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\estimator\estimator.py", line 302, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\estimator\estimator.py", line 780, in _train_model
log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\monitored_session.py", line 368, in MonitoredTrainingSession
stop_grace_period_secs=stop_grace_period_secs)
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\monitored_session.py", line 673, in __init__
stop_grace_period_secs=stop_grace_period_secs)
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\monitored_session.py", line 493, in __init__
self._sess = _RecoverableSession(self._coordinated_creator)
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\monitored_session.py", line 851, in __init__
_WrappedSession.__init__(self, self._create_session())
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\monitored_session.py", line 856, in _create_session
return self._sess_creator.create_session()
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\monitored_session.py", line 554, in create_session
self.tf_sess = self._session_creator.create_session()
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\monitored_session.py", line 428, in create_session
init_fn=self._scaffold.init_fn)
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\session_manager.py", line 279, in prepare_session
sess.run(init_op, feed_dict=init_feed_dict)
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\client\session.py", line 889, in run
run_metadata_ptr)
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\client\session.py", line 1120, in _run
feed_dict_tensor, options, run_metadata)
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\client\session.py", line 1317, in _do_run
options, run_metadata)
File "C:\Users\1\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\client\session.py", line 1336, in _do_call
raise type(e)(node_def, op, message)
InvalidArgumentError: Assign requires shapes of both tensors to match. lhs shape= [900] rhs shape= [1001] [[Node: Assign_1145 = Assign[T=DT_FLOAT,
_class=["loc:#InceptionResnetV2/Logits/Logits/biases"], use_locking=true, validate_shape=true,
_device="/job:localhost/replica:0/task:0/device:CPU:0"](InceptionResnetV2/Logits/Logits/biases, checkpoint_initializer_1145)]]
What am I doing wrong when using init_from_checkpoint? How exactly are we supposed to "use" it in our model_fn? And why is the estimator trying to load the Logits' weights from the checkpoint when I'm explicitly telling it not to?
Update:
After the suggestion in the comments, I tried alternative ways to call tf.train.init_from_checkpoint.
Using {v.name: v.name}
If, as suggested in the comment, I replace the call with {v.name:v.name for v in variables_to_restore}, I get this error:
ValueError: Assignment map with scope only name InceptionResnetV2/Conv2d_2a_3x3 should map
to scope only InceptionResnetV2/Conv2d_2a_3x3/weights:0. Should be 'scope/': 'other_scope/'.
Using {v.name: v}
If, instead, I try using the name:variable mapping, I get the following error:
ValueError: Tensor InceptionResnetV2/Conv2d_2a_3x3/weights:0 is not found in
inception_resnet_v2_2016_08_30.ckpt checkpoint
{'InceptionResnetV2/Repeat_2/block8_4/Branch_1/Conv2d_0c_3x1/BatchNorm/moving_mean': [256],
'InceptionResnetV2/Repeat/block35_9/Branch_0/Conv2d_1x1/BatchNorm/beta': [32], ...
The error continues listing what I think are all the variable names in the checkpoint (or could it be the scopes instead?).
Update (2)
After inspecting the latest error here above, I see that InceptionResnetV2/Conv2d_2a_3x3/weights is in the list of the checkpointed variables. The problem is that :0 at the end!
I'll now verify if this does indeed solve the problem and post an answer if that's the case.
Thanks to #KathyWu's comment, I got on the right track and found the problem.
Indeed, the way I was computing the scopes would include the InceptionResnetV2/ scope, that would trigger the load of all variables "under" the scope (i.e., all variables in the network). Replacing this with the correct dictionary, however, was not trivial.
Of the possible scope modes init_from_checkpoint accepts, the one I had to use was the 'scope_variable_name': variable one, but without using the actual variable.name attribute.
The variable.name looks like: 'some_scope/variable_name:0'. That :0 is not in the checkpointed variable's name and so using scopes = {v.name:v.name for v in variables_to_restore} will raise a "Variable not found" error.
The trick to make it work was stripping the tensor index from the name:
tf.train.init_from_checkpoint('inception_resnet_v2_2016_08_30.ckpt',
{v.name.split(':')[0]: v for v in variables_to_restore})
I find out {s+'/':s+'/' for s in scopes} didn't work, just because the variables_to_restore include something like "global_step", so scopes include the global scopes which could include everything. You need to print variables_to_restore, find "global_step" thing, and put it in "exclude".
I am trying to use a TFrecord file for training a network in tensorflow. The problem is that it starts running fine, but after some time, it becomes really slow. Even the GPU utilization goes to 0% during some time.
I have measured the time between iterations, and it is clearly increasing.
I have read somewhere that this might be due, to adding operations to the graph in the training loop, and that that can be solved by using graph.finalize().
My code is like this:
self.inputMR_,self.CT_GT_ = read_and_decode_single_example("data.tfrecords")
self.inputMR, self.CT_GT = tf.train.shuffle_batch([self.inputMR_, self.CT_GT_], batch_size=self.batch_size, num_threads=2,
capacity=500*self.batch_size,min_after_dequeue=2000)
batch_size_tf = tf.shape(self.inputMR)[0] #variable batchsize so we can test here
self.train_phase = tf.placeholder(tf.bool, name='phase_train')
self.G = self.Network(self.inputMR,batch_size_tf)# create the network
self.g_loss=lp_loss(self.G, self.CT_GT, self.l_num, batch_size_tf)
print 'learning rate ',self.learning_rate
self.g_optim = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.g_loss)
self.saver = tf.train.Saver()
Then I have a training stage that looks like this:
def train(self, config):
init=tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
coord = tf.train.Coordinator()
threads=tf.train.start_queue_runners(sess=sess, coord=coord)
sess.graph.finalize()# **WHERE SHOULD I PUT THIS?**
try:
while not coord.should_stop():
_,loss_eval = sess.run([self.g_optim, self.g_loss],feed_dict={self.train_phase: True})
.....
except:
e = sys.exc_info()[0]
print "Exception !!!", e
finally:
coord.request_stop()
coord.join(threads)
sess.close()
When I add the grapgh.finalize, there is an exeption that says: type 'exceptions.RuntimeError'
Could anyone explain to me, what is the correct way to using a TFrecord file during training, and how to use the graph.finalize() without interefering in the QueueRunner execution?
The full error is:
File "main.py", line 37, in <module>
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 30, in run
sys.exit(main(sys.argv[:1] + flags_passthrough))
File "main.py", line 35, in main
gen_model.train(FLAGS)
File "/home/dongnie/Desktop/gan/TF_record_MR_CT/model.py", line 143, in train
self.global_step.assign(it).eval() # set and update(eval) global_step with index, i
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variables.py", line 505, in assign
return state_ops.assign(self._variable, value, use_locking=use_locking)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_state_ops.py", line 45, in assign
use_locking=use_locking, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 490, in apply_op
preferred_dtype=default_dtype)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 657, in convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/constant_op.py", line 180, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/constant_op.py", line 167, in constant
attrs={"value": tensor_value, "dtype": dtype_value}, name=name).outputs[0]
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2337, in create_op
self._check_not_finalized()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2078, in _check_not_finalized
raise RuntimeError("Graph is finalized and cannot be modified.")
RuntimeError: Graph is finalized and cannot be modified.
The problem is that you are modifying graph between session.run calls. You pin-point the place you are modifying the graph by calling finalize on default graph which would trigger an error on graph modification. In your case it seems that you are modifying it by calling global_step.assign(it), which creates an additional assign op each time. You should instead call it once in the beginning, save result to a variable and reuse that value.