So, I'm having a problem with fitting UNet model in Tensorflow. From this line of code
hist = unet.fit(train,
validation_data=val,
steps_per_epoch=STEPS_PER_EPOCH,
validation_steps=VALIDATION_STEPS,
epochs=50)
This is the error message I get
Traceback (most recent call last):
File "C:\Users\Fedor\OneDrive\Рабочий стол\Проект\Херня.py", line 178, in <module>
hist = unet.fit(train,
File "C:\Python\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "C:\Python\lib\site-packages\tensorflow\python\eager\execute.py", line 54, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError: Graph execution error:
Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits' defined at (most recent call last):
File "C:\Users\Fedor\OneDrive\Рабочий стол\Проект\Херня.py", line 178, in <module>
hist = unet.fit(train,
File "C:\Python\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "C:\Python\lib\site-packages\keras\engine\training.py", line 1564, in fit
tmp_logs = self.train_function(iterator)
File "C:\Python\lib\site-packages\keras\engine\training.py", line 1160, in train_function
return step_function(self, iterator)
File "C:\Python\lib\site-packages\keras\engine\training.py", line 1146, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "C:\Python\lib\site-packages\keras\engine\training.py", line 1135, in run_step
outputs = model.train_step(data)
File "C:\Python\lib\site-packages\keras\engine\training.py", line 994, in train_step
loss = self.compute_loss(x, y, y_pred, sample_weight)
File "C:\Python\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
return self.compiled_loss(
File "C:\Python\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
loss_value = loss_obj(y_t, y_p, sample_weight=sw)
File "C:\Python\lib\site-packages\keras\losses.py", line 152, in __call__
losses = call_fn(y_true, y_pred)
File "C:\Python\lib\site-packages\keras\losses.py", line 272, in call
return ag_fn(y_true, y_pred, **self._fn_kwargs)
File "C:\Python\lib\site-packages\keras\losses.py", line 2084, in sparse_categorical_crossentropy
return backend.sparse_categorical_crossentropy(
File "C:\Python\lib\site-packages\keras\backend.py", line 5630, in sparse_categorical_crossentropy
res = tf.nn.sparse_softmax_cross_entropy_with_logits(
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits'
logits and labels must have the same first dimension, got logits shape [16384,59] and labels shape [49152]
[[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_23608]
I've viewed questions with same error, but they didn't solve my problem
I am following this repo:
https://github.com/NVIDIA/NeMo/tree/main/examples/nlp/entity_linking
Here is a small tutorial:
https://colab.research.google.com/github/NVIDIA/NeMo/blob/v1.0.2/tutorials/nlp/Entity_Linking_Medical.ipynb
Before starting this tutorial change branch to r1.10.0
When I train this model on entire UMLS dataset given the commands it gives the following error:
In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present
I checked the training steps method and it is fine:
def training_step(self, batch, batch_idx):
"""
Lightning calls this inside the training loop with the data from the training dataloader
passed in as `batch`.
"""
input_ids, token_type_ids, attention_mask, concept_ids = batch
logits = self.forward(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
train_loss = self.loss(logits=logits, labels=concept_ids)
# No hard examples found in batch,
# shouldn't use this batch to update model weights
if train_loss == 0:
train_loss = None
lr = None
else:
lr = self._optimizer.param_groups[0]["lr"]
self.log("train_loss", train_loss)
self.log("lr", lr, prog_bar=True)
return {"loss": train_loss, "lr": lr}
Here is a full stacktrace:
[NeMo I 2022-07-29 18:29:27 multi_similarity_loss:91] Encountered zero loss in multisimloss, loss = 0.0. No hard examples found in the batch
Error executing job with overrides: ['project_dir=.']
Traceback (most recent call last):
File "self_alignment_pretraining.py", line 38, in main
trainer.fit(model)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 769, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 719, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1234, in _run
results = self._run_stage()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1321, in _run_stage
return self._run_train()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1351, in _run_train
self.fit_loop.run()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/fit_loop.py", line 268, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 207, in advance
self.optimizer_idx,
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 256, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 378, in _optimizer_step
using_lbfgs=is_lbfgs,
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1593, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1644, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/ddp.py", line 278, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 193, in optimizer_step
return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 85, in optimizer_step
closure_result = closure()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 148, in __call__
self._result = self.closure(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 134, in closure
step_output = self._step_fn()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 437, in _training_step
training_step_output, self.trainer.accumulate_grad_batches
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 75, in from_training_step_output
"In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present"
pytorch_lightning.utilities.exceptions.MisconfigurationException: In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
You get this error message about "loss key needs to be present" because in some training steps you return the dict {"loss": None}. This happens in your code here
if train_loss == 0:
train_loss = None
lr = None
where you set train_loss = None. Lightning does not like that, because it wants loss to be a tensor with a graph attached.
If you wish to skip the optimization step completely, just return None from the training_step method, like this:
if train_loss == 0:
return None
I'm trying to get the predictions inside the function on_epoch_end of keras' Callback.
At the moment, to get the predictions, I execute self.model.predict with batch_size of 2, but at the 3rd epochs I get this error:
RuntimeError: Dst tensor is not initialized in Tensorflow
Reading on the web, I notice that this error appears when the GPU goes out of memory. In my case, reading the stack trace, this error is triggered by self.model.predict inside on_epoch_end, it says:
File "mlp_keras.py", line 20, in on_epoch_end predictions =
self.model.predict(self.dataset)
This is the full stack trace:
Traceback (most recent call last):
File "mlp_keras.py", line 150, in <module>
callbacks=[KendallTauHistory(training_dataset, training_dataset_labels, groups_id_count)])
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 819, in fit
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 397, in fit
prefix='val_')
File "/usr/lib64/python2.7/contextlib.py", line 24, in __exit__
self.gen.next()
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 771, in on_epoch
self.callbacks.on_epoch_end(epoch, epoch_logs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/callbacks.py", line 302, in on_epoch_end
callback.on_epoch_end(epoch, logs)
File "mlp_keras.py", line 20, in on_epoch_end
predictions = self.model.predict(self.dataset)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 1013, in predict
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 498, in predict
workers=workers, use_multiprocessing=use_multiprocessing, **kwargs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 426, in _model_iteration
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 706, in _process_inputs
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/data_adapter.py", line 357, in __init__
dataset = self.slice_inputs(indices_dataset, inputs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/data_adapter.py", line 383, in slice_inputs
dataset_ops.DatasetV2.from_tensors(inputs).repeat()
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 566, in from_tensors
return TensorDataset(tensors)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 2765, in __init__
element = structure.normalize_element(element)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/util/structure.py", line 113, in normalize_element
ops.convert_to_tensor(t, name="component_%d" % i))
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/ops.py", line 1314, in convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/tensor_conversion_registry.py", line 52, in _default_conversion_function
return constant_op.constant(value, dtype, name=name)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 258, in constant
allow_broadcast=True)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 266, in _constant_impl
t = convert_to_eager_tensor(value, ctx, dtype)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 96, in convert_to_eager_tensor
return ops.EagerTensor(value, ctx.device_name, dtype)
RuntimeError: Dst tensor is not initialized.
My question is: is there a way to get the predictions without performing predict inside on_epoch_end? Thanks in advance.
Alright, after seeing your last comment, what you could do:
epochs = 100
for epoch in range(epochs):
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
Let me start from the beggining. I'm implementing in tensorflow 1.14 a partial convolution layer for image inpainting based on the not official Keras implementation (I already test it and it works on my dataset).
This architecture uses a pretrained (imagenet) VGG16 to compute some loss terms. Sadly, a VGG implemented in tensorflow didn't worked (I've tried with this one), as the one in keras application. Therefore, I used this class to incorporate the keras application VGG16 into my tensorflow 1.14 code.
Everything was working fine but then I incorporate Mixed Precision Training (documentation) into my code and the VGG16 part gave the following error:
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
ERROR:tensorflow:==================================
Object was never used (type <class 'tensorflow.python.framework.ops.Tensor'>):
<tf.Tensor 'VGG16/model/IsVariableInitialized_3:0' shape=() dtype=bool>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
File "main.py", line 131, in <module>
psi_gt, psi_out, psi_comp, I_comp, layers = model.build_vgg(data_gt, unet_pconv,
data_mask) File "/workspace/model.py", line 52, in build_vgg
vgg = vgg16.VGG16(image_shape=gt.shape, input_tensor=gt) File "/workspace/vgg.py", line
17, in __init__
self._build_graph(input_tensor) File "/workspace/vgg.py", line 35, in _build_graph
self.vgg16 = tf.keras.applications.VGG16(weights='imagenet', include_top=False,
input_tensor=img) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/applications/__init__.py", line 70, in wrapper
return base_fun(*args, **kwargs) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/applications/vgg16.py", line 32, in VGG16
return vgg16.VGG16(*args, **kwargs) File "/usr/local/lib/python3.6/dist-
packages/keras_applications/vgg16.py", line 210, in VGG16
model.load_weights(weights_path) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/engine/training.py", line 162, in load_weights
return super(Model, self).load_weights(filepath, by_name) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/network.py", line
1424, in load_weights
saving.load_weights_from_hdf5_group(f, self.layers) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/saving/hdf5_format.py", line 759, in
load_weights_from_hdf5_group
K.batch_set_value(weight_value_tuples) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 3071, in batch_set_value
get_session().run(assign_ops, feed_dict=feed_dict) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 462, in get_session
_initialize_variables(session) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 879, in _initialize_variables
[variables_module.is_variable_initialized(v) for v in candidate_vars]) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 879, in
<listcomp>
[variables_module.is_variable_initialized(v) for v in candidate_vars]) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/tf_should_use.py", line 193,
in wrapped
return _add_should_use_warning(fn(*args, **kwargs))
==================================
ERROR:tensorflow:==================================
Object was never used (type <class 'tensorflow.python.framework.ops.Tensor'>):
<tf.Tensor 'VGG16/model/IsVariableInitialized_2:0' shape=() dtype=bool>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
File "main.py", line 131, in <module>
psi_gt, psi_out, psi_comp, I_comp, layers = model.build_vgg(data_gt, unet_pconv, data_mask)
File "/workspace/model.py", line 52, in build_vgg
vgg = vgg16.VGG16(image_shape=gt.shape, input_tensor=gt) File "/workspace/vgg.py", line 17,
in __init__
self._build_graph(input_tensor) File "/workspace/vgg.py", line 35, in _build_graph
self.vgg16 = tf.keras.applications.VGG16(weights='imagenet', include_top=False,
input_tensor=img) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/applications/__init__.py", line 70, in wrapper
return base_fun(*args, **kwargs) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/applications/vgg16.py", line 32, in VGG16
return vgg16.VGG16(*args, **kwargs) File "/usr/local/lib/python3.6/dist-
packages/keras_applications/vgg16.py", line 210, in VGG16
model.load_weights(weights_path) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/engine/training.py", line 162, in load_weights
return super(Model, self).load_weights(filepath, by_name) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/network.py", line
1424, in load_weights
saving.load_weights_from_hdf5_group(f, self.layers) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/saving/hdf5_format.py", line 759, in
load_weights_from_hdf5_group
K.batch_set_value(weight_value_tuples) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 3071, in batch_set_value
get_session().run(assign_ops, feed_dict=feed_dict) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 462, in get_session
_initialize_variables(session) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 879, in _initialize_variables
[variables_module.is_variable_initialized(v) for v in candidate_vars]) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 879, in
<listcomp>
[variables_module.is_variable_initialized(v) for v in candidate_vars]) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/tf_should_use.py", line 193,
in wrapped
return _add_should_use_warning(fn(*args, **kwargs))
==================================
ERROR:tensorflow:==================================
Object was never used (type <class 'tensorflow.python.framework.ops.Tensor'>):
<tf.Tensor 'VGG16/model/IsVariableInitialized_1:0' shape=() dtype=bool>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
File "main.py", line 131, in <module>
psi_gt, psi_out, psi_comp, I_comp, layers = model.build_vgg(data_gt, unet_pconv, data_mask)
File "/workspace/model.py", line 52, in build_vgg
vgg = vgg16.VGG16(image_shape=gt.shape, input_tensor=gt) File "/workspace/vgg.py", line 17,
in __init__
self._build_graph(input_tensor) File "/workspace/vgg.py", line 35, in _build_graph
self.vgg16 = tf.keras.applications.VGG16(weights='imagenet', include_top=False,
input_tensor=img) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/applications/__init__.py", line 70, in wrapper
return base_fun(*args, **kwargs) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/applications/vgg16.py", line 32, in VGG16
return vgg16.VGG16(*args, **kwargs) File "/usr/local/lib/python3.6/dist-
packages/keras_applications/vgg16.py", line 210, in VGG16
model.load_weights(weights_path) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/engine/training.py", line 162, in load_weights
return super(Model, self).load_weights(filepath, by_name) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/network.py", line
1424, in load_weights
saving.load_weights_from_hdf5_group(f, self.layers) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/saving/hdf5_format.py", line 759, in
load_weights_from_hdf5_group
K.batch_set_value(weight_value_tuples) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 3071, in batch_set_value
get_session().run(assign_ops, feed_dict=feed_dict) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 462, in get_session
_initialize_variables(session) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 879, in _initialize_variables
[variables_module.is_variable_initialized(v) for v in candidate_vars]) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 879, in
<listcomp>
[variables_module.is_variable_initialized(v) for v in candidate_vars]) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/tf_should_use.py", line 193,
in wrapped
return _add_should_use_warning(fn(*args, **kwargs))
==================================
ERROR:tensorflow:==================================
Object was never used (type <class 'tensorflow.python.framework.ops.Tensor'>):
<tf.Tensor 'VGG16/model/IsVariableInitialized:0' shape=() dtype=bool>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
File "main.py", line 131, in <module>
psi_gt, psi_out, psi_comp, I_comp, layers = model.build_vgg(data_gt, unet_pconv, data_mask)
File "/workspace/model.py", line 52, in build_vgg
vgg = vgg16.VGG16(image_shape=gt.shape, input_tensor=gt) File "/workspace/vgg.py", line 17,
in __init__
self._build_graph(input_tensor) File "/workspace/vgg.py", line 35, in _build_graph
self.vgg16 = tf.keras.applications.VGG16(weights='imagenet', include_top=False,
input_tensor=img) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/applications/__init__.py", line 70, in wrapper
return base_fun(*args, **kwargs) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/applications/vgg16.py", line 32, in VGG16
return vgg16.VGG16(*args, **kwargs) File "/usr/local/lib/python3.6/dist-
packages/keras_applications/vgg16.py", line 210, in VGG16
model.load_weights(weights_path) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/engine/training.py", line 162, in load_weights
return super(Model, self).load_weights(filepath, by_name) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/network.py", line
1424, in load_weights
saving.load_weights_from_hdf5_group(f, self.layers) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/saving/hdf5_format.py", line 759, in
load_weights_from_hdf5_group
K.batch_set_value(weight_value_tuples) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 3071, in batch_set_value
get_session().run(assign_ops, feed_dict=feed_dict) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 462, in get_session
_initialize_variables(session) File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/backend.py", line 879, in _initialize_variables
[variables_module.is_variable_initialized(v) for v in candidate_vars]) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 879, in
<listcomp>
[variables_module.is_variable_initialized(v) for v in candidate_vars]) File
"/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/tf_should_use.py", line 193,
in wrapped
return _add_should_use_warning(fn(*args, **kwargs))
==================================
Traceback (most recent call last):
File "main.py", line 131, in <module>
psi_gt, psi_out, psi_comp, I_comp, layers = model.build_vgg(data_gt, unet_pconv, data_mask)
File "/workspace/model.py", line 52, in build_vgg
vgg = vgg16.VGG16(image_shape=gt.shape, input_tensor=gt)
File "/workspace/vgg.py", line 17, in __init__
self._build_graph(input_tensor)
File "/workspace/vgg.py", line 35, in _build_graph
self.vgg16 = tf.keras.applications.VGG16(weights='imagenet', include_top=False,
input_tensor=img)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/keras/applications/__init__.py", line 70, in wrapper
return base_fun(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/applications/vgg16.py", line 32, in VGG16
return vgg16.VGG16(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/keras_applications/vgg16.py", line 210, in VGG16
model.load_weights(weights_path)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 162, in load_weights
return super(Model, self).load_weights(filepath, by_name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/network.py", line 1424, in load_weights
saving.load_weights_from_hdf5_group(f, self.layers)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/saving/hdf5_format.py", line 759, in load_weights_from_hdf5_group
K.batch_set_value(weight_value_tuples)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 3071, in batch_set_value
get_session().run(assign_ops, feed_dict=feed_dict)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 462, in get_session
_initialize_variables(session)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 879, in _initialize_variables
[variables_module.is_variable_initialized(v) for v in candidate_vars])
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 879, in <listcomp>
[variables_module.is_variable_initialized(v) for v in candidate_vars])
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/tf_should_use.py", line 193, in wrapped
return _add_should_use_warning(fn(*args, **kwargs))
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/variables.py", line 3083, in is_variable_initialized
return state_ops.is_variable_initialized(variable)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/state_ops.py", line 133, in is_variable_initialized
return ref.is_initialized(name=name)
AttributeError: 'Tensor' object has no attribute 'is_initialized'
For mixed precision training I'm using an nvidia docker.
The VGG16 it's being used here to obtain the feature map of 3 images:
def build_vgg(gt, y_pred, mask):
vgg_layer = ['block1_pool', 'block2_pool', 'block3_pool']
vgg = vgg16.VGG16(image_shape=gt.shape, input_tensor=gt)
psi_gt = {}
psi_gt[vgg_layer[0]] = tf.identity(vgg[vgg_layer[0]], name='gt_vgg0')
psi_gt[vgg_layer[1]] = tf.identity(vgg[vgg_layer[1]], name='gt_vgg1')
psi_gt[vgg_layer[2]] = tf.identity(vgg[vgg_layer[2]], name='gt_vgg2')
vgg = vgg16.VGG16(image_shape=y_pred.shape, input_tensor=y_pred)
psi_out = {}
psi_out[vgg_layer[0]] = tf.identity(vgg[vgg_layer[0]], name='out_vgg0')
psi_out[vgg_layer[1]] = tf.identity(vgg[vgg_layer[1]], name='out_vgg1')
psi_out[vgg_layer[2]] = tf.identity(vgg[vgg_layer[2]], name='out_vgg2')
I_comp = (mask * gt) + ((1-mask) * y_pred)
vgg = vgg16.VGG16(image_shape=I_comp.shape, input_tensor=I_comp)
psi_comp = {}
psi_comp[vgg_layer[0]] = tf.identity(vgg[vgg_layer[0]], name='comp_vgg0')
psi_comp[vgg_layer[1]] = tf.identity(vgg[vgg_layer[1]], name='comp_vgg1')
psi_comp[vgg_layer[2]] = tf.identity(vgg[vgg_layer[2]], name='comp_vgg2')
return psi_gt, psi_out, psi_comp, I_comp, vgg_layer
The previous function it's used in the main script:
import tensorflow as tf
import PConv
import model
import layers
import math
import os
import data
import utils
import numpy as np
import datetime
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
# Mixed precision training variable storage
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
initializer=None, regularizer=None,
trainable=True, *args, **kwargs):
storage_dtype = tf.float32 if trainable else dtype
variable = getter(name, shape, dtype=storage_dtype,
initializer=initializer, regularizer=regularizer,
trainable=trainable, *args, **kwargs)
if trainable and dtype != tf.float32:
variable = tf.cast(variable, dtype)
return variable
# ==============================================================================
# SETTINGS
# ==============================================================================
path_ =''
batch_size = 16
best_val = math.inf
best_val_epoch = 0
patience = 0
stop = 300
epochs = 2000
steps_train = 25
steps_val = 8
template = '{}, Epoch {}, train_loss: {:.4f} - val_loss: {:.4f}'
path = path_ + 'tmp/'
if not os.path.isdir(path):
os.mkdir(path)
# ==============================================================================
# DATA
# ==============================================================================
X_train, m_train, y_train = data.get_filenames()
X_val, m_val, y_val = data.get_filenames(train=False)
# ==============================================================================
# DATASET
# ==============================================================================
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, m_train, y_train))#(images, mask, gt))
train_dataset = train_dataset.map(data.load, num_parallel_calls=tf.data.experimental.AUTOTUNE)
train_dataset = train_dataset.batch(batch_size)
train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, m_val, y_val))#(images, mask, gt))
val_dataset = val_dataset.map(data.load, num_parallel_calls=tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size)
val_dataset = val_dataset.prefetch(buffer_size=1)
iterator = tf.data.Iterator.from_structure(train_dataset.output_types,
train_dataset.output_shapes)
data_im, data_mask, data_gt = iterator.get_next()
# create the initialization operations
train_init_op = iterator.make_initializer(train_dataset)
val_init_op = iterator.make_initializer(val_dataset)
# ==============================================================================
# MODEL
# ==============================================================================
data_im = tf.cast(data_im, tf.float16)
data_mask = tf.cast(data_mask, tf.float16)
with tf.variable_scope('fp32_vars', custom_getter=float32_variable_storage_getter):
unet_pconv = model.pconv_unet(data_im, data_mask)
unet_pconv = tf.cast(unet_pconv, tf.float32)
data_mask = tf.cast(data_mask, tf.float32)
psi_gt, psi_out, psi_comp, I_comp, layers = model.build_vgg(data_gt, unet_pconv, data_mask)
I_comp = tf.cast(I_comp, tf.float32)
# # ==============================================================================
# # LOSS
# # ==============================================================================
loss = utils.get_total_loss(unet_pconv, data_gt, data_mask, psi_gt, psi_out, psi_comp, I_comp, layers)
lr = 0.0002
optimizer = utils.optimize(loss, lr)
saver = tf.train.Saver()
# # ==============================================================================
# # TRAINING
# # ==============================================================================
output_summary = tf.summary.image(name='output', tensor=unet_pconv)
merged = tf.summary.merge_all()
with tf.Session() as sess:
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter('graphs',sess.graph)
train_loss_, val_loss_ = [], []
for epoch in range(epochs):
pred_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
tl, vl = [], []
#Initialize iterator with training data
sess.run(train_init_op)
try:
for step in range (steps_train):
_, train_loss, summ = sess.run([optimizer, loss, merged])
writer.add_summary(summ, epoch)
tl.append(train_loss)
mean_train = utils.list_mean(tl)
train_loss_.append(mean_train)
except tf.errors.OutOfRangeError:
pass
if (epoch+1) % 1 == 0:
sess.run(val_init_op)
try:
for step in range (steps_val):
val_loss = sess.run([loss])
vl.append(val_loss)
mean_val = utils.list_mean(vl)
val_loss_.append(mean_val)
except tf.errors.OutOfRangeError:
pass
print(template.format(pred_time, epoch, mean_train, mean_val))
# early stopping
if mean_val < best_val:
print('Saving on epoch {0}'.format(epoch))
best_val = mean_val
patience = 0
best_val_epoch = epoch
saver.save(sess, path+'best_model')
else:
patience += 1
if patience == stop:
print('Early stopping at epoch: {}'.format(best_val_epoch))
break
# # ==============================================================================
# # SAVE CURVES
# # ==============================================================================
np.save(path_+'loss.npy', train_loss_)
np.save(path_+'val_loss.npy', val_loss_)
The optimization it's being done as follows:
def optimize(loss, learning_rate=1e-4):
U_vars = [var for var in tf.trainable_variables() if 'UNET' in var.name]
opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(opt, loss_scale=128.0)
train_opt = opt.minimize(loss, var_list=U_vars)
return train_opt
I've trying to fix this for a while and still don't understand why it doesn't work when I implement the mixed precision training. Feel free to ask for more details.
If you can give a hand would be great! Thank you in advance.
I've try many ways and my final thought is that pre trained keras models are not compatible. I changed it to a tensorflow VGG16 model and it works slower but at least it works.