I got an error message when I used pytorch-lightning Trainer module:
File "run.py", line 105, in <module>
runner.fit(experiment)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 769, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 721, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1234, in _run
results = self._run_stage()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1321, in _run_stage
return self._run_train()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1351, in _run_train
self.fit_loop.run()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/fit_loop.py", line 269, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 207, in advance
self.optimizer_idx,
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 258, in _run_optimization
result = closure.consume_result()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/closure.py", line 52, in consume_result
"The closure hasn't been executed."
pytorch_lightning.utilities.exceptions.MisconfigurationException: The closure hasn't been executed. HINT: did you call `optimizer_closure()` in your `optimizer_step` hook? It could also happen because the `optimizer.step(optimizer_closure)` call did not execute it internally.
I read the Pytorch Lightning 1.6.1 documentation. It seems that under the hood, the Lightning Trainer handles the training loop details for me. I don't know how to handle the "optimizer.step()" by myself...
The following is my code of calling Trainer:
runner = Trainer(checkpoint_callback=checkpoint_callback,
resume_from_checkpoint=model_path,
logger=tt_logger,
log_every_n_steps=100,
weights_summary='full',
# early_stop_callback = False,
**config['trainer_params'])
And I defined the following function in my model:
def forward(self, input: Tensor, **kwargs) -> Tensor
def training_step(self, batch, batch_idx, optimizer_idx)
def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, def optimizer_closure, on_tpu, using_native_amp, using_lbfgs)
on_load_checkpoint(self, checkpoint)
def training_epoch_end(self, outputs)
def configure_optimizers(self)
def train_dataloader(self)
def data_transforms(self)
This is my first time asking a question here. If there is something wrong, please forgive me. Thank you all for your help!
Related
I am trying to upgrade a code from tensorflow 1.x to tensorflow 2.x, since I need to use it with python>=3.8. The code is from someone else who is not updating it, you can find it here if needed: https://github.com/NetManAIOps/donut.
I have used the tools provided by tensorflow to upgrade it (i.e. running tf_upgrade_v2 in each file).
Already for a while I have been stuck with an error that I cannot track down.
File "test_model.py", line 227, in <module>
get_anomaly_score(test_files, KPI=args.KPI)
File "test_model.py", line 79, in get_anomaly_score
test_score = dm.predictor.get_score(test_values, test_missing)
File "/home/miguel/gitlab_projects/test/donut_haowen/donut/prediction.py", line 138, in get_score
b_r = sess.run(self._get_score(), feed_dict=feed_dict)
File "/home/miguel/gitlab_projects/test/donut_haowen/donut/prediction.py", line 64, in _get_score
self._score = self.model.get_score(
File "/home/miguel/gitlab_projects/test/donut_haowen/donut/model.py", line 197, in get_score
x_r = iterative_masked_reconstruct(
File "/home/miguel/gitlab_projects/test/donut_haowen/donut/reconstruction.py", line 101, in iterative_masked_reconstruct
x_r, _ = tf.while_loop(
File "/home/miguel/anaconda3/envs/test/lib/python3.8/site-packages/tensorflow/python/util/deprecation.py", line 629, in new_func
return func(*args, **kwargs)
File "/home/miguel/anaconda3/envs/test/lib/python3.8/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2516, in while_loop_v2
return while_loop(
File "/home/miguel/anaconda3/envs/test/lib/python3.8/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2716, in while_loop
return while_v2.while_loop(
File "/home/miguel/anaconda3/envs/test/lib/python3.8/site-packages/tensorflow/python/ops/while_v2.py", line 222, in while_loop
body_graph = func_graph_module.func_graph_from_py_func(
File "/home/miguel/anaconda3/envs/test/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py", line 1283, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "/home/miguel/anaconda3/envs/test/lib/python3.8/site-packages/tensorflow/python/ops/while_v2.py", line 200, in wrapped_body
outputs = body(
File "/home/miguel/gitlab_projects/test/donut_haowen/donut/reconstruction.py", line 103, in <lambda>
body=lambda x_i, i: (masked_reconstruct(reconstruct, x_i, mask), i + 1),
File "/home/miguel/gitlab_projects/test/donut_haowen/donut/reconstruction.py", line 65, in masked_reconstruct
r_x = reconstruct(x)
File "/home/miguel/gitlab_projects/test/src/tfsnippet/tfsnippet/modules/auto_encoders/vae.py", line 469, in reconstruct
model = self.model(z=q_net['z'], n_z=n_z, n_x=n_x)
File "/home/miguel/gitlab_projects/test/src/tfsnippet/tfsnippet/utils/reuse.py", line 179, in wrapper
return method(*args, **kwargs)
File "/home/miguel/gitlab_projects/test/src/tfsnippet/tfsnippet/modules/auto_encoders/vae.py", line 314, in model
x_params = self.h_for_p_x(z)
File "/home/miguel/gitlab_projects/test/src/tfsnippet/tfsnippet/modules/base.py", line 89, in __call__
return self._forward(inputs, **kwargs)
File "/home/miguel/gitlab_projects/test/src/tfsnippet/tfsnippet/modules/container/lambda_.py", line 47, in _forward
return self._factory(inputs, **kwargs)
File "/home/miguel/gitlab_projects/test/donut_haowen/donut/model.py", line 24, in wrap_params_net
h = h_for_dist(inputs)
File "/home/miguel/gitlab_projects/test/src/tfsnippet/tfsnippet/modules/base.py", line 89, in __call__
return self._forward(inputs, **kwargs)
File "/home/miguel/gitlab_projects/test/src/tfsnippet/tfsnippet/modules/container/sequential.py", line 76, in _forward
outputs = c(outputs, **kwargs)
File "/home/miguel/anaconda3/envs/test/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/home/miguel/anaconda3/envs/test/lib/python3.8/site-packages/keras/engine/base_layer.py", line 1140, in __call__
if self._saved_model_inputs_spec is None:
AttributeError: 'Dense' object has no attribute '_saved_model_inputs_spec'
This occurs when I call the predictor.py file in the donut package. This will call another package called TFsnippet, which I also upgraded, and this will finally call the keras package.
Since it is not my code that I am using I have trouble to write any kind of minimum reproducible example.
I am looking for any idea of where could the issue be, and what parts of the code should I maybe check. At the moment I do not have any idea of why or how this error occurs.
Please, tell me if I should add something.
I am following this repo:
https://github.com/NVIDIA/NeMo/tree/main/examples/nlp/entity_linking
Here is a small tutorial:
https://colab.research.google.com/github/NVIDIA/NeMo/blob/v1.0.2/tutorials/nlp/Entity_Linking_Medical.ipynb
Before starting this tutorial change branch to r1.10.0
When I train this model on entire UMLS dataset given the commands it gives the following error:
In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present
I checked the training steps method and it is fine:
def training_step(self, batch, batch_idx):
"""
Lightning calls this inside the training loop with the data from the training dataloader
passed in as `batch`.
"""
input_ids, token_type_ids, attention_mask, concept_ids = batch
logits = self.forward(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
train_loss = self.loss(logits=logits, labels=concept_ids)
# No hard examples found in batch,
# shouldn't use this batch to update model weights
if train_loss == 0:
train_loss = None
lr = None
else:
lr = self._optimizer.param_groups[0]["lr"]
self.log("train_loss", train_loss)
self.log("lr", lr, prog_bar=True)
return {"loss": train_loss, "lr": lr}
Here is a full stacktrace:
[NeMo I 2022-07-29 18:29:27 multi_similarity_loss:91] Encountered zero loss in multisimloss, loss = 0.0. No hard examples found in the batch
Error executing job with overrides: ['project_dir=.']
Traceback (most recent call last):
File "self_alignment_pretraining.py", line 38, in main
trainer.fit(model)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 769, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 719, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1234, in _run
results = self._run_stage()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1321, in _run_stage
return self._run_train()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1351, in _run_train
self.fit_loop.run()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/fit_loop.py", line 268, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 207, in advance
self.optimizer_idx,
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 256, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 378, in _optimizer_step
using_lbfgs=is_lbfgs,
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1593, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1644, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/ddp.py", line 278, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 193, in optimizer_step
return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 85, in optimizer_step
closure_result = closure()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 148, in __call__
self._result = self.closure(*args, **kwargs)
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 134, in closure
step_output = self._step_fn()
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 437, in _training_step
training_step_output, self.trainer.accumulate_grad_batches
File "/home/umair/miniconda3/envs/aemap/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 75, in from_training_step_output
"In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present"
pytorch_lightning.utilities.exceptions.MisconfigurationException: In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
You get this error message about "loss key needs to be present" because in some training steps you return the dict {"loss": None}. This happens in your code here
if train_loss == 0:
train_loss = None
lr = None
where you set train_loss = None. Lightning does not like that, because it wants loss to be a tensor with a graph attached.
If you wish to skip the optimization step completely, just return None from the training_step method, like this:
if train_loss == 0:
return None
I want to run python program using pytorch with my own dataset.I come across with the error:
Traceback (most recent call last):
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demoEmotion.py", line 345, in
fire.Fire(demo)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 138, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 468, in _Fire
target=component.name)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 672, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demoEmotion.py", line 323, in demo
n_epochs=n_epochs, batch_size=batch_size, seed=seed)
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demoEmotion.py", line 202, in train
n_epochs=n_epochs,
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demoEmotion.py", line 83, in train_epoch
output = model(input)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\module.py", line 555, in call
result = self.forward(*input, **kwargs)
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\models\densenet.py", line 151, in forward
features = self.features(x)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\module.py", line 555, in call
result = self.forward(*input, **kwargs)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\container.py", line 100, in forward
input = module(input)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\module.py", line 555, in call
result = self.forward(*input, **kwargs)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\container.py", line 100, in forward
input = module(input)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\module.py", line 555, in call
result = self.forward(*input, **kwargs)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\pooling.py", line 557, in forward
self.padding, self.ceil_mode, self.count_include_pad, self.divisor_override)
RuntimeError: Given input size: (150x1x1). Calculated output size: (150x0x0). Output size is too small
Please guide me how to sove this problem.Thanks in advance!
Please check the input size of the image. It has to be exactly the same as specified in the model. You can use padding if the image is smaller.
I'm trying to get the predictions inside the function on_epoch_end of keras' Callback.
At the moment, to get the predictions, I execute self.model.predict with batch_size of 2, but at the 3rd epochs I get this error:
RuntimeError: Dst tensor is not initialized in Tensorflow
Reading on the web, I notice that this error appears when the GPU goes out of memory. In my case, reading the stack trace, this error is triggered by self.model.predict inside on_epoch_end, it says:
File "mlp_keras.py", line 20, in on_epoch_end predictions =
self.model.predict(self.dataset)
This is the full stack trace:
Traceback (most recent call last):
File "mlp_keras.py", line 150, in <module>
callbacks=[KendallTauHistory(training_dataset, training_dataset_labels, groups_id_count)])
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 819, in fit
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 397, in fit
prefix='val_')
File "/usr/lib64/python2.7/contextlib.py", line 24, in __exit__
self.gen.next()
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 771, in on_epoch
self.callbacks.on_epoch_end(epoch, epoch_logs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/callbacks.py", line 302, in on_epoch_end
callback.on_epoch_end(epoch, logs)
File "mlp_keras.py", line 20, in on_epoch_end
predictions = self.model.predict(self.dataset)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 1013, in predict
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 498, in predict
workers=workers, use_multiprocessing=use_multiprocessing, **kwargs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 426, in _model_iteration
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 706, in _process_inputs
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/data_adapter.py", line 357, in __init__
dataset = self.slice_inputs(indices_dataset, inputs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/data_adapter.py", line 383, in slice_inputs
dataset_ops.DatasetV2.from_tensors(inputs).repeat()
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 566, in from_tensors
return TensorDataset(tensors)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 2765, in __init__
element = structure.normalize_element(element)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/util/structure.py", line 113, in normalize_element
ops.convert_to_tensor(t, name="component_%d" % i))
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/ops.py", line 1314, in convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/tensor_conversion_registry.py", line 52, in _default_conversion_function
return constant_op.constant(value, dtype, name=name)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 258, in constant
allow_broadcast=True)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 266, in _constant_impl
t = convert_to_eager_tensor(value, ctx, dtype)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 96, in convert_to_eager_tensor
return ops.EagerTensor(value, ctx.device_name, dtype)
RuntimeError: Dst tensor is not initialized.
My question is: is there a way to get the predictions without performing predict inside on_epoch_end? Thanks in advance.
Alright, after seeing your last comment, what you could do:
epochs = 100
for epoch in range(epochs):
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
I want to load a checkpoint from TensorFlow1 that consists of .index, .meta, .data-00000-of-00001 files into tensorflow2.0.0 and convert it to a keras model so to be able to use it natively in eager mode without need for tf.Session. Here is the code I ran:
import tensorflow as tf
import numpy as np
from tensorflow.python.keras.backend import set_session
from tensorflow.python.training.saver import _import_meta_graph_with_return_elements
def save_ckpt(ckpt_path='test'):
'''save TensorFlow-1 Checkpoint '''
with tf.Graph().as_default() as g:
in_op = tf.constant(np.random.rand(1,2,2,2),name='input',dtype=tf.float32)
out_op = tf.keras.layers.Conv2D(3,(3,3),padding='same',name='MY_LAYER')(in_op)
saver = tf.compat.v1.train.Saver()
with tf.compat.v1.Session() as sess:
sess.run(tf.compat.v1.variables_initializer(tf.compat.v1.global_variables()))
saver.save(sess,ckpt_path)
def load_ckpt():
'''KerasModel from meta & ckpt'''
in_op = tf.keras.Input([2,2,2])
_m = tf.keras.models.Model(inputs=in_op,outputs=in_op)
with _m.input.graph.as_default() as g:
saver, out_op = _import_meta_graph_with_return_elements('test.meta',
input_map={'input':_m.output},
return_elements=[
# 'input:0',
'MY_LAYER/Conv2D:0'
])
with tf.compat.v1.Session() as sess:
saver.restore(sess,'test')
set_session(sess)
out_mdl = tf.keras.models.Model(inputs=_m.input, outputs=out_op[0])
return out_mdl
# main
save_ckpt() # save name based checkpoint
meta_model = load_ckpt() # restore in keras model
oo = meta_model(np.random.rand(1,2,2,2)) # run the model
print(oo)
but I get this error:
Traceback (most recent call last):
File "question2.py", line 38, in <module>
meta_model = load_ckpt() # restore in keras model
File "question2.py", line 32, in load_ckpt
out_mdl = tf.keras.models.Model(inputs=_m.input, outputs=out_op[0])
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py", line 146, in __init__
super(Model, self).__init__(*args, **kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/network.py", line 167, in __init__
self._init_graph_network(*args, **kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/training/tracking/base.py", line 457, in _method_wrapper
result = method(self, *args, **kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/network.py", line 270, in _init_graph_network
base_layer_utils.create_keras_history(self._nested_outputs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/base_layer_utils.py", line 184, in create_keras_history
_, created_layers = _create_keras_history_helper(tensors, set(), [])
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/base_layer_utils.py", line 229, in _create_keras_history_helper
constants[i] = backend.function([], op_input)([])
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/backend.py", line 3740, in __call__
outputs = self._graph_fn(*converted_inputs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 1081, in __call__
return self._call_impl(args, kwargs)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 1121, in _call_impl
return self._call_flat(args, self.captured_inputs, cancellation_manager)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 1224, in _call_flat
ctx, args, cancellation_manager=cancellation_manager)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py", line 511, in call
ctx=ctx)
File "/home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/execute.py", line 67, in quick_execute
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.FailedPreconditionError: Error while reading resource variable MY_LAYER/kernel from Container: localhost. This could mean that the variable was uninitialized. Not found: Container localhost does not exist. (Could not find resource: localhost/MY_LAYER/kernel)
[[node MY_LAYER/Conv2D/ReadVariableOp (defined at /home/dionyssos/tf2/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1751) ]] [Op:__inference_keras_scratch_graph_72]
Function call stack:
keras_scratch_graph
What I tried so far
Replacing MY_LAYER/Conv2D:0 with input:0 in _import_meta_graph_with_return_elements() makes the code run with no problem.