I'm trying to get the predictions inside the function on_epoch_end of keras' Callback.
At the moment, to get the predictions, I execute self.model.predict with batch_size of 2, but at the 3rd epochs I get this error:
RuntimeError: Dst tensor is not initialized in Tensorflow
Reading on the web, I notice that this error appears when the GPU goes out of memory. In my case, reading the stack trace, this error is triggered by self.model.predict inside on_epoch_end, it says:
File "mlp_keras.py", line 20, in on_epoch_end predictions =
self.model.predict(self.dataset)
This is the full stack trace:
Traceback (most recent call last):
File "mlp_keras.py", line 150, in <module>
callbacks=[KendallTauHistory(training_dataset, training_dataset_labels, groups_id_count)])
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 819, in fit
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 397, in fit
prefix='val_')
File "/usr/lib64/python2.7/contextlib.py", line 24, in __exit__
self.gen.next()
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 771, in on_epoch
self.callbacks.on_epoch_end(epoch, epoch_logs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/callbacks.py", line 302, in on_epoch_end
callback.on_epoch_end(epoch, logs)
File "mlp_keras.py", line 20, in on_epoch_end
predictions = self.model.predict(self.dataset)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 1013, in predict
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 498, in predict
workers=workers, use_multiprocessing=use_multiprocessing, **kwargs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 426, in _model_iteration
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py", line 706, in _process_inputs
use_multiprocessing=use_multiprocessing)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/data_adapter.py", line 357, in __init__
dataset = self.slice_inputs(indices_dataset, inputs)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/keras/engine/data_adapter.py", line 383, in slice_inputs
dataset_ops.DatasetV2.from_tensors(inputs).repeat()
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 566, in from_tensors
return TensorDataset(tensors)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 2765, in __init__
element = structure.normalize_element(element)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/data/util/structure.py", line 113, in normalize_element
ops.convert_to_tensor(t, name="component_%d" % i))
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/ops.py", line 1314, in convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/tensor_conversion_registry.py", line 52, in _default_conversion_function
return constant_op.constant(value, dtype, name=name)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 258, in constant
allow_broadcast=True)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 266, in _constant_impl
t = convert_to_eager_tensor(value, ctx, dtype)
File "/usr/home/studenti/sp171412/word_ordering/mlp/env/lib/python2.7/site-packages/tensorflow_core/python/framework/constant_op.py", line 96, in convert_to_eager_tensor
return ops.EagerTensor(value, ctx.device_name, dtype)
RuntimeError: Dst tensor is not initialized.
My question is: is there a way to get the predictions without performing predict inside on_epoch_end? Thanks in advance.
Alright, after seeing your last comment, what you could do:
epochs = 100
for epoch in range(epochs):
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
Related
There's a couple of other questions similar to this, but I couldn't find a solution which seems to fit. I am using LightGBM with Scikit-Optimize BayesSearchCV.
full_pipeline = skl.Pipeline(steps=[('preprocessor', pre_processor),
('estimator', lgbm.sklearn.LGBMClassifier())])
scorer=make_scorer(fl.lgb_focal_f1_score)
lgb_tuner = sko.BayesSearchCV(full_pipeline, hyper_space, cv=5, refit=True, n_iter=num_calls,scoring=scorer)
lgb_tuner.fit(balanced_xtrain, balanced_ytrain)
Training runs for a while, before it errors with the following:
Traceback (most recent call last):
File "/var/training.py", line 134, in <module>
lgb_tuner.fit(balanced_xtrain, balanced_ytrain)
File "/usr/local/lib/python3.6/site-packages/skopt/searchcv.py", line 694, in fit
groups=groups, n_points=n_points_adjusted
File "/usr/local/lib/python3.6/site-packages/skopt/searchcv.py", line 579, in _step
self._fit(X, y, groups, params_dict)
File "/usr/local/lib/python3.6/site-packages/skopt/searchcv.py", line 423, in _fit
for parameters in parameter_iterable
File "/usr/local/lib/python3.6/site-packages/joblib/parallel.py", line 1041, in __call__
if self.dispatch_one_batch(iterator):
File "/usr/local/lib/python3.6/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
self._dispatch(tasks)
File "/usr/local/lib/python3.6/site-packages/joblib/parallel.py", line 777, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/usr/local/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/usr/local/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/usr/local/lib/python3.6/site-packages/joblib/parallel.py", line 263, in __call__
for func, args, kwargs in self.items]
File "/usr/local/lib/python3.6/site-packages/joblib/parallel.py", line 263, in <listcomp>
for func, args, kwargs in self.items]
File "/usr/local/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/usr/local/lib/python3.6/site-packages/sklearn/pipeline.py", line 335, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "/usr/local/lib/python3.6/site-packages/lightgbm/sklearn.py", line 857, in fit
callbacks=callbacks, init_model=init_model)
File "/usr/local/lib/python3.6/site-packages/lightgbm/sklearn.py", line 617, in fit
callbacks=callbacks, init_model=init_model)
File "/usr/local/lib/python3.6/site-packages/lightgbm/engine.py", line 252, in train
booster.update(fobj=fobj)
File "/usr/local/lib/python3.6/site-packages/lightgbm/basic.py", line 2467, in update
return self.__boost(grad, hess)
File "/usr/local/lib/python3.6/site-packages/lightgbm/basic.py", line 2503, in __boost
ctypes.byref(is_finished)))
File "/usr/local/lib/python3.6/site-packages/lightgbm/basic.py", line 55, in _safe_call
raise LightGBMError(decode_string(_LIB.LGBM_GetLastError()))
lightgbm.basic.LightGBMError: Check failed: (best_split_info.left_count) > (0) at /__w/1/s/python-package/compile/src/treelearner/serial_tree_learner.cpp, line 651 .
Some answers to similar questions have suggested it might be as a consequence of using the GPU, but I do not have a GPU available. I don't know what else is causing it or how to try and fix it. Can anyone suggest anything?
I think this was due to my hyperparameter limits being wrong, causing one hyperparameter to be set to zero which shouldn't have been, though I'm not sure which one.
I am trying to initialize resnet50 as backbone for a model in TF 1.15, and the model is run on google TPU V2. My code is this:
backbone_model=tf.keras.applications.ResNet50(include_top=False, weights='imagenet',pooling=None)
I get following errors.
<pre><code>
E1014 09:57:09.458413 140497105635136 tpu.py:425] Operation of type Placeholder (input_1) is not supported on the TPU. Execution will fail if this op is used in the graph.
app.run(main)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/absl/app.py", line 300, in run
_run_main(main, args)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "/home/usman/nas/tpu.py", line 232, in main
max_steps=FLAGS.train_steps)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py", line 3035, in train
rendezvous.raise_errors()
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/tpu/error_handling.py", line 136, in raise_errors
six.reraise(typ, value, traceback)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/six.py", line 703, in reraise
raise value
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py", line 3030, in train
saving_listeners=saving_listeners)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1191, in _train_model_default
features, labels, ModeKeys.TRAIN, self.config)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py", line 2857, in _call_model_fn
config)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1149, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py", line 3159, in _model_fn
_train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py", line 3604, in _train_on_tpu_system
device_assignment=ctx.device_assignment)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/tpu/tpu.py", line 1277, in split_compile_and_shard
name=name)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/tpu/tpu.py", line 992, in split_compile_and_replicate
outputs = computation(*computation_inputs)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py", line 3589, in multi_tpu_train_steps_on_single_shard
inputs=[0, _INITIAL_LOSS])
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/tpu/training_loop.py", line 178, in while_loop
condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py", line 2753, in while_loop
return_same_structure)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py", line 2245, in BuildLoop
pred, body, original_loop_vars, loop_vars, shape_invariants)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py", line 2170, in _BuildLoop
body_result = body(*packed_vars_for_body)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/tpu/training_loop.py", line 121, in body_wrapper
outputs = body(*(inputs + dequeue_ops))
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py", line 3588, in <lambda>
lambda i, loss: [i + 1, single_tpu_train_step(i)],
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py", line 1715, in train_step
self._call_model_fn(features, labels))
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py", line 1994, in _call_model_fn
estimator_spec = self._model_fn(features=features, **kwargs)
File "/home/usman/nas/tpu.py", line 120, in model
pooling=None)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/keras/applications/__init__.py", line 49, in wrapper
return base_fun(*args, **kwargs)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/keras/applications/resnet.py", line 33, in ResNet50
return resnet.ResNet50(*args, **kwargs)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/keras_applications/resnet_common.py", line 435, in ResNet50
**kwargs)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/keras_applications/resnet_common.py", line 411, in ResNet
model.load_weights(weights_path)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 182, in load_weights
return super(Model, self).load_weights(filepath, by_name)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/network.py", line 1373, in load_weights
saving.load_weights_from_hdf5_group(f, self.layers)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/keras/saving/hdf5_format.py", line 693, in load_weights_from_hdf5_group
K.batch_set_value(weight_value_tuples)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/keras/backend.py", line 3259, in batch_set_value
get_session().run(assign_ops, feed_dict=feed_dict)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/keras/backend.py", line 486, in get_session
_initialize_variables(session)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/keras/backend.py", line 903, in _initialize_variables
[variables_module.is_variable_initialized(v) for v in candidate_vars])
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 956, in run
run_metadata_ptr)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1165, in _run
self._graph, fetches, feed_dict_tensor, feed_handles=feed_handles)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 488, in __init__
self._assert_fetchable(graph, fetch.op)
File "/home/usman/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 505, in _assert_fetchable
% op.name)
tensorflow.python.framework.errors_impl.InaccessibleTensorError: Operation 'VarIsInitializedOp' has been marked as not fetchable. Typically this happens when it is defined in another function or code block. Use return values,explicit Python locals or TensorFlow collections to access it.
</code></pre>
From what I have searched , following is the closest guess to the error I am getting , it is not possible to fetch the result of an op created inside the while loop's body, because the body might execute 0 or more times, based on the loop condition. To get a value out of the loop, you need to return it from the body function (as one of the loop variables), and its final value after all the iterations will be returned from tf.while_loop(). But this happens inside tf code , not externally , because i am only running a single line code for initialization of a model.
I want to run python program using pytorch with my own dataset.I come across with the error:
Traceback (most recent call last):
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demoEmotion.py", line 345, in
fire.Fire(demo)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 138, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 468, in _Fire
target=component.name)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 672, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demoEmotion.py", line 323, in demo
n_epochs=n_epochs, batch_size=batch_size, seed=seed)
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demoEmotion.py", line 202, in train
n_epochs=n_epochs,
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demoEmotion.py", line 83, in train_epoch
output = model(input)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\module.py", line 555, in call
result = self.forward(*input, **kwargs)
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\models\densenet.py", line 151, in forward
features = self.features(x)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\module.py", line 555, in call
result = self.forward(*input, **kwargs)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\container.py", line 100, in forward
input = module(input)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\module.py", line 555, in call
result = self.forward(*input, **kwargs)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\container.py", line 100, in forward
input = module(input)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\module.py", line 555, in call
result = self.forward(*input, **kwargs)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\nn\modules\pooling.py", line 557, in forward
self.padding, self.ceil_mode, self.count_include_pad, self.divisor_override)
RuntimeError: Given input size: (150x1x1). Calculated output size: (150x0x0). Output size is too small
Please guide me how to sove this problem.Thanks in advance!
Please check the input size of the image. It has to be exactly the same as specified in the model. You can use padding if the image is smaller.
EDIT2
Ok so far i have tried with python3.5 -tf 1.10 and python 2.7 tf 1.10
I m still getting this error
Traceback (most recent call last):
File "object_detection/model_main.py", line 101, in <module>
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "object_detection/model_main.py", line 97, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 455, in train_and_evaluate
return executor.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 594, in run
return self.run_local()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 695, in run_local
saving_listeners=saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 354, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1179, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1209, in _train_model_default
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1167, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/home/nvidia/tensorflow/models/research/object_detection/model_lib.py", line 287, in model_fn
prediction_dict, features[fields.InputDataFields.true_image_shape])
File "/home/nvidia/tensorflow/models/research/object_detection/meta_architectures/ssd_meta_arch.py", line 686, in loss
keypoints, weights)
File "/home/nvidia/tensorflow/models/research/object_detection/meta_architectures/ssd_meta_arch.py", line 859, in _assign_targets
groundtruth_weights_list)
File "/home/nvidia/tensorflow/models/research/object_detection/core/target_assigner.py", line 481, in batch_assign_targets
anchors, gt_boxes, gt_class_targets, unmatched_class_label, gt_weights)
File "/home/nvidia/tensorflow/models/research/object_detection/core/target_assigner.py", line 180, in assign
match = self._matcher.match(match_quality_matrix, **params)
File "/home/nvidia/tensorflow/models/research/object_detection/core/matcher.py", line 239, in match
return Match(self._match(similarity_matrix, **params),
File "/home/nvidia/tensorflow/models/research/object_detection/matchers/argmax_matcher.py", line 190, in _match
_match_when_rows_are_non_empty, _match_when_rows_are_empty)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2074, in cond
orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 1920, in BuildCondBranch
original_result = fn()
File "/home/nvidia/tensorflow/models/research/object_detection/matchers/argmax_matcher.py", line 153, in _match_when_rows_are_non_empty
-1)
File "/home/nvidia/tensorflow/models/research/object_detection/matchers/argmax_matcher.py", line 203, in _set_values_using_indicator
indicator = tf.cast(1-indicator, x.dtype)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/math_ops.py", line 878, in r_binary_op_wrapper
x = ops.convert_to_tensor(x, dtype=y.dtype.base_dtype, name="x")
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1028, in convert_to_tensor
as_ref=False)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1124, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/constant_op.py", line 228, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/constant_op.py", line 207, in constant
value, dtype=dtype, shape=shape, verify_shape=verify_shape))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/tensor_util.py", line 442, in make_tensor_proto
_AssertCompatible(values, dtype)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/tensor_util.py", line 353, in _AssertCompatible
(dtype.name, repr(mismatch), type(mismatch).__name__))
TypeError: Expected bool, got 1 of type 'int' instead.
Has anybody tried to train on TX2 or is it for my case only and i did something wrong?
ORIGINAL
Trying to train on mobilenet ssd on Jetson TX2 (I know it is not for taining but i have no better option)
followed these guides
https://towardsdatascience.com/how-to-train-your-own-object-detector-with-tensorflows-object-detector-api-bec72ecfe1d9
https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_locally.md
Training runs on my laptop (CPU) fine but i get the following error on my TX2
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 510, in _apply_op_helper
preferred_dtype=default_dtype)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1040, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 883, in _TensorTensorConversionFunction
(dtype.name, t.dtype.name, str(t)))
ValueError: Tensor conversion requested dtype int32 for Tensor with dtype float32: 'Tensor("Loss/Loss/huber_loss/Sub_1:0", shape=(24, 1917, 4), dtype=float32)'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "object_detection/model_main.py", line 101, in <module>
tf.app.run()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/platform/app.py", line 126, in run
_sys.exit(main(argv))
File "object_detection/model_main.py", line 97, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/training.py", line 425, in train_and_evaluate
executor.run()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/training.py", line 504, in run
self.run_local()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/training.py", line 636, in run_local
hooks=train_hooks)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 355, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 824, in _train_model
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 805, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/home/nvidia/tensorflow/models/research/object_detection/model_lib.py", line 287, in model_fn
prediction_dict, features[fields.InputDataFields.true_image_shape])
File "/home/nvidia/tensorflow/models/research/object_detection/meta_architectures/ssd_meta_arch.py", line 708, in loss
weights=batch_reg_weights)
File "/home/nvidia/tensorflow/models/research/object_detection/core/losses.py", line 74, in __call__
return self._compute_loss(prediction_tensor, target_tensor, **params)
File "/home/nvidia/tensorflow/models/research/object_detection/core/losses.py", line 157, in _compute_loss
reduction=tf.losses.Reduction.NONE
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/losses/losses_impl.py", line 444, in huber_loss
math_ops.multiply(delta, linear))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/math_ops.py", line 326, in multiply
return gen_math_ops.mul(x, y, name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 4689, in mul
"Mul", x=x, y=y, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 546, in _apply_op_helper
inferred_from[input_arg.type_attr]))
TypeError: Input 'y' of 'Mul' Op has type float32 that does not match type int32 of argument 'x'.
NOTE:
Used precompiled wheels to install tensorflow
There was an error with protobuf compiler that has been solved by
removing this line
reserved 6; (line number 104)
in ssd.proto on object_detection/protos folder
I found this solution here but i couldnt find the link
Here is the script to start training
PIPELINE_CONFIG_PATH=/home/nvidia/testtraining/models/model/ssd_mobilenet_v1_pets.config
MODEL_DIR=/home/nvidia/testtraining/models/model/
NUM_TRAIN_STEPS=50000
NUM_EVAL_STEPS=2000
python3 object_detection/model_main.py \
--pipeline_config_path=${PIPELINE_CONFIG_PATH} \
--model_dir=${MODEL_DIR} \
--num_train_steps=${NUM_TRAIN_STEPS} \
--num_eval_steps=${NUM_EVAL_STEPS} \
--alsologtostderr
Laptop TF version 1.10.0
Jetson TX2 tf version 1.6.0-rc1
I m new to Ubuntu and Tensorflow so go easy on me :)
Thanks
EDIT:
It seems like line 546, in _apply_op_helper is some sort of error handling line.
I tried to fix this error with following edit. Added these. In /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/math_ops.py Added these to line 236 just after define statement
import tensorflow as tf
y = tf.cast(y, x.dtype)
This created some other error message which is solved by editing /home/nvidia/tensorflow/models/research/object_detection/matchers/argmax_matcher.py line 203-204 to these
indicator = tf.cast(1-indicator, x.dtype)
return tf.add(tf.multiply(x, indicator), val * indicator)
But i m still getting error
Traceback (most recent call last):
File "object_detection/model_main.py", line 101, in <module>
tf.app.run()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/platform/app.py", line 126, in run
_sys.exit(main(argv))
File "object_detection/model_main.py", line 97, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/training.py", line 425, in train_and_evaluate
executor.run()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/training.py", line 504, in run
self.run_local()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/training.py", line 636, in run_local
hooks=train_hooks)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 355, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 824, in _train_model
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 805, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/home/nvidia/tensorflow/models/research/object_detection/model_lib.py", line 287, in model_fn
prediction_dict, features[fields.InputDataFields.true_image_shape])
File "/home/nvidia/tensorflow/models/research/object_detection/meta_architectures/ssd_meta_arch.py", line 686, in loss
keypoints, weights)
File "/home/nvidia/tensorflow/models/research/object_detection/meta_architectures/ssd_meta_arch.py", line 859, in _assign_targets
groundtruth_weights_list)
File "/home/nvidia/tensorflow/models/research/object_detection/core/target_assigner.py", line 481, in batch_assign_targets
anchors, gt_boxes, gt_class_targets, unmatched_class_label, gt_weights)
File "/home/nvidia/tensorflow/models/research/object_detection/core/target_assigner.py", line 180, in assign
match = self._matcher.match(match_quality_matrix, **params)
File "/home/nvidia/tensorflow/models/research/object_detection/core/matcher.py", line 239, in match
return Match(self._match(similarity_matrix, **params),
File "/home/nvidia/tensorflow/models/research/object_detection/matchers/argmax_matcher.py", line 190, in _match
_match_when_rows_are_non_empty, _match_when_rows_are_empty)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py", line 432, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2047, in cond
orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 1897, in BuildCondBranch
original_result = fn()
File "/home/nvidia/tensorflow/models/research/object_detection/matchers/argmax_matcher.py", line 153, in _match_when_rows_are_non_empty
-1)
File "/home/nvidia/tensorflow/models/research/object_detection/matchers/argmax_matcher.py", line 203, in _set_values_using_indicator
indicator = tf.cast(1-indicator, x.dtype)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/math_ops.py", line 983, in r_binary_op_wrapper
x = ops.convert_to_tensor(x, dtype=y.dtype.base_dtype, name="x")
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 950, in convert_to_tensor
as_ref=False)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1040, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/constant_op.py", line 235, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/constant_op.py", line 214, in constant
value, dtype=dtype, shape=shape, verify_shape=verify_shape))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py", line 433, in make_tensor_proto
_AssertCompatible(values, dtype)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py", line 344, in _AssertCompatible
(dtype.name, repr(mismatch), type(mismatch).__name__))
TypeError: Expected bool, got 1 of type 'int' instead.
And this one is out of my leage
I think there is a huge compatability issues and i will just install tf 1.1 instead
I m open to new ideas though
The key to your problem is here(bottom of traceback):
TypeError: Input 'y' of 'Mul' Op has type float32 that does not match type int32 of argument 'x'.
your y has type float32, but argument - x(place where you pass this y to), needs to be int32.
Try using tf.cast(y, tf.int32) or something like that.
Sometimes there are some changes in tf/you use some older model versions. So this may happen from time to time.
So just open
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py"
find line 546, and do that cast. (using sudo vim, i guess)
I can use a placeholder as the batch_size with tf.train.batch_join() (Which is queue based,) so I can dynamically change the batch size in the training loop.
But when I use placeholder (or a nontrainable variable) as the batch_size for tf.data.Dataset.batch(), I got this error,
ValueError: Cannot capture a placeholder (name:Placeholder, type:Placeholder) by value.
The whole error stack trace is very long. I traced the error to v1.4 tensorflow/python/data/ops/dataset_ops.py:108 in make_one_shot_iterator()
#function.Defun(capture_by_value=True)
Full stack trace attached. I was trying the official tf resnet model.
Thanks!!
Traceback (most recent call last):
File "imagenet_main.py", line 281, in <module>
tf.app.run(argv=[sys.argv[0]] + unparsed)
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "imagenet_main.py", line 270, in main
hooks=[logging_hook])
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 302, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 708, in _train_model
input_fn, model_fn_lib.ModeKeys.TRAIN)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 577, in _get_features_and_labels_from_input_fn
result = self._call_input_fn(input_fn, mode)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 663, in _call_input_fn
return input_fn(**kwargs)
File "imagenet_main.py", line 269, in <lambda>
True, FLAGS.data_dir, worker_batch_size, FLAGS.epochs_per_eval),
File "imagenet_main.py", line 157, in input_fn
iterator = dataset.make_one_shot_iterator()
File "/usr/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 113, in make_one_shot_iterator
_make_dataset.add_to_graph(ops.get_default_graph())
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/function.py", line 486, in add_to_graph
self._create_definition_if_needed()
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/function.py", line 321, in _create_definition_if_needed
self._create_definition_if_needed_impl()
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/function.py", line 338, in _create_definition_if_needed_impl
outputs = self._func(*inputs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 111, in _make_dataset
return self._as_variant_tensor() # pylint: disable=protected-access
File "/usr/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 1225, in _as_variant_tensor
self._input_dataset._as_variant_tensor(), # pylint: disable=protected-access
File "/usr/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 1036, in _as_variant_tensor
self._input_dataset._as_variant_tensor(), # pylint: disable=protected-access
File "/usr/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 1147, in _as_variant_tensor
self._input_dataset._as_variant_tensor(), # pylint: disable=protected-access
File "/usr/lib/python2.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 1598, in _as_variant_tensor
output_types=nest.flatten(self.output_types))
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 1062, in prefetch_dataset
output_shapes=output_shapes, name=name)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/function.py", line 691, in create_op
inputs[i] = self._add_tensor_and_parents(x)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/function.py", line 706, in _add_tensor_and_parents
op = self._add_op_and_parents(tensor.op)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/function.py", line 718, in _add_op_and_parents
"by value." % (op.name, op.type))
ValueError: Cannot capture a placeholder (name:Placeholder, type:Placeholder) by value.
seems like you have to use initializable iterator, not one_shot