Tensorflow load model and load_weights give different behavior - python

I have one working tensorflow RNN model (called my_model) and I save it using my_model.save("<dir_to_my_model>").
However, when I reload this model with
model = tf.keras.models.load_model("<dir_to_my_model>")
and run a few incremental learning epochs with
with tf.GradientTape() as tape:
logits = model(x_test_batch, training=True)
I got an error (very long, please see at the bottom).
But, if I save the weights of this model to a checkpoint using model.save_weights(), create a new model and load the weights using model.load_weights from that checkpoint. The code above could run successfully.
My question is why these 2 methods work differently and what should I do to make method 1 work?
InvalidArgumentError Traceback (most recent call last)
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in get_attr(self, name)
2325 with c_api_util.tf_buffer() as buf:
-> 2326 c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
2327 data = c_api.TF_GetBuffer(buf)
InvalidArgumentError: Operation 'StatefulPartitionedCall' has no attr named '_XlaCompile'.
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/ops/gradients_util.py in _MaybeCompile(scope, op, func, grad_fn)
330 try:
--> 331 xla_compile = op.get_attr("_XlaCompile")
332 xla_separate_compiled_gradients = op.get_attr(
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in get_attr(self, name)
2329 # Convert to ValueError for backwards compatibility.
-> 2330 raise ValueError(str(e))
2331 x = attr_value_pb2.AttrValue()
ValueError: Operation 'StatefulPartitionedCall' has no attr named '_XlaCompile'.
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in get_attr(self, name)
2325 with c_api_util.tf_buffer() as buf:
-> 2326 c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
2327 data = c_api.TF_GetBuffer(buf)
InvalidArgumentError: Operation 'StatefulPartitionedCall' has no attr named '_XlaCompile'.
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/ops/gradients_util.py in _MaybeCompile(scope, op, func, grad_fn)
330 try:
--> 331 xla_compile = op.get_attr("_XlaCompile")
332 xla_separate_compiled_gradients = op.get_attr(
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in get_attr(self, name)
2329 # Convert to ValueError for backwards compatibility.
-> 2330 raise ValueError(str(e))
2331 x = attr_value_pb2.AttrValue()
ValueError: Operation 'StatefulPartitionedCall' has no attr named '_XlaCompile'.
During handling of the above exception, another exception occurred:
LookupError Traceback (most recent call last)
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
606 try:
--> 607 grad_fn = ops.get_gradient_function(op)
608 except LookupError:
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in get_gradient_function(op)
2494 op_type = op.type
-> 2495 return _gradient_registry.lookup(op_type)
2496
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/framework/registry.py in lookup(self, name)
96 raise LookupError(
---> 97 "%s registry has no entry for: %s" % (self._name, name))
LookupError: gradient registry has no entry for: While
During handling of the above exception, another exception occurred:
LookupError Traceback (most recent call last)
<ipython-input-7-55225939f96a> in <module>
10 mode = 'DNN' if 'ANN' in m else 'RNN'
11 c_datasets = continual_dataset(datasets[i], mode=mode)
---> 12 res = learning_rate_test(continual_fine_tune, mname, c_datasets, lr_list=lrs)
13 dfs.append(res)
<ipython-input-5-f4d4cc64dc17> in learning_rate_test(continual_func, model_fname, c_datasets, epochs, lr_list, loss)
14 model.save_weights('Model/tmp')
15 model.load_weights('Model/tmp')
---> 16 test_predictions = continual_func(c_datasets, model, loss_fn, lr, epochs=e, save_model='')
17 test_predictions = (np.array(test_predictions) > 0.5).astype(int)
18 epoch_test_dict[e].append(np.sum(np.equal(y_test_list, test_predictions)) / len(y_test_list) * 100)
<ipython-input-4-9ae956b3e918> in continual_fine_tune(c_datasets, model, loss_fn, lr, epochs, save_model)
12 for e in range(epochs):
13 with tf.GradientTape() as tape:
---> 14 logits = model(x_test_batch, training=True)
15 loss_value = loss_fn(y_test_batch, logits)
16 grads = tape.gradient(loss_value, model.trainable_weights)
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
820 with base_layer_utils.autocast_context_manager(
821 self._compute_dtype):
--> 822 outputs = self.call(cast_inputs, *args, **kwargs)
823 self._handle_activity_regularization(inputs, outputs)
824 self._set_mask_metadata(inputs, outputs, input_masks)
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/sequential.py in call(self, inputs, training, mask)
265 if not self.built:
266 self._init_graph_network(self.inputs, self.outputs, name=self.name)
--> 267 return super(Sequential, self).call(inputs, training=training, mask=mask)
268
269 outputs = inputs # handle the corner case where self.layers is empty
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/network.py in call(self, inputs, training, mask)
715 return self._run_internal_graph(
716 inputs, training=training, mask=mask,
--> 717 convert_kwargs_to_constants=base_layer_utils.call_context().saving)
718
719 def compute_output_shape(self, input_shape):
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/network.py in _run_internal_graph(self, inputs, training, mask, convert_kwargs_to_constants)
889
890 # Compute outputs.
--> 891 output_tensors = layer(computed_tensors, **kwargs)
892
893 # Update tensor_dict.
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
820 with base_layer_utils.autocast_context_manager(
821 self._compute_dtype):
--> 822 outputs = self.call(cast_inputs, *args, **kwargs)
823 self._handle_activity_regularization(inputs, outputs)
824 self._set_mask_metadata(inputs, outputs, input_masks)
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/keras/saving/saved_model/utils.py in return_outputs_and_add_losses(*args, **kwargs)
57 inputs = args[inputs_arg_index]
58 args = args[inputs_arg_index + 1:]
---> 59 outputs, losses = fn(inputs, *args, **kwargs)
60 layer.add_loss(losses, inputs)
61 return outputs
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/keras/saving/saved_model/utils.py in wrap_with_training_arg(*args, **kwargs)
111 training,
112 lambda: replace_training_and_call(True),
--> 113 lambda: replace_training_and_call(False))
114
115 # Create arg spec for decorated function. If 'training' is not defined in the
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/keras/utils/tf_utils.py in smart_cond(pred, true_fn, false_fn, name)
57 pred, true_fn=true_fn, false_fn=false_fn, name=name)
58 return smart_module.smart_cond(
---> 59 pred, true_fn=true_fn, false_fn=false_fn, name=name)
60
61
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/framework/smart_cond.py in smart_cond(pred, true_fn, false_fn, name)
52 if pred_value is not None:
53 if pred_value:
---> 54 return true_fn()
55 else:
56 return false_fn()
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/keras/saving/saved_model/utils.py in <lambda>()
110 return tf_utils.smart_cond(
111 training,
--> 112 lambda: replace_training_and_call(True),
113 lambda: replace_training_and_call(False))
114
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/keras/saving/saved_model/utils.py in replace_training_and_call(training)
106 def replace_training_and_call(training):
107 set_training_arg(training, training_arg_index, args, kwargs)
--> 108 return wrapped_call(*args, **kwargs)
109
110 return tf_utils.smart_cond(
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
566 xla_context.Exit()
567 else:
--> 568 result = self._call(*args, **kwds)
569
570 if tracing_count == self._get_tracing_count():
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds)
604 # In this case we have not created variables on the first call. So we can
605 # run the first trace but we should fail if variables are created.
--> 606 results = self._stateful_fn(*args, **kwds)
607 if self._created_variables:
608 raise ValueError("Creating variables on a non-first call to a function"
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in __call__(self, *args, **kwargs)
2361 with self._lock:
2362 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2363 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2364
2365 #property
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _filtered_call(self, args, kwargs)
1609 if isinstance(t, (ops.Tensor,
1610 resource_variable_ops.BaseResourceVariable))),
-> 1611 self.captured_inputs)
1612
1613 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1695 possible_gradient_type,
1696 executing_eagerly)
-> 1697 forward_function, args_with_tangents = forward_backward.forward()
1698 if executing_eagerly:
1699 flat_outputs = forward_function.call(
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in forward(self)
1421 """Builds or retrieves a forward function for this call."""
1422 forward_function = self._functions.forward(
-> 1423 self._inference_args, self._input_tangents)
1424 return forward_function, self._inference_args + self._input_tangents
1425
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in forward(self, inference_args, input_tangents)
1183 (self._forward, self._forward_graph, self._backward,
1184 self._forwardprop_output_indices, self._num_forwardprop_outputs) = (
-> 1185 self._forward_and_backward_functions(inference_args, input_tangents))
1186 return self._forward
1187
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _forward_and_backward_functions(self, inference_args, input_tangents)
1329 outputs = self._func_graph.outputs[:self._num_inference_outputs]
1330 return self._build_functions_for_outputs(
-> 1331 outputs, inference_args, input_tangents)
1332
1333
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _build_functions_for_outputs(self, outputs, inference_args, input_tangents)
888 self._func_graph.inputs,
889 grad_ys=gradients_wrt_outputs,
--> 890 src_graph=self._func_graph)
891
892 captures_from_forward = [
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
667 # functions.
668 in_grads = _MaybeCompile(grad_scope, op, func_call,
--> 669 lambda: grad_fn(op, *out_grads))
670 else:
671 # For function call ops, we add a 'SymbolicGradient'
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/ops/gradients_util.py in _MaybeCompile(scope, op, func, grad_fn)
334 xla_scope = op.get_attr("_XlaScope").decode()
335 except ValueError:
--> 336 return grad_fn() # Exit early
337
338 if not xla_compile:
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/ops/gradients_util.py in <lambda>()
667 # functions.
668 in_grads = _MaybeCompile(grad_scope, op, func_call,
--> 669 lambda: grad_fn(op, *out_grads))
670 else:
671 # For function call ops, we add a 'SymbolicGradient'
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _rewrite_forward_and_call_backward(self, op, *doutputs)
705 def _rewrite_forward_and_call_backward(self, op, *doutputs):
706 """Add outputs to the forward call and feed them to the grad function."""
--> 707 forward_function, backwards_function = self.forward_backward(len(doutputs))
708 if not backwards_function.outputs:
709 return backwards_function.structured_outputs
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in forward_backward(self, num_doutputs)
614 if forward_backward is not None:
615 return forward_backward
--> 616 forward, backward = self._construct_forward_backward(num_doutputs)
617 self._cached_function_pairs[num_doutputs] = (forward, backward)
618 return forward, backward
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _construct_forward_backward(self, num_doutputs)
662 args=[], kwargs={},
663 signature=signature,
--> 664 func_graph=backwards_graph)
665 backwards_graph_captures = backwards_graph.external_captures
666 captures_from_forward = [
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
976 converted_func)
977
--> 978 func_outputs = python_func(*func_args, **func_kwargs)
979
980 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _backprop_function(*grad_ys)
652 self._func_graph.inputs,
653 grad_ys=grad_ys,
--> 654 src_graph=self._func_graph)
655
656 with self._func_graph.as_default():
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
667 # functions.
668 in_grads = _MaybeCompile(grad_scope, op, func_call,
--> 669 lambda: grad_fn(op, *out_grads))
670 else:
671 # For function call ops, we add a 'SymbolicGradient'
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/ops/gradients_util.py in _MaybeCompile(scope, op, func, grad_fn)
334 xla_scope = op.get_attr("_XlaScope").decode()
335 except ValueError:
--> 336 return grad_fn() # Exit early
337
338 if not xla_compile:
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/ops/gradients_util.py in <lambda>()
667 # functions.
668 in_grads = _MaybeCompile(grad_scope, op, func_call,
--> 669 lambda: grad_fn(op, *out_grads))
670 else:
671 # For function call ops, we add a 'SymbolicGradient'
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _rewrite_forward_and_call_backward(self, op, *doutputs)
705 def _rewrite_forward_and_call_backward(self, op, *doutputs):
706 """Add outputs to the forward call and feed them to the grad function."""
--> 707 forward_function, backwards_function = self.forward_backward(len(doutputs))
708 if not backwards_function.outputs:
709 return backwards_function.structured_outputs
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in forward_backward(self, num_doutputs)
614 if forward_backward is not None:
615 return forward_backward
--> 616 forward, backward = self._construct_forward_backward(num_doutputs)
617 self._cached_function_pairs[num_doutputs] = (forward, backward)
618 return forward, backward
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _construct_forward_backward(self, num_doutputs)
662 args=[], kwargs={},
663 signature=signature,
--> 664 func_graph=backwards_graph)
665 backwards_graph_captures = backwards_graph.external_captures
666 captures_from_forward = [
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
976 converted_func)
977
--> 978 func_outputs = python_func(*func_args, **func_kwargs)
979
980 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _backprop_function(*grad_ys)
652 self._func_graph.inputs,
653 grad_ys=grad_ys,
--> 654 src_graph=self._func_graph)
655
656 with self._func_graph.as_default():
~/work/tf/lib/python3.7/site-packages/tensorflow_core/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
621 raise LookupError(
622 "No gradient defined for operation '%s' (op type: %s)" %
--> 623 (op.name, op.type))
624 if loop_state:
625 loop_state.EnterGradWhileContext(op, before=False)
LookupError: No gradient defined for operation 'while' (op type: While)

I have run into that in the past. The following is how I save/load keras models.
Note that the architecture and the weights are separate.
#saving model
open('model_weights.h5', 'w').write(model.to_json())
model.save_weights('architecture.json', overwrite=True)
from tensorflow.keras.models import model_from_json
#loading model
model = model_from_json(open('architecture.json').read())
model.load_weights('model_weights.h5')

Related

XGBoostError: ../src/gbm/gbtree.cc:588: Check failed: common::AllVisibleGPUs() >= 1 (0 vs. 1) : No visible GPU is found for XGBoost

When I'm trying to run hyper parameter Bayesian optimization using:
from skopt import BayesSearchCV
I'm getting this error on a JupterLab AWS platform with pip environment.
The full error:
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
Input In [49], in <cell line: 1>()
----> 1 opt.fit(X_train, y_train['dr_with_pp_assumption_dpd30'])
File ~/wd/venv/lib/python3.8/site-packages/skopt/searchcv.py:466, in BayesSearchCV.fit(self, X, y, groups, callback, **fit_params)
463 else:
464 self.optimizer_kwargs_ = dict(self.optimizer_kwargs)
--> 466 super().fit(X=X, y=y, groups=groups, **fit_params)
468 # BaseSearchCV never ranked train scores,
469 # but apparently we used to ship this (back-compat)
470 if self.return_train_score:
File ~/wd/venv/lib/python3.8/site-packages/sklearn/model_selection/_search.py:891, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
885 results = self._format_results(
886 all_candidate_params, n_splits, all_out, all_more_results
887 )
889 return results
--> 891 self._run_search(evaluate_candidates)
893 # multimetric is determined here because in the case of a callable
894 # self.scoring the return type is only known after calling
895 first_test_score = all_out[0]["test_scores"]
File ~/wd/venv/lib/python3.8/site-packages/skopt/searchcv.py:512, in BayesSearchCV._run_search(self, evaluate_candidates)
508 while n_iter > 0:
509 # when n_iter < n_points points left for evaluation
510 n_points_adjusted = min(n_iter, n_points)
--> 512 optim_result = self._step(
513 search_space, optimizer,
514 evaluate_candidates, n_points=n_points_adjusted
515 )
516 n_iter -= n_points
518 if eval_callbacks(callbacks, optim_result):
File ~/wd/venv/lib/python3.8/site-packages/skopt/searchcv.py:408, in BayesSearchCV._step(self, search_space, optimizer, evaluate_candidates, n_points)
405 # make lists into dictionaries
406 params_dict = [point_asdict(search_space, p) for p in params]
--> 408 all_results = evaluate_candidates(params_dict)
409 # Feed the point and objective value back into optimizer
410 # Optimizer minimizes objective, hence provide negative score
411 local_results = all_results["mean_test_score"][-len(params):]
File ~/wd/venv/lib/python3.8/site-packages/sklearn/model_selection/_search.py:838, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
830 if self.verbose > 0:
831 print(
832 "Fitting {0} folds for each of {1} candidates,"
833 " totalling {2} fits".format(
834 n_splits, n_candidates, n_candidates * n_splits
835 )
836 )
--> 838 out = parallel(
839 delayed(_fit_and_score)(
840 clone(base_estimator),
841 X,
842 y,
843 train=train,
844 test=test,
845 parameters=parameters,
846 split_progress=(split_idx, n_splits),
847 candidate_progress=(cand_idx, n_candidates),
848 **fit_and_score_kwargs,
849 )
850 for (cand_idx, parameters), (split_idx, (train, test)) in product(
851 enumerate(candidate_params), enumerate(cv.split(X, y, groups))
852 )
853 )
855 if len(out) < 1:
856 raise ValueError(
857 "No fits were performed. "
858 "Was the CV iterator empty? "
859 "Were there no candidates?"
860 )
File ~/wd/venv/lib/python3.8/site-packages/joblib/parallel.py:1085, in Parallel.__call__(self, iterable)
1076 try:
1077 # Only set self._iterating to True if at least a batch
1078 # was dispatched. In particular this covers the edge
(...)
1082 # was very quick and its callback already dispatched all the
1083 # remaining jobs.
1084 self._iterating = False
-> 1085 if self.dispatch_one_batch(iterator):
1086 self._iterating = self._original_iterator is not None
1088 while self.dispatch_one_batch(iterator):
File ~/wd/venv/lib/python3.8/site-packages/joblib/parallel.py:901, in Parallel.dispatch_one_batch(self, iterator)
899 return False
900 else:
--> 901 self._dispatch(tasks)
902 return True
File ~/wd/venv/lib/python3.8/site-packages/joblib/parallel.py:819, in Parallel._dispatch(self, batch)
817 with self._lock:
818 job_idx = len(self._jobs)
--> 819 job = self._backend.apply_async(batch, callback=cb)
820 # A job can complete so quickly than its callback is
821 # called before we get here, causing self._jobs to
822 # grow. To ensure correct results ordering, .insert is
823 # used (rather than .append) in the following line
824 self._jobs.insert(job_idx, job)
File ~/wd/venv/lib/python3.8/site-packages/joblib/_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
File ~/wd/venv/lib/python3.8/site-packages/joblib/_parallel_backends.py:597, in ImmediateResult.__init__(self, batch)
594 def __init__(self, batch):
595 # Don't delay the application, to avoid keeping the input
596 # arguments in memory
--> 597 self.results = batch()
File ~/wd/venv/lib/python3.8/site-packages/joblib/parallel.py:288, in BatchedCalls.__call__(self)
284 def __call__(self):
285 # Set the default nested backend to self._backend but do not set the
286 # change the default number of processes to -1
287 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288 return [func(*args, **kwargs)
289 for func, args, kwargs in self.items]
File ~/wd/venv/lib/python3.8/site-packages/joblib/parallel.py:288, in <listcomp>(.0)
284 def __call__(self):
285 # Set the default nested backend to self._backend but do not set the
286 # change the default number of processes to -1
287 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288 return [func(*args, **kwargs)
289 for func, args, kwargs in self.items]
File ~/wd/venv/lib/python3.8/site-packages/sklearn/utils/fixes.py:216, in _FuncWrapper.__call__(self, *args, **kwargs)
214 def __call__(self, *args, **kwargs):
215 with config_context(**self.config):
--> 216 return self.function(*args, **kwargs)
File ~/wd/venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:680, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
678 estimator.fit(X_train, **fit_params)
679 else:
--> 680 estimator.fit(X_train, y_train, **fit_params)
682 except Exception:
683 # Note fit time as time until error
684 fit_time = time.time() - start_time
File ~/wd/venv/lib/python3.8/site-packages/xgboost/core.py:506, in _deprecate_positional_args.<locals>.inner_f(*args, **kwargs)
504 for k, arg in zip(sig.parameters, args):
505 kwargs[k] = arg
--> 506 return f(**kwargs)
File ~/wd/venv/lib/python3.8/site-packages/xgboost/sklearn.py:789, in XGBModel.fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights, callbacks)
786 obj = None
788 model, feval, params = self._configure_fit(xgb_model, eval_metric, params)
--> 789 self._Booster = train(
790 params,
791 train_dmatrix,
792 self.get_num_boosting_rounds(),
793 evals=evals,
794 early_stopping_rounds=early_stopping_rounds,
795 evals_result=evals_result,
796 obj=obj,
797 feval=feval,
798 verbose_eval=verbose,
799 xgb_model=model,
800 callbacks=callbacks,
801 )
803 self._set_evaluation_result(evals_result)
804 return self
File ~/wd/venv/lib/python3.8/site-packages/xgboost/training.py:188, in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks)
115 def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
116 maximize=None, early_stopping_rounds=None, evals_result=None,
117 verbose_eval=True, xgb_model=None, callbacks=None):
118 # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
119 """Train a booster with given parameters.
120
121 Parameters
(...)
186 Booster : a trained booster model
187 """
--> 188 bst = _train_internal(params, dtrain,
189 num_boost_round=num_boost_round,
190 evals=evals,
191 obj=obj, feval=feval,
192 xgb_model=xgb_model, callbacks=callbacks,
193 verbose_eval=verbose_eval,
194 evals_result=evals_result,
195 maximize=maximize,
196 early_stopping_rounds=early_stopping_rounds)
197 return bst
File ~/wd/venv/lib/python3.8/site-packages/xgboost/training.py:81, in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks, evals_result, maximize, verbose_eval, early_stopping_rounds)
79 if callbacks.before_iteration(bst, i, dtrain, evals):
80 break
---> 81 bst.update(dtrain, i, obj)
82 if callbacks.after_iteration(bst, i, dtrain, evals):
83 break
File ~/wd/venv/lib/python3.8/site-packages/xgboost/core.py:1680, in Booster.update(self, dtrain, iteration, fobj)
1677 self._validate_features(dtrain)
1679 if fobj is None:
-> 1680 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
1681 ctypes.c_int(iteration),
1682 dtrain.handle))
1683 else:
1684 pred = self.predict(dtrain, output_margin=True, training=True)
File ~/wd/venv/lib/python3.8/site-packages/xgboost/core.py:218, in _check_call(ret)
207 """Check the return value of C API call
208
209 This function will raise exception when error occurs.
(...)
215 return value from API calls
216 """
217 if ret != 0:
--> 218 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
XGBoostError: [13:46:18] ../src/gbm/gbtree.cc:588: Check failed: common::AllVisibleGPUs() >= 1 (0 vs. 1) : No visible GPU is found for XGBoost.
Stack trace:
[bt] (0) /root/wd/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x179459) [0x7f00362cb459]
[bt] (1) /root/wd/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x179b82) [0x7f00362cbb82]
[bt] (2) /root/wd/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x17a1da) [0x7f00362cc1da]
[bt] (3) /root/wd/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x1b46e5) [0x7f00363066e5]
[bt] (4) /root/wd/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x68) [0x7f00361eb4e8]
[bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f01f861cdae]
[bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7f01f861c71f]
[bt] (7) /root/wd/venv/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(_ctypes_callproc+0x8ce) [0x7f01f883234e]
[bt] (8) /root/wd/venv/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(+0xe4e4) [0x7f01f882d4e4]
Any suggestions how to make it work?
I tried to solve this using the answers from here:
XGBoostError: [10:10:03] /workspace/src/tree/updater_gpu_hist.cu:1407: Exception in gpu_hist: NCCL failure
but it didn't work

When switching from federated to centralized dataset -Error may indicate that you're trying to pass a Tensor to a NumPy call

I'm trying to switch from a federated setting to centralized learning. I've created a federated dataset, but I want to create a dataset for centralized learning with the create_tf_dataset_from_all_clients function. When I googled the error I found out that maybe versions of NumPy and TensorFlow are not correct for this function, my current versions are :
python == 3.9
tensorflow==2.8.2
numpy==1.21.6
tensorflow-federated==0.24.0
I haven't found some recent posts about TensorFlow 2.8 and matching NumPy version
Also, the error might come from a function that I used to create the clientData object:
def parse_image(filename):
parts = tf.strings.split(filename, os.sep)
label_str = parts[-2]
label_int = tf.where(labels_tf == label_str)[0][0]
image = tf.io.read_file(filename)
image = tf.io.decode_jpeg(image, channels=3)
image = tf.image.convert_image_dtype(image, tf.float32)
image = tf.image.resize(image, [32, 32])
image = tf.keras.applications.resnet50.preprocess_input(image)
if base_model == "VGG16":
print("-------- preprocessing image for base_model VGG16 --------")
image = tf.keras.applications.vgg16.preprocess_input(image)
elif base_model == "ResNet":
print("-------- preprocessing image for base_model ResNet --------")
image = tf.keras.applications.resnet.preprocess_input(image)
return image, label_int
def create_dataset(client_id):
df = train_set
client_id = int(client_id)
file = df.loc[df["client_id"] == client_id]
# print(file)
path = file["path"]
# print(path)
list_ds = tf.data.Dataset.list_files(path)
images_ds = list_ds.map(parse_image)
return images_ds
Error:
TypeError Traceback (most recent call last)
Input In [7], in <cell line: 1>()
----> 1 train_dataset = client_data.create_tf_dataset_from_all_clients()
File ~/master_venv/lib/python3.9/site-packages/tensorflow_federated/python/simulation/datasets/client_data.py:231, in ClientData.create_tf_dataset_from_all_clients(self, seed)
227 nested_dataset = tf.data.Dataset.from_tensor_slices(client_ids)
228 # We apply serializable_dataset_fn here to avoid loading all client datasets
229 # in memory, which is slow. Note that tf.data.Dataset.map implicitly wraps
230 # the input mapping in a tf.function.
--> 231 example_dataset = nested_dataset.flat_map(self.serializable_dataset_fn)
232 return example_dataset
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/data/ops/dataset_ops.py:2092, in DatasetV2.flat_map(self, map_func, name)
2058 def flat_map(self, map_func, name=None):
2059 """Maps `map_func` across this dataset and flattens the result.
2060
2061 The type signature is:
(...)
2090 Dataset: A `Dataset`.
2091 """
-> 2092 return FlatMapDataset(self, map_func, name=name)
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/data/ops/dataset_ops.py:5327, in FlatMapDataset.__init__(self, input_dataset, map_func, name)
5325 """See `Dataset.flat_map()` for details."""
5326 self._input_dataset = input_dataset
-> 5327 self._map_func = structured_function.StructuredFunctionWrapper(
5328 map_func, self._transformation_name(), dataset=input_dataset)
5329 if not isinstance(self._map_func.output_structure, DatasetSpec):
5330 raise TypeError(
5331 "The `map_func` argument must return a `Dataset` object. Got "
5332 f"{_get_type(self._map_func.output_structure)!r}.")
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/data/ops/structured_function.py:271, in StructuredFunctionWrapper.__init__(self, func, transformation_name, dataset, input_classes, input_shapes, input_types, input_structure, add_to_graph, use_legacy_function, defun_kwargs)
264 warnings.warn(
265 "Even though the `tf.config.experimental_run_functions_eagerly` "
266 "option is set, this option does not apply to tf.data functions. "
267 "To force eager execution of tf.data functions, please use "
268 "`tf.data.experimental.enable_debug_mode()`.")
269 fn_factory = trace_tf_function(defun_kwargs)
--> 271 self._function = fn_factory()
272 # There is no graph to add in eager mode.
273 add_to_graph &= not context.executing_eagerly()
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:2567, in Function.get_concrete_function(self, *args, **kwargs)
2558 def get_concrete_function(self, *args, **kwargs):
2559 """Returns a `ConcreteFunction` specialized to inputs and execution context.
2560
2561 Args:
(...)
2565 or `tf.Tensor` or `tf.TensorSpec`.
2566 """
-> 2567 graph_function = self._get_concrete_function_garbage_collected(
2568 *args, **kwargs)
2569 graph_function._garbage_collector.release() # pylint: disable=protected-access
2570 return graph_function
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:2533, in Function._get_concrete_function_garbage_collected(self, *args, **kwargs)
2531 args, kwargs = None, None
2532 with self._lock:
-> 2533 graph_function, _ = self._maybe_define_function(args, kwargs)
2534 seen_names = set()
2535 captured = object_identity.ObjectIdentitySet(
2536 graph_function.graph.internal_captures)
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:2711, in Function._maybe_define_function(self, args, kwargs)
2708 cache_key = self._function_cache.generalize(cache_key)
2709 (args, kwargs) = cache_key._placeholder_value() # pylint: disable=protected-access
-> 2711 graph_function = self._create_graph_function(args, kwargs)
2712 self._function_cache.add(cache_key, cache_key_deletion_observer,
2713 graph_function)
2715 return graph_function, filtered_flat_args
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:2627, in Function._create_graph_function(self, args, kwargs)
2622 missing_arg_names = [
2623 "%s_%d" % (arg, i) for i, arg in enumerate(missing_arg_names)
2624 ]
2625 arg_names = base_arg_names + missing_arg_names
2626 graph_function = ConcreteFunction(
-> 2627 func_graph_module.func_graph_from_py_func(
2628 self._name,
2629 self._python_function,
2630 args,
2631 kwargs,
2632 self.input_signature,
2633 autograph=self._autograph,
2634 autograph_options=self._autograph_options,
2635 arg_names=arg_names,
2636 capture_by_value=self._capture_by_value),
2637 self._function_attributes,
2638 spec=self.function_spec,
2639 # Tell the ConcreteFunction to clean up its graph once it goes out of
2640 # scope. This is not the default behavior since it gets used in some
2641 # places (like Keras) where the FuncGraph lives longer than the
2642 # ConcreteFunction.
2643 shared_func_graph=False)
2644 return graph_function
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py:1141, in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, acd_record_initial_resource_uses)
1138 else:
1139 _, original_func = tf_decorator.unwrap(python_func)
-> 1141 func_outputs = python_func(*func_args, **func_kwargs)
1143 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
1144 # TensorArrays and `None`s.
1145 func_outputs = nest.map_structure(
1146 convert, func_outputs, expand_composites=True)
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/data/ops/structured_function.py:248, in StructuredFunctionWrapper.__init__.<locals>.trace_tf_function.<locals>.wrapped_fn(*args)
242 #eager_function.defun_with_attributes(
243 input_signature=structure.get_flat_tensor_specs(
244 self._input_structure),
245 autograph=False,
246 attributes=defun_kwargs)
247 def wrapped_fn(*args): # pylint: disable=missing-docstring
--> 248 ret = wrapper_helper(*args)
249 ret = structure.to_tensor_list(self._output_structure, ret)
250 return [ops.convert_to_tensor(t) for t in ret]
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/data/ops/structured_function.py:177, in StructuredFunctionWrapper.__init__.<locals>.wrapper_helper(*args)
175 if not _should_unpack(nested_args):
176 nested_args = (nested_args,)
--> 177 ret = autograph.tf_convert(self._func, ag_ctx)(*nested_args)
178 if _should_pack(ret):
179 ret = tuple(ret)
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py:692, in convert.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
690 except Exception as e: # pylint:disable=broad-except
691 if hasattr(e, 'ag_error_metadata'):
--> 692 raise e.ag_error_metadata.to_exception(e)
693 else:
694 raise
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py:689, in convert.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
687 try:
688 with conversion_ctx:
--> 689 return converted_call(f, args, kwargs, options=options)
690 except Exception as e: # pylint:disable=broad-except
691 if hasattr(e, 'ag_error_metadata'):
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py:439, in converted_call(f, args, kwargs, caller_fn_scope, options)
437 try:
438 if kwargs is not None:
--> 439 result = converted_f(*effective_args, **kwargs)
440 else:
441 result = converted_f(*effective_args)
File /var/folders/w2/fcxhc9j52tb9hymgw1b8_dmh0000gn/T/__autograph_generated_filepc7z792y.py:11, in outer_factory.<locals>.inner_factory.<locals>.tf__create_dataset(client_id)
9 retval_ = ag__.UndefinedReturnValue()
10 client_id = ag__.converted_call(ag__.ld(int), (ag__.ld(client_id),), None, fscope)
---> 11 files = ag__.ld(df).loc[ag__.ld(df)['client_id'] == ag__.ld(client_id)]
12 path = ag__.ld(files)['path']
13 list_ds = ag__.converted_call(ag__.ld(tf).data.Dataset.list_files, (ag__.ld(path),), None, fscope)
File ~/master_venv/lib/python3.9/site-packages/pandas/core/ops/common.py:70, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
66 return NotImplemented
68 other = item_from_zerodim(other)
---> 70 return method(self, other)
File ~/master_venv/lib/python3.9/site-packages/pandas/core/arraylike.py:40, in OpsMixin.__eq__(self, other)
38 #unpack_zerodim_and_defer("__eq__")
39 def __eq__(self, other):
---> 40 return self._cmp_method(other, operator.eq)
File ~/master_venv/lib/python3.9/site-packages/pandas/core/series.py:5625, in Series._cmp_method(self, other, op)
5622 with np.errstate(all="ignore"):
5623 res_values = ops.comparison_op(lvalues, rvalues, op)
-> 5625 return self._construct_result(res_values, name=res_name)
File ~/master_venv/lib/python3.9/site-packages/pandas/core/series.py:3017, in Series._construct_result(self, result, name)
3013 return (res1, res2)
3015 # We do not pass dtype to ensure that the Series constructor
3016 # does inference in the case where `result` has object-dtype.
-> 3017 out = self._constructor(result, index=self.index)
3018 out = out.__finalize__(self)
3020 # Set the result's name after __finalize__ is called because __finalize__
3021 # would set it back to self.name
File ~/master_venv/lib/python3.9/site-packages/pandas/core/series.py:442, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
440 index = default_index(len(data))
441 elif is_list_like(data):
--> 442 com.require_length_match(data, index)
444 # create/copy the manager
445 if isinstance(data, (SingleBlockManager, SingleArrayManager)):
File ~/master_venv/lib/python3.9/site-packages/pandas/core/common.py:556, in require_length_match(data, index)
552 def require_length_match(data, index: Index):
553 """
554 Check the length of data matches the length of the index.
555 """
--> 556 if len(data) != len(index):
557 raise ValueError(
558 "Length of values "
559 f"({len(data)}) "
560 "does not match length of index "
561 f"({len(index)})"
562 )
File ~/master_venv/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:932, in Tensor.__len__(self)
931 def __len__(self):
--> 932 raise TypeError(f"len is not well defined for a symbolic Tensor "
933 f"({self.name}). Please call `x.shape` rather than "
934 f"`len(x)` for shape information.")
TypeError: in user code:
File "/var/folders/w2/fcxhc9j52tb9hymgw1b8_dmh0000gn/T/ipykernel_2264/3413278942.py", line 7, in create_dataset *
files = df.loc[df['client_id']==client_id]
File "/Users/admin/master_venv/lib/python3.9/site-packages/pandas/core/ops/common.py", line 70, in new_method
return method(self, other)
File "/Users/admin/master_venv/lib/python3.9/site-packages/pandas/core/arraylike.py", line 40, in __eq__
return self._cmp_method(other, operator.eq)
File "/Users/admin/master_venv/lib/python3.9/site-packages/pandas/core/series.py", line 5625, in _cmp_method
return self._construct_result(res_values, name=res_name)
File "/Users/admin/master_venv/lib/python3.9/site-packages/pandas/core/series.py", line 3017, in _construct_result
out = self._constructor(result, index=self.index)
File "/Users/admin/master_venv/lib/python3.9/site-packages/pandas/core/series.py", line 442, in __init__
com.require_length_match(data, index)
File "/Users/admin/master_venv/lib/python3.9/site-packages/pandas/core/common.py", line 556, in require_length_match
if len(data) != len(index):
TypeError: len is not well defined for a symbolic Tensor (Equal:0). Please call `x.shape` rather than `len(x)` for shape information.
TFF is generally programmed assuming that 'all' local logic is expressed in pure TensorFlow (or at least, can be hoisted into a platform-independent representation, like TorchScript, GraphDef or XLA); this is crucial for TFF's "write-once, run-everywhere" philosophy, to prevent capturing of arbitrary python code.
It is this assumption that is surfacing here. TFF passes your function directly to TensorFlow libraries which implicitly create a tf.function; you can in fact see this in the stacktrace above:
228 # We apply serializable_dataset_fn here to avoid loading all client datasets
229 # in memory, which is slow. Note that tf.data.Dataset.map implicitly wraps
230 # the input mapping in a tf.function.
--> 231 example_dataset = nested_dataset.flat_map(self.serializable_dataset_fn)
While TF is trying to create this tf.function, or invoke it, it will pass a tensor through the function and attempt to trace the logic. This makes working with Python datastructures a little difficult; e.g., we cannot use this tensor to index into a list or dict. However, it looks to me like if you construct a tf.lookup.StaticHashTable with your clients-to-files mapping, and look up with your client ID in this hash table instead of the pandas dataframe, your code may 'just work'.
In general, you can test whether your code will work or not with this usage of TFF and tf.data by wrapping the function you pass to TFF as a tf.function, invoking it, and making sure it has the behavior you expect.

How can I compute gradient with tensorflow/keras?

I am trying to draw the saliency maps with tensorflow 2.6.0. I defined the model with tensorflow.keras.models.Sequential and finish training. However, when I tried to compute gradient like this:
with GradientTape(persistent=True) as tape:
tape.watch(image)
result = model.predict(image)[:, 4]
gradient = tape.gradient(result, image)
where the image is the tensor and [:, 4] is for choosing the actual label for this image.
image.shape = (1, 48, 48, 1)
result.shape = (1, 10)
Then I got error message:
---------------------------------------------------------------------------
LookupError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
605 try:
--> 606 grad_fn = ops.get_gradient_function(op)
607 except LookupError:
/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py in get_gradient_function(op)
2731 op_type = op.type
-> 2732 return gradient_registry.lookup(op_type)
2733
/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/registry.py in lookup(self, name)
99 raise LookupError(
--> 100 "%s registry has no entry for: %s" % (self._name, name))
LookupError: gradient registry has no entry for: IteratorGetNext
During handling of the above exception, another exception occurred:
LookupError Traceback (most recent call last)
/tmp/ipykernel_36/2425374110.py in <module>
1 with GradientTape(persistent=True) as tape:
2 tape.watch(image)
----> 3 result = model.predict(image)[:, 4]
4 # result = tf.convert_to_tensor(result)
5 # probs = tf.nn.softmax(result, axis=-1)[:, 4]
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in predict(self, x, batch_size, verbose, steps, callbacks, max_queue_size, workers, use_multiprocessing)
1749 for step in data_handler.steps():
1750 callbacks.on_predict_batch_begin(step)
-> 1751 tmp_batch_outputs = self.predict_function(iterator)
1752 if data_handler.should_sync:
1753 context.async_wait()
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
883
884 with OptionalXlaContext(self._jit_compile):
--> 885 result = self._call(*args, **kwds)
886
887 new_tracing_count = self.experimental_get_tracing_count()
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
922 # In this case we have not created variables on the first call. So we can
923 # run the first trace but we should fail if variables are created.
--> 924 results = self._stateful_fn(*args, **kwds)
925 if self._created_variables and not ALLOW_DYNAMIC_VARIABLE_CREATION:
926 raise ValueError("Creating variables on a non-first call to a function"
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
3038 filtered_flat_args) = self._maybe_define_function(args, kwargs)
3039 return graph_function._call_flat(
-> 3040 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access
3041
3042 #property
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1967 possible_gradient_type,
1968 executing_eagerly)
-> 1969 forward_function, args_with_tangents = forward_backward.forward()
1970 if executing_eagerly:
1971 flat_outputs = forward_function.call(
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in forward(self)
1493 """Builds or retrieves a forward function for this call."""
1494 forward_function = self._functions.forward(
-> 1495 self._inference_args, self._input_tangents)
1496 return forward_function, self._inference_args + self._input_tangents
1497
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in forward(self, inference_args, input_tangents)
1224 (self._forward, self._forward_graph, self._backward,
1225 self._forwardprop_output_indices, self._num_forwardprop_outputs) = (
-> 1226 self._forward_and_backward_functions(inference_args, input_tangents))
1227 return self._forward
1228
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _forward_and_backward_functions(self, inference_args, input_tangents)
1449 outputs = list(self._func_graph.outputs)
1450 self._build_functions_for_outputs(
-> 1451 outputs, inference_args, input_tangents)
1452
1453 (forward_function, forward_graph,
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _build_functions_for_outputs(self, outputs, inference_args, input_tangents)
946 self._func_graph.inputs,
947 grad_ys=gradients_wrt_outputs,
--> 948 src_graph=self._func_graph)
949
950 if input_tangents:
/opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
634 raise LookupError(
635 "No gradient defined for operation '%s' (op type: %s)" %
--> 636 (op.name, op.type))
637 if loop_state:
638 loop_state.EnterGradWhileContext(op, before=False)
LookupError: No gradient defined for operation 'IteratorGetNext' (op type: IteratorGetNext)
What should I do to solve this problem? Thanks for answering.
Use model(image)[:, 4] instead of model.predict(image)[:, 4]. The last one transforms the output to numpy arrays (without any autograd functionalities) and then tensorflow can't compute the gradient. By the way in a persistent context, it is significantly less efficient to call tape.gradient inside the context because you will aslo record the gradient operations. The only situation where it is interesting is when you want higher degree derivation.
with tf.GradientTape(persistent=True) as tape:
tape.watch(image)
result = model(image)[:, 4]
gradient = tape.gradient(preds, A)

Error while using VGP instead of SVGP in Heteroskedastic gpflow example (multilatent GP)

I am trying to understand why I am getting a ValueError when I try to replace SVGP with VGP in the heteroscedastic regression example (https://gpflow.readthedocs.io/en/develop/notebooks/advanced/heteroskedastic.html) in GPflow.
Here are the changes I made:
model = gpf.models.VGP(...)
loss_fn = model.training_loss_closure() instead of loss_fn = model.training_loss_closure(data)
The kernel and likelihood are the same as the example.
data = (X, Y)
model = gpf.models.VGP(
data = data,
kernel=kernel,
likelihood=likelihood,
#inducing_variable=inducing_variable,
num_latent_gps=likelihood.latent_dim,
)
loss_fn = model.training_loss_closure()
gpf.utilities.set_trainable(model.q_mu, False)
gpf.utilities.set_trainable(model.q_sqrt, False)
variational_vars = [(model.q_mu, model.q_sqrt)]
natgrad_opt = gpf.optimizers.NaturalGradient(gamma=0.1)
adam_vars = model.trainable_variables
adam_opt = tf.optimizers.Adam(0.01)
#tf.function
def optimisation_step():
natgrad_opt.minimize(loss_fn, variational_vars)
adam_opt.minimize(loss_fn, adam_vars)
epochs = 100
for epoch in range(0, epochs):
optimisation_step()
The optimization step gives me this error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_406484/3662007586.py in <module>
3
4 for epoch in range(1, epochs + 1):
----> 5 optimisation_step()
6
7
~/miniconda3/envs/tensorflow/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
887
888 with OptionalXlaContext(self._jit_compile):
--> 889 result = self._call(*args, **kwds)
890
891 new_tracing_count = self.experimental_get_tracing_count()
~/miniconda3/envs/tensorflow/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
931 # This is the first call of __call__, so we have to initialize.
932 initializers = []
--> 933 self._initialize(args, kwds, add_initializers_to=initializers)
934 finally:
935 # At this point we know that the initialization is complete (or less
~/miniconda3/envs/tensorflow/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
761 self._graph_deleter = FunctionDeleter(self._lifted_initializer_graph)
762 self._concrete_stateful_fn = (
--> 763 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
764 *args, **kwds))
765
~/miniconda3/envs/tensorflow/lib/python3.9/site-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
3048 args, kwargs = None, None
3049 with self._lock:
-> 3050 graph_function, _ = self._maybe_define_function(args, kwargs)
3051 return graph_function
3052
~/miniconda3/envs/tensorflow/lib/python3.9/site-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
3442
3443 self._function_cache.missed.add(call_context_key)
-> 3444 graph_function = self._create_graph_function(args, kwargs)
3445 self._function_cache.primary[cache_key] = graph_function
3446
~/miniconda3/envs/tensorflow/lib/python3.9/site-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
3277 arg_names = base_arg_names + missing_arg_names
3278 graph_function = ConcreteFunction(
-> 3279 func_graph_module.func_graph_from_py_func(
3280 self._name,
3281 self._python_function,
~/miniconda3/envs/tensorflow/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
997 _, original_func = tf_decorator.unwrap(python_func)
998
--> 999 func_outputs = python_func(*func_args, **func_kwargs)
1000
1001 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
~/miniconda3/envs/tensorflow/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
670 # the function a weak reference to itself to avoid a reference cycle.
671 with OptionalXlaContext(compile_with_xla):
--> 672 out = weak_wrapped_fn().__wrapped__(*args, **kwds)
673 return out
674
~/miniconda3/envs/tensorflow/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
984 except Exception as e: # pylint:disable=broad-except
985 if hasattr(e, "ag_error_metadata"):
--> 986 raise e.ag_error_metadata.to_exception(e)
987 else:
988 raise
ValueError: in user code:
ValueError: Dimensions must be equal, but are 2 and 1001 for '{{node add_2}} = AddV2[T=DT_DOUBLE](diag, mul_1)' with input shapes: [1001,2,2], [1001,1001].
Is this a bug or are the likelihood and model incompatible or am I missing something?
A workaround is to make the inducing variables = the training data and use SVGP but that makes the training incredibly slow...

InternalError: Cannot dlopen all CUDA libraries

I am trying to run the Python code of this Kaggle Jupyter Notebook and encounter following error:
---------------------------------------------------------------------------
InternalError Traceback (most recent call last)
<ipython-input-40-be0fb0b18f3a> in <module>
1 #Defining Neural Network
----> 2 model = Sequential()
3 #Non-trainable embeddidng layer
4 model.add(Embedding(max_features, output_dim=embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False))
5 #LSTM
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\training\tracking\base.py in _method_wrapper(self, *args, **kwargs)
528 self._self_setattr_tracking = False # pylint: disable=protected-access
529 try:
--> 530 result = method(self, *args, **kwargs)
531 finally:
532 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\keras\engine\sequential.py in __init__(self, layers, name)
105 """
106 # Skip the init in FunctionalModel since model doesn't have input/output yet
--> 107 super(functional.Functional, self).__init__( # pylint: disable=bad-super-call
108 name=name, autocast=False)
109 base_layer.keras_api_gauge.get_cell('Sequential').set(True)
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\training\tracking\base.py in _method_wrapper(self, *args, **kwargs)
528 self._self_setattr_tracking = False # pylint: disable=protected-access
529 try:
--> 530 result = method(self, *args, **kwargs)
531 finally:
532 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\keras\engine\training.py in __init__(self, *args, **kwargs)
287 self._steps_per_execution = None
288
--> 289 self._init_batch_counters()
290 self._base_model_initialized = True
291
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\training\tracking\base.py in _method_wrapper(self, *args, **kwargs)
528 self._self_setattr_tracking = False # pylint: disable=protected-access
529 try:
--> 530 result = method(self, *args, **kwargs)
531 finally:
532 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\keras\engine\training.py in _init_batch_counters(self)
295 # `evaluate`, and `predict`.
296 agg = tf.VariableAggregation.ONLY_FIRST_REPLICA
--> 297 self._train_counter = tf.Variable(0, dtype='int64', aggregation=agg)
298 self._test_counter = tf.Variable(0, dtype='int64', aggregation=agg)
299 self._predict_counter = tf.Variable(
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\ops\variables.py in __call__(cls, *args, **kwargs)
266 return cls._variable_v1_call(*args, **kwargs)
267 elif cls is Variable:
--> 268 return cls._variable_v2_call(*args, **kwargs)
269 else:
270 return super(VariableMetaclass, cls).__call__(*args, **kwargs)
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\ops\variables.py in _variable_v2_call(cls, initial_value, trainable, validate_shape, caching_device, name, variable_def, dtype, import_scope, constraint, synchronization, aggregation, shape)
248 if aggregation is None:
249 aggregation = VariableAggregation.NONE
--> 250 return previous_getter(
251 initial_value=initial_value,
252 trainable=trainable,
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\ops\variables.py in <lambda>(**kws)
241 shape=None):
242 """Call on Variable class. Useful to force the signature."""
--> 243 previous_getter = lambda **kws: default_variable_creator_v2(None, **kws)
244 for _, getter in ops.get_default_graph()._variable_creator_stack: # pylint: disable=protected-access
245 previous_getter = _make_getter(getter, previous_getter)
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\ops\variable_scope.py in default_variable_creator_v2(next_creator, **kwargs)
2660 shape = kwargs.get("shape", None)
2661
-> 2662 return resource_variable_ops.ResourceVariable(
2663 initial_value=initial_value,
2664 trainable=trainable,
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\ops\variables.py in __call__(cls, *args, **kwargs)
268 return cls._variable_v2_call(*args, **kwargs)
269 else:
--> 270 return super(VariableMetaclass, cls).__call__(*args, **kwargs)
271
272
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py in __init__(self, initial_value, trainable, collections, validate_shape, caching_device, name, dtype, variable_def, import_scope, constraint, distribute_strategy, synchronization, aggregation, shape)
1600 self._init_from_proto(variable_def, import_scope=import_scope)
1601 else:
-> 1602 self._init_from_args(
1603 initial_value=initial_value,
1604 trainable=trainable,
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py in _init_from_args(self, initial_value, trainable, collections, caching_device, name, dtype, constraint, synchronization, aggregation, distribute_strategy, shape)
1743 self._update_uid = initial_value.checkpoint_position.restore_uid
1744 initial_value = initial_value.wrapped_value
-> 1745 initial_value = ops.convert_to_tensor(initial_value,
1746 name="initial_value",
1747 dtype=dtype)
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\profiler\trace.py in wrapped(*args, **kwargs)
161 with Trace(trace_name, **trace_kwargs):
162 return func(*args, **kwargs)
--> 163 return func(*args, **kwargs)
164
165 return wrapped
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
1564
1565 if ret is None:
-> 1566 ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
1567
1568 if ret is NotImplemented:
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\tensor_conversion_registry.py in _default_conversion_function(***failed resolving arguments***)
50 def _default_conversion_function(value, dtype, name, as_ref):
51 del as_ref # Unused.
---> 52 return constant_op.constant(value, dtype, name=name)
53
54
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in constant(value, dtype, shape, name)
269 ValueError: if called on a symbolic tensor.
270 """
--> 271 return _constant_impl(value, dtype, shape, name, verify_shape=False,
272 allow_broadcast=True)
273
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in _constant_impl(value, dtype, shape, name, verify_shape, allow_broadcast)
281 with trace.Trace("tf.constant"):
282 return _constant_eager_impl(ctx, value, dtype, shape, verify_shape)
--> 283 return _constant_eager_impl(ctx, value, dtype, shape, verify_shape)
284
285 g = ops.get_default_graph()
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in _constant_eager_impl(ctx, value, dtype, shape, verify_shape)
306 def _constant_eager_impl(ctx, value, dtype, shape, verify_shape):
307 """Creates a constant on the current device."""
--> 308 t = convert_to_eager_tensor(value, ctx, dtype)
309 if shape is None:
310 return t
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
103 except AttributeError:
104 dtype = dtypes.as_dtype(dtype).as_datatype_enum
--> 105 ctx.ensure_initialized()
106 return ops.EagerTensor(value, ctx.device_name, dtype)
107
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\eager\context.py in ensure_initialized(self)
534 opts = pywrap_tfe.TFE_NewContextOptions()
535 try:
--> 536 config_str = self.config.SerializeToString()
537 pywrap_tfe.TFE_ContextOptionsSetConfig(opts, config_str)
538 if self._device_policy is not None:
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\eager\context.py in config(self)
962 """Return the ConfigProto with all runtime deltas applied."""
963 # Ensure physical devices have been discovered and config has been imported
--> 964 self._initialize_physical_devices()
965
966 config = config_pb2.ConfigProto()
c:\users\kim\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\eager\context.py in _initialize_physical_devices(self, reinitialize)
1291 return
1292
-> 1293 devs = pywrap_tfe.TF_ListPhysicalDevices()
1294 self._physical_devices = [
1295 PhysicalDevice(name=d.decode(), device_type=d.decode().split(":")[1])
InternalError: Cannot dlopen all CUDA libraries.
How can I resolve it?
Okay so I tried a few things and after installing tensorflow-gpu it worked. Maybe it can help someone else with this problem as well:
pip install tensorflow-gpu

Categories