I am writing a custom optimizer with Eager Execution in Ternsorflow 1.15 but can't figure out how to update the weights.
Taking gradient descent as an example, I have the weights, the gradient and a scalar learning rate but can't figure out how to combine them.
This is an implementation of gradient descent where model is a keras.Model e.g. a multilayer CNN:
lr = tf.constant(0.01)
def minimize(model, inputs, targets):
with tf.GradientTape() as tape:
logits = model(input)
loss_value = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=targets)
grad = tape.gradient(loss_value, model.trainable_variables)
step = tf.multiply(self.lr, grad)
model.trainable_variables.assign_sub(step)
but it fails on the tf.multiply saying
tensorflow.python.framework.errors_impl.InvalidArgumentError: Shapes of all inputs must match: values[0].shape = [5,5,1,6] != values[1].shape = [6] [Op:Pack] name: packed
I also know the last line will fail as trainable_variables is a list and doesn't have the method assign_sub.
How can I rewrite the last two lines of my code to do:
model.trainable_variables -= lr * grad
Figured it out. As both are lists we need to iterate through their pairs of gradients and variables for each layer together and update each of these separately.
lr = tf.constant(0.01)
def minimize(model, inputs, targets):
with tf.GradientTape() as tape:
logits = model(input)
loss_value = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=targets)
grad = tape.gradient(loss_value, model.trainable_variables)
for v, g in zip(model.trainable_variables, grad):
v.assign_sub(lr * g)
Related
I used the code from tensorflow example and modified the custom training function to use RL epsilon greedy policy for action selection. However, gradient calculations are no longer working. Can someone point out what is missing in my logic for gradient descent?
class CustomModel(keras.Model):
def train_step(self, data):
x, y = data
with tf.GradientTape() as tape:
y_pred = self(x, training=True) # Forward pass
> # Compute our own loss
> metric = tf.math.argmin(y_pred, axis=1)
loss = keras.losses.mean_squared_error(y, metric)
# Compute gradients
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Compute our own metrics
loss_tracker.update_state(loss)
mae_metric.update_state(y, y_pred)
return {"loss": loss_tracker.result(), "mae": mae_metric.result()}
Here is the error message from TensofFlow:
grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
/home/share/virtualenvs/TxBsk36Y/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/utils.py:79 filter_empty_gradients
([v.name for _, v in grads_and_vars],))
ValueError: No gradients provided for any variable: ['dense/kernel:0', 'dense/bias:0'].
#tf.function
def update(model, dataset, weights, optimizer):
trainable_weights = model.trainable_variables
tf.nest.map_structure(lambda x, y: x.assign(y),
trainable_weights, weights)
for batch in dataset:
with tf.GradientTape() as tape:
outputs = model.forward_pass(batch)
grads = tape.gradient(outputs.loss, trainable_weights)
norm = tf.linalg.global_norm(grads)
grads_and_vars = zip(grads, client_weights)
optimizer.apply_gradients(grads_and_vars)
return trainable_weights, grads, norm
its returns none and error for grads, norm. Error it must be decalred before loop. I
want to compute norm of each client and compare the norm of them.
I am using an RGB dataset for my x train and the loss is calculated in a dynamic loss function that gets the distances of pairs and compares them against the ideal distance dist_train. Here is the model:
class MyModel(Model):
def __init__(self):
super(MyModel, self).__init__()
self.d1 = Dense(3, activation='relu')
self.flatten = Flatten()
self.d2 = Dense(3, activation='relu')
self.d3 = Dense(2)
def call(self, x):
x = self.d1(x)
x = self.flatten(x)
x = self.d2(x)
return self.d3(x)
# Create an instance of the model
model = MyModel()
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
test_loss = tf.keras.metrics.Mean(name='test_loss')
#tf.function
def train_step(rgb):
with tf.GradientTape() as tape:
predictions = model(rgb, training=True)
loss = tf_function(predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
Here is the loss function and the tf.function wrapping it:
def mahal_loss(output):
mahal = sp.spatial.distance.pdist(output, metric='mahalanobis')
mahal = sp.spatial.distance.squareform(mahal, force='no', checks=True)
new_distance = []
mahal = np.ma.masked_array(mahal, mask=mahal==0)
for i in range(len(mahal)):
pw_dist = mahal[i, indices_train[i]]
new_distance.append(pw_dist)
mahal_loss = np.mean((dist_train - new_distance)**2)
return mahal_loss
#tf.function(input_signature=[tf.TensorSpec(None, tf.float32)])
def tf_function(pred):
y = tf.numpy_function(mahal_loss, [pred], tf.float32)
return y
Running the model:
for epoch in range(EPOCHS):
train_loss.reset_states()
test_loss.reset_states()
for i in x_train:
train_step(i)
print(
f'Epoch {epoch + 1}, '
f'Loss: {train_loss.result()}, '
f'Test Loss: {test_loss.result()}, '
)
I believe the reason I am running into problems lies in the dynamic loss function, as I need to calculate the distance between certain pairs to get the results I expect. This means that inside the loss function I have to calculate the mahalanobis distance of each pair to get the ones I will compare against the correct distances. The error I get is the following:
<ipython-input-23-0e975da5cbc2>:15 train_step *
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\optimizer_v2.py:622 apply_gradients **
grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\utils.py:72 filter_empty_gradients
raise ValueError("No gradients provided for any variable: %s." %
ValueError: No gradients provided for any variable: ['my_model/dense/kernel:0', 'my_model/dense/bias:0', 'my_model/dense_1/kernel:0', 'my_model/dense_1/bias:0', 'my_model/dense_2/kernel:0', 'my_model/dense_2/bias:0'].```
The problem is the use of tf.numpy_function.
Specifically, everything that happens inside the with tf.GradientTape() as tape statement has to be differentiable. Because the conversion between tf.Tensor and numpy array is not differentiable, tf.numpy_function cannot be used for loss computation:
Since the function takes numpy arrays, you cannot take gradients through a numpy_function. If you require something that is differentiable, please consider using tf.py_function.
(Source: here in the official documentation)
So either wrap the loss comutation in tf.py_function as this accepts tf.Tensors or consider implementing it in tensorflow. Here is an example for that.
I am using the following tf.function decorated training step:
#tf.function
def train_step(inputs, labels):
with tf.GradientTape(persistent=True) as tape:
predictions = model([X, F], training=True)
losses = [l_f(tf.expand_dims(labels[:,i], axis=-1), predictions[i]) for i, l_f in enumerate(loss_functions)]
gradients = [tape.gradient(l, model.trainable_variables) for l in losses]
for g in gradients:
grads = [gg if gg is not None else tf.zeros_like(model.trainable_variables[i], dtype=tf.float32) for i, gg in enumerate(g)]
optimizer.apply_gradients(zip(grads, model.trainable_variables)
del tape
return losses
def weighted_loss(weights):
#tf.function
def loss_func(labels, predictions):
min_class_filter = tfk.backend.greater(labels, 0.5)
y_min = tf.boolean_mask(labels, min_class_filter)
y_max = tf.boolean_mask(labels, tf.math.logical_not(min_class_filter))
y_pred_min = tf.boolean_mask(predictions, min_class_filter)
y_pred_max = tf.boolean_mask(predictions, tf.math.logical_not(min_class_filter))
loss_min_class = tfk.backend.mean(tfk.backend.binary_crossentropy(y_min, y_pred_min))
loss_max_class = tfk.backend.mean(tfk.backend.binary_crossentropy(y_max, y_pred_max))
loss_all = tfk.backend.mean(tfk.backend.binary_crossentropy(labels, predictions))
return weights[0]*loss_min_class + weights[1]*loss_max_class + weights[2]*loss_all
return loss_func
loss_functions = [weighted_loss(w) for w in target_weights]
It's a little quirky, but basically, my network has multiple outputs, which means that there are cases where returning a gradient of None for certain weights is correct, so I am replacing those gradients with zero, and I'm calculating the loss at each of these outputs separately and then propagating each of them at each step.
When I run this as written, it takes an extremely long time (10min+) to run a single training step, and I see the following message in the logs:
E tensorflow/core/grappler/optimizers/meta_optimizer.cc:502] function_operator failed: Invalid argument: Input 0 of node model/LSTM_forward_0/zeros_like was passed int32 from model/LSTM_forward_0/StatefulPartitioned Call:9 incompatible with expected variant.
When I remove the #tf.function decorator, it runs in about 10% of the time, and I do not see this log warning. Is this warning a red herring or does it legitimately point to an issue created by adding #tf.function?
Additional Details:
TF 2.0
GPU enabled and available
CUDA 10.1
GPU utilization 0% in both cases but that isn't caused by data-feed maxing the CPU throughput, as when I generate training data outside of the training loop, it's as good as instantaneous from TFRecords with sufficient prefetch and limited augmentation
dtype of inputs, labels, gradients and all model.trainable_variables are all tf.float32
From what I read, tf.function should not include any assignment to the graph vars for it to run smoothly.
In a training step, you are changing the weights of the model, thus violating this.
I'm not sure this is the reason, but you can try to leave tf.function only in the loss function, but not in the training step.
I have figured out how to fix it. The issue was with overwriting None gradients, not with the persistent gradient tape.
#tf.function
def train_step(inputs, labels):
with tf.GradientTape(persistent=True) as tape:
predictions = model([X, F], training=True)
losses = [l_f(labels, predictions, i) for i, l_f in enumerate(loss_functions)]
gradients = [tape.gradient(l, model.trainable_variables) for l in losses]
for g in gradients:
optimizer.apply_gradients(zip(g, model.trainable_variables)
del tape
return losses
def weighted_loss(weights):
#tf.function
def loss_func(labs, preds, i):
labels = tf.expand_dims(labs[:,i], axis=-1)
predictions = preds[i]
min_class_filter = tfk.backend.greater(labels, 0.5)
y_min = tf.boolean_mask(labels, min_class_filter)
y_max = tf.boolean_mask(labels, tf.math.logical_not(min_class_filter))
y_pred_min = tf.boolean_mask(predictions, min_class_filter)
y_pred_max = tf.boolean_mask(predictions, tf.math.logical_not(min_class_filter))
loss_min_class = tfk.backend.mean(tfk.backend.binary_crossentropy(y_min, y_pred_min))
loss_max_class = tfk.backend.mean(tfk.backend.binary_crossentropy(y_max, y_pred_max))
loss_all = tfk.backend.mean(tfk.backend.binary_crossentropy(labels, predictions))
return weights[0]*loss_min_class + weights[1]*loss_max_class + weights[2]*loss_all
return loss_func
loss_functions = [weighted_loss(w) for w in target_weights]
By passing all outputs and all labels into the loss function (even if I ignore a bunch of them) the tape will return an appropriate gradient (0) for all branches, not just the ones in focus for that particular loss.
I try to implement MAML. Therefore I need a copy of my model (model_copy) to be trained one step,
then I need my meta_model to be trained with the loss of my model_copy.
I would like to do the training of the model_copy in a function.
If I copy my code to the function I don't get proper gradients_meta (they will be all none).
It seems, that the graphs are unconnected - how can I connect the graphs?
Any idea of what I am doing wrong? I watch a lot of variables, but that doesn't seem to make a difference..
Here is the code to reproduce this issue:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend
def copy_model(model):
copied_model = keras.Sequential()
copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
copied_model.add(keras.layers.Dense(1))
copied_model.set_weights(model.get_weights())
return copied_model
def compute_loss(model, x, y):
logits = model(x) # prediction of my model
mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits)) # compute loss between prediciton and label/truth
return mse, logits
# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))
# optimizer for training
optimizer = keras.optimizers.Adam()
# function to calculate model_copys params
def do_calc(x, y, meta_model):
with tf.GradientTape() as gg:
model_copy = copy_model(meta_model)
gg.watch(x)
gg.watch(meta_model.trainable_variables)
gg.watch(model_copy.trainable_variables)
loss, _ = compute_loss(model_copy, x, y)
gradient = gg.gradient(loss, model_copy.trainable_variables)
optimizer.apply_gradients(zip(gradient, model_copy.trainable_variables))
return model_copy
# inputs for training
x = tf.constant(3.0, shape=(1, 1, 1))
y = tf.constant(3.0, shape=(1, 1, 1))
with tf.GradientTape() as g:
g.watch(x)
g.watch(y)
model_copy = do_calc(x, y, meta_model)
g.watch(model_copy.trainable_variables)
# calculate loss of model_copy
test_loss, _ = compute_loss(model_copy, x, y)
# build gradients for meta_model update
gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
# gradients always None !?!!11 elf
optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))
Thank you in advance for any help.
I found a solution:
I needed to "connect" meta-model and model-copy somehow.
Can anybody explain why this works and how I would achieve that using a "proper" optimizer?
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend
def copy_model(model):
copied_model = keras.Sequential()
copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
copied_model.add(keras.layers.Dense(1))
copied_model.set_weights(model.get_weights())
return copied_model
def compute_loss(model, x, y):
logits = model(x) # prediction of my model
mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits)) # compute loss between prediciton and label/truth
return mse, logits
# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))
# optimizer for training
optimizer = keras.optimizers.Adam()
# function to calculate model_copys params
def do_calc(meta_model, x, y, gg, alpha=0.01):
model_copy = copy_model(meta_model)
loss, _ = compute_loss(model_copy, x, y)
gradients = gg.gradient(loss, model_copy.trainable_variables)
k = 0
for layer in range(len(model_copy.layers)):
# calculate adapted parameters w/ gradient descent
# \theta_i' = \theta - \alpha * gradients
model_copy.layers[layer].kernel = tf.subtract(meta_model.layers[layer].kernel,
tf.multiply(alpha, gradients[k]))
model_copy.layers[layer].bias = tf.subtract(meta_model.layers[layer].bias,
tf.multiply(alpha, gradients[k + 1]))
k += 2
return model_copy
with tf.GradientTape() as g:
# inputs for training
x = tf.constant(3.0, shape=(1, 1, 1))
y = tf.constant(3.0, shape=(1, 1, 1))
adapted_models = []
# model_copy = meta_model
with tf.GradientTape() as gg:
model_copy = do_calc(meta_model, x, y, gg)
# calculate loss of model_copy
test_loss, _ = compute_loss(model_copy, x, y)
# build gradients for meta_model update
gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
# gradients work. Why???
optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))
Converting Tensor to numpy and using set_weights() will only copy the updated parameter values of the gradient, but the node name in the tf2 graph has changed, so it is not possible to directly use the loss of the copy model to find the gradient of the meta model