Custom Dynamic Loss function: No gradients provided for any variable - python

I am using an RGB dataset for my x train and the loss is calculated in a dynamic loss function that gets the distances of pairs and compares them against the ideal distance dist_train. Here is the model:
class MyModel(Model):
def __init__(self):
super(MyModel, self).__init__()
self.d1 = Dense(3, activation='relu')
self.flatten = Flatten()
self.d2 = Dense(3, activation='relu')
self.d3 = Dense(2)
def call(self, x):
x = self.d1(x)
x = self.flatten(x)
x = self.d2(x)
return self.d3(x)
# Create an instance of the model
model = MyModel()
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
test_loss = tf.keras.metrics.Mean(name='test_loss')
#tf.function
def train_step(rgb):
with tf.GradientTape() as tape:
predictions = model(rgb, training=True)
loss = tf_function(predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
Here is the loss function and the tf.function wrapping it:
def mahal_loss(output):
mahal = sp.spatial.distance.pdist(output, metric='mahalanobis')
mahal = sp.spatial.distance.squareform(mahal, force='no', checks=True)
new_distance = []
mahal = np.ma.masked_array(mahal, mask=mahal==0)
for i in range(len(mahal)):
pw_dist = mahal[i, indices_train[i]]
new_distance.append(pw_dist)
mahal_loss = np.mean((dist_train - new_distance)**2)
return mahal_loss
#tf.function(input_signature=[tf.TensorSpec(None, tf.float32)])
def tf_function(pred):
y = tf.numpy_function(mahal_loss, [pred], tf.float32)
return y
Running the model:
for epoch in range(EPOCHS):
train_loss.reset_states()
test_loss.reset_states()
for i in x_train:
train_step(i)
print(
f'Epoch {epoch + 1}, '
f'Loss: {train_loss.result()}, '
f'Test Loss: {test_loss.result()}, '
)
I believe the reason I am running into problems lies in the dynamic loss function, as I need to calculate the distance between certain pairs to get the results I expect. This means that inside the loss function I have to calculate the mahalanobis distance of each pair to get the ones I will compare against the correct distances. The error I get is the following:
<ipython-input-23-0e975da5cbc2>:15 train_step *
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\optimizer_v2.py:622 apply_gradients **
grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\utils.py:72 filter_empty_gradients
raise ValueError("No gradients provided for any variable: %s." %
ValueError: No gradients provided for any variable: ['my_model/dense/kernel:0', 'my_model/dense/bias:0', 'my_model/dense_1/kernel:0', 'my_model/dense_1/bias:0', 'my_model/dense_2/kernel:0', 'my_model/dense_2/bias:0'].```

The problem is the use of tf.numpy_function.
Specifically, everything that happens inside the with tf.GradientTape() as tape statement has to be differentiable. Because the conversion between tf.Tensor and numpy array is not differentiable, tf.numpy_function cannot be used for loss computation:
Since the function takes numpy arrays, you cannot take gradients through a numpy_function. If you require something that is differentiable, please consider using tf.py_function.
(Source: here in the official documentation)
So either wrap the loss comutation in tf.py_function as this accepts tf.Tensors or consider implementing it in tensorflow. Here is an example for that.

Related

Model does not train properly when explicitly applying the gradients

I’m trying to constrain the weight of my model by explicitly applying the gradients; shower, this is not working and I can’t figure out why.
I’m defining the model with the following function:
def init_model(num_hidden_layers=2, num_neurons_per_layer=64):
model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(2,)) )
for _ in range(num_hidden_layers):
model.add(tf.keras.layers.Dense(num_neurons_per_layer, activation=tf.keras.layers.LeakyReLU( ),kernel_initializer="glorot_uniform") )
model.add(tf.keras.layers.Dense(1,kernel_initializer="glorot_uniform"))
return model
When using the fit method, the loss function decreases and the model fits the data:
Nepochs = 1500
lr = 0.001
def my_loss(u_true, u_pred):
return tf.math.reduce_mean(tf.math.square(u_true - u_pred))
model_0 = init_model(num_hidden_layers=2, num_neurons_per_layer=64)
optim_0 = tf.keras.optimizers.Adam(learning_rate=lr)
model_0.compile(loss=my_loss, optimizer=optim_0)
model_0.summary()
history_0 = model_0.fit(X_train,u_train,validation_data=(X_test.numpy(),u_test.numpy()),epochs=Nepochs, batch_size=X_train.shape[0])
When I explicitly specify and apply the gradient, the loss function stagnates and the output does not fit the data (it is uniform everywhere):
Nepochs = 1500
lr = 0.001
def compute_loss(model, X_data, u_data):
u_pred = model(X_data)
loss = tf.math.reduce_mean(tf.math.square(u_data - u_pred))
return loss
#tf.function
def training(model, optim, X_train, u_train, X_test=None, u_test=None):
if X_test is not None:
validation_loss = compute_loss(model, X_test, u_test )
else:
validation_loss = None
with tf.GradientTape(persistent=True) as tape:
tape.watch(model.trainable_variables)
loss = compute_loss(model, X_train, u_train )
grad_theta = tape.gradient(loss, model.trainable_variables)
optim.apply_gradients(zip(grad_theta, model.trainable_variables))
return loss,validation_loss
model_G = init_model(num_hidden_layers=2, num_neurons_per_layer=64)
optim_G = tf.keras.optimizers.Adam(learning_rate=lr)
model_G.summary()
hist = {'val_loss':[],'loss':[]}
for i in range(Nepochs+1):
loss, val_loss = training(model_G,optim_G,X_train,u_train,X_test,u_test)
hist['loss'].append(loss.numpy())
hist['val_loss'].append(val_loss.numpy())
if val_loss is not None:
print('It {:05d}: loss = {:10.8e}, validation loss = {:10.8e} '.format(i,loss,val_loss))
else:
print('It {:05d}: loss = {:10.8e}'.format(i,loss))
Why do the two versions provide different results?
Thanks for the help.
Cesare
Finally, I found that expanding the dimension of the targets as follows:
u_train = tf.expand_dims(u_train,axis=-1)
u_test = tf.expand_dims(u_test,axis=-1)
the model training properly and the loss functions are correctly evaluated.
u_train and u_test previously had shapes equal to the number of entries N only; by expanding the dimension, the shape now is (N,1).
using fit the code works with both; when explicitly using the gradient, only with targets of shape (N,1).

Keras - Adding loss to intermediate layer while ignoring the last layer

I've created the following Keras custom model:
import tensorflow as tf
from tensorflow.keras.layers import Layer
class MyModel(tf.keras.Model):
def __init__(self, num_classes):
super(MyModel, self).__init__()
self.dense_layer = tf.keras.layers.Dense(num_classes,activation='softmax')
self.lambda_layer = tf.keras.layers.Lambda(lambda x: tf.math.argmax(x, axis=-1))
def call(self, inputs):
x = self.dense_layer(inputs)
x = self.lambda_layer(x)
return x
# A convenient way to get model summary
# and plot in subclassed api
def build_graph(self, raw_shape):
x = tf.keras.layers.Input(shape=(raw_shape))
return tf.keras.Model(inputs=[x],
outputs=self.call(x))
The task is multi-class classification.
Model consists of a dense layer with softmax activation and a lambda layer as a post-processing unit that converts the dense output vector to a single value (predicted class).
The train targets are a one-hot encoded matrix like so:
[
[0,0,0,0,1]
[0,0,1,0,0]
[0,0,0,1,0]
[0,0,0,0,1]
]
It would be nice if I could define a categorical_crossentropy loss over the dense layer and ignore the lambda layer while still maintaining the functionality and outputting a single value when I call model.predict(x).
Please note
My workspace environment doesn't allow me to use a custom training loop as suggested by #alonetogether excellent answer.
You can try using a custom training loop, which is pretty straightforward IMO:
import tensorflow as tf
from tensorflow.keras.layers import Layer
class MyModel(tf.keras.Model):
def __init__(self, num_classes):
super(MyModel, self).__init__()
self.dense_layer = tf.keras.layers.Dense(num_classes,activation='softmax')
self.lambda_layer = tf.keras.layers.Lambda(lambda x: tf.math.argmax(x, axis=-1))
def call(self, inputs):
x = self.dense_layer(inputs)
x = self.lambda_layer(x)
return x
# A convenient way to get model summary
# and plot in subclassed api
def build_graph(self, raw_shape):
x = tf.keras.layers.Input(shape=(raw_shape))
return tf.keras.Model(inputs=[x],
outputs=self.call(x))
n_classes = 5
model = MyModel(n_classes)
labels = tf.keras.utils.to_categorical(tf.random.uniform((50, 1), maxval=5, dtype=tf.int32))
train_dataset = tf.data.Dataset.from_tensor_slices((tf.random.normal((50, 1)), labels)).batch(2)
optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.CategoricalCrossentropy()
epochs = 2
for epoch in range(epochs):
print("\nStart of epoch %d" % (epoch,))
for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
with tf.GradientTape() as tape:
logits = model.layers[0](x_batch_train)
loss_value = loss_fn(y_batch_train, logits)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
And prediction:
print(model.predict(tf.random.normal((1, 1))))
[3]
I think there is a Model.predict_classes function that would replace the need for that lambda layer. But if it doesn't work:
There doesn't seem to be a way to do that without using one of these hacks:
Two inputs (one is the groud truth values Y)
Two outputs
Two models
I'm quite convinced there is no other workaround for this.
So, I believe the "two models" version is the best for your case where you seem to "need" a model with single input, single output and fit.
Then I'd do this:
inputs = tf.keras.layers.Input(input_shape_without_batch_size)
loss_outputs = tf.keras.layers.Dense(num_classes,activation='softmax')(inputs)
final_outputs = tf.keras.layers.Lambda(lambda x: tf.math.argmax(x, axis=-1))(loss_outputs)
training_model = tf.keras.models.Model(inputs, loss_outputs)
final_model = tf.keras.models.Model(inputs, final_outputs)
training_model.compile(.....)
training_model.fit(....)
results = final_model.predict(...)

logistic regression model with L1 regularisations

I am trying to apply L1 regularization on a logistic model
class LogisticRegression(nn.Module):
def __init__(self):
super().__init__()
self.linear = nn.Linear(input_size, num_classes)
def forward(self, x):
x = x.reshape(-1, 784)
output = self.linear(x)
return output
def training_step(self, batch):
images, labels = batch
output = self(images)
loss = F.cross_entropy(output, labels)
acc = accuracy(output, labels)
return {'Training_loss': loss, 'Training_acc': acc}
def training_epoch_end(self, outputs):
batch_losses = [x['Training_loss'] for x in outputs]
epoch_loss = torch.stack(batch_losses).mean()
batch_accs = [x['Training_acc'] for x in outputs]
epoch_acc = torch.stack(batch_accs).mean()
return {'Training_loss': epoch_loss.item(), 'Training_acc': epoch_acc.item()}
def epoch_end(self, epoch, result):
print("Epoch [{}], Training_loss: {:.4f}, Training_acc: {:.4f}".format(epoch, result['Training_loss'], result['Training_acc']))
model = LogisticRegression()
But I think I am doing it wrong the accuracy did not change.
L1=0.2
def evaluate(model_b, trainloader):
outputs = [model_b.training_step(batch) for batch in trainloader]
return model_b.training_epoch_end(outputs)
def fit(epochs, lr, model_b, trainloader, opt_func=torch.optim.SGD):
history = []
optimizer = opt_func(model_b.parameters(), lr)
for epoch in range(epochs):
##### Training Phase
for batch in trainloader:
loss = model_b.training_step(batch)['Training_loss']
loss_Lasso = loss + 0.5 * L1 # L1 reg
loss_Lasso.backward()
optimizer.step()
optimizer.zero_grad()
result = evaluate_b(model_b, trainloader)
model_b.epoch_end(epoch, result)
history.append(result)
return history
Can anyone help me with what I am missing and how I can really apply L1 regularization?
Also, is L1 regularization called lasso?
I believe the l1-norm is a type of Lasso regularization, yes, but there are others.
In your snippet L1 is set as a constant, instead you should measure the l1-norm of your model's parameters. Then sum it with your network's loss, as you did. In your example there is a single layer, so you will only need self.linear's parameters. First gather all parameters then measure the total norm with torch.norm. You could also use nn.L1Loss.
params = torch.cat([x.view(-1) for x in model.linear.parameters()])
L1 = lamb*torch.norm(params, p=1)
Where lamb is your lambda regularization parameter and model is initialized from the LogisticRegression class.

Eagerly update a keras model's weights directly using the gradient

I am writing a custom optimizer with Eager Execution in Ternsorflow 1.15 but can't figure out how to update the weights.
Taking gradient descent as an example, I have the weights, the gradient and a scalar learning rate but can't figure out how to combine them.
This is an implementation of gradient descent where model is a keras.Model e.g. a multilayer CNN:
lr = tf.constant(0.01)
def minimize(model, inputs, targets):
with tf.GradientTape() as tape:
logits = model(input)
loss_value = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=targets)
grad = tape.gradient(loss_value, model.trainable_variables)
step = tf.multiply(self.lr, grad)
model.trainable_variables.assign_sub(step)
but it fails on the tf.multiply saying
tensorflow.python.framework.errors_impl.InvalidArgumentError: Shapes of all inputs must match: values[0].shape = [5,5,1,6] != values[1].shape = [6] [Op:Pack] name: packed
I also know the last line will fail as trainable_variables is a list and doesn't have the method assign_sub.
How can I rewrite the last two lines of my code to do:
model.trainable_variables -= lr * grad
Figured it out. As both are lists we need to iterate through their pairs of gradients and variables for each layer together and update each of these separately.
lr = tf.constant(0.01)
def minimize(model, inputs, targets):
with tf.GradientTape() as tape:
logits = model(input)
loss_value = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=targets)
grad = tape.gradient(loss_value, model.trainable_variables)
for v, g in zip(model.trainable_variables, grad):
v.assign_sub(lr * g)

Nested Gradient Tape in function (TF2.0)

I try to implement MAML. Therefore I need a copy of my model (model_copy) to be trained one step,
then I need my meta_model to be trained with the loss of my model_copy.
I would like to do the training of the model_copy in a function.
If I copy my code to the function I don't get proper gradients_meta (they will be all none).
It seems, that the graphs are unconnected - how can I connect the graphs?
Any idea of what I am doing wrong? I watch a lot of variables, but that doesn't seem to make a difference..
Here is the code to reproduce this issue:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend
def copy_model(model):
copied_model = keras.Sequential()
copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
copied_model.add(keras.layers.Dense(1))
copied_model.set_weights(model.get_weights())
return copied_model
def compute_loss(model, x, y):
logits = model(x) # prediction of my model
mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits)) # compute loss between prediciton and label/truth
return mse, logits
# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))
# optimizer for training
optimizer = keras.optimizers.Adam()
# function to calculate model_copys params
def do_calc(x, y, meta_model):
with tf.GradientTape() as gg:
model_copy = copy_model(meta_model)
gg.watch(x)
gg.watch(meta_model.trainable_variables)
gg.watch(model_copy.trainable_variables)
loss, _ = compute_loss(model_copy, x, y)
gradient = gg.gradient(loss, model_copy.trainable_variables)
optimizer.apply_gradients(zip(gradient, model_copy.trainable_variables))
return model_copy
# inputs for training
x = tf.constant(3.0, shape=(1, 1, 1))
y = tf.constant(3.0, shape=(1, 1, 1))
with tf.GradientTape() as g:
g.watch(x)
g.watch(y)
model_copy = do_calc(x, y, meta_model)
g.watch(model_copy.trainable_variables)
# calculate loss of model_copy
test_loss, _ = compute_loss(model_copy, x, y)
# build gradients for meta_model update
gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
# gradients always None !?!!11 elf
optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))
Thank you in advance for any help.
I found a solution:
I needed to "connect" meta-model and model-copy somehow.
Can anybody explain why this works and how I would achieve that using a "proper" optimizer?
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend
def copy_model(model):
copied_model = keras.Sequential()
copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
copied_model.add(keras.layers.Dense(1))
copied_model.set_weights(model.get_weights())
return copied_model
def compute_loss(model, x, y):
logits = model(x) # prediction of my model
mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits)) # compute loss between prediciton and label/truth
return mse, logits
# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))
# optimizer for training
optimizer = keras.optimizers.Adam()
# function to calculate model_copys params
def do_calc(meta_model, x, y, gg, alpha=0.01):
model_copy = copy_model(meta_model)
loss, _ = compute_loss(model_copy, x, y)
gradients = gg.gradient(loss, model_copy.trainable_variables)
k = 0
for layer in range(len(model_copy.layers)):
# calculate adapted parameters w/ gradient descent
# \theta_i' = \theta - \alpha * gradients
model_copy.layers[layer].kernel = tf.subtract(meta_model.layers[layer].kernel,
tf.multiply(alpha, gradients[k]))
model_copy.layers[layer].bias = tf.subtract(meta_model.layers[layer].bias,
tf.multiply(alpha, gradients[k + 1]))
k += 2
return model_copy
with tf.GradientTape() as g:
# inputs for training
x = tf.constant(3.0, shape=(1, 1, 1))
y = tf.constant(3.0, shape=(1, 1, 1))
adapted_models = []
# model_copy = meta_model
with tf.GradientTape() as gg:
model_copy = do_calc(meta_model, x, y, gg)
# calculate loss of model_copy
test_loss, _ = compute_loss(model_copy, x, y)
# build gradients for meta_model update
gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
# gradients work. Why???
optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))
Converting Tensor to numpy and using set_weights() will only copy the updated parameter values of the gradient, but the node name in the tf2 graph has changed, so it is not possible to directly use the loss of the copy model to find the gradient of the meta model

Categories