According to the tensorflow documentation, it should be possible, to configure the loss inside the custom train_step function and only configure the optimizer in compile().
My class looks like this:
import tensorflow as tf
from tensorflow import keras
loss_tracker = tf.keras.metrics.Mean(name="loss")
loss_fn = keras.losses.SparseCategoricalCrossentropy(
reduction=tf.keras.losses.Reduction.NONE
)
class MaskedLanguageModel(tf.keras.Model):
def train_step(self, inputs):
if len(inputs) == 3:
features, labels, sample_weight = inputs
else:
features, labels = inputs
sample_weight = None
with tf.GradientTape() as tape:
predictions = self(features, training=True)
loss = loss_fn(labels, predictions, sample_weight=sample_weight)
# Compute gradients
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Compute our own metrics
loss_tracker.update_state(loss, sample_weight=sample_weight)
# Return a dict mapping metric names to current value
return {"loss": loss_tracker.result()}
#property
def metrics(self):
# We list our `Metric` objects here so that `reset_states()` can be
# called automatically at the start of each epoch
# or at the start of `evaluate()`.
# If you don't implement this property, you have to call
# `reset_states()` yourself at the time of your choosing.
return [loss_tracker]
And the model is compiled with the following snippet.
mlm_model = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")
optimizer = keras.optimizers.Adam(learning_rate=lr)
mlm_model.compile(optimizer=optimizer)
But I get the following error:
'The model cannot be compiled because it has no loss to optimize.'
(My tensorflow version is 2.1.0)
Thank you in advance for any hint. :)
Related
I used the code from tensorflow example and modified the custom training function to use RL epsilon greedy policy for action selection. However, gradient calculations are no longer working. Can someone point out what is missing in my logic for gradient descent?
class CustomModel(keras.Model):
def train_step(self, data):
x, y = data
with tf.GradientTape() as tape:
y_pred = self(x, training=True) # Forward pass
> # Compute our own loss
> metric = tf.math.argmin(y_pred, axis=1)
loss = keras.losses.mean_squared_error(y, metric)
# Compute gradients
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Compute our own metrics
loss_tracker.update_state(loss)
mae_metric.update_state(y, y_pred)
return {"loss": loss_tracker.result(), "mae": mae_metric.result()}
Here is the error message from TensofFlow:
grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
/home/share/virtualenvs/TxBsk36Y/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/utils.py:79 filter_empty_gradients
([v.name for _, v in grads_and_vars],))
ValueError: No gradients provided for any variable: ['dense/kernel:0', 'dense/bias:0'].
I am using the following code to make a custom train step: https://keras.io/guides/customizing_what_happens_in_fit/
I'd like to use this with tf.distribute.MultiWorkerMirroredStrategy. How should the loss be calculated if using a custom loss? As an example, let's say my custom loss is tf.keras.losses.SparseCategoricalCrossentropy. Should the reduction be set to tf.keras.losses.Reduction.NONE (like below)? Or do I need to set the reduction to tf.keras.losses.Reduction.SUM and divide by the global batch size?
My initial thought is to set the reduction to tf.keras.losses.Reduction.NONE. Then when I call model.fit, keras will automatically handle gradient aggregation across the replicas.
class CustomModel(tf.keras.Model):
def __init__(self, model):
super(CustomModel, self).__init__()
self.model = model
self.loss_tracker= tf.keras.metrics.Mean(name='loss')
def train_step(self, data):
x, y = data
with tf.GradientTape() as tape:
# Forward pass
y_pred = self.model(x, training=True)
# Compute our own loss. Shape (batch_size,)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits = True, reduction=tf.keras.losses.Reduction.NONE)(y, y_pred)
# Compute gradients
trainable_vars = self.model.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update metrics
self.loss_tracker.update_state(loss)
self.compiled_metrics.update_state(y, y_pred)
metrics = {m.name: m.result() for m in self.metrics}
return metrics
Following this guide https://keras.io/guides/customizing_what_happens_in_fit/ I have created a custom version of the train_step that will be called when calling model.fit()
#tf.function
def train_step(self, x, y):
''' Adapted from https://keras.io/guides/customizing_what_happens_in_fit/ '''
with tf.GradientTape() as tape:
logits = model(x, training=True)
trainable_vars = model.trainable_variables
loss = custom_loss(y, logits)
gradients = tape.gradient(loss, trainable_vars)
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
self.compiled_metrics.update_state(y, logits)
return {m.name: m.result() for m in self.metrics}
when I then fit the model as
model.compile(optimizer=optimizer, loss=custom_loss)
model.fit(x,y)
the model is trained, however when I try to print something inside the train_step() function I see no output, making me wonder if .fit() actually calls my customized function? Or is what I'm trying to print being printed somewhere in the backend that I cant see in the console?
you should better use tf.print("...") inside train_step during training. I suggest to keep the original tensorflow fit method by replacing it with callback one, then you have multiple options allow you to run your code during training or at the end of epoch or validation.
Example of creating customized callback which print date after the beginning and the end of each epoch:
import tensorflow as tf
from datetime import datetime
class date_callback(tf.keras.callbacks.Callback):
def __init__(self):
super(date_callback, self).__init__()
#other inits if necessary
def on_epoch_begin(self, epoch, logs = None):
now = datetime.now()
tf.print("epoch begin at", now)
def on_epoch_end(self, epoch, logs = None):
now = datetime.now()
tf.print("epoch end at", now)
...
...
#running the model training
dc = date_callback()
model.fit(..., callbacks = [dc])
I am using an RGB dataset for my x train and the loss is calculated in a dynamic loss function that gets the distances of pairs and compares them against the ideal distance dist_train. Here is the model:
class MyModel(Model):
def __init__(self):
super(MyModel, self).__init__()
self.d1 = Dense(3, activation='relu')
self.flatten = Flatten()
self.d2 = Dense(3, activation='relu')
self.d3 = Dense(2)
def call(self, x):
x = self.d1(x)
x = self.flatten(x)
x = self.d2(x)
return self.d3(x)
# Create an instance of the model
model = MyModel()
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
test_loss = tf.keras.metrics.Mean(name='test_loss')
#tf.function
def train_step(rgb):
with tf.GradientTape() as tape:
predictions = model(rgb, training=True)
loss = tf_function(predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
Here is the loss function and the tf.function wrapping it:
def mahal_loss(output):
mahal = sp.spatial.distance.pdist(output, metric='mahalanobis')
mahal = sp.spatial.distance.squareform(mahal, force='no', checks=True)
new_distance = []
mahal = np.ma.masked_array(mahal, mask=mahal==0)
for i in range(len(mahal)):
pw_dist = mahal[i, indices_train[i]]
new_distance.append(pw_dist)
mahal_loss = np.mean((dist_train - new_distance)**2)
return mahal_loss
#tf.function(input_signature=[tf.TensorSpec(None, tf.float32)])
def tf_function(pred):
y = tf.numpy_function(mahal_loss, [pred], tf.float32)
return y
Running the model:
for epoch in range(EPOCHS):
train_loss.reset_states()
test_loss.reset_states()
for i in x_train:
train_step(i)
print(
f'Epoch {epoch + 1}, '
f'Loss: {train_loss.result()}, '
f'Test Loss: {test_loss.result()}, '
)
I believe the reason I am running into problems lies in the dynamic loss function, as I need to calculate the distance between certain pairs to get the results I expect. This means that inside the loss function I have to calculate the mahalanobis distance of each pair to get the ones I will compare against the correct distances. The error I get is the following:
<ipython-input-23-0e975da5cbc2>:15 train_step *
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\optimizer_v2.py:622 apply_gradients **
grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\utils.py:72 filter_empty_gradients
raise ValueError("No gradients provided for any variable: %s." %
ValueError: No gradients provided for any variable: ['my_model/dense/kernel:0', 'my_model/dense/bias:0', 'my_model/dense_1/kernel:0', 'my_model/dense_1/bias:0', 'my_model/dense_2/kernel:0', 'my_model/dense_2/bias:0'].```
The problem is the use of tf.numpy_function.
Specifically, everything that happens inside the with tf.GradientTape() as tape statement has to be differentiable. Because the conversion between tf.Tensor and numpy array is not differentiable, tf.numpy_function cannot be used for loss computation:
Since the function takes numpy arrays, you cannot take gradients through a numpy_function. If you require something that is differentiable, please consider using tf.py_function.
(Source: here in the official documentation)
So either wrap the loss comutation in tf.py_function as this accepts tf.Tensors or consider implementing it in tensorflow. Here is an example for that.
I would like to keep track of the gradients over tensorboard.
However, since session run statements are not a thing anymore and the write_grads argument of tf.keras.callbacks.TensorBoard is depricated, I would like to know how to keep track of gradients during training with Keras or tensorflow 2.0.
My current approach is to create a new callback class for this purpose, but without success. Maybe someone else knows how to accomplish this kind of advanced stuff.
The code created for testing is shown below, but runs into errors independently of printing a gradient value to console or tensorboard.
import tensorflow as tf
from tensorflow.python.keras import backend as K
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation='relu', name='dense128'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10, activation='softmax', name='dense10')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
class GradientCallback(tf.keras.callbacks.Callback):
console = True
def on_epoch_end(self, epoch, logs=None):
weights = [w for w in self.model.trainable_weights if 'dense' in w.name and 'bias' in w.name]
loss = self.model.total_loss
optimizer = self.model.optimizer
gradients = optimizer.get_gradients(loss, weights)
for t in gradients:
if self.console:
print('Tensor: {}'.format(t.name))
print('{}\n'.format(K.get_value(t)[:10]))
else:
tf.summary.histogram(t.name, data=t)
file_writer = tf.summary.create_file_writer("./metrics")
file_writer.set_as_default()
# write_grads has been removed
tensorboard_cb = tf.keras.callbacks.TensorBoard(histogram_freq=1, write_grads=True)
gradient_cb = GradientCallback()
model.fit(x_train, y_train, epochs=5, callbacks=[gradient_cb, tensorboard_cb])
Priniting bias gradients to console (console parameter = True)
leads to: AttributeError: 'Tensor' object has no attribute 'numpy'
Writing to tensorboard (console parameter = False) creates:
TypeError: Using a tf.Tensor as a Python bool is not allowed. Use if t is not None: instead of if t: to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the
value of a tensor.
To compute the gradients of the loss against the weights, use
with tf.GradientTape() as tape:
loss = model(model.trainable_weights)
tape.gradient(loss, model.trainable_weights)
This is (arguably poorly) documented on GradientTape.
We do not need to tape.watch the variable because trainable parameters are watched by default.
As a function, it can be written as
def gradient(model, x):
x_tensor = tf.convert_to_tensor(x, dtype=tf.float32)
with tf.GradientTape() as t:
t.watch(x_tensor)
loss = model(x_tensor)
return t.gradient(loss, x_tensor).numpy()
Also have a look here: https://github.com/tensorflow/tensorflow/issues/31542#issuecomment-630495970
richardwth wrote a child class of Tensorboard.
I adapted it as follows:
class ExtendedTensorBoard(tf.keras.callbacks.TensorBoard):
def _log_gradients(self, epoch):
writer = self._writers['train']
with writer.as_default(), tf.GradientTape() as g:
# here we use test data to calculate the gradients
features, y_true = list(val_dataset.batch(100).take(1))[0]
y_pred = self.model(features) # forward-propagation
loss = self.model.compiled_loss(y_true=y_true, y_pred=y_pred) # calculate loss
gradients = g.gradient(loss, self.model.trainable_weights) # back-propagation
# In eager mode, grads does not have name, so we get names from model.trainable_weights
for weights, grads in zip(self.model.trainable_weights, gradients):
tf.summary.histogram(
weights.name.replace(':', '_') + '_grads', data=grads, step=epoch)
writer.flush()
def on_epoch_end(self, epoch, logs=None):
# This function overwrites the on_epoch_end in tf.keras.callbacks.TensorBoard
# but we do need to run the original on_epoch_end, so here we use the super function.
super(ExtendedTensorBoard, self).on_epoch_end(epoch, logs=logs)
if self.histogram_freq and epoch % self.histogram_freq == 0:
self._log_gradients(epoch)