I am trying to implement a very simple keras model that uses Knowledge Distillation [1] from another model.
Roughly, I need to replace the original loss L(y_true, y_pred) by L(y_true, y_pred)+L(y_teacher_pred, y_pred) where y_teacher_pred is the prediction of another model.
I've tried to do
def create_student_model_with_distillation(teacher_model):
inp = tf.keras.layers.Input(shape=(21,))
model = tf.keras.models.Sequential()
model.add(inp)
model.add(...)
model.add(tf.keras.layers.Dense(units=1))
teacher_pred = teacher_model(inp)
def my_loss(y_true,y_pred):
loss = tf.keras.losses.mean_squared_error(y_true, y_pred)
loss += tf.keras.losses.mean_squared_error(teacher_pred, y_pred)
return loss
model.compile(loss=my_loss, optimizer='adam')
return model
However, when I try to call fit on my model, I am getting
TypeError: An op outside of the function building code is being passed
a "Graph" tensor. It is possible to have Graph tensors
leak out of the function building context by including a
tf.init_scope in your function building code.
How can I solve this issue ?
Refs
[1] https://arxiv.org/abs/1503.02531
Actually, this blogpost is answer to your question: keras blog
But in short - you should use new TF2 API and call teacher's predict before the tf.GradientTape() block:
def train_step(self, data):
# Unpack data
x, y = data
# Forward pass of teacher
teacher_predictions = self.teacher(x, training=False)
with tf.GradientTape() as tape:
# Forward pass of student
student_predictions = self.student(x, training=True)
# Compute losses
student_loss = self.student_loss_fn(y, student_predictions)
distillation_loss = self.distillation_loss_fn(
tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
tf.nn.softmax(student_predictions / self.temperature, axis=1),
)
loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
Related
This is my model class. I am trying to implement knowledge distillation here
class Distiller(Model):
def __init__(self, student, teacher):
super(Distiller, self).__init__()
self.teacher = teacher
self.student = student
def compile(
self,
optimizer,
metrics,
student_loss_fn,
distillation_loss_fn,
alpha=0.1,
temperature=3,
):
""" Configure the distiller.
Args:
optimizer: Keras optimizer for the student weights
metrics: Keras metrics for evaluation
student_loss_fn: Loss function of difference between student
predictions and ground-truth
distillation_loss_fn: Loss function of difference between soft
student predictions and soft teacher predictions
alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
temperature: Temperature for softening probability distributions.
Larger temperature gives softer distributions.
"""
super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
self.student_loss_fn = student_loss_fn
self.distillation_loss_fn = distillation_loss_fn
self.alpha = alpha
self.temperature = temperature
def train_step(self, data):
# Unpack data
x, y = data
# Forward pass of teacher
teacher_predictions = self.teacher(x, training=False)
with tf.GradientTape() as tape:
# Forward pass of student
student_predictions = self.student(x, training=True)
# Compute losses
student_loss = self.student_loss_fn(y, student_predictions)
# Compute scaled distillation loss from https://arxiv.org/abs/1503.02531
# The magnitudes of the gradients produced by the soft targets scale
# as 1/T^2, multiply them by T^2 when using both hard and soft targets.
distillation_loss = (
self.distillation_loss_fn(
tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
tf.nn.softmax(student_predictions / self.temperature, axis=1),
)
* self.temperature**2
)
loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
# Compute gradients
trainable_vars = self.student.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update the metrics configured in `compile()`.
self.compiled_metrics.update_state(y, student_predictions)
# Return a dict of performance
results = {m.name: m.result() for m in self.metrics}
results.update(
{"student_loss": student_loss, "distillation_loss": distillation_loss}
)
return results
def test_step(self, data):
# Unpack the data
x, y = data
# Compute predictions
y_prediction = self.student(x, training=False)
# Calculate the loss
student_loss = self.student_loss_fn(y, y_prediction)
# Update the metrics.
self.compiled_metrics.update_state(y, y_prediction)
# Return a dict of performance
results = {m.name: m.result() for m in self.metrics}
results.update({"student_loss": student_loss})
return results
# Initialize and compile distiller
distiller = Distiller(student=student_model, teacher= model1)
distiller.compile(
optimizer='adam',
metrics=['accuracy'],
student_loss_fn=CategoricalCrossentropy(from_logits= True),
distillation_loss_fn=KLDivergence(),
alpha=0.1,
temperature=40,
)
This is how I am training
distiller_best_model = SaveBestModel()
'''
teacher_history = teacher_model.fit(train_generator, validation_data=val_generator, steps_per_epoch=batch_size,
epochs=EPOCHS, callbacks=[teacher_best_model])
'''
# Distill teacher to student
distiller_history = distiller.fit(train_generator,
validation_data=val_generator,
steps_per_epoch=batch_size,
epochs=EPOCHS, callbacks=[distiller_best_model])
For evaluation purpose, I am doing this and it is working fine
distiller.set_weights(distiller_best_model.best_weights)
# Evaluate student on test dataset
distiller_test = distiller.evaluate(test_generator)
But I have to make a Classification Report. For this I need to do model.predict(X_test)
But whenever I am doing this ,
distiller.predict(X_test)
throwing error as
Can anyone help me fixing this. Unless I will not be able to make the classification report. Thanks in advance :)
I am using an RGB dataset for my x train and the loss is calculated in a dynamic loss function that gets the distances of pairs and compares them against the ideal distance dist_train. Here is the model:
class MyModel(Model):
def __init__(self):
super(MyModel, self).__init__()
self.d1 = Dense(3, activation='relu')
self.flatten = Flatten()
self.d2 = Dense(3, activation='relu')
self.d3 = Dense(2)
def call(self, x):
x = self.d1(x)
x = self.flatten(x)
x = self.d2(x)
return self.d3(x)
# Create an instance of the model
model = MyModel()
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
test_loss = tf.keras.metrics.Mean(name='test_loss')
#tf.function
def train_step(rgb):
with tf.GradientTape() as tape:
predictions = model(rgb, training=True)
loss = tf_function(predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
Here is the loss function and the tf.function wrapping it:
def mahal_loss(output):
mahal = sp.spatial.distance.pdist(output, metric='mahalanobis')
mahal = sp.spatial.distance.squareform(mahal, force='no', checks=True)
new_distance = []
mahal = np.ma.masked_array(mahal, mask=mahal==0)
for i in range(len(mahal)):
pw_dist = mahal[i, indices_train[i]]
new_distance.append(pw_dist)
mahal_loss = np.mean((dist_train - new_distance)**2)
return mahal_loss
#tf.function(input_signature=[tf.TensorSpec(None, tf.float32)])
def tf_function(pred):
y = tf.numpy_function(mahal_loss, [pred], tf.float32)
return y
Running the model:
for epoch in range(EPOCHS):
train_loss.reset_states()
test_loss.reset_states()
for i in x_train:
train_step(i)
print(
f'Epoch {epoch + 1}, '
f'Loss: {train_loss.result()}, '
f'Test Loss: {test_loss.result()}, '
)
I believe the reason I am running into problems lies in the dynamic loss function, as I need to calculate the distance between certain pairs to get the results I expect. This means that inside the loss function I have to calculate the mahalanobis distance of each pair to get the ones I will compare against the correct distances. The error I get is the following:
<ipython-input-23-0e975da5cbc2>:15 train_step *
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\optimizer_v2.py:622 apply_gradients **
grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\utils.py:72 filter_empty_gradients
raise ValueError("No gradients provided for any variable: %s." %
ValueError: No gradients provided for any variable: ['my_model/dense/kernel:0', 'my_model/dense/bias:0', 'my_model/dense_1/kernel:0', 'my_model/dense_1/bias:0', 'my_model/dense_2/kernel:0', 'my_model/dense_2/bias:0'].```
The problem is the use of tf.numpy_function.
Specifically, everything that happens inside the with tf.GradientTape() as tape statement has to be differentiable. Because the conversion between tf.Tensor and numpy array is not differentiable, tf.numpy_function cannot be used for loss computation:
Since the function takes numpy arrays, you cannot take gradients through a numpy_function. If you require something that is differentiable, please consider using tf.py_function.
(Source: here in the official documentation)
So either wrap the loss comutation in tf.py_function as this accepts tf.Tensors or consider implementing it in tensorflow. Here is an example for that.
I am trying to train an image denoiser network with Keras (Tensorflow2). For the loss function, I want to use something like (a1 * L1_loss + a2 * L2_loss), where a1 and a2 is trainable, which means after I gave them initial valuse, they can get updated each training iteration. But I am stuck here for a while and do know how should I implement this.
Here is some example code,
model_input = Input(shape=self.input_shape)
l1_weight = tf.Variable(0.5, trainable=True, name='L1_Loss_weight')
l2_weight = tf.Variable(0.5, trainable=True, name='L2_Loss_weight')
model_output= Conv3D(filters=self.filters, kernel_size=self.kernel_size, padding='same')(model_input)
self.model = Model(inputs=model_input,
outputs=model_output)
optimizer = tf.keras.optimizers.SGD()
model_loss = mixed_loss(L1_weight=l1_weight, L2_weight=l2_weight)
self.model.compile(optimizer=optimizer,
loss=model_loss)
where my loss function is defined as
def mixed_loss(L1_weight, L2_weight):
def mixed(y_true, y_pred):
return L1_weight * mean_absolute_error(y_true, y_pred) + L2_weight * mean_squared_error(y_true, y_pred)
return mixed
And then I use fit() function to pass the tf.data.Dataset, which including the training data, to do the training.
Although I can add two weights parameters this way, these weights are un-trainable and they wouldn't change as training. Really hope to get some hints or examples if anyone has some ideas about this problem. Any help is appreciated!
I am working on a multiple output model in Keras. I've implemented two custom metrics, auroc and auprc, that are passed to the compile methods of Keras model:
def auc(y_true, y_pred, curve='PR'):
score, up_opt = tf.compat.v1.metrics.auc(y_true, y_pred, curve=curve, summation_method="careful_interpolation")
K.get_session().run(tf.local_variables_initializer())
with tf.control_dependencies([up_opt]):
score = tf.identity(score)
return score
def auprc(y_true, y_pred):
return auc(y_true, y_pred, curve='PR')
def auroc(y_true, y_pred):
return auc(y_true, y_pred, curve='ROC')
mlp_model.compile(loss=...,
optimizer=...,
metrics=[auprc, auroc])
Using this method, I obtain an auprc/auroc values for every output but, to optimize my hyperparameters with a Bayesian optimizator, I need a single metrics (e.g: the average or the sum of auprc for every output). I can't figure out how I can join my metrics in a single one.
EDIT: here an example of desired results
Now for every epochs the following metrics are printed:
out1_auprc: 0.0267 - out2_auprc: 0.0277 - out3_auprc: 0.0294
where out1, out2, out3 are my neural network outputs, I desire to obtain something like:
average_auprc: 0.0279 - out1_auprc: 0.0267 - out2_auprc: 0.0277 - out3_auprc: 0.0294
I am using Keras Tuner for Bayesian Optimization.
Any help is appreciated, thank you.
I override the problem creating a custom callback
class MergeMetrics(Callback):
def __init__(self,**kargs):
super(MergeMetrics,self).__init__(**kargs)
def on_epoch_begin(self,epoch, logs={}):
return
def on_epoch_end(self, epoch, logs={}):
logs['merge_metrics'] = 0.5*logs["y1_mse"]+0.5*logs["y2_mse"]
I use this callback to merge 2 metrics coming from 2 different outputs. I use a simple problem for example but you can integrate it easily in your problem and integrate it with a validation set
this is the dummy example
X = np.random.uniform(0,1, (1000,10))
y1 = np.random.uniform(0,1, 1000)
y2 = np.random.uniform(0,1, 1000)
inp = Input((10))
x = Dense(32, activation='relu')(inp)
out1 = Dense(1, name='y1')(x)
out2 = Dense(1, name='y2')(x)
m = Model(inp, [out1,out2])
m.compile('adam','mae', metrics='mse')
checkpoint = MergeMetrics()
m.fit(X, [y1,y2], epochs=10, callbacks=[checkpoint])
the printed output
loss: ..... y1_mse: 0.0863 - y2_mse: 0.0875 - merge_metrics: 0.0869
I have only one output for my model, but I would like to combine two different loss functions:
def get_model():
# create the model here
model = Model(inputs=image, outputs=output)
alpha = 0.2
model.compile(loss=[mse, gse],
loss_weights=[1-alpha, alpha]
, ...)
but it complains that I need to have two outputs because I defined two losses:
ValueError: When passing a list as loss, it should have one entry per model outputs.
The model has 1 outputs, but you passed loss=[<function mse at 0x0000024D7E1FB378>, <function gse at 0x0000024D7E1FB510>]
Can I possibly write my final loss function without having to create another loss function (because that would restrict me from changing the alpha outside the loss function)?
How do I do something like (1-alpha)*mse + alpha*gse?
Update:
Both my loss functions are equivalent to the function signature of any builtin keras loss function, takes in y_true and y_pred and gives a tensor back for loss (which can be reduced to a scalar using K.mean()), but I believe, how these loss functions are defined shouldn't affect the answer as long as they return valid losses.
def gse(y_true, y_pred):
# some tensor operation on y_pred and y_true
return K.mean(K.square(y_pred - y_true), axis=-1)
Specify a custom function for the loss:
model = Model(inputs=image, outputs=output)
alpha = 0.2
model.compile(
loss=lambda y_true, y_pred: (1 - alpha) * mse(y_true, y_pred) + alpha * gse(y_true, y_pred),
...)
Or if you don't want an ugly lambda make it into an actual function:
def my_loss(y_true, y_pred):
return (1 - alpha) * mse(y_true, y_pred) + alpha * gse(y_true, y_pred)
model = Model(inputs=image, outputs=output)
alpha = 0.2
model.compile(loss=my_loss, ...)
EDIT:
If your alpha is not some global constant, you can have a "loss function factory":
def make_my_loss(alpha):
def my_loss(y_true, y_pred):
return (1 - alpha) * mse(y_true, y_pred) + alpha * gse(y_true, y_pred)
return my_loss
model = Model(inputs=image, outputs=output)
alpha = 0.2
my_loss = make_my_loss(alpha)
model.compile(loss=my_loss, ...)
Yes, define your own custom loss function and pass that to the loss argument upon compiling:
def custom_loss(y_true, y_pred):
return (1-alpha) * K.mean(K.square(y_true-y_pred)) + alpha * gse
(Not sure what you mean with gse). It can be helpful to have a look at how the vanilla losses are implemented in Keras: https://github.com/keras-team/keras/blob/master/keras/losses.py
loss function should be one function.You are giving your model a list of two functions
try:
def mse(y_true, y_pred):
return K.mean(K.square(y_pred - y_true), axis=-1)
model.compile(loss= (mse(y_true, y_pred)*(1-alpha) + gse(y_true, y_pred)*alpha),
, ...)
Not that this answer particularly addresses the original question, I thought of writing it because the same error occurs when trying to load a keras model that has a custom loss using keras.models.load_model, and it's not been properly answered anywhere. Specifically, following the VAE example code in keras github repository, this error occurs when loading the VAE model after been saved with model.save.
The solution is to save only the weights using vae.save_weights('file.h5') instead of saving the full model. However, you would have to build and compile the model again before loading the weights using vae.load_weights('file.h5').
Following is an example implementation.
class VAE():
def build_model(self): # latent_dim and intermediate_dim can be passed as arguments
def sampling(args):
"""Reparameterization trick by sampling from an isotropic unit Gaussian.
# Arguments
args (tensor): mean and log of variance of Q(z|X)
# Returns
z (tensor): sampled latent vector
"""
z_mean, z_log_var = args
batch = K.shape(z_mean)[0]
dim = K.int_shape(z_mean)[1]
# by default, random_normal has mean = 0 and std = 1.0
epsilon = K.random_normal(shape=(batch, dim))
return z_mean + K.exp(0.5 * z_log_var) * epsilon
# original_dim = self.no_features
# intermediate_dim = 256
latent_dim = 8
inputs = Input(shape=(self.no_features,))
x = Dense(256, activation='relu')(inputs)
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)
# use reparameterization trick to push the sampling out as input
# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
# instantiate encoder model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
# build decoder model
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(32, activation='relu')(latent_inputs)
x = Dense(48, activation='relu')(x)
x = Dense(64, activation='relu')(x)
outputs = Dense(self.no_features, activation='linear')(x)
# instantiate decoder model
decoder = Model(latent_inputs, outputs, name='decoder')
# instantiate VAE model
outputs = decoder(encoder(inputs)[2])
VAE = Model(inputs, outputs, name='vae_mlp')
reconstruction_loss = mse(inputs, outputs)
reconstruction_loss *= self.no_features
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)
VAE.add_loss(vae_loss)
VAE.compile(optimizer='adam')
return VAE
Now,
vae_cls = VAE()
vae = vae_cls.build_model()
# vae.fit()
vae.save_weights('file.h5')
Load model and predict (if in a different script, you need to import the VAE class),
vae_cls = VAE()
vae = vae_cls.build_model()
vae.load_weights('file.h5')
# vae.predict()
Finally, The Difference: [ref]
Keras model.save saves,
Model weights
Model architecture
Model compilation details (loss function(s) and metrics)
Model optimizer and regularizer states
Keras model.save_weights saves only the model weights. Keras model.to_json() saves the model architecture.
Hope this helps someone experimenting with variational autoencoders.
Combine MAE and RMSE together:
import tensorflow as tf
from tensorflow import keras
def loss_fn_mae_rmse(y_true, y_pred, alpha=0.8):
mae = keras.losses.MeanAbsoluteError()
mse = keras.losses.MeanSquaredError()
return alpha * mae(y_true, y_pred) + (1 - alpha) * tf.sqrt(mse(y_true, y_pred))
model = keras.Model(inputs=..., outputs=...)
opt = keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer=opt, loss=loss_fn_mae_rmse, metrics=['mae'])
At the same time, if you want to load this model after training and saved to disk:
model = keras.models.load_model('path/to/model.h5', custom_objects={'loss_fn_mae_rmse': loss_fn_mae_rmse})