Nested Gradient Tape in function (TF2.0)

Nested Gradient Tape in function (TF2.0) - python

I try to implement MAML. Therefore I need a copy of my model (model_copy) to be trained one step,
then I need my meta_model to be trained with the loss of my model_copy.
I would like to do the training of the model_copy in a function.
If I copy my code to the function I don't get proper gradients_meta (they will be all none).
It seems, that the graphs are unconnected - how can I connect the graphs?
Any idea of what I am doing wrong? I watch a lot of variables, but that doesn't seem to make a difference..
Here is the code to reproduce this issue:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend
def copy_model(model):
copied_model = keras.Sequential()
copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
copied_model.add(keras.layers.Dense(1))
copied_model.set_weights(model.get_weights())
return copied_model
def compute_loss(model, x, y):
logits = model(x) # prediction of my model
mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits)) # compute loss between prediciton and label/truth
return mse, logits
# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))
# optimizer for training
optimizer = keras.optimizers.Adam()
# function to calculate model_copys params
def do_calc(x, y, meta_model):
with tf.GradientTape() as gg:
model_copy = copy_model(meta_model)
gg.watch(x)
gg.watch(meta_model.trainable_variables)
gg.watch(model_copy.trainable_variables)
loss, _ = compute_loss(model_copy, x, y)
gradient = gg.gradient(loss, model_copy.trainable_variables)
optimizer.apply_gradients(zip(gradient, model_copy.trainable_variables))
return model_copy
# inputs for training
x = tf.constant(3.0, shape=(1, 1, 1))
y = tf.constant(3.0, shape=(1, 1, 1))
with tf.GradientTape() as g:
g.watch(x)
g.watch(y)
model_copy = do_calc(x, y, meta_model)
g.watch(model_copy.trainable_variables)
# calculate loss of model_copy
test_loss, _ = compute_loss(model_copy, x, y)
# build gradients for meta_model update
gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
# gradients always None !?!!11 elf
optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))
Thank you in advance for any help.

I found a solution:
I needed to "connect" meta-model and model-copy somehow.
Can anybody explain why this works and how I would achieve that using a "proper" optimizer?
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend
def copy_model(model):
copied_model = keras.Sequential()
copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
copied_model.add(keras.layers.Dense(1))
copied_model.set_weights(model.get_weights())
return copied_model
def compute_loss(model, x, y):
logits = model(x) # prediction of my model
mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits)) # compute loss between prediciton and label/truth
return mse, logits
# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))
# optimizer for training
optimizer = keras.optimizers.Adam()
# function to calculate model_copys params
def do_calc(meta_model, x, y, gg, alpha=0.01):
model_copy = copy_model(meta_model)
loss, _ = compute_loss(model_copy, x, y)
gradients = gg.gradient(loss, model_copy.trainable_variables)
k = 0
for layer in range(len(model_copy.layers)):
# calculate adapted parameters w/ gradient descent
# \theta_i' = \theta - \alpha * gradients
model_copy.layers[layer].kernel = tf.subtract(meta_model.layers[layer].kernel,
tf.multiply(alpha, gradients[k]))
model_copy.layers[layer].bias = tf.subtract(meta_model.layers[layer].bias,
tf.multiply(alpha, gradients[k + 1]))
k += 2
return model_copy
with tf.GradientTape() as g:
# inputs for training
x = tf.constant(3.0, shape=(1, 1, 1))
y = tf.constant(3.0, shape=(1, 1, 1))
adapted_models = []
# model_copy = meta_model
with tf.GradientTape() as gg:
model_copy = do_calc(meta_model, x, y, gg)
# calculate loss of model_copy
test_loss, _ = compute_loss(model_copy, x, y)
# build gradients for meta_model update
gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
# gradients work. Why???
optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))

Converting Tensor to numpy and using set_weights() will only copy the updated parameter values of the gradient, but the node name in the tf2 graph has changed, so it is not possible to directly use the loss of the copy model to find the gradient of the meta model

Related

Custom Dynamic Loss function: No gradients provided for any variable

I am using an RGB dataset for my x train and the loss is calculated in a dynamic loss function that gets the distances of pairs and compares them against the ideal distance dist_train. Here is the model:
class MyModel(Model):
def __init__(self):
super(MyModel, self).__init__()
self.d1 = Dense(3, activation='relu')
self.flatten = Flatten()
self.d2 = Dense(3, activation='relu')
self.d3 = Dense(2)
def call(self, x):
x = self.d1(x)
x = self.flatten(x)
x = self.d2(x)
return self.d3(x)
# Create an instance of the model
model = MyModel()
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
test_loss = tf.keras.metrics.Mean(name='test_loss')
#tf.function
def train_step(rgb):
with tf.GradientTape() as tape:
predictions = model(rgb, training=True)
loss = tf_function(predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
Here is the loss function and the tf.function wrapping it:
def mahal_loss(output):
mahal = sp.spatial.distance.pdist(output, metric='mahalanobis')
mahal = sp.spatial.distance.squareform(mahal, force='no', checks=True)
new_distance = []
mahal = np.ma.masked_array(mahal, mask=mahal==0)
for i in range(len(mahal)):
pw_dist = mahal[i, indices_train[i]]
new_distance.append(pw_dist)
mahal_loss = np.mean((dist_train - new_distance)**2)
return mahal_loss
#tf.function(input_signature=[tf.TensorSpec(None, tf.float32)])
def tf_function(pred):
y = tf.numpy_function(mahal_loss, [pred], tf.float32)
return y
Running the model:
for epoch in range(EPOCHS):
train_loss.reset_states()
test_loss.reset_states()
for i in x_train:
train_step(i)
print(
f'Epoch {epoch + 1}, '
f'Loss: {train_loss.result()}, '
f'Test Loss: {test_loss.result()}, '
)
I believe the reason I am running into problems lies in the dynamic loss function, as I need to calculate the distance between certain pairs to get the results I expect. This means that inside the loss function I have to calculate the mahalanobis distance of each pair to get the ones I will compare against the correct distances. The error I get is the following:
<ipython-input-23-0e975da5cbc2>:15 train_step *
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\optimizer_v2.py:622 apply_gradients **
grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\utils.py:72 filter_empty_gradients
raise ValueError("No gradients provided for any variable: %s." %
ValueError: No gradients provided for any variable: ['my_model/dense/kernel:0', 'my_model/dense/bias:0', 'my_model/dense_1/kernel:0', 'my_model/dense_1/bias:0', 'my_model/dense_2/kernel:0', 'my_model/dense_2/bias:0'].```

The problem is the use of tf.numpy_function.
Specifically, everything that happens inside the with tf.GradientTape() as tape statement has to be differentiable. Because the conversion between tf.Tensor and numpy array is not differentiable, tf.numpy_function cannot be used for loss computation:
Since the function takes numpy arrays, you cannot take gradients through a numpy_function. If you require something that is differentiable, please consider using tf.py_function.
(Source: here in the official documentation)
So either wrap the loss comutation in tf.py_function as this accepts tf.Tensors or consider implementing it in tensorflow. Here is an example for that.

Create a neural network to classify mnist dataset without using keras. Error: tape is required when a Tensor loss is passed

Here is the code. I have done the forward pass but I receive an error anytime I run it and I don't know what the problem is. I first of all create a batch with the features and labels, do the forward pass and try to use keras SGD optimizer.
This is the error I get:
And this is my code:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import numpy as np
def batches(batch_size, features, labels):
"""
Create batches of features and labels
:param batch_size: The batch size
:param features: List of features
:param labels: List of labels
:return: Batches of (Features, Labels)
"""
assert len(features) == len(labels)
outout_batches = []
sample_size = len(features)
features = tf.Variable(features, dtype='float32')
for start_i in range(0, sample_size, batch_size):
end_i = start_i + batch_size
batch = (features[start_i:end_i], labels[start_i:end_i])
outout_batches.append(batch)
return outout_batches
def get_logits(features, weights, biases):
# network's forward pass, multiply inputs with weight
return tf.add(tf.matmul(features, weights), biases)
def get_cost(logits, labels):
# returns the cost of the pass
return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
def vectorize(features):
# reshapes the features to a vector for input
return features.reshape(features.shape[0], features.shape[1] * features.shape[2])
(train_x, train_y), (test_x, test_y) = tf.keras.datasets.mnist.load_data()
train_x, test_x = train_x.astype('float32'), test_x.astype('float32')
train_x, test_y = train_x.astype('float32'), test_y.astype('float32')
train_y, test_y = to_categorical(train_y, 10), to_categorical(test_y, 10)
train_x = vectorize(train_x)
n_inputs = 28 * 28
n_classes = 10
weights = tf.Variable(tf.random.normal([n_inputs, n_classes]), dtype='float32', name='weights')
biases = tf.Variable(tf.random.normal([n_classes]), dtype='float32', name='biases')
batch_list = batches(32, train_x, train_y)
for x, y in batch_list:
logits = get_logits(x, weights, biases)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
opt = tf.keras.optimizers.SGD(learning_rate=0.001)
optimizer = opt.minimize(loss=cost)

This is because your loss is a Tensor. In the optimizer.minimize() the loss argument can be a Tensor or callable. If a callable, loss should take no arguments and return the value to minimize. If loss is a Tensor, the tape argument must be passed.
So the modified code could be like this:
for x, y in batch_list:
with tf.GradientTape() as tape:
logits = get_logits(x, weights, biases)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
opt = tf.keras.optimizers.SGD(learning_rate=0.001)
optimizer = opt.minimize(loss=cost, var_list=[weights,biases], tape=tape)

Hi i made a little change towards your code, not sure if it fits your situation but i would normally do it like this
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
for x, y in batch_list:
with tf.GradientTape() as tape:
logits = get_logits(x, weights, biases)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
grads = tape.gradient(loss, [weights, biases])
optimizer.apply_gradients(zip(grads, [weights, biases]))

Custom Keras loss function with the output's gradient [duplicate]

I am using TF2 (2.3.0) NN to approximate the function y which solves the ODE: y'+3y=0
I have defined cutsom loss class and function in which I am trying to differentiate the single output with respect to the single input so the equation holds, provided that y_true is zero:
from tensorflow.keras.losses import Loss
import tensorflow as tf
class CustomLossOde(Loss):
def __init__(self, x, model, name='ode_loss'):
super().__init__(name=name)
self.x = x
self.model = model
def call(self, y_true, y_pred):
with tf.GradientTape() as tape:
tape.watch(self.x)
y_p = self.model(self.x)
dy_dx = tape.gradient(y_p, self.x)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
return loss
but running the following NN:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
from custom_loss_ode import CustomLossOde
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
loss = CustomLossOde(model.input, model)
model.compile(optimizer=Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99),loss=loss)
model.run_eagerly = True
model.fit(x_train, y_train, batch_size=16, epochs=30)
for now I am getting 0 loss from the fisrt epoch, which doesn't make any sense.
I have printed both y_true and y_test from within the function and they seem OK so I suspect that the problem is in the gradien which I didn't succeed to print.
Apprecitate any help

Defining a custom loss with the high level Keras API is a bit difficult in that case. I would instead write the training loop from scracth, as it allows a finer grained control over what you can do.
I took inspiration from those two guides :
Advanced Automatic Differentiation
Writing a training loop from scratch
Basically, I used the fact that multiple tape can interact seamlessly. I use one to compute the loss function, the other to calculate the gradients to be propagated by the optimizer.
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
# using the high level tf.data API for data handling
x_train = tf.reshape(x_train,(-1,1))
dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train)).batch(1)
opt = Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99)
for step, (x,y_true) in enumerate(dataset):
# we need to convert x to a variable if we want the tape to be
# able to compute the gradient according to x
x_variable = tf.Variable(x)
with tf.GradientTape() as model_tape:
with tf.GradientTape() as loss_tape:
loss_tape.watch(x_variable)
y_pred = model(x_variable)
dy_dx = loss_tape.gradient(y_pred, x_variable)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
grad = model_tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(grad, model.trainable_variables))
if step%20==0:
print(f"Step {step}: loss={loss.numpy()}")

Why can't I learn XOR function with this network and constraints?

Let's say I have the following constraints and the network:
The architecture is fixed (see this image) (note that there are no biases)
Activation function for the hidden layer is ReLU
There's no activation function for the output layer (should just return the sum of the inputs it receive).
I tried to implement this in pytorch with various initialization schemes and different data sets but I failed (the code is at the bottom).
My questions are:
Is there anything wrong with my NN training process?
Is this a feasible problem? If yes, how?
If this is doable, can we still achieve that by constraining the weights to be in the set {-1, 0, 1}
Code:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import numpy as np
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.fc1 = nn.Linear(2,2,bias=False)
self.fc2 = nn.Linear(2,1, bias=False)
self.rl = nn.ReLU()
def forward(self, x):
x = self.fc1(x)
x = self.rl(x)
x = self.fc2(x)
return x
#create an XOR data set to train
rng = np.random.RandomState(0)
X = rng.randn(200, 2)
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0).astype('int32')
# test data set
X_test = np.array([[0,0],[0,1], [1,0], [1,1]])
train = data_utils.TensorDataset(torch.from_numpy(X).float(), \
torch.from_numpy(y).float())
train_loader = data_utils.DataLoader(train, batch_size=50, shuffle=True)
test = torch.from_numpy(X_test).float()
# training the network
num_epoch = 10000
net = Network()
net.fc1.weight.data.clamp_(min=-1, max=1)
net.fc2.weight.data.clamp_(min=-1, max=1)
# define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters())
for epoch in range(num_epoch):
running_loss = 0 # loss per epoch
for (X, y)in train_loader:
# make the grads zero
optimizer.zero_grad()
# forward propagate
out = net(X)
# calculate loss and update
loss = criterion(out, y)
loss.backward()
optimizer.step()
running_loss += loss.data
if epoch%500== 0:
print("Epoch: {0} Loss: {1}".format(epoch, running_loss))
The loss doesn't improve. It gets stuck in some value after a few epochs ( i'm not sure how to make this reproducible as I'm getting different values every time)
net(test) returns a set of predictions that are no way close to XOR output.

You need to use a nonlinear activation function such as sigmoid in your hidden and output layers . because xor is not linearly separable.Also biases are required.

How to set a constant as a weight in a keras model?

I build my model using tf.keras.layers.Dense. In the first layer of my model I want some weights to be constant Zero. As in the gradient calculation these weights should be get a gradient = zero (as the last term in the chain rule corresponds to the weight, which is 0 for a constant).
This is my approach so far:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
tf.enable_eager_execution()
model = tf.keras.Sequential([
tf.keras.layers.Dense(2, activation=tf.sigmoid, input_shape=(2,)),
tf.keras.layers.Dense(2, activation=tf.sigmoid)
])
weights=[np.array([[tf.constant(0), 0.25],[0.2,0.3]]),np.array([0.35,0.35]),np.array([[0.4,0.5],[0.45, 0.55]]),np.array([0.6,0.6])]
model.set_weights(weights)
def loss(model, x, y):
y_ = model(x)
return tf.losses.mean_squared_error(labels=y, predictions=y_)
def grad(model, inputs, targets):
with tf.GradientTape() as tape:
loss_value = loss(model, inputs, targets)
return loss_value, tape.gradient(loss_value, model.trainable_variables)
But in the gradient calculation the weight tf.constant(0) has a gradient not equal zero.
Do I have an understanding problem?
How can I set a weight(or some weights) in a layer(not all weights in one layer) to a constant value (which should not change during training)?

My answer is based on the CustomConnected layer from this answer. As I said in a comment, when you multiply a weight w_ij by c_ij=0 via the connections matrix, the gradient of the loss with respect to that weight becomes zero as well (since the last factor in the chain rule corresponds to c_ij=0).
Here is a minimal example in Keras:
# Using CustomConnected from:
# https://stackoverflow.com/questions/50290769/specify-connections-in-nn-in-keras
import tensorflow as tf
import numpy as np
tf.enable_eager_execution()
# Define model
inp = tf.keras.layers.Input(shape=(2,))
c = np.array([[1., 1.], [1., 0.]], dtype=np.float32)
h = CustomConnected(2, c)(inp)
model = tf.keras.models.Model(inp, h)
# Set initial weights and compile
w = [np.random.rand(2, 2) * c]
model.set_weights(w)
model.compile(tf.train.AdamOptimizer(), 'mse')
# Check gradients
x = tf.constant(np.random.rand(10, 2), dtype=tf.float32)
y = np.random.rand(10, 2)
with tf.GradientTape() as tape:
loss_value = tf.losses.mean_squared_error(labels=y, predictions=model(x))
grad = tape.gradient(loss_value, model.trainable_variables)
print('Gradients: ', grad[0])
Note that I set c[1,1]=0 so the gradient corresponding to weight w[1,1] is 0 regardless of the input.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Nested Gradient Tape in function (TF2.0) - python

Converting Tensor to numpy and using set_weights() will only copy the updated parameter values of the gradient, but the node name in the tf2 graph has changed, so it is not possible to directly use the loss of the copy model to find the gradient of the meta model

Related

Custom Dynamic Loss function: No gradients provided for any variable

Create a neural network to classify mnist dataset without using keras. Error: tape is required when a Tensor loss is passed

Custom Keras loss function with the output's gradient [duplicate]

Why can't I learn XOR function with this network and constraints?

How to set a constant as a weight in a keras model?

Categories

Resources