GradNorm implementation in Tensorflow 2.x / Keras - python

I have been trying to implement Gradnorm from the paper, GradNorm: Gradient Normalization for Adaptive Loss Balancing in Deep Multitask Networks. I have been having some trouble figuring out how to do this since I have a two-head model and need to balance the gradients accordingly.
I am using Keras for this work and using the model.fit() function to train my model.
So far, I have tried to use a callback to implement this like:
class GradNorm(tf.keras.callbacks.Callback):
def __init__(self, model, optimizer, alpha=0.2):
super(GradNorm, self).__init__()
self.alpha = alpha
self.model = model
self.optimizer = optimizer
self.l_hat_1 = self.l_hat_2 = self.l_hat_avg = 1.0
self.inv_rate_1 = self.inv_rate_2 = 1.0
self.weights_shared = self.model.weights[:20] # all shared layer weights
self.w1 = self.w2 = 1.0
self.l01 = self.l02 = -1.0
self.task_losses = []
def on_batch_end(self, batch, logs=None):
loss_t1 = logs.get('loss1')
loss_t2 = logs.get('loss2')
self.task_losses.append((loss_t1, loss_t2))
# Weight losses
l1 = tf.multiply(loss_t1, self.w1)
l2 = tf.multiply(loss_t2, self.w2)
with tf.GradientTape(persistent=True) as tape:
G1R = tape.gradient(tf.constant(loss_t1, dtype=tf.float32), self.weights_shared)
G2R = tape.gradient(tf.constant(loss_t2, dtype=tf.float32), self.weights_shared)
G1 = tf.norm(G1R, ord=2)
G2 = tf.norm(G2R, ord=2)
However, this fails even at this point because G1R returns an array of None
Would really appreciate any help on this
Thank You

Related

nn.LSTM not working together with functional_call for calculating the gradient

I have the following code:
import torch
from torch.nn.utils.stateless import functional_call
import torch.autograd as autograd
import torch.nn as nn
# This is the model
class Encoder(nn.Module):
def __init__(self, action_dim, z_dim, skill_length):
super().__init__()
print(action_dim)
self.lin1 = nn.Linear(action_dim, action_dim)
self.lstm = nn.LSTM(input_size=action_dim, hidden_size=z_dim, batch_first=True)
self.lin2 = nn.Linear(z_dim, z_dim)
def forward(self, skill):
a, b, c = skill.shape
skill = skill.reshape(-1, skill.shape[-1])
embed = self.lin1(skill)
embed = embed.reshape(a, b, c)
mean, _ = self.lstm(embed)
mean = mean[:, -1, :]
mean = self.lin2(mean)
return mean
# This is the initialization function
def pars(model):
params = {}
for name, param in model.named_parameters():
if len(param.shape) == 1:
init = torch.nn.init.constant_(param, 0)
else:
init = torch.nn.init.orthogonal_(param)
params[name] = nn.Parameter(init)
return params
# Initializating the model
model = Encoder(4, 2, 5)
x = torch.rand(3, 5, 4)
params = pars(model)
# Running the model with functional_call and calculating gradient.
samp = functional_call(model, params, x)
grad_f = autograd.grad(torch.mean(samp), params.values(),
retain_graph=True, allow_unused=True)
print(grad_f)
# grad_f has gradient for the linear layer, but None for the LSTM layer.
# Running the model without functional_call and calculating gradient.
samp = model(x)
grad = autograd.grad(torch.mean(samp), model.parameters(), retain_graph=True)
print(grad)
# grad has gradient for all layers, e.g., linears and lstm.
I know the problem is with the LSTM layer because when I use a linear layer with nn.Linear, then the gradient depends on std as well as the linear layer. Unfortunately, I do not know to resolve this problem. I'd appreciate any help.
*Edit: I heavily edited the code provided just to further simplify the example. This code can be copied and run.
Update Dec 11, 2022
class Encoder(nn.Module):
def __init__(self, action_dim, z_dim, skill_length):
super().__init__()
print(action_dim)
self.lin1 = nn.Linear(action_dim, action_dim)
self.lstm = nn.LSTM(input_size=action_dim, hidden_size=z_dim, batch_first=True)
self.lin2 = nn.Linear(z_dim, z_dim)
def forward(self, skill):
a, b, c = skill.shape
skill = skill.reshape(-1, skill.shape[-1])
embed = self.lin1(skill)
embed = embed.reshape(a, b, c)
mean, _ = self.lstm(embed)
pdb.set_trace()
grad1 = autograd.grad(mean.mean(), params.values(),
retain_graph=True, allow_unused=True)
# This gives gradient for the self.lin1 layer, and None for the LSTM
grad2 = autograd.grad(mean.mean(), self.parameters(),
retain_graph=True, allow_unused=True)
# This gives gradient the LSTM, but None for the self.lin1 layer
mean = mean[:, -1, :]
mean = self.lin2(mean)
return mean
When I run it the regular without functional_call and calling directly the model, then autograd.grad(mean.mean(), self.parameters(), allow_unused=True, retain_graph=True) has gradient for the self.lin1 and LSTM layer.
I don't know if this information is useful, but putting out there just in case.

ValueError: No gradients provided for any variable in Tensorflow 2.5

I am performing reinforcement learning and need to train an actor and a critic neural network over a custom environment. I have the following code for my networks and RL agent:
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls
class critic(tf.keras.Model):
## Critic NN
def __init__(self):
super().__init__()
self.d1 = tf.keras.layers.Dense(64,activation='relu')
self.v = tf.keras.layers.Dense(1, activation = None)
def call(self, input_data):
x = self.d1(input_data)
v = self.v(x)
return v
class actor(tf.keras.Model):
## Actor NN
def __init__(self):
super().__init__()
self.d1 = tf.keras.layers.Dense(64,activation='relu')
self.a = tf.keras.layers.Dense(4,activation='softmax')
def call(self, input_data):
x = self.d1(input_data)
a = self.a(x)
return a
class rlalgorithm:
## RL Agent that trains the above NNs based on data from environment
def __init__(self, actions, learning_rate=0.1):
## Display name for graphing performance
self.display_name="A2C"
## Root Mean Square Optimizer for minimizing A2C losses
self.a_opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
self.c_opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
## Initialize models
self.actor = actor()
self.critic = critic()
## Define training constants and variables
learn_rate = tf.constant(learning_rate, dtype=tf.float32)
self.reward = tf.Variable(initial_value=0, dtype=tf.float32)
self.state = tf.Variable(initial_value=tf.zeros(shape=(1, 4)), dtype=tf.float32, shape=(1, 4))
self.next_state = tf.Variable(initial_value=tf.zeros(shape=(1, 4)), dtype=tf.float32, shape=(1, 4))
self.action = tf.Variable(initial_value=0, dtype=tf.float32)
## The graph that produces the advantage
advantage = self.reward + learn_rate*self.critic(self.next_state) - self.critic(self.state)
## Graph that produces losses
dist = tfp.distributions.Categorical(probs=self.actor(self.state), dtype=tf.float32)
self.actor_loss = dist.log_prob(self.action)*advantage
self.critic_loss = advantage**2
def choose_action(self, state):
## Agent chooses action to proceed to next state
prob = self.actor(tf.convert_to_tensor([state]))
dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
action = dist.sample()
return int(action.numpy()[0])
def learn(self, s, a, r, s_):
## Based on chosen action, learn from result
## Assign training variables for this state-action outcome
self.reward = self.reward.assign(r)
self.state = self.state.assign(tf.convert_to_tensor([s]))
self.next_state = self.next_state.assign(tf.convert_to_tensor([s_]))
self.action = self.action.assign(a)
## Generate the loss gradient for actor
with tf.GradientTape() as tape:
actor_grad = tape.gradient(self.actor_loss, self.actor.trainable_variables)
self.a_opt.apply_gradients(zip(actor_grad, self.actor.trainable_variables))
## Generate the loss gradient for critic
with tf.GradientTape() as tape:
critic_grad = tape.gradient(self.critic_loss, self.critic.trainable_variables)
self.c_opt.apply_gradients(zip(critic_grad, self.critic.trainable_variables))
## Environment uses this, not relevant to learning
return s_, self.choose_action(s_)
I am getting the following error:
ValueError: No gradients provided for any variable: ['actor/dense/kernel:0', 'actor/dense/bias:0', 'actor/dense_1/kernel:0', 'actor/dense_1/bias:0'].
I have seen this question asked multiple times, but none of the previous solutions seem to work for my case. Unfortunately, I cannot provide the environment that this agent runs on but the error is only contained within the above file.
I have read through the docs and tried a similar implementation using the optimizer minimize function, with the same results.
I suspect the issue is related to how the tensorflow graphs are defined, but am unsure of what exactly the problem is.
Any and all help is appreciated.
I solved this on my own.
What I didn't understand was the proper usage of tf.GradientTape. Within the with block, I need to perform the operations that compute loss, so that the gradients can be found.
Here is the updated learn function, for anybody else' reference:
def learn(self, s, a, r, s_):
## Based on chosen action, learn from result
## Assign training variables for this state-action outcome
self.reward = self.reward.assign(r)
self.state = self.state.assign(tf.convert_to_tensor([s]))
self.next_state = self.next_state.assign(tf.convert_to_tensor([s_]))
self.action = self.action.assign(a)
## Generate the loss gradient for critic
with tf.GradientTape() as tape:
advantage = self.reward + self.learn_rate*self.critic(self.next_state) - self.critic(self.state)
critic_loss = advantage**2
critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables)
self.c_opt.apply_gradients(zip(critic_grad, self.critic.trainable_variables))
## Generate the loss gradient for actor
with tf.GradientTape() as tape:
dist = tfp.distributions.Categorical(probs=self.actor(self.state), dtype=tf.float32)
actor_loss = dist.log_prob(self.action)*self.advantage
actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
self.a_opt.apply_gradients(zip(actor_grad, self.actor.trainable_variables))
## Environment uses this, not relevant to learning
return s_, self.choose_action(s_)

Tensorflow 2.0: flat_map() to flatten Dataset of Dataset returns cardinality -2

I am trying to run the following code (as given in Tensorflow documentation) to create windows of my data and then flatten the dataset of datasets.
window_size = 5
windows = range_ds.window(window_size, shift=1)
for sub_ds in windows.take(5):
print(sub_ds)
flat_windows = windows.flat_map(lambda x: x)
The problem is that flat_windows.cardinality().numpy() returns cardinality to be -2 which is creating problem for me during training. I tried looking for ways to set_cardinality of a dataset but couldn't find anything. I also tried other ways of flattening a dataset of datasets, but again no success.
Edit-1: The problem with the training is that the shape is unknown (at Linear and Dense layers) when I am training a subclass model (given below). The model trains well when I train the model eagerly (through tf.config.run_functions_eagerly(True)) but that is slow. Therefore I want the input data to be known for the model training.
Neural Network
class NeuralNetworkModel(tf.keras.Model):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.encoder = Encoder()
def train_step(self, inputs):
X = inputs[0]
Y = inputs[1]
with tf.GradientTape() as tape:
enc_X = self.encoder(X)
enc_Y = self.encoder(Y)
# loss:
loss = tf.norm(enc_Y - enc_X, axis = [0, 1], ord = 'fro')
# Compute gradients
trainable_vars = self.encoder.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
#property
def metrics(self):
# We list our `Metric` objects here so that `reset_states()` can be
# called automatically at the start of each epoch
# or at the start of `evaluate()`.
# If you don't implement this property, you have to call
# `reset_states()` yourself at the time of your choosing.
return [loss_tracker]
def test_step(self, inputs):
X = inputs[0]
Y = inputs[1]
Psi_X = self.encoder(X)
Psi_Y = self.encoder(Y)
# loss:
loss = tf.norm(Psi_Y - Psi_X, axis = [0, 1], ord = 'fro')
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
class Encoder(tf.keras.Model):
def __init__(self):
super(Encoder, self).__init__(dtype = 'float64', name = 'Encoder')
self.input_layer = DenseLayer(128)
self.hidden_layer1 = DenseLayer(128)
self.hidden_layer2 = DenseLayer(64)
self.hidden_layer3 = DenseLayer(64)
self.output_layer = LinearLayer(64)
def call(self, input_data, training):
fx = self.input_layer(input_data)
fx = self.hidden_layer1(fx)
fx = self.hidden_layer2(fx)
fx = self.hidden_layer3(fx)
return self.output_layer(fx)
class LinearLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(LinearLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
return tf.matmul(inputs, self.w) + self.b
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(DenseLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
x = tf.matmul(inputs, self.w) + self.b
return tf.nn.elu(x)
I was wondering about this as well. Turns out that -2 is tf.data.UNKNOWN_CARDINALITY (https://www.tensorflow.org/api_docs/python/tf/data#UNKNOWN_CARDINALITY), which represents that TF doesn't know how many elements the flat_map returns per item.
I just asked Windowing a TensorFlow dataset without losing cardinality information? to see if anyone knows a way to window datasets without losing cardinality.

Applying Gradient to LSTM layers using GradientTape raises "No gradients provided for any variable" error

I'm currently setting up a Deep Deterministic Policy Gradient agent to interact with a crypto trading environment. The code works when I'm using Dense layers for the function approximator, but when I switch to LSTM or GRUs this error pops up when I call the learn method of Agent:
No gradients provided for any variable: ['actor_network_4/lstm/kernel:0', 'actor_network_4/lstm/recurrent_kernel:0', 'actor_network_4/lstm/bias:0', 'actor_network_4/lstm_1/kernel:0', 'actor_network_4/lstm_1/recurrent_kernel:0', 'actor_network_4/lstm_1/bias:0', 'actor_network_4/dense_8/kernel:0', 'actor_network_4/dense_8/bias:0'].
I'm using GradientTape to record the gradients and optimizer.apply_gradients to update the actor and critic networks. Please find the code snippets below:
#RNN version
class CriticNetwork(keras.Model):
def __init__(self, n_actions,name='critic', chkpt_dir='ddpg'):
super(CriticNetwork, self).__init__()
self.n_actions = n_actions
self.model_name = name
self.checkpoint_dir = chkpt_dir
self.checkpoint_file = os.path.join(self.checkpoint_dir, self.model_name+'_ddpg.h5')
self.lstm1 = LSTM(128,return_sequences=True,unroll=False)
self.lstm2 = LSTM(128)
self.q = Dense(1, activation=None)
def call(self, state, action):
action_value = tf.concat([state, action], axis=1)
action_value = np.reshape(action_value,(32,1,44))
action_value = self.lstm1(action_value)
action_value = self.lstm2(action_value)
q = self.q(action_value)
return q
class ActorNetwork(keras.Model):
def __init__(self,n_actions=1, name='actor',chkpt_dir='ddpg'):
super(ActorNetwork, self).__init__()
self.n_actions = n_actions
self.model_name = name
self.checkpoint_dir = chkpt_dir
self.checkpoint_file = os.path.join(self.checkpoint_dir,
self.model_name+'_ddpg.h5')
self.lstm1 = LSTM(128,return_sequences=True,unroll=False)
self.lstm2 = LSTM(128)
self.mu = Dense(self.n_actions, activation='tanh')
def call(self, state):
state = np.reshape(state,(32,1,43))
prob = self.lstm1(state)
prob = self.lstm2(prob)
mu = self.mu(prob)
return mu
class Agent:
def __init__(self, alpha=0.001, beta=0.002, input_dims=[33], env=None,
gamma=0.99, n_actions=1, max_size=3000000, tau=0.005,
fc1=128, fc2=128, fc3=64, batch_size=32):
self.gamma = gamma
self.tau = tau
self.memory = ReplayBuffer(max_size, input_dims, n_actions)
self.batch_size = batch_size
self.n_actions = n_actions
self.max_action = 1
self.min_action = -1
self.actor = ActorNetwork(n_actions=n_actions, name='actor')
self.critic = CriticNetwork(n_actions=n_actions, name='critic')
self.target_actor = ActorNetwork(n_actions=n_actions, name='target_actor')
self.target_critic = CriticNetwork(n_actions=n_actions, name='target_critic')
self.actor.compile(optimizer=Adam(learning_rate=alpha))
self.critic.compile(optimizer=Adam(learning_rate=beta))
self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
self.target_critic.compile(optimizer=Adam(learning_rate=beta))
self.update_network_parameters(tau=1)
def update_network_parameters(self, tau=None):
if tau is None:
tau = self.tau
weights = []
targets = self.target_actor.weights
for i, weight in enumerate(self.actor.weights):
weights.append(weight * tau + targets[i]*(1-tau))
self.target_actor.set_weights(weights)
weights = []
targets = self.target_critic.weights
for i, weight in enumerate(self.critic.weights):
weights.append(weight * tau + targets[i]*(1-tau))
self.target_critic.set_weights(weights)
def remember(self, state, action, reward, new_state, done):
self.memory.store_transition(state, action, reward, new_state, done)
def save_models(self):
print('... saving models ...')
self.actor.save_weights(self.actor.checkpoint_file)
self.target_actor.save_weights(self.target_actor.checkpoint_file)
self.critic.save_weights(self.critic.checkpoint_file)
self.target_critic.save_weights(self.target_critic.checkpoint_file)
def load_models(self):
print('... loading models ...')
self.actor.load_weights(self.actor.checkpoint_file)
self.target_actor.load_weights(self.target_actor.checkpoint_file)
self.critic.load_weights(self.critic.checkpoint_file)
self.target_critic.load_weights(self.target_critic.checkpoint_file)
def choose_action(self, observation, evaluate=False):
state = tf.convert_to_tensor([observation], dtype=tf.float32)
actions = self.actor(state)
if not evaluate:
actions += tf.random.normal(shape=[self.n_actions],
mean=0.0, stddev=0.05)
actions = tf.clip_by_value(actions, self.min_action, self.max_action)
return actions
def learn(self):
if self.memory.mem_cntr < self.batch_size:
return
state, action, reward, new_state, done = \
self.memory.sample_buffer(self.batch_size)
states = tf.convert_to_tensor(state, dtype=tf.float32)
states_ = tf.convert_to_tensor(new_state, dtype=tf.float32)
rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
actions = tf.convert_to_tensor(action, dtype=tf.float32)
with tf.GradientTape() as tape:
target_actions = self.target_actor(states_)
critic_value_ = tf.squeeze(self.target_critic(
states_, target_actions), 1)
critic_value = tf.squeeze(self.critic(states, actions), 1)
target = reward + self.gamma*critic_value_*(1-done)
critic_loss = keras.losses.MSE(target, critic_value)
critic_network_gradient = tape.gradient(critic_loss, self.critic.trainable_variables)
self.critic.optimizer.apply_gradients(zip(critic_network_gradient, self.critic.trainable_variables))
with tf.GradientTape() as tape:
new_policy_actions = self.actor(states)
actor_loss = -self.critic(states, new_policy_actions)
actor_loss = tf.math.reduce_mean(actor_loss)
actor_network_gradient = tape.gradient(actor_loss, self.actor.trainable_variables)
self.actor.optimizer.apply_gradients(zip(actor_network_gradient, self.actor.trainable_variables))
self.update_network_parameters()
Any advice or help in terms of how to record and apply the gradient for LSTMs would help.
Actually, the problem is that you are using Numpy operations to define the computation logic (hence, the gradients cannot flow from that point onward); however, this should be done entirely using TF Ops or Keras layers. Specifically, in call method of CriticNetwork as well as ActorNetwork, instead of using np.reshape you should either use tf.expand_dims (if you only want to add a new axis of size one to the tensor), or tf.reshape, or tf.keras.layers.Reshape layer (for more involved reshaping). For example, using tf.expand_dims:
class CriticNetwork(keras.Model):
#...
def call(self, state, action):
action_value = tf.concat([state, action], axis=1)
action_value = tf.expand_dims(action_value, axis=1)
or using tf.reshape:
class CriticNetwork(keras.Model):
#...
def call(self, state, action):
action_value = tf.concat([state, action], axis=1)
action_value = tf.reshape(action_value, (-1, 1, 44)) # Use `-1` for the first axis so that any batch size would be supported
or using Reshape layer:
class CriticNetwork(keras.Model):
def __init__(self, n_actions,name='critic', chkpt_dir='ddpg'):
# ...
self.reshape = tf.keras.layers.Reshape((1, 44))
def call(self, state, action):
action_value = tf.concat([state, action], axis=1)
action_value = self.reshape(action_value)
And you need to do the same thing for the ActorNetwork.
Side note: I am not sure if this is just a demo code or not, but note that using RNN layers on a sequence of length one (i.e. having only one timestep) may not prove to be very beneficial.

multi-class weighted loss function in pytorch

I am training a unet based model for multi-class segmentation task on pytorch framework. Optimizing the model with following loss function,
class MulticlassJaccardLoss(_Loss):
"""Implementation of Jaccard loss for multiclass (semantic) image segmentation task
"""
__name__ = 'mc_jaccard_loss'
def __init__(self, classes: List[int] = None, from_logits=True, weight=None, reduction='elementwise_mean'):
super(MulticlassJaccardLoss, self).__init__(reduction=reduction)
self.classes = classes
self.from_logits = from_logits
self.weight = weight
def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
"""
:param y_pred: NxCxHxW
:param y_true: NxHxW
:return: scalar
"""
if self.from_logits:
y_pred = y_pred.softmax(dim=1)
n_classes = y_pred.size(1)
smooth = 1e-3
if self.classes is None:
classes = range(n_classes)
else:
classes = self.classes
n_classes = len(classes)
loss = torch.zeros(n_classes, dtype=torch.float, device=y_pred.device)
if self.weight is None:
weights = [1] * n_classes
else:
weights = self.weight
for class_index, weight in zip(classes, weights):
jaccard_target = (y_true == class_index).float()
jaccard_output = y_pred[:, class_index, ...]
num_preds = jaccard_target.long().sum()
if num_preds == 0:
loss[class_index-1] = 0 #custom
else:
iou = soft_jaccard_score(jaccard_output, jaccard_target, from_logits=False, smooth=smooth)
loss[class_index-1] = (1.0 - iou) * weight #custom
if self.reduction == 'elementwise_mean':
return loss.mean()
if self.reduction == 'sum':
return loss.sum()
return loss
I am calculating loss for only two classes (class 1 and 2 and not for the background).
MulticlassJaccardLoss(weight=[0.5,10], classes=[1,2], from_logits=False)
When I train the model, it trains for first few iterations and I get the following error,
element 0 of tensors does not require grad and does not have a grad_fn
What is the mistake in the code?
Thanks!
Try setting:
torch.zeros(..., requires_grad=True)
I believe requires_grad=False is the default for torch.zeros, so this may help here.

Categories