I am training a unet based model for multi-class segmentation task on pytorch framework. Optimizing the model with following loss function,
class MulticlassJaccardLoss(_Loss):
"""Implementation of Jaccard loss for multiclass (semantic) image segmentation task
"""
__name__ = 'mc_jaccard_loss'
def __init__(self, classes: List[int] = None, from_logits=True, weight=None, reduction='elementwise_mean'):
super(MulticlassJaccardLoss, self).__init__(reduction=reduction)
self.classes = classes
self.from_logits = from_logits
self.weight = weight
def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
"""
:param y_pred: NxCxHxW
:param y_true: NxHxW
:return: scalar
"""
if self.from_logits:
y_pred = y_pred.softmax(dim=1)
n_classes = y_pred.size(1)
smooth = 1e-3
if self.classes is None:
classes = range(n_classes)
else:
classes = self.classes
n_classes = len(classes)
loss = torch.zeros(n_classes, dtype=torch.float, device=y_pred.device)
if self.weight is None:
weights = [1] * n_classes
else:
weights = self.weight
for class_index, weight in zip(classes, weights):
jaccard_target = (y_true == class_index).float()
jaccard_output = y_pred[:, class_index, ...]
num_preds = jaccard_target.long().sum()
if num_preds == 0:
loss[class_index-1] = 0 #custom
else:
iou = soft_jaccard_score(jaccard_output, jaccard_target, from_logits=False, smooth=smooth)
loss[class_index-1] = (1.0 - iou) * weight #custom
if self.reduction == 'elementwise_mean':
return loss.mean()
if self.reduction == 'sum':
return loss.sum()
return loss
I am calculating loss for only two classes (class 1 and 2 and not for the background).
MulticlassJaccardLoss(weight=[0.5,10], classes=[1,2], from_logits=False)
When I train the model, it trains for first few iterations and I get the following error,
element 0 of tensors does not require grad and does not have a grad_fn
What is the mistake in the code?
Thanks!
Try setting:
torch.zeros(..., requires_grad=True)
I believe requires_grad=False is the default for torch.zeros, so this may help here.
Related
I have been trying to implement Gradnorm from the paper, GradNorm: Gradient Normalization for Adaptive Loss Balancing in Deep Multitask Networks. I have been having some trouble figuring out how to do this since I have a two-head model and need to balance the gradients accordingly.
I am using Keras for this work and using the model.fit() function to train my model.
So far, I have tried to use a callback to implement this like:
class GradNorm(tf.keras.callbacks.Callback):
def __init__(self, model, optimizer, alpha=0.2):
super(GradNorm, self).__init__()
self.alpha = alpha
self.model = model
self.optimizer = optimizer
self.l_hat_1 = self.l_hat_2 = self.l_hat_avg = 1.0
self.inv_rate_1 = self.inv_rate_2 = 1.0
self.weights_shared = self.model.weights[:20] # all shared layer weights
self.w1 = self.w2 = 1.0
self.l01 = self.l02 = -1.0
self.task_losses = []
def on_batch_end(self, batch, logs=None):
loss_t1 = logs.get('loss1')
loss_t2 = logs.get('loss2')
self.task_losses.append((loss_t1, loss_t2))
# Weight losses
l1 = tf.multiply(loss_t1, self.w1)
l2 = tf.multiply(loss_t2, self.w2)
with tf.GradientTape(persistent=True) as tape:
G1R = tape.gradient(tf.constant(loss_t1, dtype=tf.float32), self.weights_shared)
G2R = tape.gradient(tf.constant(loss_t2, dtype=tf.float32), self.weights_shared)
G1 = tf.norm(G1R, ord=2)
G2 = tf.norm(G2R, ord=2)
However, this fails even at this point because G1R returns an array of None
Would really appreciate any help on this
Thank You
I have the following code:
import torch
from torch.nn.utils.stateless import functional_call
import torch.autograd as autograd
import torch.nn as nn
# This is the model
class Encoder(nn.Module):
def __init__(self, action_dim, z_dim, skill_length):
super().__init__()
print(action_dim)
self.lin1 = nn.Linear(action_dim, action_dim)
self.lstm = nn.LSTM(input_size=action_dim, hidden_size=z_dim, batch_first=True)
self.lin2 = nn.Linear(z_dim, z_dim)
def forward(self, skill):
a, b, c = skill.shape
skill = skill.reshape(-1, skill.shape[-1])
embed = self.lin1(skill)
embed = embed.reshape(a, b, c)
mean, _ = self.lstm(embed)
mean = mean[:, -1, :]
mean = self.lin2(mean)
return mean
# This is the initialization function
def pars(model):
params = {}
for name, param in model.named_parameters():
if len(param.shape) == 1:
init = torch.nn.init.constant_(param, 0)
else:
init = torch.nn.init.orthogonal_(param)
params[name] = nn.Parameter(init)
return params
# Initializating the model
model = Encoder(4, 2, 5)
x = torch.rand(3, 5, 4)
params = pars(model)
# Running the model with functional_call and calculating gradient.
samp = functional_call(model, params, x)
grad_f = autograd.grad(torch.mean(samp), params.values(),
retain_graph=True, allow_unused=True)
print(grad_f)
# grad_f has gradient for the linear layer, but None for the LSTM layer.
# Running the model without functional_call and calculating gradient.
samp = model(x)
grad = autograd.grad(torch.mean(samp), model.parameters(), retain_graph=True)
print(grad)
# grad has gradient for all layers, e.g., linears and lstm.
I know the problem is with the LSTM layer because when I use a linear layer with nn.Linear, then the gradient depends on std as well as the linear layer. Unfortunately, I do not know to resolve this problem. I'd appreciate any help.
*Edit: I heavily edited the code provided just to further simplify the example. This code can be copied and run.
Update Dec 11, 2022
class Encoder(nn.Module):
def __init__(self, action_dim, z_dim, skill_length):
super().__init__()
print(action_dim)
self.lin1 = nn.Linear(action_dim, action_dim)
self.lstm = nn.LSTM(input_size=action_dim, hidden_size=z_dim, batch_first=True)
self.lin2 = nn.Linear(z_dim, z_dim)
def forward(self, skill):
a, b, c = skill.shape
skill = skill.reshape(-1, skill.shape[-1])
embed = self.lin1(skill)
embed = embed.reshape(a, b, c)
mean, _ = self.lstm(embed)
pdb.set_trace()
grad1 = autograd.grad(mean.mean(), params.values(),
retain_graph=True, allow_unused=True)
# This gives gradient for the self.lin1 layer, and None for the LSTM
grad2 = autograd.grad(mean.mean(), self.parameters(),
retain_graph=True, allow_unused=True)
# This gives gradient the LSTM, but None for the self.lin1 layer
mean = mean[:, -1, :]
mean = self.lin2(mean)
return mean
When I run it the regular without functional_call and calling directly the model, then autograd.grad(mean.mean(), self.parameters(), allow_unused=True, retain_graph=True) has gradient for the self.lin1 and LSTM layer.
I don't know if this information is useful, but putting out there just in case.
Thanks for your time~ after I run the code, the grad is always zero and the loss is not updating.(I guess it's because the weights is initialized all 0's, but I don't know how to fix it) The code is a basic neural network:
class Model(torch.nn.Module): #class
def __init__(self):
super(Model, self).__init__()
self.linear1 = torch.nn.Linear(8,6)
self.linear2 = torch.nn.Linear(6,4)
self.linear3 = torch.nn.Linear(4,1)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
x = self.sigmoid(self.linear1(x))
x = self.sigmoid(self.linear2(x))
x = self.sigmoid(self.linear3(x))
x = F.softmax(x, dim=1)
return x
model = Model() #model
criterion = torch.nn.BCELoss(size_average = False)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for epoch in range(1000): # training
y_pred = model(X_train.float())
loss = criterion(y_pred, y_train.float())
print(epoch, loss.item())
print([x.grad for x in optimizer.param_groups[0]['params']])
optimizer.zero_grad()
loss.backward()
optimizer.step()
And I get the 0s grad:
I think your forgot to apply the backpropagation. Adding loss.backward() just before your print statetements will do the trick (compute the accumulated gradients and store them in x.grad). Note that by default your weights are not initialized to 0 here. The default initialization for linear layers are here.
I am trying to run the following code (as given in Tensorflow documentation) to create windows of my data and then flatten the dataset of datasets.
window_size = 5
windows = range_ds.window(window_size, shift=1)
for sub_ds in windows.take(5):
print(sub_ds)
flat_windows = windows.flat_map(lambda x: x)
The problem is that flat_windows.cardinality().numpy() returns cardinality to be -2 which is creating problem for me during training. I tried looking for ways to set_cardinality of a dataset but couldn't find anything. I also tried other ways of flattening a dataset of datasets, but again no success.
Edit-1: The problem with the training is that the shape is unknown (at Linear and Dense layers) when I am training a subclass model (given below). The model trains well when I train the model eagerly (through tf.config.run_functions_eagerly(True)) but that is slow. Therefore I want the input data to be known for the model training.
Neural Network
class NeuralNetworkModel(tf.keras.Model):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.encoder = Encoder()
def train_step(self, inputs):
X = inputs[0]
Y = inputs[1]
with tf.GradientTape() as tape:
enc_X = self.encoder(X)
enc_Y = self.encoder(Y)
# loss:
loss = tf.norm(enc_Y - enc_X, axis = [0, 1], ord = 'fro')
# Compute gradients
trainable_vars = self.encoder.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
#property
def metrics(self):
# We list our `Metric` objects here so that `reset_states()` can be
# called automatically at the start of each epoch
# or at the start of `evaluate()`.
# If you don't implement this property, you have to call
# `reset_states()` yourself at the time of your choosing.
return [loss_tracker]
def test_step(self, inputs):
X = inputs[0]
Y = inputs[1]
Psi_X = self.encoder(X)
Psi_Y = self.encoder(Y)
# loss:
loss = tf.norm(Psi_Y - Psi_X, axis = [0, 1], ord = 'fro')
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
class Encoder(tf.keras.Model):
def __init__(self):
super(Encoder, self).__init__(dtype = 'float64', name = 'Encoder')
self.input_layer = DenseLayer(128)
self.hidden_layer1 = DenseLayer(128)
self.hidden_layer2 = DenseLayer(64)
self.hidden_layer3 = DenseLayer(64)
self.output_layer = LinearLayer(64)
def call(self, input_data, training):
fx = self.input_layer(input_data)
fx = self.hidden_layer1(fx)
fx = self.hidden_layer2(fx)
fx = self.hidden_layer3(fx)
return self.output_layer(fx)
class LinearLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(LinearLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
return tf.matmul(inputs, self.w) + self.b
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(DenseLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
x = tf.matmul(inputs, self.w) + self.b
return tf.nn.elu(x)
I was wondering about this as well. Turns out that -2 is tf.data.UNKNOWN_CARDINALITY (https://www.tensorflow.org/api_docs/python/tf/data#UNKNOWN_CARDINALITY), which represents that TF doesn't know how many elements the flat_map returns per item.
I just asked Windowing a TensorFlow dataset without losing cardinality information? to see if anyone knows a way to window datasets without losing cardinality.
def kl_divergence(p, p_hat):
return (p * K.log(p / p_hat)) + ((1 - p) * K.log((1 - p) / (1 - p_hat)))
class SparseActivityRegularizer(Regularizer):
sparsityBeta = None
def __init__(self, l1=0., l2=0., p=0.01, sparsityBeta=0.1):
self.p = p
self.sparsityBeta = sparsityBeta
def set_layer(self, layer):
self.layer = layer
def __call__(self,loss):
#p_hat needs to be the average activation of the units in the hidden layer.
p_hat = T.sum(T.mean(self.layer.get_output(True) , axis=0))
loss += self.sparsityBeta * kl_divergence(self.p, p_hat)
return loss
def get_config(self):
return {"name": self.__class__.__name__,
"p": self.l1}
when I call this custom regularizer in the model as shown below
dr=0.5
inputs = Input(shape=(392,))
x = Dense(1000,activation='relu',activity_regularizer=SparseActivityRegularizer())(inputs)
x=Dropout(dr)(x)
out= Dense(392, activation='sigmoid')(x)
model = Model(inputs=inputs, outputs=out)
model.compile(loss=euc_dist_keras,
optimizer='adadelta', metrics=["accuracy"])
model.summary()
filepath="weightdae.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1,
save_best_only=True, mode='min')
callbacks_list = [checkpoint,TensorBoard(log_dir='/tmp/autoencoder')]
hist = model.fit(ptilde, p,
nb_epoch=40,
shuffle=True,
validation_data=(ptilde_val,p_val),
batch_size=32,
callbacks=callbacks_list)
I get the following error
AttributeError: 'SparseActivityRegularizer' object has no attribute 'layer'
Can someone please help me in solving this error?
I have checked the implementation of the regularizer,
and activity regularizers in keras have been implemented in the same way.
But here somehow it cannot find the attribute 'layer' and throws this error.
This type of Regularization declaration deprecated here.Since Keras 1.2.0, you must implement regularization as function like here or as callable class like here
try this:
class SparseRegularizer(keras.regularizers.Regularizer):
def __init__(self, rho = 0.01,beta = 1):
"""
rho : Desired average activation of the hidden units
beta : Weight of sparsity penalty term
"""
self.rho = rho
self.beta = beta
def __call__(self, activation):
rho = self.rho
beta = self.beta
# sigmoid because we need the probability distributions
activation = tf.nn.sigmoid(activation)
# average over the batch samples
rho_bar = K.mean(activation, axis=0)
# Avoid division by 0
rho_bar = K.maximum(rho_bar,1e-10)
KLs = rho*K.log(rho/rho_bar) + (1-rho)*K.log((1-rho)/(1-rho_bar))
return beta * K.sum(KLs) # sum over the layer units
def get_config(self):
return {
'rho': self.rho,
'beta': self.beta
}