Model does not train properly when explicitly applying the gradients - python

I’m trying to constrain the weight of my model by explicitly applying the gradients; shower, this is not working and I can’t figure out why.
I’m defining the model with the following function:
def init_model(num_hidden_layers=2, num_neurons_per_layer=64):
model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(2,)) )
for _ in range(num_hidden_layers):
model.add(tf.keras.layers.Dense(num_neurons_per_layer, activation=tf.keras.layers.LeakyReLU( ),kernel_initializer="glorot_uniform") )
model.add(tf.keras.layers.Dense(1,kernel_initializer="glorot_uniform"))
return model
When using the fit method, the loss function decreases and the model fits the data:
Nepochs = 1500
lr = 0.001
def my_loss(u_true, u_pred):
return tf.math.reduce_mean(tf.math.square(u_true - u_pred))
model_0 = init_model(num_hidden_layers=2, num_neurons_per_layer=64)
optim_0 = tf.keras.optimizers.Adam(learning_rate=lr)
model_0.compile(loss=my_loss, optimizer=optim_0)
model_0.summary()
history_0 = model_0.fit(X_train,u_train,validation_data=(X_test.numpy(),u_test.numpy()),epochs=Nepochs, batch_size=X_train.shape[0])
When I explicitly specify and apply the gradient, the loss function stagnates and the output does not fit the data (it is uniform everywhere):
Nepochs = 1500
lr = 0.001
def compute_loss(model, X_data, u_data):
u_pred = model(X_data)
loss = tf.math.reduce_mean(tf.math.square(u_data - u_pred))
return loss
#tf.function
def training(model, optim, X_train, u_train, X_test=None, u_test=None):
if X_test is not None:
validation_loss = compute_loss(model, X_test, u_test )
else:
validation_loss = None
with tf.GradientTape(persistent=True) as tape:
tape.watch(model.trainable_variables)
loss = compute_loss(model, X_train, u_train )
grad_theta = tape.gradient(loss, model.trainable_variables)
optim.apply_gradients(zip(grad_theta, model.trainable_variables))
return loss,validation_loss
model_G = init_model(num_hidden_layers=2, num_neurons_per_layer=64)
optim_G = tf.keras.optimizers.Adam(learning_rate=lr)
model_G.summary()
hist = {'val_loss':[],'loss':[]}
for i in range(Nepochs+1):
loss, val_loss = training(model_G,optim_G,X_train,u_train,X_test,u_test)
hist['loss'].append(loss.numpy())
hist['val_loss'].append(val_loss.numpy())
if val_loss is not None:
print('It {:05d}: loss = {:10.8e}, validation loss = {:10.8e} '.format(i,loss,val_loss))
else:
print('It {:05d}: loss = {:10.8e}'.format(i,loss))
Why do the two versions provide different results?
Thanks for the help.
Cesare

Finally, I found that expanding the dimension of the targets as follows:
u_train = tf.expand_dims(u_train,axis=-1)
u_test = tf.expand_dims(u_test,axis=-1)
the model training properly and the loss functions are correctly evaluated.
u_train and u_test previously had shapes equal to the number of entries N only; by expanding the dimension, the shape now is (N,1).
using fit the code works with both; when explicitly using the gradient, only with targets of shape (N,1).

Related

Can't backward pass two losses in Classification Transformer Model

For my model I'm using a roberta transformer model and the Trainer from the Huggingface transformer library.
I calculate two losses:
lloss is a Cross Entropy Loss and dloss calculates the loss inbetween hierarchy layers.
The total loss is the sum of lloss and dloss. (Based on this)
When calling total_loss.backwards() however, I get the error:
RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed
Any idea why that happens? Can I force it to only call backwards once? Here is the loss calculation part:
dloss = calculate_dloss(prediction, labels, 3)
lloss = calculate_lloss(predeiction, labels, 3)
total_loss = lloss + dloss
total_loss.backward()
def calculate_lloss(predictions, true_labels, total_level):
'''Calculates the layer loss.
'''
loss_fct = nn.CrossEntropyLoss()
lloss = 0
for l in range(total_level):
lloss += loss_fct(predictions[l], true_labels[l])
return self.alpha * lloss
def calculate_dloss(predictions, true_labels, total_level):
'''Calculate the dependence loss.
'''
dloss = 0
for l in range(1, total_level):
current_lvl_pred = torch.argmax(nn.Softmax(dim=1)(predictions[l]), dim=1)
prev_lvl_pred = torch.argmax(nn.Softmax(dim=1)(predictions[l-1]), dim=1)
D_l = self.check_hierarchy(current_lvl_pred, prev_lvl_pred, l) #just a boolean tensor
l_prev = torch.where(prev_lvl_pred == true_labels[l-1], torch.FloatTensor([0]).to(self.device), torch.FloatTensor([1]).to(self.device))
l_curr = torch.where(current_lvl_pred == true_labels[l], torch.FloatTensor([0]).to(self.device), torch.FloatTensor([1]).to(self.device))
dloss += torch.sum(torch.pow(self.p_loss, D_l*l_prev)*torch.pow(self.p_loss, D_l*l_curr) - 1)
return self.beta * dloss
There is nothing wrong with having a loss that is the sum of two individual losses, here is a small proof of principle adapted from the docs:
import torch
import numpy
from sklearn.datasets import make_blobs
class Feedforward(torch.nn.Module):
def __init__(self, input_size, hidden_size):
super(Feedforward, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
self.relu = torch.nn.ReLU()
self.fc2 = torch.nn.Linear(self.hidden_size, 1)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
hidden = self.fc1(x)
relu = self.relu(hidden)
output = self.fc2(relu)
output = self.sigmoid(output)
return output
def blob_label(y, label, loc): # assign labels
target = numpy.copy(y)
for l in loc:
target[y == l] = label
return target
x_train, y_train = make_blobs(n_samples=40, n_features=2, cluster_std=1.5, shuffle=True)
x_train = torch.FloatTensor(x_train)
y_train = torch.FloatTensor(blob_label(y_train, 0, [0]))
y_train = torch.FloatTensor(blob_label(y_train, 1, [1,2,3]))
x_test, y_test = make_blobs(n_samples=10, n_features=2, cluster_std=1.5, shuffle=True)
x_test = torch.FloatTensor(x_test)
y_test = torch.FloatTensor(blob_label(y_test, 0, [0]))
y_test = torch.FloatTensor(blob_label(y_test, 1, [1,2,3]))
model = Feedforward(2, 10)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
model.eval()
y_pred = model(x_test)
before_train = criterion(y_pred.squeeze(), y_test)
print('Test loss before training' , before_train.item())
model.train()
epoch = 20
for epoch in range(epoch):
optimizer.zero_grad() # Forward pass
y_pred = model(x_train) # Compute Loss
lossCE= criterion(y_pred.squeeze(), y_train)
lossSQD = (y_pred.squeeze()-y_train).pow(2).mean()
loss=lossCE+lossSQD
print('Epoch {}: train loss: {}'.format(epoch, loss.item())) # Backward pass
loss.backward()
optimizer.step()
There must be a real second time that you call directly or indirectly backward on some varaible that then traverses through your graph. It is a bit too much to ask for the complete code here, only you can check this or at least reduce it to a minimal example (while doing so, you might already find the issue). Apart from that, I would start checking:
Does it already occur in the first iteration of training? If not: are you reusing any calculation results for the second iteration without a detach?
When you do backward on your losses individually lloss.backward() followed by dloss.backward() (this has the same effect as adding them together first as gradients are accumulated): what happens? This will let you track down for which of the two losses the error occurs.
After backward() your comp. graph is freed so for the second backward you need to create a new graph by providing inputs again. If you want to reiterate the same graph after backward (for some reason) you need to specify retain_graph flag in backward as True. see retain_graph here.
P.S. As the summation of Tensors is automatically differentiable, summing the losses would not cause any issue in the backward.

Custom Dynamic Loss function: No gradients provided for any variable

I am using an RGB dataset for my x train and the loss is calculated in a dynamic loss function that gets the distances of pairs and compares them against the ideal distance dist_train. Here is the model:
class MyModel(Model):
def __init__(self):
super(MyModel, self).__init__()
self.d1 = Dense(3, activation='relu')
self.flatten = Flatten()
self.d2 = Dense(3, activation='relu')
self.d3 = Dense(2)
def call(self, x):
x = self.d1(x)
x = self.flatten(x)
x = self.d2(x)
return self.d3(x)
# Create an instance of the model
model = MyModel()
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
test_loss = tf.keras.metrics.Mean(name='test_loss')
#tf.function
def train_step(rgb):
with tf.GradientTape() as tape:
predictions = model(rgb, training=True)
loss = tf_function(predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
Here is the loss function and the tf.function wrapping it:
def mahal_loss(output):
mahal = sp.spatial.distance.pdist(output, metric='mahalanobis')
mahal = sp.spatial.distance.squareform(mahal, force='no', checks=True)
new_distance = []
mahal = np.ma.masked_array(mahal, mask=mahal==0)
for i in range(len(mahal)):
pw_dist = mahal[i, indices_train[i]]
new_distance.append(pw_dist)
mahal_loss = np.mean((dist_train - new_distance)**2)
return mahal_loss
#tf.function(input_signature=[tf.TensorSpec(None, tf.float32)])
def tf_function(pred):
y = tf.numpy_function(mahal_loss, [pred], tf.float32)
return y
Running the model:
for epoch in range(EPOCHS):
train_loss.reset_states()
test_loss.reset_states()
for i in x_train:
train_step(i)
print(
f'Epoch {epoch + 1}, '
f'Loss: {train_loss.result()}, '
f'Test Loss: {test_loss.result()}, '
)
I believe the reason I am running into problems lies in the dynamic loss function, as I need to calculate the distance between certain pairs to get the results I expect. This means that inside the loss function I have to calculate the mahalanobis distance of each pair to get the ones I will compare against the correct distances. The error I get is the following:
<ipython-input-23-0e975da5cbc2>:15 train_step *
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\optimizer_v2.py:622 apply_gradients **
grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
C:\Anaconda3\envs\colour_env\lib\site-packages\keras\optimizer_v2\utils.py:72 filter_empty_gradients
raise ValueError("No gradients provided for any variable: %s." %
ValueError: No gradients provided for any variable: ['my_model/dense/kernel:0', 'my_model/dense/bias:0', 'my_model/dense_1/kernel:0', 'my_model/dense_1/bias:0', 'my_model/dense_2/kernel:0', 'my_model/dense_2/bias:0'].```
The problem is the use of tf.numpy_function.
Specifically, everything that happens inside the with tf.GradientTape() as tape statement has to be differentiable. Because the conversion between tf.Tensor and numpy array is not differentiable, tf.numpy_function cannot be used for loss computation:
Since the function takes numpy arrays, you cannot take gradients through a numpy_function. If you require something that is differentiable, please consider using tf.py_function.
(Source: here in the official documentation)
So either wrap the loss comutation in tf.py_function as this accepts tf.Tensors or consider implementing it in tensorflow. Here is an example for that.

Custom Traing Loop with multiple model pass through

Dear stackoverflow members,
I am currently trying to implement my own keras tuner training loop. In this loop I want to pass the input variable multiple times through the model in example:
Y = Startvalue
for i in range(x):
Y = model(Y)
I want to see if this method creates more stable simulations for my self feedback problem.
When I implement it I get an OOM error even when I do not loop. This error does not occur when I just do it normally.
My Class example (the OOM error occurs when i switch logits for logits2:
class MyTuner(kt.Tuner):
def run_trial(self, trial, train_ds, validation_data):
model = self.hypermodel.build(trial.hyperparameters)
optimizer = tf.keras.optimizers.Adam()
epoch_loss_metric = tf.keras.metrics.MeanSquaredError()
def microbatch(T_IN, A_IN, D_IN):
OUT_T = []
OUT_A = []
for i in range(len(T_IN)):
A_IN_R = tf.expand_dims(tf.squeeze(A_IN[i]), 0)
T_IN_R = tf.expand_dims(tf.squeeze(T_IN[i]), 0)
D_IN_R = tf.expand_dims(tf.squeeze(D_IN[i]), 0)
(OUT_T_R, OUT_A_R) = model((A_IN_R, T_IN_R, D_IN_R))
OUT_T.append(tf.squeeze(OUT_T_R))
OUT_A.append(tf.squeeze(OUT_A_R))
return(tf.squeeze(tf.stack(OUT_T)), tf.squeeze(tf.stack(OUT_A)))
def run_train_step(data):
T_IN = tf.dtypes.cast(data[0][0], 'float32')
A_IN = tf.dtypes.cast(data[0][1], 'float32')
D_IN = tf.dtypes.cast(data[0][2], 'float32')
A_Ta = tf.dtypes.cast(data[1][0], 'float32')
T_Ta = tf.dtypes.cast(data[1][1], 'float32')
mse = tf.keras.losses.MeanSquaredError()
with tf.GradientTape() as tape:
logits2 = microbatch(T_IN, A_IN, D_IN)
logits = model([A_IN, T_IN, D_IN])
loss = mse((T_Ta, A_Ta), logits2)
# Add any regularization losses.
if model.losses:
loss += tf.math.add_n(model.losses)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
epoch_loss_metric.update_state((T_Ta, A_Ta), logits2)
return loss
for epoch in range(1000):
print('Epoch: {}'.format(epoch))
self.on_epoch_begin(trial, model, epoch, logs={})
for batch, data in enumerate(train_ds):
self.on_batch_begin(trial, model, batch, logs={})
batch_loss = float(run_train_step(data))
self.on_batch_end(trial, model, batch, logs={'loss': batch_loss})
if batch % 100 == 0:
loss = epoch_loss_metric.result().numpy()
print('Batch: {}, Average Loss: {}'.format(batch, loss))
epoch_loss = epoch_loss_metric.result().numpy()
self.on_epoch_end(trial, model, epoch, logs={'loss': epoch_loss})
epoch_loss_metric.reset_states()
````
In my understanding, the micro-batch function is not implementing a self-feedback loop (though it does not affect the OOM)
I guess what's happening is that because you are computing the output of the network k times, the amount of memory consumption by the network is increasing by k times (because it needs to store intermediate tensors for backprop).
What you can do is, at each self-feedback instance, you backprop the gradients so that all the intermediate tensors do not increase beyond the limit.
lemme know if you have any doubt,

logistic regression model with L1 regularisations

I am trying to apply L1 regularization on a logistic model
class LogisticRegression(nn.Module):
def __init__(self):
super().__init__()
self.linear = nn.Linear(input_size, num_classes)
def forward(self, x):
x = x.reshape(-1, 784)
output = self.linear(x)
return output
def training_step(self, batch):
images, labels = batch
output = self(images)
loss = F.cross_entropy(output, labels)
acc = accuracy(output, labels)
return {'Training_loss': loss, 'Training_acc': acc}
def training_epoch_end(self, outputs):
batch_losses = [x['Training_loss'] for x in outputs]
epoch_loss = torch.stack(batch_losses).mean()
batch_accs = [x['Training_acc'] for x in outputs]
epoch_acc = torch.stack(batch_accs).mean()
return {'Training_loss': epoch_loss.item(), 'Training_acc': epoch_acc.item()}
def epoch_end(self, epoch, result):
print("Epoch [{}], Training_loss: {:.4f}, Training_acc: {:.4f}".format(epoch, result['Training_loss'], result['Training_acc']))
model = LogisticRegression()
But I think I am doing it wrong the accuracy did not change.
L1=0.2
def evaluate(model_b, trainloader):
outputs = [model_b.training_step(batch) for batch in trainloader]
return model_b.training_epoch_end(outputs)
def fit(epochs, lr, model_b, trainloader, opt_func=torch.optim.SGD):
history = []
optimizer = opt_func(model_b.parameters(), lr)
for epoch in range(epochs):
##### Training Phase
for batch in trainloader:
loss = model_b.training_step(batch)['Training_loss']
loss_Lasso = loss + 0.5 * L1 # L1 reg
loss_Lasso.backward()
optimizer.step()
optimizer.zero_grad()
result = evaluate_b(model_b, trainloader)
model_b.epoch_end(epoch, result)
history.append(result)
return history
Can anyone help me with what I am missing and how I can really apply L1 regularization?
Also, is L1 regularization called lasso?
I believe the l1-norm is a type of Lasso regularization, yes, but there are others.
In your snippet L1 is set as a constant, instead you should measure the l1-norm of your model's parameters. Then sum it with your network's loss, as you did. In your example there is a single layer, so you will only need self.linear's parameters. First gather all parameters then measure the total norm with torch.norm. You could also use nn.L1Loss.
params = torch.cat([x.view(-1) for x in model.linear.parameters()])
L1 = lamb*torch.norm(params, p=1)
Where lamb is your lambda regularization parameter and model is initialized from the LogisticRegression class.

Loss on dev set is always increasing unlike training set loss

I designed a network for a text classification problem. To do this, I'm using huggingface transformet's BERT model with a linear layer above that for fine-tuning. My problem is that the loss on the training set is decreasing which is fine, but when it comes to do the evaluation after each epoch on the development set, the loss is increasing with epochs. I'm posting my code to investigate if there's something wrong with it.
for epoch in range(1, args.epochs + 1):
total_train_loss = 0
trainer.set_train()
for step, batch in enumerate(train_dataloader):
loss = trainer.step(batch)
total_train_loss += loss
avg_train_loss = total_train_loss / len(train_dataloader)
logger.info(('Training loss for epoch %d/%d: %4.2f') % (epoch, args.epochs, avg_train_loss))
print("\n-------------------------------")
logger.info('Start validation ...')
trainer.set_eval()
y_hat = list()
y = list()
total_dev_loss = 0
for step, batch_val in enumerate(dev_dataloader):
true_labels_ids, predicted_labels_ids, loss = trainer.validate(batch_val)
total_dev_loss += loss
y.extend(true_labels_ids)
y_hat.extend(predicted_labels_ids)
avg_dev_loss = total_dev_loss / len(dev_dataloader)
print(("\n-Total dev loss: %4.2f on epoch %d/%d\n") % (avg_dev_loss, epoch, args.epochs))
print("Training terminated!")
Following is the trainer file, which I use for doing a forward pass on a given batch and then backpropagate accordingly.
class Trainer(object):
def __init__(self, args, model, device, data_points, is_test=False, train_stats=None):
self.args = args
self.model = model
self.device = device
self.loss = nn.CrossEntropyLoss(reduction='none')
if is_test:
# Should load the model from checkpoint
self.model.eval()
self.model.load_state_dict(torch.load(args.saved_model))
logger.info('Loaded saved model from %s' % args.saved_model)
else:
self.model.train()
self.optim = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = data_points * self.args.epochs
self.scheduler = get_linear_schedule_with_warmup(self.optim, num_warmup_steps=0,
num_training_steps=total_steps)
def step(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
self.model.zero_grad()
outputs = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
loss = self.loss(outputs, batch_labels)
loss = loss.sum()
(loss / loss.numel()).backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.optim.step()
self.scheduler.step()
return loss
def validate(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
with torch.no_grad():
model_output = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
predicted_label_ids = self._predict(model_output)
label_ids = batch_labels.to('cpu').numpy()
loss = self.loss(model_output, batch_labels)
loss = loss.sum()
return label_ids, predicted_label_ids, loss
def _predict(self, logits):
return np.argmax(logits.to('cpu').numpy(), axis=1)
Finally, the following is my model (i.e., Classifier) class:
import torch.nn as nn
from transformers import BertModel
class Classifier(nn.Module):
def __init__(self, args, is_eval=False):
super(Classifier, self).__init__()
self.bert_model = BertModel.from_pretrained(
args.init_checkpoint,
output_attentions=False,
output_hidden_states=True,
)
self.is_eval_mode = is_eval
self.linear = nn.Linear(768, 2) # binary classification
def switch_state(self):
self.is_eval_mode = not self.is_eval_mode
def forward(self, input_ids, attention_mask=None, labels=None):
bert_outputs = self.bert_model(input_ids,
token_type_ids=None,
attention_mask=attention_mask)
# Should give the logits to the the linear layer
model_output = self.linear(bert_outputs[1])
return model_output
For visualization the loss throughout the epochs:
When I've used Bert for text classification my model has generally behaved as you tell. In part this is expected because pre-trained models tend to require few epochs to fine-tune, actually if you check Bert's paper the number of epochs recommended for fine-tuning is between 2 and 4.
On the other hand, I've usually found the optimum at just 1 or 2 epochs, which coincides with your case also. My guess is: there is a trade-off when fine-tuning pre-trained models between fitting to your downstream task and forgetting the weights learned at pre-training. Depending on the data you have, the equilibrium point may happen sooner or later and overfitting starts after that. But this paragraph is speculation based on my experience.
When validation loss increases it means your model is overfitting

Categories