Loss on dev set is always increasing unlike training set loss - python

I designed a network for a text classification problem. To do this, I'm using huggingface transformet's BERT model with a linear layer above that for fine-tuning. My problem is that the loss on the training set is decreasing which is fine, but when it comes to do the evaluation after each epoch on the development set, the loss is increasing with epochs. I'm posting my code to investigate if there's something wrong with it.
for epoch in range(1, args.epochs + 1):
total_train_loss = 0
trainer.set_train()
for step, batch in enumerate(train_dataloader):
loss = trainer.step(batch)
total_train_loss += loss
avg_train_loss = total_train_loss / len(train_dataloader)
logger.info(('Training loss for epoch %d/%d: %4.2f') % (epoch, args.epochs, avg_train_loss))
print("\n-------------------------------")
logger.info('Start validation ...')
trainer.set_eval()
y_hat = list()
y = list()
total_dev_loss = 0
for step, batch_val in enumerate(dev_dataloader):
true_labels_ids, predicted_labels_ids, loss = trainer.validate(batch_val)
total_dev_loss += loss
y.extend(true_labels_ids)
y_hat.extend(predicted_labels_ids)
avg_dev_loss = total_dev_loss / len(dev_dataloader)
print(("\n-Total dev loss: %4.2f on epoch %d/%d\n") % (avg_dev_loss, epoch, args.epochs))
print("Training terminated!")
Following is the trainer file, which I use for doing a forward pass on a given batch and then backpropagate accordingly.
class Trainer(object):
def __init__(self, args, model, device, data_points, is_test=False, train_stats=None):
self.args = args
self.model = model
self.device = device
self.loss = nn.CrossEntropyLoss(reduction='none')
if is_test:
# Should load the model from checkpoint
self.model.eval()
self.model.load_state_dict(torch.load(args.saved_model))
logger.info('Loaded saved model from %s' % args.saved_model)
else:
self.model.train()
self.optim = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = data_points * self.args.epochs
self.scheduler = get_linear_schedule_with_warmup(self.optim, num_warmup_steps=0,
num_training_steps=total_steps)
def step(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
self.model.zero_grad()
outputs = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
loss = self.loss(outputs, batch_labels)
loss = loss.sum()
(loss / loss.numel()).backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.optim.step()
self.scheduler.step()
return loss
def validate(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
with torch.no_grad():
model_output = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
predicted_label_ids = self._predict(model_output)
label_ids = batch_labels.to('cpu').numpy()
loss = self.loss(model_output, batch_labels)
loss = loss.sum()
return label_ids, predicted_label_ids, loss
def _predict(self, logits):
return np.argmax(logits.to('cpu').numpy(), axis=1)
Finally, the following is my model (i.e., Classifier) class:
import torch.nn as nn
from transformers import BertModel
class Classifier(nn.Module):
def __init__(self, args, is_eval=False):
super(Classifier, self).__init__()
self.bert_model = BertModel.from_pretrained(
args.init_checkpoint,
output_attentions=False,
output_hidden_states=True,
)
self.is_eval_mode = is_eval
self.linear = nn.Linear(768, 2) # binary classification
def switch_state(self):
self.is_eval_mode = not self.is_eval_mode
def forward(self, input_ids, attention_mask=None, labels=None):
bert_outputs = self.bert_model(input_ids,
token_type_ids=None,
attention_mask=attention_mask)
# Should give the logits to the the linear layer
model_output = self.linear(bert_outputs[1])
return model_output
For visualization the loss throughout the epochs:

When I've used Bert for text classification my model has generally behaved as you tell. In part this is expected because pre-trained models tend to require few epochs to fine-tune, actually if you check Bert's paper the number of epochs recommended for fine-tuning is between 2 and 4.
On the other hand, I've usually found the optimum at just 1 or 2 epochs, which coincides with your case also. My guess is: there is a trade-off when fine-tuning pre-trained models between fitting to your downstream task and forgetting the weights learned at pre-training. Depending on the data you have, the equilibrium point may happen sooner or later and overfitting starts after that. But this paragraph is speculation based on my experience.

When validation loss increases it means your model is overfitting

Related

Walk Forward Validation in Pytorch LSTM

Im currently building an LSTM Model for predicting stock prices in pytorch. I now want to implement a walk forward validation method, but I couldnt find any resource in how to do that.
This is my current training loop:
#%%
lstm1 = LSTM1(num_classes, input_size, hidden_dim, num_layers, X_train_tensors_final.shape[1])
criterion = torch.nn.L1Loss()
optimizer = torch.optim.Adam(lstm1.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
outputs = lstm1.forward(X_train_tensors_final)
optimizer.zero_grad() #clear gradients
loss = criterion(outputs, y_train_tensors)
loss.backward() #calculates the loss of the loss function
optimizer.step() #improve from loss, i.e backprop
if epoch % 100 == 0:
print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))
df_X_ss = ss.transform(df.iloc[:, 0:-1])
df_y_mm = ss.transform(df.iloc[:, 0:1])
df_X_ss = Variable(torch.Tensor(df_X_ss))
df_y_mm = Variable(torch.Tensor(df_y_mm))
df_X_ss = torch.reshape(df_X_ss, (df_X_ss.shape[0], 1, df_X_ss.shape[1]))
train_predict = lstm1(df_X_ss)
data_predict = train_predict.data.numpy()
The model should now predict one step into the future, then calculate the absolute percentage error. For the next step, the model should use the actual y value instead of the predicted yhat to make its next prediction. What would be the best way of implementing this? Or is there some build in function in pytorch that would do this ?

Training on a saved model after learning has stagnated?

I am training a single-layer neural network using PyTorch and saving the model after the validation loss decreases. Once the network has finished training, I load the saved model and pass my test set features through that (rather than the model from the last epoch) to see how well it does. However, more often that not, the validation loss will stop decreasing after about 150 epochs, and I'm worried that the network is overfitting the data. Would it be better for me to load the saved model during training if the validation loss has not decreased for some number of iterations (say, after 5 epochs), and then train on that saved model instead?
Also, are there any recommendations for how to avoid a situation where the validation loss stops decreasing? I've had some models where the validation loss continues to decrease even after 500 epochs and others where it stops decreasing after 100. Here is my code so far:
class NeuralNetwork(nn.Module):
def __init__(self, input_dim, output_dim, nodes):
super(NeuralNetwork, self).__init__()
self.linear1 = nn.Linear(input_dim, nodes)
self.tanh = nn.Tanh()
self.linear2 = nn.Linear(nodes, output_dim)
def forward(self, x):
output = self.linear1(x)
output = self.tanh(output)
output = self.linear2(output)
return output
epochs = 500 # (start small for now)
learning_rate = 0.01
w_decay = 0.1
momentum = 0.9
input_dim = 4
output_dim = 1
nodes = 8
model = NeuralNetwork(input_dim, output_dim, nodes)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=w_decay)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5)
losses = []
val_losses = []
min_validation_loss = np.inf
means = [] # we want to store the mean and standard deviation for the test set later
stdevs = []
torch.save({
'epoch': 0,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'training_loss': 0.0,
'validation_loss': 0.0,
'means': [],
'stdevs': [],
}, new_model_path)
new_model_saved = True
for epoch in range(epochs):
curr_loss = 0.0
validation_loss = 0.0
if new_model_saved:
checkpoint = torch.load(new_model_path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
means = checkpoint['means']
stdevs = checkpoint['stdevs']
new_model_saved = False
model.train()
for i, batch in enumerate(train_dataloader):
x, y = batch
x, new_mean, new_std = normalize_data(x, means, stdevs)
means = new_mean
stdevs = new_std
optimizer.zero_grad()
predicted_outputs = model(x)
loss = criterion(torch.squeeze(predicted_outputs), y)
loss.backward()
optimizer.step()
curr_loss += loss.item()
model.eval()
for x_val, y_val in val_dataloader:
x_val, val_means, val_std = normalize_data(x_val, means, stdevs)
predicted_y = model(x_val)
loss = criterion(torch.squeeze(predicted_y), y_val)
validation_loss += loss.item()
curr_lr = optimizer.param_groups[0]['lr']
if epoch % 10 == 0:
print(f'Epoch {epoch} \t\t Training Loss: {curr_loss/len(train_dataloader)} \t\t Validation Loss: {validation_loss/len(val_dataloader)} \t\t Learning rate: {curr_lr}')
if min_validation_loss > validation_loss:
print(f' For epoch {epoch}, validation loss decreased ({min_validation_loss:.6f}--->{validation_loss:.6f}) \t learning rate: {curr_lr} \t saving the model')
min_validation_loss = validation_loss
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'training_loss': curr_loss/len(train_dataloader),
'validation_loss': validation_loss/len(val_dataloader),
'means': means,
'stdevs': stdevs
}, new_model_path)
new_model_saved = True
losses.append(curr_loss/len(train_dataloader))
val_losses.append(validation_loss/len(val_dataloader))
scheduler.step(curr_loss/len(train_dataloader))
The phenomenon of the validation loss increases whereas the training loss decreases is called overfitting. Overfitting is a problem when training a model and should be avoided. please read more on this topic here. Overfitting may occur after any number of epochs and id dependent on a lot of variables(learning rate, database side, database diversity and more). as a rule of thumb, test your model at the "pivot point", i.e. exactly where the validation loss begins to increase (and the training continues to decrease). This means that my recommendation is to save the model after each iteration where the validation loss decreases. If it keeps increasing after any X number of epochs, it probably means that you reach a "deep" minimum for the loss and it will not be beneficial to keep training (again, this has some exceptions but for this level of discussion it is enough)
I encourage you to read and learn more about this subject, It is very interesting and has significant implications.

Validation of the dataset in GCN

I am using PyTorch I am trying to do the validation on my dataset to obtain optimal number of channels in my neural network. I have the following code:
def train_during_validation():
for epoch in range (1, 201):
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = criterion(out[data.val_mask], data.y[data.val_mask])
loss.backward()
optimizer.step()
return loss
def validation():
loss_val = np.zeros(50, dtype = float)
model = GCN(hidden_channels = 1)
loss_val = train_during_validation()
print(loss_val)
validation()
In the code above I train previously defined model with 16 channels and I obtain a loss of 0.33. But as soon as I start doing validation on hidden_channel (see code below), my loss does not go down (it remains on 1.95). I do not understand why. Can somebody explain?
def train_during_validation(model):
print(f'Model:{model}')
for epoch in range (1, 201):
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = criterion(out[data.val_mask], data.y[data.val_mask])
loss.backward()
optimizer.step()
return loss
def validation():
loss_val = np.zeros(50, dtype = float)
model = GCN(hidden_channels = 1)
for i in range (50):
model = GCN(hidden_channels = i)
#print(model)
loss_val[i] = train_during_validation(model)
print(loss_val[i])
validation()
Eventually I found an error:
optimizer and criterion must be defined inside the def train_during_validation(model)

Training Graph Neural Network (GNN) to create Embeddings using spektral

I am working to create a Graph Neural Network (GNN) which can create embeddings of the input graph for its usage in other applications like Reinforcement Learning.
I have started with example from the spektral library TUDataset classification with GIN and modified it to divide the network into two parts. The first part to produce embeddings and second part to produce classification. My goal is to train this network using supervised learning on dataset with graph labels e.g. TUDataset and use the first part (embedding generation) once trained in other applications.
I am getting different results from my approach in two different datasets. The TUDataset shows improved loss and accuracy with this new approach whereas the other other local dataset shows significant increase in the loss.
Can I get any feedback if my approach to create embedding is appropriate or any suggestions for further improvement?
here is my code used to generate graph embeddings:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from spektral.data import DisjointLoader
from spektral.datasets import TUDataset
from spektral.layers import GINConv, GlobalAvgPool
################################################################################
# PARAMETERS
################################################################################
learning_rate = 1e-3 # Learning rate
channels = 128 # Hidden units
layers = 3 # GIN layers
epochs = 300 # Number of training epochs
batch_size = 32 # Batch size
################################################################################
# LOAD DATA
################################################################################
dataset = TUDataset("PROTEINS", clean=True)
# Parameters
F = dataset.n_node_features # Dimension of node features
n_out = dataset.n_labels # Dimension of the target
# Train/test split
idxs = np.random.permutation(len(dataset))
split = int(0.9 * len(dataset))
idx_tr, idx_te = np.split(idxs, [split])
dataset_tr, dataset_te = dataset[idx_tr], dataset[idx_te]
loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)
################################################################################
# BUILD MODEL
################################################################################
class GIN0(Model):
def __init__(self, channels, n_layers):
super().__init__()
self.conv1 = GINConv(channels, epsilon=0, mlp_hidden=[channels, channels])
self.convs = []
for _ in range(1, n_layers):
self.convs.append(
GINConv(channels, epsilon=0, mlp_hidden=[channels, channels])
)
self.pool = GlobalAvgPool()
self.dense1 = Dense(channels, activation="relu")
def call(self, inputs):
x, a, i = inputs
x = self.conv1([x, a])
for conv in self.convs:
x = conv([x, a])
x = self.pool([x, i])
return self.dense1(x)
# Build model
model = GIN0(channels, layers)
model_op = Sequential()
model_op.add(Dropout(0.5, input_shape=(channels,)))
model_op.add(Dense(n_out, activation="softmax"))
opt = Adam(lr=learning_rate)
loss_fn = CategoricalCrossentropy()
################################################################################
# FIT MODEL
################################################################################
#tf.function(input_signature=loader_tr.tf_signature(), experimental_relax_shapes=True)
def train_step(inputs, target):
with tf.GradientTape(persistent=True) as tape:
node2vec = model(inputs, training=True)
predictions = model_op(node2vec, training=True)
loss = loss_fn(target, predictions)
loss += sum(model.losses)
gradients = tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(gradients, model.trainable_variables))
gradients2 = tape.gradient(loss, model_op.trainable_variables)
opt.apply_gradients(zip(gradients2, model_op.trainable_variables))
acc = tf.reduce_mean(categorical_accuracy(target, predictions))
return loss, acc
print("Fitting model")
current_batch = 0
model_lss = model_acc = 0
for batch in loader_tr:
lss, acc = train_step(*batch)
model_lss += lss.numpy()
model_acc += acc.numpy()
current_batch += 1
if current_batch == loader_tr.steps_per_epoch:
model_lss /= loader_tr.steps_per_epoch
model_acc /= loader_tr.steps_per_epoch
print("Loss: {}. Acc: {}".format(model_lss, model_acc))
model_lss = model_acc = 0
current_batch = 0
################################################################################
# EVALUATE MODEL
################################################################################
def tolist(predictions):
result = []
for item in predictions:
result.append((float(item[0]), float(item[1])))
return result
loss_data = []
print("Testing model")
model_lss = model_acc = 0
for batch in loader_te:
inputs, target = batch
node2vec = model(inputs, training=False)
predictions = model_op(node2vec, training=False)
predictions_list = tolist(predictions)
loss_data.append(zip(target,predictions_list))
model_lss += loss_fn(target, predictions)
model_acc += tf.reduce_mean(categorical_accuracy(target, predictions))
model_lss /= loader_te.steps_per_epoch
model_acc /= loader_te.steps_per_epoch
print("Done. Test loss: {}. Test acc: {}".format(model_lss, model_acc))
for batchi in loss_data:
for item in batchi:
print(list(item),'\n')
Your approach to generate graph embeddings is correct, the GIN0 model will return a vector given a graph.
This code here, however, seems weird:
gradients = tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(gradients, model.trainable_variables))
gradients2 = tape.gradient(loss, model_op.trainable_variables)
opt.apply_gradients(zip(gradients2, model_op.trainable_variables))
What you're doing here is that you're updating the weights of model twice, and the weights of model_op once.
When you compute the loss in the context of a tf.GradientTape, all computations that went into computing the final value are tracked. This means that if you call loss = foo(bar(x)) and then compute the training step using that loss, the weights of both foo and bar will be updated.
Besides this, I don't see issues with the code so it will mostly depend on the local dataset that you are using.
Cheers

Custom Traing Loop with multiple model pass through

Dear stackoverflow members,
I am currently trying to implement my own keras tuner training loop. In this loop I want to pass the input variable multiple times through the model in example:
Y = Startvalue
for i in range(x):
Y = model(Y)
I want to see if this method creates more stable simulations for my self feedback problem.
When I implement it I get an OOM error even when I do not loop. This error does not occur when I just do it normally.
My Class example (the OOM error occurs when i switch logits for logits2:
class MyTuner(kt.Tuner):
def run_trial(self, trial, train_ds, validation_data):
model = self.hypermodel.build(trial.hyperparameters)
optimizer = tf.keras.optimizers.Adam()
epoch_loss_metric = tf.keras.metrics.MeanSquaredError()
def microbatch(T_IN, A_IN, D_IN):
OUT_T = []
OUT_A = []
for i in range(len(T_IN)):
A_IN_R = tf.expand_dims(tf.squeeze(A_IN[i]), 0)
T_IN_R = tf.expand_dims(tf.squeeze(T_IN[i]), 0)
D_IN_R = tf.expand_dims(tf.squeeze(D_IN[i]), 0)
(OUT_T_R, OUT_A_R) = model((A_IN_R, T_IN_R, D_IN_R))
OUT_T.append(tf.squeeze(OUT_T_R))
OUT_A.append(tf.squeeze(OUT_A_R))
return(tf.squeeze(tf.stack(OUT_T)), tf.squeeze(tf.stack(OUT_A)))
def run_train_step(data):
T_IN = tf.dtypes.cast(data[0][0], 'float32')
A_IN = tf.dtypes.cast(data[0][1], 'float32')
D_IN = tf.dtypes.cast(data[0][2], 'float32')
A_Ta = tf.dtypes.cast(data[1][0], 'float32')
T_Ta = tf.dtypes.cast(data[1][1], 'float32')
mse = tf.keras.losses.MeanSquaredError()
with tf.GradientTape() as tape:
logits2 = microbatch(T_IN, A_IN, D_IN)
logits = model([A_IN, T_IN, D_IN])
loss = mse((T_Ta, A_Ta), logits2)
# Add any regularization losses.
if model.losses:
loss += tf.math.add_n(model.losses)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
epoch_loss_metric.update_state((T_Ta, A_Ta), logits2)
return loss
for epoch in range(1000):
print('Epoch: {}'.format(epoch))
self.on_epoch_begin(trial, model, epoch, logs={})
for batch, data in enumerate(train_ds):
self.on_batch_begin(trial, model, batch, logs={})
batch_loss = float(run_train_step(data))
self.on_batch_end(trial, model, batch, logs={'loss': batch_loss})
if batch % 100 == 0:
loss = epoch_loss_metric.result().numpy()
print('Batch: {}, Average Loss: {}'.format(batch, loss))
epoch_loss = epoch_loss_metric.result().numpy()
self.on_epoch_end(trial, model, epoch, logs={'loss': epoch_loss})
epoch_loss_metric.reset_states()
````
In my understanding, the micro-batch function is not implementing a self-feedback loop (though it does not affect the OOM)
I guess what's happening is that because you are computing the output of the network k times, the amount of memory consumption by the network is increasing by k times (because it needs to store intermediate tensors for backprop).
What you can do is, at each self-feedback instance, you backprop the gradients so that all the intermediate tensors do not increase beyond the limit.
lemme know if you have any doubt,

Categories