I had train a BertClassifier model using pytorch. After creating my best.pt I would like to make in production my model and using it to predict and classifier starting from a sample, so I resume them from the checkpoint. Otherwise after put it in evaluation and freeze model, I use .predict to make in work on my sample but I'm encountering this Attribute Error. I had also inizialize it before calling the checkpoint. When I am wrong? Thank you for your help!
def save_ckp(state, is_best, checkpoint_path, best_model_path):
"""
function created to save checkpoint, the latest one and the best one.
This creates flexibility: either you are interested in the state of the latest checkpoint or the best checkpoint.
state: checkpoint we want to save
is_best: is this the best checkpoint; min validation loss
checkpoint_path: path to save checkpoint
best_model_path: path to save best model
"""
f_path = checkpoint_path
# save checkpoint data to the path given, checkpoint_path
torch.save(state, f_path)
# if it is a best model, min validation loss
if is_best:
best_fpath = best_model_path
# copy that checkpoint file to best path given, best_model_path
shutil.copyfile(f_path, best_fpath)
def load_ckp(checkpoint_fpath, model, optimizer):
"""
checkpoint_path: path to save checkpoint
model: model that we want to load checkpoint parameters into
optimizer: optimizer we defined in previous training
"""
# load check point
checkpoint = torch.load(checkpoint_fpath)
# initialize state_dict from checkpoint to model
model.load_state_dict(checkpoint['state_dict'])
# initialize optimizer from checkpoint to optimizer
optimizer.load_state_dict(checkpoint['optimizer'])
# initialize valid_loss_min from checkpoint to valid_loss_min
valid_loss_min = checkpoint['valid_loss_min']
# return model, optimizer, epoch value, min validation loss
return model, optimizer, checkpoint['epoch'], valid_loss_min.item()
#Create the BertClassfier class
class BertClassifier(nn.Module):
"""Bert Model for Classification Tasks."""
def __init__(self, freeze_bert=True):
"""
#param bert: a BertModel object
#param classifier: a torch.nn.Module classifier
#param freeze_bert (bool): Set `False` to fine-tune the BERT model
"""
super(BertClassifier, self).__init__()
.......
def forward(self, input_ids, attention_mask):
''' Feed input to BERT and the classifier to compute logits.
#param input_ids (torch.Tensor): an input tensor with shape (batch_size,
max_length)
#param attention_mask (torch.Tensor): a tensor that hold attention mask
information with shape (batch_size, max_length)
#return logits (torch.Tensor): an output tensor with shape (batch_size,
num_labels) '''
# Feed input to BERT
outputs = self.bert(input_ids=input_ids,
attention_mask=attention_mask)
# Extract the last hidden state of the token `[CLS]` for classification task
last_hidden_state_cls = outputs[0][:, 0, :]
# Feed input to classifier to compute logits
logits = self.classifier(last_hidden_state_cls)
return logits
def initialize_model(epochs):
""" Initialize the Bert Classifier, the optimizer and the learning rate scheduler."""
# Instantiate Bert Classifier
bert_classifier = BertClassifier(freeze_bert=False)
# Tell PyTorch to run the model on GPU
bert_classifier = bert_classifier.to(device)
# Create the optimizer
optimizer = AdamW(bert_classifier.parameters(),
lr=lr, # Default learning rate
eps=1e-8 # Default epsilon value
)
# Total number of training steps
total_steps = len(train_dataloader) * epochs
# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=0, # Default value
num_training_steps=total_steps)
return bert_classifier, optimizer, scheduler
def train(model, train_dataloader, val_dataloader, valid_loss_min_input, checkpoint_path, best_model_path, start_epochs, epochs, evaluation=True):
"""Train the BertClassifier model."""
# Start training loop
logging.info("--Start training...\n")
# Initialize tracker for minimum validation loss
valid_loss_min = valid_loss_min_input
for epoch_i in range(start_epochs, epochs):
# =======================================
# Training
# =======================================
# Print the header of the result table
logging.info((f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}"))
# Measure the elapsed time of each epoch
t0_epoch, t0_batch = time.time(), time.time()
# Reset tracking variables at the beginning of each epoch
total_loss, batch_loss, batch_counts = 0, 0, 0
# Put the model into the training mode
model.train()
# For each batch of training data...
for step, batch in enumerate(train_dataloader):
batch_counts +=1
# Load batch to GPU
b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
# Zero out any previously calculated gradients
model.zero_grad()
# Perform a forward pass. This will return logits.
logits = model(b_input_ids, b_attn_mask)
# Compute loss and accumulate the loss values
loss = loss_fn(logits, b_labels)
batch_loss += loss.item()
total_loss += loss.item()
# Perform a backward pass to calculate gradients
loss.backward()
# Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and the learning rate
optimizer.step()
scheduler.step()
# Print the loss values and time elapsed for every 20 batches
if (step % 500 == 0 and step != 0) or (step == len(train_dataloader) - 1):
# Calculate time elapsed for 20 batches
time_elapsed = time.time() - t0_batch
# Print training results
logging.info(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
# Reset batch tracking variables
batch_loss, batch_counts = 0, 0
t0_batch = time.time()
# Calculate the average loss over the entire training data
avg_train_loss = total_loss / len(train_dataloader)
logging.info("-"*70)
# =======================================
# Evaluation
# =======================================
if evaluation == True:
# After the completion of each training epoch, measure the model's performance
# on our validation set.
val_loss, val_accuracy = evaluate(model, val_dataloader)
# Print performance over the entire training data
time_elapsed = time.time() - t0_epoch
logging.info(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^10.6f} | {time_elapsed:^9.2f}")
logging.info("-"*70)
logging.info("\n")
# create checkpoint variable and add important data
checkpoint = {
'epoch': epoch_i + 1,
'valid_loss_min': val_loss,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
}
# save checkpoint
save_ckp(checkpoint, False, checkpoint_path, best_model_path)
## TODO: save the model if validation loss has decreased
if val_loss <= valid_loss_min:
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,val_loss))
# save checkpoint as best model
save_ckp(checkpoint, True, checkpoint_path, best_model_path)
valid_loss_min = val_loss
logging.info("-----------------Training complete--------------------------")
def evaluate(model, val_dataloader):
"""After the completion of each training epoch, measure the model's performance on our validation set."""
# Put the model into the evaluation mode. The dropout layers are disabled during the test time.
model.eval()
# Tracking variables
val_accuracy = []
val_loss = []
# For each batch in our validation set...
for batch in val_dataloader:
# Load batch to GPU
b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
# Compute logits
with torch.no_grad():
logits = model(b_input_ids, b_attn_mask)
# Compute loss
loss = loss_fn(logits, b_labels)
val_loss.append(loss.item())
# Get the predictions
preds = torch.argmax(logits, dim=1).flatten()
# Calculate the accuracy rate
accuracy = (preds == b_labels).cpu().numpy().mean() * 100
val_accuracy.append(accuracy)
# Compute the average accuracy and loss over the validation set.
val_loss = np.mean(val_loss)
val_accuracy = np.mean(val_accuracy)
return val_loss, val_accuracy
bert_classifier, optimizer, scheduler = initialize_model(epochs=n_epochs)
train(model = bert_classifier ......)
bert_classifier, optimizer, scheduler = initialize_model(epochs=n_epochs)
model, optimizer, start_epoch, valid_loss_min = load_ckp(r"./best_model/best_model.pt", bert_classifier, optimizer)
model.eval()
model.freeze()
sample = {
"seq": "ABCDE",}
predictions = model.predict(sample)
AttributeError: 'BertClassifier' object has no attribute 'predict'
Generally, people wrote the prediction function for you.
If not, you need to handle the low level stuff.
After this line, you loaded the trained parameters.
model, optimizer, start_epoch, valid_loss_min = load_ckp(r"./best_model/best_model.pt", bert_classifier, optimizer)
After that, you need to do the model.forward(intput_seq,this_attention_mask_maybe_null).
You can see the forward method here is the : def forward(self, input_ids, attention_mask) in the model.
Related
Im currently building an LSTM Model for predicting stock prices in pytorch. I now want to implement a walk forward validation method, but I couldnt find any resource in how to do that.
This is my current training loop:
#%%
lstm1 = LSTM1(num_classes, input_size, hidden_dim, num_layers, X_train_tensors_final.shape[1])
criterion = torch.nn.L1Loss()
optimizer = torch.optim.Adam(lstm1.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
outputs = lstm1.forward(X_train_tensors_final)
optimizer.zero_grad() #clear gradients
loss = criterion(outputs, y_train_tensors)
loss.backward() #calculates the loss of the loss function
optimizer.step() #improve from loss, i.e backprop
if epoch % 100 == 0:
print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))
df_X_ss = ss.transform(df.iloc[:, 0:-1])
df_y_mm = ss.transform(df.iloc[:, 0:1])
df_X_ss = Variable(torch.Tensor(df_X_ss))
df_y_mm = Variable(torch.Tensor(df_y_mm))
df_X_ss = torch.reshape(df_X_ss, (df_X_ss.shape[0], 1, df_X_ss.shape[1]))
train_predict = lstm1(df_X_ss)
data_predict = train_predict.data.numpy()
The model should now predict one step into the future, then calculate the absolute percentage error. For the next step, the model should use the actual y value instead of the predicted yhat to make its next prediction. What would be the best way of implementing this? Or is there some build in function in pytorch that would do this ?
I need to write a code which trains a network given one single batch of training data and computes the loss on the complete validation set for each epoch as well. Set batch_size = 64.
Also, need to provide the graph the training and validation loss over epochs.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.net_layer = Sequential(
nn.Flatten(),
nn.Linear(64*64,30),
nn.Sigmoid())
def foward(self, x):
x = self.net_layer(x)
return x
model = Net()
nepochs = 2
losses = np.zeros(nepochs)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
for epoch in range(nepochs): # loop over the dataset multiple times
# initialise variables for mean loss calculation
running_loss = 0.0
n = 0
for data in train_loader:
inputs, labels = data
# Zero the parameter gradients to remove accumulated gradient from a previous iteration.
optimizer.zero_grad()
# Forward, backward, and update parameters
outputs = model(inputs) # running network
loss = loss_fn(outputs, labels) # calculating loss function
loss.backward() # backpropogating network
optimizer.step() # update model parameters with gradient decsent
# accumulate loss and increment minibatches
running_loss += loss.item()`enter code here`
n += 1
# record the mean loss for this epoch and show progress
losses[epoch] = running_loss / n
print(f"epoch: {epoch+1} loss: {losses[epoch] : .3f}")
I got this far and getting the following error:
error message
Any idea what I am doing wrong?
I am new to pytorch and I am trying to run a github model I found and test it. So the author's provided the model and the loss function.
like this:
#1. Inference the model
model = PhysNet_padding_Encoder_Decoder_MAX(frames=128)
rPPG, x_visual, x_visual3232, x_visual1616 = model(inputs)
#2. Normalized the Predicted rPPG signal and GroundTruth BVP signal
rPPG = (rPPG-torch.mean(rPPG)) /torch.std(rPPG) # normalize
BVP_label = (BVP_label-torch.mean(BVP_label)) /torch.std(BVP_label) # normalize
#3. Calculate the loss
loss_ecg = Neg_Pearson(rPPG, BVP_label)
Dataloading
train_loader = torch.utils.data.DataLoader(train_set, batch_size = 20, shuffle = True)
batch = next(iter(train_loader))
data, label1, label2 = batch
inputs= data
Let's say I want to train this model for 15 epochs.
So this is what I have so far:
I am trying to set the optimizer and training, but I am not sure how to tie the custom loss and data loading to the model and set the 15 epoch training correctly.
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
for epoch in range(15):
....
Any suggestions?
I assumed BVP_label is label1 of train_loader
train_loader = torch.utils.data.DataLoader(train_set, batch_size = 20, shuffle = True)
# Using GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = PhysNet_padding_Encoder_Decoder_MAX(frames=128)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
for epoch in range(15):
model.train()
for inputs, label1, label2 in train_loader:
rPPG, x_visual, x_visual3232, x_visual1616 = model(inputs)
BVP_label = label1 # assumed BVP_label is label1
rPPG = (rPPG-torch.mean(rPPG)) /torch.std(rPPG)
BVP_label = (BVP_label-torch.mean(BVP_label)) /torch.std(BVP_label)
loss_ecg = Neg_Pearson(rPPG, BVP_label)
optimizer.zero_grad()
loss_ecg.backward()
optimizer.step()
PyTorch training steps are as belows.
Create DataLoader
Initialize model and optimizer
Create a device object and move model to the device
in the train loop
select a mini-batch of data
use the model to make predictions
calculate the loss
loss.backward() updates the gradients of the model
update the parameters using optimizer
As you may know you can also check PyTorch Tutorials.
Learning PyTorch with Examples
What is torch.nn really?
I was trying to create a Neural Network based on the model presented on this paper by Yoon Kim (https://arxiv.org/pdf/1408.5882.pdf) for sentence classification. I've built it in TensorFlow Keras and was using padded sentences (with words lemmatized) as input and 3 categories ("positive", "neutral" or "negative") as output.
Below is the model I've built:
def create_CNN_model(window_sizes, feature_maps, sent_size, num_categs, embedding_matrix:np.array):
inputs = Input(shape=(sent_size), dtype='float32', name='text_inputs') # dim = (BATCH_SIZE, sent_size, embedding_dim)
# initialize the embeddings with my own embeddings matrix
embed = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],
mask_zero=True, input_length=sent_size,
weights=[embedding_matrix])(inputs)
#create array for max pooled vectors of features
ta = []
# as we have multiple window sizes:
for n_window in window_sizes:
con = Conv1D(feature_maps, n_window, padding='causal',
activation="relu", use_bias=True)(embed) # (BATCH_SIZE, sent_size-window_size+1, feature_maps)
# the convoluted tensor contains, for each window a feature map of dimension feature_maps
pooled = GlobalMaxPool1D(data_format='channels_last')(con) # (BATCH_SIZE, sent_size-windows_size+1)
# then, the max pooling operation extracts the maximum of each feature map, reducing the rank of the tensor
# the max pooled tensor contains a feature for each window
ta.append(pooled)
concat = concatenate(ta, axis=1)
dropped = Dropout(0.5)(concat)
outputs = Dense(num_categs,activation="softmax",use_bias=True, kernel_regularizer=l2(l=3),
kernel_constraint=Dropout(0.5))(dropped)
# create the model
model = Model(inputs=[inputs], outputs=[outputs])
#return the model
return model
I've tried training this model with just 200 sentences just to see if it overfits the data. But instead of overfitting, the loss value just goes up and down between 0 and 1. I've tried changing the learning rate for a value as small as 1e-8, but it did nothing.
Below is the function I've used for training:
def train_model(X_data, y_data, batch_sz, tf_model, max_patience, num_epochs, ln_rate):
# Instantiate an optimizer to train the model.
# optimizer = Adadelta(learning_rate=1e-3)
optimizer = Adam(learning_rate=ln_rate)
# Instantiate a loss function.
loss_fn = CategoricalCrossentropy()
# Prepare the metrics
train_acc_metric = CategoricalAccuracy()
val_acc_metric = CategoricalAccuracy()
buffer_sz = len(X_data)
patience = 0
epochs = num_epochs
last_val_acc = 0
# Start random state for better reprodutibility
np.random.seed(123)
# Create the checkpoints
ckpt = train.Checkpoint(step=tf.Variable(1), optimizer=optimizer,
model=tf_model)
manager = train.CheckpointManager(ckpt, './tf_ckpts', max_to_keep=3)
# Create directory to save the trained model
path = "./saved_model"
print("\n----------------------------------------------")
if not os.path.isdir(path):
try:
os.mkdir(path)
except OSError:
print ("\nCreation of the directory %s failed \n" % path)
else:
print ("\nSuccessfully created the directory %s \n" % path)
else:
print("\nDirectory %s already exists" % path)
print("\n----------------------------------------------")
print("\nStarting run script...\n",
"Model will be saved to ", path,"\n",
"Checkpoints will be restored from and saved to .\tf_ckpts")
# Save model prior to training
tf_model.save("./saved_model/tf_model")
# Restart from last checkpoint, if available
ckpt.restore(manager.latest_checkpoint)
print("\n----------------------------------------------")
if manager.latest_checkpoint:
print("\nRestored from {}".format(manager.latest_checkpoint))
else:
print("\nInitializing from scratch.")
# beggining training loop
for epoch in range(epochs):
print("\n----------------------------------------------")
print('Start of epoch %d' % (epoch,))
# re-shuffle data before each epoch
np.random.shuffle(X_data)
np.random.shuffle(y_data)
# create the training dataset with 10-fold crossvalidation
train_dataset = make_dataset(X_data,y_data,10)
# Iterate over the batches of the dataset.
for x_train, y_train, x_val, y_val in train_dataset:
train_batches = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_batches = train_batches.batch(batch_sz)
for x_batch_train, y_batch_train in train_batches:
with tf.GradientTape() as tape:
# calculate the forward run
logits = tf_model(x_batch_train)
# assert if output and true label tensor shapes are equal
get_shape = y_batch_train.shape
tf.debugging.assert_shapes([
(logits,get_shape),
], data=(y_batch_train, logits),
summarize=3, message="Inconsistent shape (labels,output): ",
name="assert_shapes")
# calculate loss function
loss_value = loss_fn(y_batch_train, logits)
# add 1 step to the stpes variable
ckpt.step.assign_add(1)
# Add extra losses created during this forward pass:
loss_value += sum(tf_model.losses)
# calculate gradients
grads = tape.gradient(loss_value, tf_model.trainable_weights)
# backpropagate the gradients
optimizer.apply_gradients(zip(grads, tf_model.trainable_weights))
# Update training metric.
train_acc_metric(y_batch_train, logits)
# Save & log every 500 batches.
if int(ckpt.step) % 100 == 0:
save_path = manager.save()
print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
print("loss {:1.2f}".format(loss_value))
print('Seen so far: %s samples' % (int(ckpt.step) * batch_sz))
# Run a cross-validation loop on each 10-fold dataset
val_logits = tf_model(x_val)
# Update val metrics
val_acc_metric(y_val, val_logits)
# Display metrics at the end of each epoch.
train_acc = train_acc_metric.result()
print('Training accuracy: ', float(train_acc))
# Reset training metrics at the end of each epoch
train_acc_metric.reset_states()
print("----------")
val_acc = val_acc_metric.result()
print('Validation accuracy: ', float(val_acc))
print("----------------------------------------------\n")
val_acc_metric.reset_states()
# Early stopping part
if val_acc < last_val_acc:
# If the max_patience is exceeded stop the training
if patience >= max_patience:
print("\n------------------------------------------------")
print("Early stopping training to prevent over-fitting!")
print("------------------------------------------------\n")
break
else:
patience += 1
# update the validation accuracy
last_val_acc = val_acc
# save the trained model
tf_model.save("./saved_model/tf_model")
print("\n------------------------------------------------")
print("\nEnd of Training!\n")
And the result of the training:
----------------------------------------------
Successfully created the directory ./saved_model
----------------------------------------------
Starting run script...
Model will be saved to ./saved_model
Checkpoints will be restored from and saved to . f_ckpts
INFO:tensorflow:Assets written to: ./saved_model/tf_model/assets
----------------------------------------------
Initializing from scratch.
----------------------------------------------
Start of epoch 0
Training accuracy: 0.38999998569488525
----------
Validation accuracy: 0.38999998569488525
----------------------------------------------
----------------------------------------------
Start of epoch 1
Saved checkpoint for step 100: ./tf_ckpts/ckpt-1
loss 1.05
Seen so far: 2000 samples
Training accuracy: 0.4050000011920929
----------
Validation accuracy: 0.4050000011920929
----------------------------------------------
----------------------------------------------
Start of epoch 2
Saved checkpoint for step 200: ./tf_ckpts/ckpt-2
loss 1.10
Seen so far: 4000 samples
Training accuracy: 0.36000001430511475
----------
Validation accuracy: 0.36000001430511475
----------------------------------------------
----------------------------------------------
Start of epoch 3
Saved checkpoint for step 300: ./tf_ckpts/ckpt-3
loss 1.15
Seen so far: 6000 samples
Training accuracy: 0.375
----------
Validation accuracy: 0.375
----------------------------------------------
----------------------------------------------
Start of epoch 4
Saved checkpoint for step 400: ./tf_ckpts/ckpt-4
loss 1.17
Seen so far: 8000 samples
Training accuracy: 0.38999998569488525
----------
Validation accuracy: 0.38999998569488525
----------------------------------------------
----------------------------------------------
Start of epoch 5
Saved checkpoint for step 500: ./tf_ckpts/ckpt-5
loss 1.18
Seen so far: 10000 samples
Training accuracy: 0.3799999952316284
----------
Validation accuracy: 0.3799999952316284
----------------------------------------------
----------------------------------------------
Start of epoch 6
Saved checkpoint for step 600: ./tf_ckpts/ckpt-6
loss 1.09
Seen so far: 12000 samples
Training accuracy: 0.35499998927116394
----------
Validation accuracy: 0.35499998927116394
----------------------------------------------
----------------------------------------------
Start of epoch 7
Saved checkpoint for step 700: ./tf_ckpts/ckpt-7
loss 1.12
Seen so far: 14000 samples
Training accuracy: 0.3700000047683716
----------
Validation accuracy: 0.3700000047683716
----------------------------------------------
Any sugestions on how to make it converge?
I designed a network for a text classification problem. To do this, I'm using huggingface transformet's BERT model with a linear layer above that for fine-tuning. My problem is that the loss on the training set is decreasing which is fine, but when it comes to do the evaluation after each epoch on the development set, the loss is increasing with epochs. I'm posting my code to investigate if there's something wrong with it.
for epoch in range(1, args.epochs + 1):
total_train_loss = 0
trainer.set_train()
for step, batch in enumerate(train_dataloader):
loss = trainer.step(batch)
total_train_loss += loss
avg_train_loss = total_train_loss / len(train_dataloader)
logger.info(('Training loss for epoch %d/%d: %4.2f') % (epoch, args.epochs, avg_train_loss))
print("\n-------------------------------")
logger.info('Start validation ...')
trainer.set_eval()
y_hat = list()
y = list()
total_dev_loss = 0
for step, batch_val in enumerate(dev_dataloader):
true_labels_ids, predicted_labels_ids, loss = trainer.validate(batch_val)
total_dev_loss += loss
y.extend(true_labels_ids)
y_hat.extend(predicted_labels_ids)
avg_dev_loss = total_dev_loss / len(dev_dataloader)
print(("\n-Total dev loss: %4.2f on epoch %d/%d\n") % (avg_dev_loss, epoch, args.epochs))
print("Training terminated!")
Following is the trainer file, which I use for doing a forward pass on a given batch and then backpropagate accordingly.
class Trainer(object):
def __init__(self, args, model, device, data_points, is_test=False, train_stats=None):
self.args = args
self.model = model
self.device = device
self.loss = nn.CrossEntropyLoss(reduction='none')
if is_test:
# Should load the model from checkpoint
self.model.eval()
self.model.load_state_dict(torch.load(args.saved_model))
logger.info('Loaded saved model from %s' % args.saved_model)
else:
self.model.train()
self.optim = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = data_points * self.args.epochs
self.scheduler = get_linear_schedule_with_warmup(self.optim, num_warmup_steps=0,
num_training_steps=total_steps)
def step(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
self.model.zero_grad()
outputs = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
loss = self.loss(outputs, batch_labels)
loss = loss.sum()
(loss / loss.numel()).backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.optim.step()
self.scheduler.step()
return loss
def validate(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
with torch.no_grad():
model_output = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
predicted_label_ids = self._predict(model_output)
label_ids = batch_labels.to('cpu').numpy()
loss = self.loss(model_output, batch_labels)
loss = loss.sum()
return label_ids, predicted_label_ids, loss
def _predict(self, logits):
return np.argmax(logits.to('cpu').numpy(), axis=1)
Finally, the following is my model (i.e., Classifier) class:
import torch.nn as nn
from transformers import BertModel
class Classifier(nn.Module):
def __init__(self, args, is_eval=False):
super(Classifier, self).__init__()
self.bert_model = BertModel.from_pretrained(
args.init_checkpoint,
output_attentions=False,
output_hidden_states=True,
)
self.is_eval_mode = is_eval
self.linear = nn.Linear(768, 2) # binary classification
def switch_state(self):
self.is_eval_mode = not self.is_eval_mode
def forward(self, input_ids, attention_mask=None, labels=None):
bert_outputs = self.bert_model(input_ids,
token_type_ids=None,
attention_mask=attention_mask)
# Should give the logits to the the linear layer
model_output = self.linear(bert_outputs[1])
return model_output
For visualization the loss throughout the epochs:
When I've used Bert for text classification my model has generally behaved as you tell. In part this is expected because pre-trained models tend to require few epochs to fine-tune, actually if you check Bert's paper the number of epochs recommended for fine-tuning is between 2 and 4.
On the other hand, I've usually found the optimum at just 1 or 2 epochs, which coincides with your case also. My guess is: there is a trade-off when fine-tuning pre-trained models between fitting to your downstream task and forgetting the weights learned at pre-training. Depending on the data you have, the equilibrium point may happen sooner or later and overfitting starts after that. But this paragraph is speculation based on my experience.
When validation loss increases it means your model is overfitting