Neural networks pytorch - python

I am very new in pytorch and implementing my own network of image classifier. However I see for each epoch training accuracy is very good but validation accuracy is 0.i noted till 5th epoch. I am using Adam optimizer and have learning rate .001. also resampling the whole data set after each epoch into training n validation set. Please help where I am going wrong.
Here is my code:
### where is data?
data_dir_train = '/home/sup/PycharmProjects/deep_learning/CNN_Data/training_set'
data_dir_test = '/home/sup/PycharmProjects/deep_learning/CNN_Data/test_set'
# Define your batch_size
batch_size = 64
allData = datasets.ImageFolder(root=data_dir_train,transform=transformArr)
# We need to further split our training dataset into training and validation sets.
def split_train_validation():
# Define the indices
num_train = len(allData)
indices = list(range(num_train)) # start with all the indices in training set
split = int(np.floor(0.2 * num_train)) # define the split size
#train_idx, valid_idx = indices[split:], indices[:split]
# Random, non-contiguous split
validation_idx = np.random.choice(indices, size=split, replace=False)
train_idx = list(set(indices) - set(validation_idx))
# define our samplers -- we use a SubsetRandomSampler because it will return
# a random subset of the split defined by the given indices without replacement
train_sampler = SubsetRandomSampler(train_idx)
validation_sampler = SubsetRandomSampler(validation_idx)
#train_loader = DataLoader(allData,batch_size=batch_size,sampler=train_sampler,shuffle=False,num_workers=4)
#validation_loader = DataLoader(dataset=allData,batch_size=1, sampler=validation_sampler)
return (train_sampler,validation_sampler)
Training
from torch.optim import Adam
import torch
import createNN
import torch.nn as nn
import loadData as ld
from torch.autograd import Variable
from torch.utils.data import DataLoader
# check if cuda - GPU support available
cuda = torch.cuda.is_available()
#create model, optimizer and loss function
model = createNN.ConvNet(class_num=2)
optimizer = Adam(model.parameters(),lr=.001,weight_decay=.0001)
loss_func = nn.CrossEntropyLoss()
if cuda:
model.cuda()
# function to save model
def save_model(epoch):
torch.save(model.load_state_dict(),'imageClassifier_{}.model'.format(epoch))
print('saved model at epoch',epoch)
def exp_lr_scheduler ( epoch , init_lr = args.lr, weight_decay = args.weight_decay, lr_decay_epoch = cf.lr_decay_epoch):
lr = init_lr * ( 0.5 ** (epoch // lr_decay_epoch))
def train(num_epochs):
best_acc = 0.0
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
model.train()
acc = 0.0
loss = 0.0
total = 0
# train model with training data
for i,(images,labels) in enumerate(train_loader):
# if cuda then move to GPU
if cuda:
images = images.cuda()
labels = labels.cuda()
# Variable class wraps a tensor and we can calculate grad
images = Variable(images)
labels = Variable(labels)
# reset accumulated gradients for each batch
optimizer.zero_grad()
# pass images to model which returns preiction
output = model(images)
#calculate the loss based on prediction and actual
loss = loss_func(output,labels)
# backpropagate the loss and compute gradient
loss.backward()
# update weights as per the computed gradients
optimizer.step()
# prediction class
predVal , predClass = torch.max(output.data, 1)
acc += torch.sum(predClass == labels.data)
loss += loss.cpu().data[0]
total += labels.size(0)
# print the statistics
train_acc = acc/total
train_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch,acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, loss))
# Valid model with validataion data
model.eval()
acc = 0.0
loss = 0.0
total = 0
for i,(images,labels) in enumerate(validation_loader):
# if cuda then move to GPU
if cuda:
images = images.cuda()
labels = labels.cuda()
# Variable class wraps a tensor and we can calculate grad
images = Variable(images)
labels = Variable(labels)
# reset accumulated gradients for each batch
optimizer.zero_grad()
# pass images to model which returns preiction
output = model(images)
#calculate the loss based on prediction and actual
loss = loss_func(output,labels)
# backpropagate the loss and compute gradient
loss.backward()
# update weights as per the computed gradients
optimizer.step()
# prediction class
predVal, predClass = torch.max(output.data, 1)
acc += torch.sum(predClass == labels.data)
loss += loss.cpu().data[0]
total += labels.size(0)
# print the statistics
valid_acc = acc / total
valid_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch, valid_acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, valid_loss))
if(best_acc<valid_acc):
best_acc = valid_acc
save_model(epoch)
# at 30th epoch we save the model
if (epoch == 30):
save_model(epoch)
train(20)

I think you did not take into account that acc += torch.sum(predClass == labels.data) returns a tensor instead of a float value. Depending on the version of pytorch you are using I think you should change it to:
acc += torch.sum(predClass == labels.data).cpu().data[0] #pytorch 0.3
acc += torch.sum(predClass == labels.data).item() #pytorch 0.4
Although your code seems to be working for old pytorch version, I would recommend you to upgrade to the 0.4 version.
Also, I mentioned other problems/typos in your code.
You are loading the dataset for every epoch.
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
...
That should not happen, it should be enough loading it once
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
...
In the training part you have (this does not happen in the validation):
train_acc = acc/total
train_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch,acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, loss))
Where you are printing acc instead of train_acc
Also, in the validation part I mentioned that you are printing print('Mean train acc = {} over epoch = {}'.format(epoch, valid_acc)) when it should be something like 'Mean val acc'.
Changing this lines of code, using a standard model I created and CIFAR dataset the training seems to converge, accuracy increases at every epoch while mean loss value decreases.
I Hope I could help you!

Related

Understanding model training and evaluation in Pytorch

I am following a Pytorch code on deep learning. Where I saw model evaluation taking place within the training epoch!
Q) Should the torch.no_grad and model.eval() be out of the training epoch loop?
Q) And how to determine that, which parameter (weight) are getting optimised by the optimiser during the back-propagation?
...
for l in range(1):
model = GTN(num_edge=A.shape[-1],
num_channels=num_channels,w_in = node_features.shape[1],w_out = node_dim,
num_class=num_classes,num_layers=num_layers,norm=norm)
if adaptive_lr == 'false':
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001)
else:
optimizer = torch.optim.Adam([{'params':model.weight},{'params':model.linear1.parameters()},{'params':model.linear2.parameters()},
{"params":model.layers.parameters(), "lr":0.5}], lr=0.005, weight_decay=0.001)
loss = nn.CrossEntropyLoss()
# Train & Valid & Test
best_val_loss = 10000
best_train_loss = 10000
best_train_f1 = 0
best_val_f1 = 0
for i in range(epochs):
print('Epoch: ',i+1)
model.zero_grad()
model.train()
loss,y_train,Ws = model(A, node_features, train_node, train_target)
train_f1 = torch.mean(f1_score(torch.argmax(y_train.detach(),dim=1), train_target, num_classes=num_classes)).cpu().numpy()
print('Train - Loss: {}, Macro_F1: {}'.format(loss.detach().cpu().numpy(), train_f1))
loss.backward()
optimizer.step()
model.eval()
# Valid
with torch.no_grad():
val_loss, y_valid,_ = model.forward(A, node_features, valid_node, valid_target)
val_f1 = torch.mean(f1_score(torch.argmax(y_valid,dim=1), valid_target, num_classes=num_classes)).cpu().numpy()
if val_f1 > best_val_f1:
best_val_loss = val_loss.detach().cpu().numpy()
best_train_loss = loss.detach().cpu().numpy()
best_train_f1 = train_f1
best_val_f1 = val_f1
print('---------------Best Results--------------------')
print('Train - Loss: {}, Macro_F1: {}'.format(best_train_loss, best_train_f1))
print('Valid - Loss: {}, Macro_F1: {}'.format(best_val_loss, best_val_f1))
final_f1 += best_test_f1
For each epoch, you are doing train, followed by validation/test.
For validation/test you are moving the model to evaluation model
using model.eval() and then doing forward propagation with
torch.no_grad() which is correct. Again, you are moving back the
model back to train model using model.train() at the start of
train. There is no issue with the code and you are using the model
modes correctly.
In your code, if adaptive_lr if False then you are optimizing the parameters given by model.parameters() and when adaptive_lr
is True then you are optimizing:
model.weight
model.linear1.parameters()
model.linear2.parameters()
model.layers.parameters()

Walk Forward Validation in Pytorch LSTM

Im currently building an LSTM Model for predicting stock prices in pytorch. I now want to implement a walk forward validation method, but I couldnt find any resource in how to do that.
This is my current training loop:
#%%
lstm1 = LSTM1(num_classes, input_size, hidden_dim, num_layers, X_train_tensors_final.shape[1])
criterion = torch.nn.L1Loss()
optimizer = torch.optim.Adam(lstm1.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
outputs = lstm1.forward(X_train_tensors_final)
optimizer.zero_grad() #clear gradients
loss = criterion(outputs, y_train_tensors)
loss.backward() #calculates the loss of the loss function
optimizer.step() #improve from loss, i.e backprop
if epoch % 100 == 0:
print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))
df_X_ss = ss.transform(df.iloc[:, 0:-1])
df_y_mm = ss.transform(df.iloc[:, 0:1])
df_X_ss = Variable(torch.Tensor(df_X_ss))
df_y_mm = Variable(torch.Tensor(df_y_mm))
df_X_ss = torch.reshape(df_X_ss, (df_X_ss.shape[0], 1, df_X_ss.shape[1]))
train_predict = lstm1(df_X_ss)
data_predict = train_predict.data.numpy()
The model should now predict one step into the future, then calculate the absolute percentage error. For the next step, the model should use the actual y value instead of the predicted yhat to make its next prediction. What would be the best way of implementing this? Or is there some build in function in pytorch that would do this ?

invalid data type 'numpy.str_' when training bert model with pytorch

my data are divided to 2 parts training and validation one . I Used load_dataset and dataloader functions . I convert data in dataset to torch format using traindataset.set_format
when starting training I got error
new(): invalid data type 'numpy.str_'
in this line
for step,batch in enumerate(train_dataloader):
so how can i fix this error?
model= MixModel()
#model.load_state_dict(torch.load(r"/media/sh/saved_weightscnnbert.pt"))
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
traindataset = load_dataset('csv', data_files='/content/drive//My Drive/Colab Notebooks/newdataset/newdata_train2',split='train')
testdataset = load_dataset('csv', data_files='/content/drive//My Drive/Colab Notebooks/newdataset/newdata_valid2',split='train')
traindataset =traindataset.map(encode)
testdataset1 = testdataset.map(encode)
traindataset =traindataset.map(lambda examples: {'labels': examples['symptoms']}, batched=True)
testdataset =testdataset1.map(lambda examples: {'labels': examples['symptoms']}, batched=True)
traindataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
testdataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = torch.utils.data.DataLoader(traindataset, batch_size= 64)
test_dataloader = torch.utils.data.DataLoader(testdataset, batch_size= 64)
# function to train the model
def train():
model.train()
total_loss, total_accuracy = 0, 0
# empty list to save model predictions
total_preds=[]
Labels=[]
# iterate over batches
for step,batch in enumerate(train_dataloader):
# progress update after every 50 batches.
if step % 100 == 0 and not step == 0:
print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
sent_id, mask, labels = batch['input_ids'],batch['attention_mask'],batch['labels']
# clear previously calculated gradients
model.zero_grad()
# get model predictions for the current batch
preds = model(sent_id, mask, labels)
# compute the loss between actual and predicted values
alpha=0.25
gamma=2
ce_loss = loss_fn(preds, labels)
#pt = torch.exp(-ce_loss)
#focal_loss = (alpha * (1-pt)**gamma * ce_loss).mean() # mean over the batch
# add on to the total loss
total_loss = total_loss + ce_loss.item()
# backward pass to calculate the gradients
ce_loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# update parameters
optimizer.step()
preds =torch.argmax(preds, dim=1)
total_preds.append(preds)
total_accuracy += (preds == labels).float().sum()
# compute the training loss of the epoch
avg_loss = total_loss / len(train_dataloader)
avg_accuracy = total_accuracy / len(traindataset)
# predictions are in the form of (no. of batches, size of batch, no. of classes).
# reshape the predictions in form of (number of samples, no. of classes)
total_preds = np.concatenate(total_preds, axis=0)
#returns the loss and predictions
return avg_loss, total_preds, avg_accuracy

What is causing this NotImplementedError in Pytorch?

I need to write a code which trains a network given one single batch of training data and computes the loss on the complete validation set for each epoch as well. Set batch_size = 64.
Also, need to provide the graph the training and validation loss over epochs.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.net_layer = Sequential(
nn.Flatten(),
nn.Linear(64*64,30),
nn.Sigmoid())
def foward(self, x):
x = self.net_layer(x)
return x
model = Net()
nepochs = 2
losses = np.zeros(nepochs)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
for epoch in range(nepochs): # loop over the dataset multiple times
# initialise variables for mean loss calculation
running_loss = 0.0
n = 0
for data in train_loader:
inputs, labels = data
# Zero the parameter gradients to remove accumulated gradient from a previous iteration.
optimizer.zero_grad()
# Forward, backward, and update parameters
outputs = model(inputs) # running network
loss = loss_fn(outputs, labels) # calculating loss function
loss.backward() # backpropogating network
optimizer.step() # update model parameters with gradient decsent
# accumulate loss and increment minibatches
running_loss += loss.item()`enter code here`
n += 1
# record the mean loss for this epoch and show progress
losses[epoch] = running_loss / n
print(f"epoch: {epoch+1} loss: {losses[epoch] : .3f}")
I got this far and getting the following error:
error message
Any idea what I am doing wrong?

Efficient metrics evaluation in PyTorch

I am new to PyTorch and want to efficiently evaluate among others F1 during my Training and my Validation Loop.
So far, my approach was to calculate the predictions on GPU, then push them to CPU and append them to a vector for both Training and Validation. After Training and Validation, I would evaluate both for each epoch using sklearn. However, profiling my code it showed, that pushing to cpu is quite a bottleneck.
for epoch in range(n_epochs):
model.train()
avg_loss = 0
avg_val_loss = 0
train_pred = np.array([])
val_pred = np.array([])
# Training loop (transpose X_batch to fit pretrained (features, samples) style)
for X_batch, y_batch in train_loader:
scores = model(X_batch)
y_pred = F.softmax(scores, dim=1)
train_pred = np.append(train_pred, self.get_vector(y_pred.detach().cpu().numpy()))
loss = loss_fn(scores, self.get_vector(y_batch))
optimizer.zero_grad()
loss.backward()
optimizer.step()
avg_loss += loss.item() / len(train_loader)
model.eval()
# Validation loop
for X_batch, y_batch in val_loader:
with torch.no_grad():
scores = model(X_batch)
y_pred = F.softmax(scores, dim=1)
val_pred = np.append(val_pred, self.get_vector(y_pred.detach().cpu().numpy()))
loss = loss_fn(scores, self.get_vector(y_batch))
avg_val_loss += loss.item() / len(val_loader)
# Model Checkpoint for best validation f1
val_f1 = self.calculate_metrics(train_targets[val_index], val_pred, f1_only=True)
if val_f1 > best_val_f1:
prev_best_val_f1 = best_val_f1
best_val_f1 = val_f1
torch.save(model.state_dict(), self.PATHS['xlm'])
evaluated_epoch = epoch
# Calc the metrics
self.save_metrics(train_targets[train_index], train_pred, avg_loss, 'train')
self.save_metrics(train_targets[val_index], val_pred, avg_val_loss, 'val')
I am certain there is a more efficient way to
a) store the predictions without having to push them to cpu each batch. b) calculate the metrics on GPU directly?
As I am new to PyTorch, I am very grateful for any hints and feedback :)
You can compute the F-score yourself in pytorch. The F1-score is defined for single-class (true/false) classification only. The only thing you need is to aggregating the number of:
Count of the class in the ground truth target data;
Count of the class in the predictions;
Count how many times the class was correctly predicted.
Let's assume you want to compute F1 score for the class with index 0 in your softmax. In every batch, you can do:
predicted_classes = torch.argmax(y_pred, dim=1) == 0
target_classes = self.get_vector(y_batch)
target_true += torch.sum(target_classes == 0).float()
predicted_true += torch.sum(predicted_classes).float()
correct_true += torch.sum(
predicted_classes == target_classes * predicted_classes == 0).float()
When all batches are processed:
recall = correct_true / target_true
precision = correct_true / predicted_true
f1_score = 2 * precission * recall / (precision + recall)
Don't forget to take care of cases when precision and recall are zero and when then desired class was not predicted at all.

Categories