I have three datasets (training and test and validation). I combine training data set and test data set to do k fold cross validation. I did not use validation dataset. I am new to tensor board from previous question I able to do plot loss accuracy during training over each epoch. How I can do plot for loss and accuracy for also testing over each epoch. because I want to see the performance over each epoch. should I use validation set for set and how if yes?
# Prepare dataset by concatenating Train/Test part; we split later.
training_set = CustomDataset('one_hot_train_data.txt','train_3states_target.txt') #training_set = CustomDataset_3('one_hot_train_data.txt','train_5_target.txt')
training_generator = torch.utils.data.DataLoader(training_set, **params)
val_set = CustomDataset('one_hot_val_data.txt','val_3states_target.txt')
test_set = CustomDataset('one_hot_test_data.txt','test_3states_target.txt')
testloader_ = torch.utils.data.DataLoader(test_set, **params)
dataset = ConcatDataset([training_set, test_set])
kfold = KFold(n_splits=k_folds, shuffle=True)
# Start print
print('--------------------------------')
# K-fold Cross Validation model evaluation
for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
# Print
print(f'FOLD {fold}')
print('--------------------------------')
# Sample elements randomly from a given list of ids, no replacement.
train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
# Define data loaders for training and testing data in this fold
trainloader = torch.utils.data.DataLoader(
dataset,**params, sampler=train_subsampler)
testloader = torch.utils.data.DataLoader(
dataset,
**params, sampler=test_subsampler)
# Init the neural network
model = PPS()
model.to(device)
# Initialize optimizer
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
# Run the training loop for defined number of epochs
for epoch in range(0, N_EPOCHES):
# Print epoch
print(f'Starting epoch {epoch + 1}')
# Set current loss value
running_loss = 0.0
epoch_loss = 0.0
a = []
# Iterate over the DataLoader for training data
for i, data in enumerate(trainloader, 0):
inputs, targets = data
inputs = inputs.unsqueeze(-1)
#inputs = inputs.to(device)
targets = targets.to(device)
inputs = inputs.to(device)
# print(inputs.shape,targets.shape)
# Zero the gradients
optimizer.zero_grad()
# Perform forward pass
loss,outputs = model(inputs,targets)
outputs = outputs.to(device)
# Perform backward pass
loss.backward()
# Perform optimization
optimizer.step()
# print statistics
running_loss += loss.item()
epoch_loss += loss
a.append(torch.sum(outputs == targets))
# print(outputs.shape,outputs.shape[0])
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000), "acc",
torch.sum(outputs == targets) / float(outputs.shape[0]))
running_loss = 0.0
# sum_acc += (outputs == stat_batch.argmax(1)).float().sum()
print("epoch", epoch + 1, "acc", sum(a) / len(train_subsampler), "loss", epoch_loss / len(trainloader))
accuracy = 100 * sum(a) / len(training_set)
avg_loss = sum(a) / len(training_set)
writer.add_scalar('train/loss',
avg_loss.item(),
epoch)
writer.add_scalar('accuracy/loss',
accuracy,
epoch)
state = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict() }
torch.save(state, path + name_file + "model_epoch_i_" + str(epoch) + str(fold)+".cnn")
#torch.save(model.state_dict(), path + name_file + "model_epoch_i_" + str(epoch) + ".cnn")
# Print about testing
print('Starting testing')
# Evaluation for this fold
correct, total = 0, 0
with torch.no_grad():
# Iterate over the test data and generate predictions
for i, data in enumerate(testloader, 0):
# Get inputs
inputs, targets = data
#targets = targets.to(device)
inputs = inputs.unsqueeze(-1)
inputs = inputs.to(device)
# Generate outputs
loss,outputs = model(inputs,targets)
outputs.to(device)
print("out",outputs.shape)
print("target",targets.shape)
print("targetsize",targets.size(0))
print("sum",(outputs == targets).sum().item())
#print("sum",torch.sum(outputs == targets))
# Set total and correct
# _, predicted = torch.max(outputs.data, 1)
total += targets.size(0)
correct += (outputs == targets).sum().item()
#correct += torch.sum(outputs == targets)
# Print accuracy
print('Accuracy for fold %d: %d %%' % (fold,float( 100.0 * float(correct / total))))
print('--------------------------------')
results[fold] = 100.0 * float(correct / total)
# Print fold results
print(f'K-FOLD CROSS VALIDATION RESULTS FOR {k_folds} FOLDS')
print('--------------------------------')
sum = 0.0
for key, value in results.items():
print(f'Fold {key}: {value} %')
sum += value
print(f'Average: {float(sum / len(results.items()))} %')
Related
the original batch_size = 16, but I wanted to give accumulation = 2 so that I have a similar effect as when I used batch_size = 32.
The original training time lasted an hour, so I expected 2 hour training time with the gradient accumulation.
But the training ends at 50%, lasting an hour even with the gradient accumulation.
I don't know why it's stopping.. below is my code for training
def train_runner(model, train_dataset, valid_dataset , batch_size, num_train_epochs, learning_rate):
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size)
valid_dataloader = DataLoader(dataset = valid_dataset, batch_size = batch_size)
lowest_total_valid_loss = 9999.
step = 0
global_total_step = len(train_dataloader) * num_train_epochs
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0)
print("TRAIN START")
with tqdm(total=global_total_step, unit='step') as t:
total = 0
total_loss = 0
for epoch in range(num_train_epochs):
for iteration,batch in enumerate(train_dataloader):
#optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
start_positions = batch['start_positions'].to(device)
end_positions = batch['end_positions'].to(device)
outputs = model(input_ids,
attention_mask=attention_mask,
start_positions=start_positions,
end_positions=end_positions)
loss = outputs.loss
(loss / ACCUMULATION).backward()
step += 1
if step % ACCUMULATION:
continue
clip_grad_norm_(model.parameters(), max_norm=1.)
optimizer.step()
optimizer.zero_grad(set_to_none=True)
batch_loss = loss.item() * len(input_ids)
total += len(input_ids)
total_loss += batch_loss / ACCUMULATION
global_total_step += 1
t.set_postfix(loss="{:.6f}".format(total_loss / total), batch_loss="{:.6f}".format(batch_loss))
t.update(1)
del input_ids
del attention_mask
del start_positions
del end_positions
del outputs
del loss
## validation ##
if iteration != 0 and iteration % int(len(train_dataloader) / 10) == 0:
total_valid_loss = 0
for batch_val in valid_dataloader:
model.eval()
optimizer.zero_grad()
input_ids = batch_val['input_ids'].to(device)
attention_mask = batch_val['attention_mask'].to(device)
start_positions = batch_val['start_positions'].to(device)
end_positions = batch_val['end_positions'].to(device)
with torch.no_grad():
outputs = model(input_ids,
attention_mask=attention_mask,
start_positions=start_positions,
end_positions=end_positions)
loss = outputs.loss
total_valid_loss += loss.item()
if total_valid_loss < lowest_total_valid_loss:
print(f"lowest_total_valid_loss: {total_valid_loss} epoch : {epoch} iteration : {iteration}")
torch.save(model.state_dict(),'./output_model_best')
lowest_total_valid_loss = total_valid_loss
## validation ##
#model.save_pretrained("./klue_output_model")
print("TRAIN END")
for iteration,batch in enumerate(train_dataloader):
if step % ACCUMULATION:
t.update(1) # add one update here as well.
continue
...
t.update(1)
Half of the time you do not update the tqdm counter or set its value too high during initialization. So it can't go higher than 50%.
I am training a GAN and using a function to train the model. For the output, I would like to create lists, losses_G and losses_D, to track performance of the GAN which I can plot (using matplotlib).
def train(self, dataLoader, lr, num_epochs, criterion):
losses_G = []
losses_D = []
for epoch in range(num_epochs):
running_D_loss = 0
running_G_loss = 0
for index, real_X in enumerate(dataLoader):
# TODO device
labels = torch.ones(real_X.shape[0], 1)
self.discriminator.zero_grad()
real_X = real_X.type(torch.float32)
# real forward
output = self.discriminator(real_X)
error_D_real = criterion(output, labels)
error_D_real.backward()
D_x = output.mean().item()
# fake forward
labels_fake = torch.zeros(real_X.shape[0], 1)
noise = torch.randn(real_X.shape[0], 100)
fake_data = self.generator(noise)
output = self.discriminator(fake_data.detach().squeeze())
error_D_fake = criterion(output, labels_fake)
error_D_fake.backward()
D_G_z1 = output.mean().item()
optimizer_D.step()
# generator
labels.fill_(1)
self.generator.zero_grad()
output = self.discriminator(fake_data)
error_G = criterion(output, labels)
D_G_z2 = output.mean().item()
error_G.backward()
optimizer_G.step()
error_D = error_D_real + error_D_fake
running_D_loss += error_D.item()
running_G_loss += error_G.item()
if index % 1 == 0:
print(f'epoch {epoch + 1}, index {index + 1}: g_loss={running_G_loss:.4f}, d_loss={running_D_loss:.4f}')
print(f'D(x): {D_x:.3f}, D(G(z)): {D_G_z1:.3f} {D_G_z2:.3f}')
print()
#possible point of differences to get code to print losses
losses_G.append(running_G_loss)
losses_D.append(running_D_loss)
return losses_G, losses_D
x= train (self, dataLoader, lr, num_epochs, criterion)
print (x)
As a sanity check, I tried to print out the return value of the function but instead of getting the full list for each index that is run, I am only getting the final value.
Any help on this?
I am new to pytorch, and i would like to know how to display graphs of loss and accuraccy And how exactly should i store these values,knowing that i'm applying a cnn model for image classification using CIFAR10.
here is my current implementation :
def train(num_epochs,optimizer,criterion,model):
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(trainloader):
# origin shape: [4, 3, 32, 32] = 4, 3, 1024
# input_layer: 3 input channels, 6 output channels, 5 kernel size
images = images.to(device)
labels = labels.to(device)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i+1) % 2000 == 0:
print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)
def test ():
with torch.no_grad():
n_correct = 0
n_samples = 0
n_class_correct = [0 for i in range(10)]
n_class_samples = [0 for i in range(10)]
for images, labels in testloader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
# max returns (value ,index)
_, predicted = torch.max(outputs, 1)
n_samples += labels.size(0)
n_correct += (predicted == labels).sum().item()
for i in range(batch_size):
label = labels[i]
pred = predicted[i]
if (label == pred):
n_class_correct[label] += 1
n_class_samples[label] += 1
acc = 100.0 * n_correct / n_samples
print(f'Accuracy of the network: {acc} %')
for i in range(10):
acc = 100.0 * n_class_correct[i] / n_class_samples[i]
print(f'Accuracy of {classes[i]}: {acc} %')
test_score = np.mean([100 * n_class_correct[i] / n_class_samples[i] for i in range(10)])
print("the score test is : {0:.3f}%".format(test_score))
return acc
What you need to do is: Average the loss over all the batches and then append it to a variable after every epoch and then plot it. Implementation would be something like this:
import matplotlib.pyplot as plt
def my_plot(epochs, loss):
plt.plot(epochs, loss)
def train(num_epochs,optimizer,criterion,model):
loss_vals= []
for epoch in range(num_epochs):
epoch_loss= []
for i, (images, labels) in enumerate(trainloader):
# rest of the code
loss.backward()
epoch_loss.append(loss.item())
# rest of the code
# rest of the code
loss_vals.append(sum(epoch_loss)/len(epoch_loss))
# rest of the code
# plotting
my_plot(np.linspace(1, num_epochs, num_epochs).astype(int), loss_vals)
my_plot([1, 2, 3, 4, 5], [100, 90, 60, 30, 10])
You can do a similar calculation for accuracy.
The code below calculates the MSE and MAE values but I have an issue where the values for MAE and MSE don't get store_MAE and store MSE after the end of each epoch. It appears to use the values of the last epoch only. Any idea what I need to do in the code to save the values for each epoch I hope this makes sense. Thanks for your help
global_step = 0
best_test_error = 10000
MAE_for_all_epochs = []
MSE_for_all_epochs = []
for epoch in range(4):
print("Epoch %d" % epoch)
model.train()
for images, paths in tqdm(loader_train):
images = images.to(device)
targets = torch.tensor([metadata['count'][os.path.split(path)[-1]] for path in paths]) # B
targets = targets.float().to(device)
# forward pass:
output = model(images) # B x 1 x 9 x 9 (analogous to a heatmap)
preds = output.sum(dim=[1,2,3]) # predicted cell counts (vector of length B)
# backward pass:
loss = torch.mean((preds - targets)**2)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# logging:
count_error = torch.abs(preds - targets).mean()
writer.add_scalar('train_loss', loss.item(), global_step=global_step)
writer.add_scalar('train_count_error', count_error.item(), global_step=global_step)
print("Step %d, loss=%f, count error=%f" % (global_step,loss.item(),count_error.item()))
global_step += 1
mean_test_error = 0
model.eval()
for images, paths in tqdm(loader_test):
images = images.to(device)
targets = torch.tensor([metadata['count'][os.path.split(path)[-1]] for path in paths]) # B
targets = targets.float().to(device)
# forward pass:
output = model(images) # B x 1 x 9 x 9 (analogous to a heatmap)
preds = output.sum(dim=[1,2,3]) # predicted cell counts (vector of length B)
# logging:
#error = torch.abs(preds - targets).sum().data
#squared_error = ((preds - targets)*(preds - targets)).sum().data
#runnning_mae += error
#runnning_mse += squared_error
loss = torch.mean((preds - targets)**2)
count_error = torch.abs(preds - targets).mean()
mean_test_error += count_error
writer.add_scalar('test_loss', loss.item(), global_step=global_step)
writer.add_scalar('test_count_error', count_error.item(), global_step=global_step)
global_step += 1
#store_MAE = 0
#store_MSE = 0
mean_test_error = mean_test_error / len(loader_test)
#store_MAE += mean_test_error
MAE_for_all_epochs = np.append(MAE_for_all_epochs, mean_test_error)
mse = math.sqrt(loss / len(loader_test))
#store_MSE +=mse
MSE_for_all_epochs = np.append(MSE_for_all_epochs, mse)
print("Test count error: %f" % mean_test_error)
print("MSE: %f" % mse)
if mean_test_error < best_test_error:
best_test_error = mean_test_error
torch.save({'state_dict':model.state_dict(),
'optimizer_state_dict':optimizer.state_dict(),
'globalStep':global_step,
'train_paths':dataset_train.files,
'test_paths':dataset_test.files},checkpoint_path)
print("MAE Total: %f" % store_MAE)
print("MSE Total: %f" % store_MSE)
model_mae= MAE_for_all_epochs / epoch
model_mse= MSE_for_all_epochs / epoch
print("Model MAE: %f" % model_mae)
print("Model MSE: %f" % model_mse)
np.append() will work for your case like this,
#outside epochs loop
MAE_for_all_epochs = []
#inside loop
#replace this store_MAE with relevant variable
MAE_for_all_epochs = np.append(MAE_for_all_epochs, store_MAE)
Edit: Toy example: as per the usage
import numpy as np
all_var = []
for e in range(1, 10):
var1 = np.random.random(1)
all_var = np.append(all_var, var1)
print(all_var)
# output : [0.07660848 0.46824825 0.09432051 0.79462902 0.97798061 0.67299183 0.50996432 0.13084029 0.95100381]
I am very new in pytorch and implementing my own network of image classifier. However I see for each epoch training accuracy is very good but validation accuracy is 0.i noted till 5th epoch. I am using Adam optimizer and have learning rate .001. also resampling the whole data set after each epoch into training n validation set. Please help where I am going wrong.
Here is my code:
### where is data?
data_dir_train = '/home/sup/PycharmProjects/deep_learning/CNN_Data/training_set'
data_dir_test = '/home/sup/PycharmProjects/deep_learning/CNN_Data/test_set'
# Define your batch_size
batch_size = 64
allData = datasets.ImageFolder(root=data_dir_train,transform=transformArr)
# We need to further split our training dataset into training and validation sets.
def split_train_validation():
# Define the indices
num_train = len(allData)
indices = list(range(num_train)) # start with all the indices in training set
split = int(np.floor(0.2 * num_train)) # define the split size
#train_idx, valid_idx = indices[split:], indices[:split]
# Random, non-contiguous split
validation_idx = np.random.choice(indices, size=split, replace=False)
train_idx = list(set(indices) - set(validation_idx))
# define our samplers -- we use a SubsetRandomSampler because it will return
# a random subset of the split defined by the given indices without replacement
train_sampler = SubsetRandomSampler(train_idx)
validation_sampler = SubsetRandomSampler(validation_idx)
#train_loader = DataLoader(allData,batch_size=batch_size,sampler=train_sampler,shuffle=False,num_workers=4)
#validation_loader = DataLoader(dataset=allData,batch_size=1, sampler=validation_sampler)
return (train_sampler,validation_sampler)
Training
from torch.optim import Adam
import torch
import createNN
import torch.nn as nn
import loadData as ld
from torch.autograd import Variable
from torch.utils.data import DataLoader
# check if cuda - GPU support available
cuda = torch.cuda.is_available()
#create model, optimizer and loss function
model = createNN.ConvNet(class_num=2)
optimizer = Adam(model.parameters(),lr=.001,weight_decay=.0001)
loss_func = nn.CrossEntropyLoss()
if cuda:
model.cuda()
# function to save model
def save_model(epoch):
torch.save(model.load_state_dict(),'imageClassifier_{}.model'.format(epoch))
print('saved model at epoch',epoch)
def exp_lr_scheduler ( epoch , init_lr = args.lr, weight_decay = args.weight_decay, lr_decay_epoch = cf.lr_decay_epoch):
lr = init_lr * ( 0.5 ** (epoch // lr_decay_epoch))
def train(num_epochs):
best_acc = 0.0
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
model.train()
acc = 0.0
loss = 0.0
total = 0
# train model with training data
for i,(images,labels) in enumerate(train_loader):
# if cuda then move to GPU
if cuda:
images = images.cuda()
labels = labels.cuda()
# Variable class wraps a tensor and we can calculate grad
images = Variable(images)
labels = Variable(labels)
# reset accumulated gradients for each batch
optimizer.zero_grad()
# pass images to model which returns preiction
output = model(images)
#calculate the loss based on prediction and actual
loss = loss_func(output,labels)
# backpropagate the loss and compute gradient
loss.backward()
# update weights as per the computed gradients
optimizer.step()
# prediction class
predVal , predClass = torch.max(output.data, 1)
acc += torch.sum(predClass == labels.data)
loss += loss.cpu().data[0]
total += labels.size(0)
# print the statistics
train_acc = acc/total
train_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch,acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, loss))
# Valid model with validataion data
model.eval()
acc = 0.0
loss = 0.0
total = 0
for i,(images,labels) in enumerate(validation_loader):
# if cuda then move to GPU
if cuda:
images = images.cuda()
labels = labels.cuda()
# Variable class wraps a tensor and we can calculate grad
images = Variable(images)
labels = Variable(labels)
# reset accumulated gradients for each batch
optimizer.zero_grad()
# pass images to model which returns preiction
output = model(images)
#calculate the loss based on prediction and actual
loss = loss_func(output,labels)
# backpropagate the loss and compute gradient
loss.backward()
# update weights as per the computed gradients
optimizer.step()
# prediction class
predVal, predClass = torch.max(output.data, 1)
acc += torch.sum(predClass == labels.data)
loss += loss.cpu().data[0]
total += labels.size(0)
# print the statistics
valid_acc = acc / total
valid_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch, valid_acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, valid_loss))
if(best_acc<valid_acc):
best_acc = valid_acc
save_model(epoch)
# at 30th epoch we save the model
if (epoch == 30):
save_model(epoch)
train(20)
I think you did not take into account that acc += torch.sum(predClass == labels.data) returns a tensor instead of a float value. Depending on the version of pytorch you are using I think you should change it to:
acc += torch.sum(predClass == labels.data).cpu().data[0] #pytorch 0.3
acc += torch.sum(predClass == labels.data).item() #pytorch 0.4
Although your code seems to be working for old pytorch version, I would recommend you to upgrade to the 0.4 version.
Also, I mentioned other problems/typos in your code.
You are loading the dataset for every epoch.
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
...
That should not happen, it should be enough loading it once
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
...
In the training part you have (this does not happen in the validation):
train_acc = acc/total
train_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch,acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, loss))
Where you are printing acc instead of train_acc
Also, in the validation part I mentioned that you are printing print('Mean train acc = {} over epoch = {}'.format(epoch, valid_acc)) when it should be something like 'Mean val acc'.
Changing this lines of code, using a standard model I created and CIFAR dataset the training seems to converge, accuracy increases at every epoch while mean loss value decreases.
I Hope I could help you!