mxnet training loss never changes but accuracy oscillates

mxnet training loss never changes but accuracy oscillates - python

I am using mxnet to train a VQA model, the input is (6244,) vector and the output is a single label
During my epoch, the loss never change but the accuracy is oscillating in a small range, the first 5 epochs are
Epoch 1. Loss: 2.7262569132562255, Train_acc 0.06867348986554285
Epoch 2. Loss: 2.7262569132562255, Train_acc 0.06955649207304837
Epoch 3. Loss: 2.7262569132562255, Train_acc 0.06853301224162152
Epoch 4. Loss: 2.7262569132562255, Train_acc 0.06799116997792494
Epoch 5. Loss: 2.7262569132562255, Train_acc 0.06887417218543046
This is a multi-class classification problem, with each answer label stands for a class, so I use softmax as final layer and cross-entropy to evaluate the loss, the code of them are as follows
So why the loss never change?... I just directly get if from cross_entropy
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01})
loss = gluon.loss.SoftmaxCrossEntropyLoss()
epochs = 10
moving_loss = 0.
best_eva = 0
for e in range(epochs):
for i, batch in enumerate(data_train):
data1 = batch.data[0].as_in_context(ctx)
data2 = batch.data[1].as_in_context(ctx)
data = [data1, data2]
label = batch.label[0].as_in_context(ctx)
with autograd.record():
output = net(data)
cross_entropy = loss(output, label)
cross_entropy.backward()
trainer.step(data[0].shape[0])
moving_loss = np.mean(cross_entropy.asnumpy()[0])
train_accuracy = evaluate_accuracy(data_train, net)
print("Epoch %s. Loss: %s, Train_acc %s" % (e, moving_loss, train_accuracy))
The eval function is as follows
def evaluate_accuracy(data_iterator, net, ctx=mx.cpu()):
numerator = 0.
denominator = 0.
metric = mx.metric.Accuracy()
data_iterator.reset()
for i, batch in enumerate(data_iterator):
with autograd.record():
data1 = batch.data[0].as_in_context(ctx)
data2 = batch.data[1].as_in_context(ctx)
data = [data1, data2]
label = batch.label[0].as_in_context(ctx)
output = net(data)
metric.update([label], [output])
return metric.get()[1]

Question asked and answered on the mxnet discussion forum here. No need use the autograd.record scope to record computational graph when computing the accuracy. Try instead:
def evaluate_accuracy(data_iterator, net, ctx=mx.cpu()):
metric = mx.metric.Accuracy()
data_iterator.reset()
for i, batch in enumerate(data_iterator):
data1 = batch.data[0].as_in_context(ctx)
data2 = batch.data[1].as_in_context(ctx)
data = [data1, data2]
label = batch.label[0].as_in_context(ctx)
output = net(data)
metric.update([label], [output])
return metric.get()[1]

Related

how do I get to plot epoch vs accuracy and epoch vs avg loss

I'm trying to build a chatbot and need to display a plot for epochs vs accuracy and epoch vs avg loss and epoch vs final loss.
I'm only able to get epoch vs final loss. I'm not that great at code.
Any help is appreciated
epoc = []
los = []
for epoch in range(num_epochs):
for (words, labels) in train_loader:
words = words.to(device)
labels = labels.to(dtype=torch.long).to(device)
# Forward pass
outputs = model(words)
# if y would be one-hot, we must apply
# labels = torch.max(labels, 1)[1]
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoc.append(epoch)
los.append(loss.item())
if (epoch + 1) % 100 == 0:
print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')
print(f'final loss: {loss.item():.4f}')
from matplotlib import pyplot as plt
plt.plot(epoc, los)
plt.show()
data = {
"model_state": model.state_dict(),
"input_size": input_size,
"hidden_size": hidden_size,
"output_size": output_size,
"all_words": all_words,
"tags": tags
}
FILE = "data.pth"
torch.save(data, FILE)
print(f'training complete. file saved to {FILE}')
I got the epoch vs final loss graph. Trying to get
Epoch vs accuracy
Epoch vs avg loss

how to set a threshold in pytorch

I am working on a very imbalanced data, 15% labeled as 1 and the rest as 0, using BERT.
the code i wrote uses maxing outputs which gives me predictions of 0 for everything.
How do I include thresholds in my code to maximise my predictions of 1.
nsteps=215
nepoch=3
best_val_acc = 0
for epoch in range(nepoch):
model.train()
print(f"epoch n°{epoch+1}:")
av_epoch_loss=0
progress_bar = tqdm(range(nsteps))
for batch in trainloader:
batch = {k:v.cuda() for k,v in batch.items()}
outputs = model(**batch)
loss = criterion(outputs, *batch)
av_epoch_loss += loss
loss.backward()
optim.step()
optim.zero_grad()
predictions=torch.argmax(outputs.logits, dim=-1)
f1.add_batch(predictions=predictions, references=batch["labels"])
acc.add_batch(predictions=predictions, references=batch["labels"])
progress_bar.update(1)
av_epoch_loss /= nsteps
print(f"Training Loss: {av_epoch_loss: .2f}")
acc_res = acc.compute()["accuracy"]
print(f"Training Accuracy: {acc_res:.2f}")
f_res = f1.compute()["f1"]
print(f"Training F1-score: {f_res:.2f}")
model.eval()
val_acc = validate(model)
if val_acc > best_val_acc:
print("Achieved best validation accuracy so far. Saving model.")
best_val_acc = val_acc
best_model_state = deepcopy(model.state_dict())
print("\n\n")
I looked in pytorch documentation but i couldn't figure it out.

Low accuracy binary classification with Pytorch

In practicing deep learning for binary classification with Pytorch on Breast-Cancer-Wisconsin-Diagnostic-DataSet.
I've tried different approaches, and the best I can get as below, the accuracy is still low at 61%.
What's the way to improve the accuracy?
Thank you.
import pandas as pd
import io
dataset = pd.read_excel(base_dir + "Breast-Cancer-Wisconsin-Diagnostic.xlsx")
number_of_columns = dataset.shape[1]
# training and testing split of 70:30
dataset['diagnosis'] = pd.Categorical(dataset['diagnosis']).codes
dataset = dataset.sample(frac=1, random_state=1234)
train_input = dataset.values[:398, :number_of_columns-1]
train_target = dataset.values[:398, number_of_columns-1]
test_input = dataset.values[398:, :number_of_columns-1]
test_target = dataset.values[398:, number_of_columns-1]
import torch
torch.manual_seed(1234)
hidden_units = 5
net = torch.nn.Sequential(
torch.nn.Linear(number_of_columns-1, hidden_units),
torch.nn.ReLU(),
torch.nn.Linear(hidden_units, 2))
# choose optimizer and loss function
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.1,momentum=0.9)
# train
epochs = 50
for epoch in range(epochs):
inputs = torch.autograd.Variable(torch.Tensor(train_input).float())
targets = torch.autograd.Variable(torch.Tensor(train_target).long())
optimizer.zero_grad()
out = net(inputs)
loss = criterion(out, targets)
loss.backward()
optimizer.step()
if epoch == 0 or (epoch + 1) % 10 == 0:
print('Epoch %d Loss: %.4f' % (epoch + 1, loss.item()))
# Epoch 1 Loss: 412063.1250
# Epoch 10 Loss: 0.6628
# Epoch 20 Loss: 0.6639
# Epoch 30 Loss: 0.6592
# Epoch 40 Loss: 0.6587
# Epoch 50 Loss: 0.6588
import numpy as np
inputs = torch.autograd.Variable(torch.Tensor(test_input).float())
targets = torch.autograd.Variable(torch.Tensor(test_target).long())
optimizer.zero_grad()
out = net(inputs)
_, predicted = torch.max(out.data, 1)
error_count = test_target.size - np.count_nonzero((targets == predicted).numpy())
print('Errors: %d; Accuracy: %d%%' % (error_count, 100 * torch.sum(targets == predicted) // test_target.size))
# Errors: 65; Accuracy: 61%

Features Representing samples are in different range. So, First thing you should do is to normalize the data.
You should plot the loss and acc over the training epochs for training and validation/test dataset to understand whether the model overfits on training data or underfit.
Furthermore, you can try with more complex (deeper) model. And since your training dataset has few number of samples, you can consider augmentation and transfer learning as well if possible.

PyTorch: accuracy of validation set greater than 100% during training

1 ) Problem
I observe an odd behaviour during training where my validation-accuracy is above 100% right from the start.
Epoch 0/3
----------
100%|██████████| 194/194 [00:50<00:00, 3.82it/s]
train Loss: 1.8653 Acc: 0.4796
100%|██████████| 194/194 [00:32<00:00, 5.99it/s]
val Loss: 1.7611 Acc: 1.2939
Epoch 1/3
----------
100%|██████████| 194/194 [00:42<00:00, 4.61it/s]
train Loss: 0.8704 Acc: 0.7467
100%|██████████| 194/194 [00:31<00:00, 6.11it/s]
val Loss: 1.0801 Acc: 1.4694
The output indicates that one epoch iterates over 194 batches, which does seem to be correct for the training data (which has a length of 6186, batch_size is 32, hence 32*194 = 6208 and this is ≈6186) but does not match the size of the validation-data (length of 3447, batch_size = 32).
Hence I would expect my validation-loop to generate 108 (3447 / 32 ≈ 108) batches insted of 194.
I thought this behaviour is handled within my for loop at:
for dataset in tqdm(dataloaders[phase]):
But somehow I can't figure out what is wrong here. See point 3) below for my entire code.
2 ) Question
If my assumption above is correct i.e. that this error stems from the for-loop within in my code then I would like to know the following:
How do I need to adjust the for-loop during the validation phase to handle the number of batches that are being used for validation correctly?
3 ) Background:
Following two tutorials, one on how to do transfer-learning (https://discuss.pytorch.org/t/transfer-learning-using-vgg16/20653) and one on how to do data-loading (https://pytorch.org/tutorials/beginner/data_loading_tutorial.html) in pytorch, I am trying to customize the code such that I can perform transfer-learning on a new custom dataset which I want to provide via pandas dataframes.
As such, my training- and validation-data is provided via two dataframes (df_train & df_val) which both contain two columns, one for the path and one for the target. E.g. like this:
url target
0 C:/Users/aaron/Desktop/pics/4ebd... 9
1 C:/Users/aaron/Desktop/pics/7153... 3
2 C:/Users/aaron/Desktop/pics/3ee6... 3
3 C:/Users/aaron/Desktop/pics/4652... 16
4 C:/Users/aaron/Desktop/pics/28ce... 15
...
And their respective length:
print(len(df_train))
print(len(df_val))
>> 6186
>> 3447
My pipeline looks like this:
class CustomDataset(Dataset):
def __init__(self, df, transform=None):
self.dataframe = df_train
self.transform = transform
def __len__(self):
return len(self.dataframe)
def __getitem__(self, idx):
img_name = self.dataframe.iloc[idx, 0]
img = Image.open(img_name)
img_normalized = self.transform(img)
landmarks = self.dataframe.iloc[idx, 1]
sample = {'data': img_normalized, 'label': int(landmarks)}
return sample
train_dataset = CustomDataset(df_train,transform=transforms.Compose([
transforms.Resize(224),
transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]))
val_dataset = CustomDataset(df_val,transform=transforms.Compose([
transforms.Resize(224),
transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]))
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=32,shuffle=True, num_workers=0)
val_loader = torch.utils.data.DataLoader(val_dataset,batch_size=32,shuffle=True, num_workers=0)
dataloaders = {'train': train_loader, 'val': val_loader}
dataset_sizes = {'train': len(df_train) ,'val': len(df_val)}
################### Training
from tqdm import tqdm
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for dataset in tqdm(dataloaders[phase]):
inputs, labels = dataset["data"], dataset["label"]
#print(inputs.type())
inputs = inputs.to(device, dtype=torch.float)
labels = labels.to(device,dtype=torch.long)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, len(le.classes_))
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
num_epochs=4)

Your problem appears to be here:
class CustomDataset(Dataset):
def __init__(self, df, transform=None):
>>>>> self.dataframe = df_train
This should be
self.dataframe = df
In your case, you are inadvertently setting both the train and val CustomDataset to df_train ...

Tensorflow : my model doesn't register after the first iteration

I took a tutorial available here and I tried to run it on my dataset, it was able to compile and it begins the training but here what I got :
The model don't seem to be saved at each iteration.
And I tried with 100 epochs it didn't change anything, it gives the output of the first iteration.
Do you have an idea what would be the problem ? (I know the code is long sorry)
def train(model, epochs, log_string):
'''Train the RNN'''
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# Used to determine when to stop the training early
valid_loss_summary = []
# Keep track of which batch iteration is being trained
iteration = 0
print()
print("Training Model: {}".format(log_string))
train_writer = tf.summary.FileWriter('./logs/3/train/{}'.format(log_string), sess.graph)
valid_writer = tf.summary.FileWriter('./logs/3/valid/{}'.format(log_string))
for e in range(epochs):
state = sess.run(model.initial_state)
# Record progress with each epoch
train_loss = []
train_acc = []
val_acc = []
val_loss = []
with tqdm(total=len(x_train)) as pbar:
for _, (x, y) in enumerate(get_batches(x_train, y_train, batch_size), 1):
feed = {model.inputs: x,
model.labels: y[:, None],
model.keep_prob: dropout,
model.initial_state: state}
summary, loss, acc, state, _ = sess.run([model.merged,
model.cost,
model.accuracy,
model.final_state,
model.optimizer],
feed_dict=feed)
# Record the loss and accuracy of each training batch
train_loss.append(loss)
train_acc.append(acc)
# Record the progress of training
train_writer.add_summary(summary, iteration)
iteration += 1
pbar.update(batch_size)
avg_train_loss = np.mean(train_loss)
avg_train_acc = np.mean(train_acc)
val_state = sess.run(model.initial_state)
with tqdm(total=len(x_valid)) as pbar:
for x, y in get_batches(x_valid, y_valid, batch_size):
feed = {model.inputs: x,
model.labels: y[:, None],
model.keep_prob: 1,
model.initial_state: val_state}
summary, batch_loss, batch_acc, val_state = sess.run([model.merged,
model.cost,
model.accuracy,
model.final_state],
feed_dict=feed)
# Record the validation loss and accuracy of each epoch
val_loss.append(batch_loss)
val_acc.append(batch_acc)
pbar.update(batch_size)
# Average the validation loss and accuracy of each epoch
avg_valid_loss = np.mean(val_loss)
avg_valid_acc = np.mean(val_acc)
valid_loss_summary.append(avg_valid_loss)
# Record the validation data's progress
valid_writer.add_summary(summary, iteration)
# Print the progress of each epoch
print("Epoch: {}/{}".format(e, epochs),
"Train Loss: {:.3f}".format(avg_train_loss),
"Train Acc: {:.3f}".format(avg_train_acc),
"Valid Loss: {:.3f}".format(avg_valid_loss),
"Valid Acc: {:.3f}".format(avg_valid_acc))
# Stop training if the validation loss does not decrease after 3 epochs
if avg_valid_loss > min(valid_loss_summary):
print("No Improvement.")
stop_early += 1
if stop_early == 3:
break
# Reset stop_early if the validation loss finds a new low
# Save a checkpoint of the model
else:
print("New Record!")
stop_early = 0
checkpoint = "sauvegarde/controverse_{}.ckpt".format(log_string)
saver.save(sess,checkpoint)
Thank you very much for your answers :)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

mxnet training loss never changes but accuracy oscillates - python

Related

how do I get to plot epoch vs accuracy and epoch vs avg loss

how to set a threshold in pytorch

Low accuracy binary classification with Pytorch

PyTorch: accuracy of validation set greater than 100% during training

Tensorflow : my model doesn't register after the first iteration

Categories

Resources