I'm trying to train an NN to develop a relationship between a certain range of inputs and some mathematical coefficients but I'm struggling to figure out why the loss is not decreasing over the course of training. I've made a simple custom MLP and dataset to interface with the PyTorch module. The dataset will take in a 6 x m matrix for the inputs(m for however many different combinations I have) and a r x m matrix for the coefficients. However, when running the training script the training loss doesn't decrease over the course of 100 epochs. All the shapes match when entering and exiting the NN so I'm lost as to what might be going wrong. These are the shapes of the input and coefficients matrices for the training and test phase respectively.
inputs_train = (6, 581)
coefficients_train = (259, 581)
inputs_test = (6, 145)
coefficients_test = (259, 145)
All the relevant code is shown below.
class MLP(nn.Module):
def __init__(self, r):
super(MLP, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(6, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, r),
nn.ReLU()
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
class ExpansionCoefficientsDataset(Dataset):
def __init__(self, inputs, alphas):
self.geom_params = inputs
self.expansion_coefficients = alphas
def __len__(self):
return self.expansion_coefficients.shape[1]
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sample = {'inputs': self.geom_params[:,idx], 'alphas': self.expansion_coefficients[:,idx]}
return sample
train_dataset = ExpansionCoefficientsDataset(inputs_train, coefficients_train)
test_dataset = ExpansionCoefficientsDataset(inputs_test, coefficients_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)
#%%
model = MLP(r_ideal).double()
criterion = torch.nn.L1Loss()
num_epochs = 100
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
batch_size = len(train_loader)
# Training loop starts
for epoch in range(num_epochs):
print(f"-------------------------------\nEpoch {epoch+1}\n-------------------------------")
size = len(train_loader.dataset)
# Iterating over the training dataset
for i_batch, sample_batched in enumerate(train_loader):
pred = model(sample_batched['inputs'])
# Calculating loss
loss = criterion(pred, sample_batched['alphas'])
# Updating weights according
# to the calculated loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
if i_batch % 8 == 0:
loss, current = loss.item(), i_batch * sample_batched['inputs'].size()[0]
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
Related
I'm trying to use pytorch in for the IMBD dataset, to predict the positive and negative reviews. When I get to the training state, the following error is given by the criterion function:
ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([1136, 64, 1]))
After some research, I saw that the error is because the output of the model is returning a tensor of size [1136, 64, 1], and criterion is expecting only batch results.
Howerver, I don't know how to solve this error.
My code:
import torch
import spacy
import torch.nn as nn
from torchtext.legacy import data
import sys
import csv
import torch.optim as optim
import re
import nltk
from nltk.corpus import stopwords
from torchtext import vocab
from torchtext.legacy.data import Field
from torchtext.legacy import datasets
import pandas as pd
import re
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import random
SEED = 1234
torch.manual_seed(SEED) # For reproducibility
torch.backends.cudnn.deterministic = True
import torch.nn.functional as F
import torch.nn as nn
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
#text = [sent len, batch size]
embedded = self.embedding(text)
h_1 = F.relu(self.hidden_fc(embedded))
# assert torch.equal(output[-1,:,:], h_1.squeeze(0))
# [batch size, output dim]
return self.fc(h_1.squeeze(0))
def binary_accuracy(preds, y):
"""
Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
"""
#round predictions to the closest integer
rounded_preds = torch.round(torch.sigmoid(preds)) # 0.75 --> 1 0.4 --> 0
correct = (rounded_preds == y).float() #convert into float for division
acc = correct.sum() / len(correct)
return acc
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train() #Train mode is on
for batch in iterator:
optimizer.zero_grad() #Reset the gradients
predictions = model(batch.text) ## forward propagation
print(predictions.shape)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
loss.backward() ## backward propagation / calculate gradients
optimizer.step() ## update parameters
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval() #Evaluation mode is on
with torch.no_grad():
for batch in iterator:
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
TEXT = data.Field(tokenize = 'spacy',
tokenizer_language = 'en_core_web_sm',
lower = True)
LABEL = data.LabelField(dtype = torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) ## IMDB reviews dataset
train_data, valid_data = train_data.split(random_state = random.seed(SEED))
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE) #Build the vocabulary using the top frequent 25K words
LABEL.build_vocab(train_data)
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE)
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
model = MLP(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
To summarize your problem, you have reviews you want to classify as positive or negative. To do so you train an embedding space to map each word to a vector, then output a probability for each sentence and supervised with the corresponding label using a binary cross-entropy loss nn.BCELossWithLogits.
You current model is comprised of:
nn.Embedding: embeds each word in the sequence independently thus converting the input tensor shape from (seq_len, batch_size) to (seq_len, batch_size, embedding_dim). Where seq_len is the number of tokens in your input sequence.
nn.Linear layer reduces the dimensionality by projecting the features, the tensor shape is converted from (seq_len, batch_size, embedding_dim) to (seq_len, batch_size, hidden_dim).
A non-linearity layer is applied to the sequence of word vectors. Note how the structure of the sentence is retained. And finally, apply a second linear layer to map from (seq_len, batch_size, hidden_dim) to (seq_len, batch_size, output_dim). Still with the sentence structure (cf. the dim=0 with seq_len).
This is the reason why you are getting (1136, 64, 1) as the predictions shape: 1136 must be your sequence length, 64 is BATCH_SIZE, while 1 is OUTPUT_DIM.
Yet you are trying to classify each sequence as a whole, what you would need instead is a single tensor or scalar value per sentence, i.e. a shape of (1, 64, 1). This implies reducing the first dimension corresponding to the sequence dimension, to a single value.
A straightforward and easy way to reduce the dimension such that you can represent the whole sentence with a single vector is by applying an average pool to the sentence. The average vector of the words in each sentence should give you the sentiment of the positiveness/negativeness of the overall sentence. You can apply this operator before the final projection to remain in a relatively high dimension, either with nn.AdaptiveAvgPool1d with an output size of 1 or simply torch.Tensor.mean.
Here is a possible implementation with nn.AdaptiveAvgPool1d:
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.avg = nn.AdaptiveAvgPool1d(1) # reduces (N, C, L_in) to (N, C, 1)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# (seq_len, batch_size) = (1136, 64)
embedded = self.embedding(text)
# (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
h_1 = F.relu(self.hidden_fc(embedded))
# (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
avg = self.avg(h_1.permute(1,2,0))
# (batch_size, hidden_dim, 1) = (64, 256, 1)
out = self.fc(avg.squeeze(-1))
# (batch_size, 1, 1) = (64, 1)
return out
Or with torch.Tensor.mean:
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# (seq_len, batch_size) = (1136, 64)
embedded = self.embedding(text)
# (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
h_1 = F.relu(self.hidden_fc(embedded))
# (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
avg = h_1.mean(0)
# (batch_size, hidden_dim, 1) = (64, 256)
out = self.fc(avg)
# (batch_size, 1, 1) = (64, 1)
return out
Alternative methods involve using more sophisticated neural network layers such as recurrent neural network blocks (nn.RNN, nn.LSTM, nn.GRU)...
I'm having trouble understanding why this is throwing an error. This code is pulled directly from the PyTorch documentation for a NN classifier for the fashion MNIST dataset. However when I try to flip this to the MNIST handwritten digits data set it comes up with the following error:
RuntimeError: The size of tensor a (10) must match the size of tensor b (64) at non-singleton dimension 1
This occurs when using the loss function during the training loop function. Can anyone help me understand why this is happening. Thanks!
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda
import torchvision.models as models
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
print(f"Using {device} device")
training_data = datasets.MNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.MNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Compute prediction and loss
X, y = X.to(device), y.to(device)
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test_loop(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
def save_checkpoint(state, filename = "checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
model = NeuralNetwork().to(device)
learning_rate = 1e-3
batch_size = 64
epochs = 10
# Initialize the loss function
loss_fn = nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimiser)
test_loop(test_dataloader, model, loss_fn)
print("Done!")
torch.nn.MSELoss is an implemention of mean squared error. You can't measure the difference between two tensors if they're different sizes (MSELoss does not allow for broadcasting). So if you're using MSELoss, then the predictions and the targets must be the same shape. In your case, preds is a tensor of shape [64, 10], and y is a tensor of shape [64].
The reason y is of shape [64] rather than [64, 10] is that most classification dataset implementations represent targets as integer labels rather than one-hot encoded vectors. In theory, you could convert these integer label targets to one-hot encoded targets.
But in reality, since this is a classification problem, you should probably be using something like nn.CrossEntropyLoss rather than nn.MSELoss. The former is a conventional classification loss function, and it allows the targets to be integer labels rather than one-hot labels (so just swapping out MSELoss for CrossEntropyLoss should solve your problem). MSELoss is better suited for regression tasks and such.
I am in the process of training my first CNN to solve a multi-class classification problem. I am feeding in images of animals corresponding to one of 182 classes, however I have ran into some issues. Firstly my code appears to get stuck on optimiser.step(), it has been calculating this for roughly 30 minutes. Secondly my training loss is increasing:
EPOCH: 0 BATCH: 1999 LOSS: 1.5790680234357715
EPOCH: 0 BATCH: 3999 LOSS: 2.9340945997834207
If any one would be able to provide some guidance that would be greatly appreciated. Below is my code
#loading data
train_data = dataset.get_subset(
"train",
transform=transforms.Compose(
[transforms.Resize((448, 448)), transforms.ToTensor()]
),
)
train_loader = get_train_loader("standard", train_data, batch_size=16)
#definind model
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 3, 1)
self.conv2 = nn.Conv2d(6, 16, 3, 3)
self.fc1 = nn.Linear(37*37*16, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 182)
def forward(self, X):
X = F.relu(self.conv1(X))
X = F.max_pool2d(X, 2, 2)
X = F.relu(self.conv2(X))
X = F.max_pool2d(X, 2, 2)
X = torch.flatten(X, 1)
X = F.relu(self.fc1((X)))
X = F.relu(self.fc2((X)))
X = self.fc3(X)
return F.log_softmax(X, dim=1)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modell.parameters(), lr=0.001)
import time
start_time = time.time()
#VARIABLES (TRACKER)
epochs = 2
train_losses = []
test_losses = []
train_correct = []
test_correct = []
# FOR LOOP EPOCH
for i in range(epochs):
trn_corr = 0
tst_corr = 0
running_loss = 0.0
#TRAIN
for b, (X_train, Y_train, meta) in enumerate(train_loader):
b+=1 #batch starts at 1
#zero parameter gradients
optimizer.zero_grad()
# pass training to model as float (later compute loss)
output = modell(X_train.float())
#Calculate the loss of outputs with respect to ground truth values
loss = criterion(output, Y_train)
#Backpropagate the loss through the network
loss.backward()
#perform parameter update based on the current gradient
optimizer.step()
predicted = torch.max(output.data, 1)[1]
batch_corr = (predicted == Y_train).sum() # True (1) or False (0)
trn_corr += batch_corr
running_loss += loss.item()
if b%2000 == 1999:
print(f"EPOCH: {i} BATCH: {b} LOSS: {running_loss/2000}")
running_loss = 0.0
train_losses.append(loss)
train_correct.append(trn_corr)
As for the loss, it may be due to the model. The model has some rooms for improvement. Only 2 convolution layers is not sufficient for your data, as well as only expanding to 16 channels. Use more convolution layers with more channels. For example, 5 conv layers with channels of 16, 32, 32, 64, 64. Experiment with different numbers of layers and channels to see which one is best. Also, a good learning rate for Adam is 3e-4.To more easily track the models progress, I’d recommend decreasing the interval at which it prints the loss so you can more easily track progress.
About the data, are there enough instances of each class? Is it normalized between 0 and 1?
I am trying to train a model in pytorch.
input: 686-array
first layer: 64-array
second layer: 2-array
output: predition either 1 or 0
this is what I have so far:
class autoencoder(nn.Module):
def __init__(self):
super(autoencoder, self).__init__()
self.encoder_softmax = nn.Sequential(
nn.Linear(686, 256),
nn.ReLU(True),
nn.Linear(256, 2),
nn.Softmax()
)
def forward(self, x):
x = self.encoder_softmax(x)
return x
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
net = net.to(device)
iterations = 10
learning_rate = 0.98
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
net.parameters(), lr=learning_rate, weight_decay=1e-5)
for epoch in range(iterations):
loss = 0.0
print("train_dl len: ", len(train_dl))
# net.train()
for i, data in enumerate(train_dl, 0):
inputs, labels, vectorize = data
labels = labels.long().to(device)
inputs = inputs.float().to(device)
optimizer.zero_grad()
outputs = net(inputs)
train_loss = criterion(outputs, labels)
train_loss.backward()
optimizer.step()
loss += train_loss.item()
loss = loss / len(train_dl)
but when I train the model, the loss is not going down. What am I doing wrong?
You're using nn.CrossEntropyLoss as the loss function, which applies log-softmax, but you also apply softmax in the model:
self.encoder_softmax = nn.Sequential(
nn.Linear(686, 256),
nn.ReLU(True),
nn.Linear(256, 2),
nn.Softmax() # <- needs to be removed
)
The output of your model should be the raw logits, without the nn.Softmax.
You should also lower the learning rate, because a learning rate of 0.98 is very high, which makes the training much less stable and you'll likely see the loss oscillate. Are more appropriate learning rate would be in the magnitude of 0.01 or 0.001.
Assume that I have 77 samples to train my CNN, and my batch size is 10. Then the last batch has a batch size of 7 instead of 10. Somehow when I pass it to the loss function such as nn.MSELoss(), it gives me the error:
RuntimeError: The size of tensor a (10) must match the size of tensor
b (7) at non-singleton dimension 1
So pytorch doesn't support batches with different sizes?
My code in doubt:
import numpy as np
import torch
from torch import nn
import torchvision
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 6, (5,4))
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(64, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, x.shape[1] * x.shape[2] * x.shape[3])
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
model = Net()
batch_size = 10
# Generating Artifical data
x_train = torch.randn((77,1,20,20))
y_train = torch.randint(0,10,size=(77,),dtype=torch.float)
trainset = torch.utils.data.TensorDataset(x_train,y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)
# testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=0)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
for epoch in range(20): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i%10==0:
print('epoch{}, step{}, loss: {}'.format(epoch + 1, i + 1, running_loss))
# print("frac post = {}".format(frac_post))
running_loss = 0.0
The problem is not due to the batch size, but to a failure to broadcast properly between the 10 outputs of your CNN and the single label provided in each example.
If you look at the model output and label tensor shapes during the batch where the error is thrown,
print(outputs.shape, labels.shape)
#out: torch.Size([7, 10]) torch.Size([7])
you'll see that the labels are stored in a singleton tensor. According to pytorch broadcasting rules, to be broadcastable two tensors have to be compatible in all trailing dimensions. In this case, the trailing dimension of the model output (10) is incompatible with that of the label (7).
To fix, either add a dummy dimension to the label (assuming you actually want to broadcast the labels to match your ten network outputs), or define a network with scalar outputs. For example:
y_train = torch.randint(0,10,size=(77,1),dtype=torch.float)
results in
print(outputs.shape, labels.shape)
#out: torch.Size([7, 10]) torch.Size([7,1])
# these are broadcastable