training and evaluating an stacked auto-encoder model in pytorch - python

I am trying to train a model in pytorch.
input: 686-array
first layer: 64-array
second layer: 2-array
output: predition either 1 or 0
this is what I have so far:
class autoencoder(nn.Module):
def __init__(self):
super(autoencoder, self).__init__()
self.encoder_softmax = nn.Sequential(
nn.Linear(686, 256),
nn.ReLU(True),
nn.Linear(256, 2),
nn.Softmax()
)
def forward(self, x):
x = self.encoder_softmax(x)
return x
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
net = net.to(device)
iterations = 10
learning_rate = 0.98
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
net.parameters(), lr=learning_rate, weight_decay=1e-5)
for epoch in range(iterations):
loss = 0.0
print("train_dl len: ", len(train_dl))
# net.train()
for i, data in enumerate(train_dl, 0):
inputs, labels, vectorize = data
labels = labels.long().to(device)
inputs = inputs.float().to(device)
optimizer.zero_grad()
outputs = net(inputs)
train_loss = criterion(outputs, labels)
train_loss.backward()
optimizer.step()
loss += train_loss.item()
loss = loss / len(train_dl)
but when I train the model, the loss is not going down. What am I doing wrong?

You're using nn.CrossEntropyLoss as the loss function, which applies log-softmax, but you also apply softmax in the model:
self.encoder_softmax = nn.Sequential(
nn.Linear(686, 256),
nn.ReLU(True),
nn.Linear(256, 2),
nn.Softmax() # <- needs to be removed
)
The output of your model should be the raw logits, without the nn.Softmax.
You should also lower the learning rate, because a learning rate of 0.98 is very high, which makes the training much less stable and you'll likely see the loss oscillate. Are more appropriate learning rate would be in the magnitude of 0.01 or 0.001.

Related

Pytorch unsure of NN behaviour during training

I'm trying to train an NN to develop a relationship between a certain range of inputs and some mathematical coefficients but I'm struggling to figure out why the loss is not decreasing over the course of training. I've made a simple custom MLP and dataset to interface with the PyTorch module. The dataset will take in a 6 x m matrix for the inputs(m for however many different combinations I have) and a r x m matrix for the coefficients. However, when running the training script the training loss doesn't decrease over the course of 100 epochs. All the shapes match when entering and exiting the NN so I'm lost as to what might be going wrong. These are the shapes of the input and coefficients matrices for the training and test phase respectively.
inputs_train = (6, 581)
coefficients_train = (259, 581)
inputs_test = (6, 145)
coefficients_test = (259, 145)
All the relevant code is shown below.
class MLP(nn.Module):
def __init__(self, r):
super(MLP, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(6, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, r),
nn.ReLU()
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
class ExpansionCoefficientsDataset(Dataset):
def __init__(self, inputs, alphas):
self.geom_params = inputs
self.expansion_coefficients = alphas
def __len__(self):
return self.expansion_coefficients.shape[1]
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sample = {'inputs': self.geom_params[:,idx], 'alphas': self.expansion_coefficients[:,idx]}
return sample
train_dataset = ExpansionCoefficientsDataset(inputs_train, coefficients_train)
test_dataset = ExpansionCoefficientsDataset(inputs_test, coefficients_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)
#%%
model = MLP(r_ideal).double()
criterion = torch.nn.L1Loss()
num_epochs = 100
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
batch_size = len(train_loader)
# Training loop starts
for epoch in range(num_epochs):
print(f"-------------------------------\nEpoch {epoch+1}\n-------------------------------")
size = len(train_loader.dataset)
# Iterating over the training dataset
for i_batch, sample_batched in enumerate(train_loader):
pred = model(sample_batched['inputs'])
# Calculating loss
loss = criterion(pred, sample_batched['alphas'])
# Updating weights according
# to the calculated loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
if i_batch % 8 == 0:
loss, current = loss.item(), i_batch * sample_batched['inputs'].size()[0]
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

How do I fix this size of tensor error for my NN classifier PyTorch

I'm having trouble understanding why this is throwing an error. This code is pulled directly from the PyTorch documentation for a NN classifier for the fashion MNIST dataset. However when I try to flip this to the MNIST handwritten digits data set it comes up with the following error:
RuntimeError: The size of tensor a (10) must match the size of tensor b (64) at non-singleton dimension 1
This occurs when using the loss function during the training loop function. Can anyone help me understand why this is happening. Thanks!
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda
import torchvision.models as models
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
print(f"Using {device} device")
training_data = datasets.MNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.MNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Compute prediction and loss
X, y = X.to(device), y.to(device)
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test_loop(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
def save_checkpoint(state, filename = "checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
model = NeuralNetwork().to(device)
learning_rate = 1e-3
batch_size = 64
epochs = 10
# Initialize the loss function
loss_fn = nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimiser)
test_loop(test_dataloader, model, loss_fn)
print("Done!")
torch.nn.MSELoss is an implemention of mean squared error. You can't measure the difference between two tensors if they're different sizes (MSELoss does not allow for broadcasting). So if you're using MSELoss, then the predictions and the targets must be the same shape. In your case, preds is a tensor of shape [64, 10], and y is a tensor of shape [64].
The reason y is of shape [64] rather than [64, 10] is that most classification dataset implementations represent targets as integer labels rather than one-hot encoded vectors. In theory, you could convert these integer label targets to one-hot encoded targets.
But in reality, since this is a classification problem, you should probably be using something like nn.CrossEntropyLoss rather than nn.MSELoss. The former is a conventional classification loss function, and it allows the targets to be integer labels rather than one-hot labels (so just swapping out MSELoss for CrossEntropyLoss should solve your problem). MSELoss is better suited for regression tasks and such.

Training loss is increasing in CNN?

I am in the process of training my first CNN to solve a multi-class classification problem. I am feeding in images of animals corresponding to one of 182 classes, however I have ran into some issues. Firstly my code appears to get stuck on optimiser.step(), it has been calculating this for roughly 30 minutes. Secondly my training loss is increasing:
EPOCH: 0 BATCH: 1999 LOSS: 1.5790680234357715
EPOCH: 0 BATCH: 3999 LOSS: 2.9340945997834207
If any one would be able to provide some guidance that would be greatly appreciated. Below is my code
#loading data
train_data = dataset.get_subset(
"train",
transform=transforms.Compose(
[transforms.Resize((448, 448)), transforms.ToTensor()]
),
)
train_loader = get_train_loader("standard", train_data, batch_size=16)
#definind model
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 3, 1)
self.conv2 = nn.Conv2d(6, 16, 3, 3)
self.fc1 = nn.Linear(37*37*16, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 182)
def forward(self, X):
X = F.relu(self.conv1(X))
X = F.max_pool2d(X, 2, 2)
X = F.relu(self.conv2(X))
X = F.max_pool2d(X, 2, 2)
X = torch.flatten(X, 1)
X = F.relu(self.fc1((X)))
X = F.relu(self.fc2((X)))
X = self.fc3(X)
return F.log_softmax(X, dim=1)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modell.parameters(), lr=0.001)
import time
start_time = time.time()
#VARIABLES (TRACKER)
epochs = 2
train_losses = []
test_losses = []
train_correct = []
test_correct = []
# FOR LOOP EPOCH
for i in range(epochs):
trn_corr = 0
tst_corr = 0
running_loss = 0.0
#TRAIN
for b, (X_train, Y_train, meta) in enumerate(train_loader):
b+=1 #batch starts at 1
#zero parameter gradients
optimizer.zero_grad()
# pass training to model as float (later compute loss)
output = modell(X_train.float())
#Calculate the loss of outputs with respect to ground truth values
loss = criterion(output, Y_train)
#Backpropagate the loss through the network
loss.backward()
#perform parameter update based on the current gradient
optimizer.step()
predicted = torch.max(output.data, 1)[1]
batch_corr = (predicted == Y_train).sum() # True (1) or False (0)
trn_corr += batch_corr
running_loss += loss.item()
if b%2000 == 1999:
print(f"EPOCH: {i} BATCH: {b} LOSS: {running_loss/2000}")
running_loss = 0.0
train_losses.append(loss)
train_correct.append(trn_corr)
As for the loss, it may be due to the model. The model has some rooms for improvement. Only 2 convolution layers is not sufficient for your data, as well as only expanding to 16 channels. Use more convolution layers with more channels. For example, 5 conv layers with channels of 16, 32, 32, 64, 64. Experiment with different numbers of layers and channels to see which one is best. Also, a good learning rate for Adam is 3e-4.To more easily track the models progress, I’d recommend decreasing the interval at which it prints the loss so you can more easily track progress.
About the data, are there enough instances of each class? Is it normalized between 0 and 1?

PyTorch Input and hidden tensors not on the same device

I'm creating a simple LSTM model to predict some sales data. I am trying to train it on a GPU, but there seems to be a problem with casting the hidden state tensor to cuda.
I get the following error:
RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cuda:0 and hidden tensor at cpu.
How can I train the model on a GPU? I cast the training data, initial hidden states, and the model to cuda, yet I still get the error.
Here's my code:
# Convert train_norm from an array to a tensor
train_norm = torch.FloatTensor(train_norm).view(-1).cuda()
# define a window size
window_size = 12
# Define function to create seq/label tuples
def input_data(seq, ws): # ws is window size
out = []
L = len(seq)
for i in range(L-ws):
window = seq[i:i+ws]
label = seq[i+ws:i+ws+1]
out.append((window, label))
return out
# Apply the input_data function to the train_norm
train_data = input_data(train_norm, window_size)
class LSTM(nn.Module):
def __init__(self, input_size=1, hidden_size=100, output_size=1):
super().__init__()
self.hidden_size = hidden_size
# Add an LSTM layer:
self.lstm = nn.LSTM(input_size, hidden_size)
# Add a fully connected linear layer:
self.linear = nn.Linear(hidden_size, output_size)
# Initialize h0 and c0:
self.hidden = (torch.zeros(1, 1, hidden_size).cuda(), torch.zeros(1, 1, hidden_size).cuda())
def forward(self, seq):
lstm_out, self.hidden = self.lstm(seq.view(len(seq), 1, -1), self.hidden)
pred = self.linear(lstm_out.view(len(seq), -1))
return pred[-1] # get only the last value
model = LSTM().cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 200
import time
start_time = time.time()
for epoch in range(epochs):
# Extract the sequence and label from the training data
for seq, y_train in train_data:
# Reset the parameters and hidden states
optimizer.zero_grad()
hidden = (torch.zeros(1, 1, model.hidden_size),
torch.zeros(1, 1, model.hidden_size))
model.hidden = hidden
# Predict the values
y_pred = model(seq)
# Calculate loss and perform backpropagation
loss = criterion(y_pred, y_train)
loss.backward()
optimizer.step()
print(f'epoch: {epoch+1:2} loss: {loss.item():10.8f}')
print(f'Training took {time.time() - start_time:.0f} seconds')
First of all you are initializing hidden when there is absolutely no point to do it. If hidden isn't passed to LSTM layer it will be zero by default, please see documentation. This gives us the following model:
class LSTM(nn.Module):
def __init__(self, input_size=1, hidden_size=100, output_size=1):
super().__init__()
self.hidden_size = hidden_size
# Add an LSTM layer:
self.lstm = nn.LSTM(input_size, hidden_size)
# Add a fully connected linear layer:
self.linear = nn.Linear(hidden_size, output_size)
def forward(self, seq):
lstm_out, _ = self.lstm(seq.view(len(seq), 1, -1))
return self.linear(lstm_out.view(len(seq), -1))
Your pred[-1] is probably wrong as well as you are only returning the last element of batch from linear layer...
Also your training should be this (see hidden removed and added cuda to seq and y_train):
for epoch in range(epochs):
# Extract the sequence and label from the training data
for seq, y_train in train_data:
# Reset the parameters and hidden states
optimizer.zero_grad()
# Predict the values
# Add cuda to sequence
y_pred = model(seq.cuda())
# Calculate loss and perform backpropagation
loss = criterion(y_pred, y_train.cuda())
loss.backward()
optimizer.step()
print(f'epoch: {epoch+1:2} loss: {loss.item():10.8f}')
print(f'Training took {time.time() - start_time:.0f} seconds')
This alleviates problems with cuda (it's not a solution to hardcode it everywhere you possibly can...) and makes your code more readable.

pytorch training loss invariant with varying forward pass implementations

The following code (MNIST MLP in PyTorch) delivers approximately the same training loss regardless of having the last layer in the forward pass returning:
F.log_softmax(x)
(x)
Option 1 is incorrect because I use criterion = nn.CrossEntropyLoss() and yet the results are almost identical. Am I missing anything?
import torch
import numpy as np
import time
from torchvision import datasets
import torchvision.transforms as transforms
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 20
# convert data to torch.FloatTensor
transform = transforms.ToTensor()
# choose the training and test datasets
train_data = datasets.MNIST(root='data', train=True,
download=True, transform=transform)
test_data = datasets.MNIST(root='data', train=False,
download=True, transform=transform)
# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size,
num_workers=num_workers)
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# linear layer (784 -> 1 hidden node)
self.fc1 = nn.Linear(28 * 28, 512)
self.dropout1= nn.Dropout(p=0.2, inplace= False)
self.fc2 = nn.Linear(512, 256)
self.dropout2= nn.Dropout(p=0.2, inplace= False)
self.dropout = nn.Dropout(p=0.2, inplace= False)
self.fc3 = nn.Linear(256, 10)
def forward(self, x):
# flatten image input
x = x.view(-1, 28 * 28)
# add hidden layer, with relu activation function
x = F.relu(self.fc1(x))
x = self.dropout1(x)
x = F.relu(self.fc2(x))
x = self.dropout2(x)
x = self.fc3(x)
# return F.log_softmax(x)
return x
# initialize the NN
model = Net()
print(model)
model.to('cuda')
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
n_epochs = 10
model.train() # prep model for training
for epoch in range(n_epochs):
# monitor training loss
train_loss = 0.0
start = time.time()
for data, target in train_loader:
data, target = data.to('cuda'), target.to('cuda')
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
train_loss += loss.item()*data.size(0)
train_loss = train_loss/len(train_loader.dataset)
print('Epoch: {} \tTraining Loss: {:.6f} \ttime: {:.6f}'.format(
epoch+1,
train_loss,
time.time()-start
))
For numerical stability, the nn.CrossEntropyLoss() is implemented with the softmax layer inside it. So you should NOT use the softmax layer in your forward pass.
From the docs (https://pytorch.org/docs/stable/nn.html#crossentropyloss):
This criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class.
Using the softmax layer in the forward pass will lead to worse metrics because the gradient magnitudes are lowered (thus, the weight updates are also lowered). I learned it the hard way!
I guess your problem is that the loss is similar at the beginning of training, but at the end of the training, they should not. It is usually a good sanity check to overfit your model in one batch of data. The model should reach 100% accuracy if the batch is small enough. If the model is taking too long to train than you probably have a bug somewhere.
Hope that helps =)

Categories