Loss increases when training neural network - python

I am I am trying to write a code for regression using neural networks (for learning).
Here are my code:
#fixme: k-fold cross validation
n_crossVal = 10
kf = KFold(n_splits = n_crossVal) #, random_state=1, shuffle=True fixme
for p_t in key_set_1:
cur_ds = []
for i, roi in enumerate(key_set_2):
if(i==0):
cur_ds = brain_ds[p_t + '_' + roi]
else:
cur_ds = np.hstack((cur_ds, brain_ds[p_t + '_' + roi]))
print(cur_ds.shape)
print(n_train)
size_input = cur_ds.shape[1]
preds_case = np.zeros(glove_ds.shape)
k_no = 0
for k_train_index, k_test_index in kf.split(cur_ds):
train_X_ds = torch.from_numpy(cur_ds[k_train_index, :])
train_y_ds = torch.from_numpy(glove_ds[k_train_index, :])
train_ds = TensorDataset(train_X_ds, train_y_ds)
test_X_ds = torch.from_numpy(cur_ds[k_test_index, :])
test_y_ds = torch.from_numpy(glove_ds[k_test_index, :])
test_ds = TensorDataset(test_X_ds, test_y_ds)
preds = fit_reg(train_ds, train_X_ds, train_y_ds, test_X_ds, test_y_ds, which_case, k_no, p_t)
k_no += 1
preds_case[k_test_index, :] = preds.detach().numpy()
and my model:
class RegressionNet(nn.Module):
def __init__(self):
super(RegressionNet, self).__init__()
self.linear1 = nn.Linear(size_input, size_hidden)
self.act1 = nn.ReLU()
self.linear2 = nn.Linear(size_hidden, size_output)
def forward(self, input_X):
X = self.linear1(input_X)
X = self.act1(X)
X = self.linear2(X)
return X
def fit_reg(train_ds, train_X_torch, train_y_torch, test_X_torch, case_type, fold_no, p_t):
num_epochs = 1
loss_fn = F.mse_loss
model = RegressionNet()
opt = torch.optim.SGD(model.parameters(), lr=1e-5)
for epoch in range(num_epochs):
print("num epoch: ", epoch)
for xb, yb in train_ds:
#not batch? fixme
#print(xb.shape, yb.shape, type(xb), type(yb))
pred = model(xb.float())
loss = loss_fn(pred, yb.float())
loss.backward()
opt.step()
opt.zero_grad()
print('Training loss: ', loss_fn(model(train_X_torch.float()), train_y_torch.float()))
pred_test_here = model(test_X_torch.float())
torch.save(model.state_dict(), './weights_' + case_type + '_' + str(fold_no) + '_' + p_t)
return pred_test_here
So I am using 10-fold cross validation. Each time, I pass the 9/10th of my data into the network and try to test it on the rest.
My questions:
Is this the correct way to perform regression?
How can I send batches of data instead of one sample per time for training?
After training is finished with some number of epochs, I show training loss as the loss between whole samples, is that correct?
Thanks in advance.

This question needs serious editing; No data is included and then the second block of code is used for data formatting which is not easy to follow. But here's my example if it can help. I took the suggested Model with small changes and made a 1D Regression code,
import torch
from sklearn.model_selection import KFold
from torch import nn
import math
import torch.nn.functional as F
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import numpy as np
predictionFull = []
lossFull = []
# data
x = torch.unsqueeze(torch.linspace(-math.pi, math.pi, 1000), dim=1)
y = torch.sin(x**2) + 0.3*torch.rand(x.size())
fig, ax = plt.subplots(figsize=(12,7))
curve, = ax.plot(x, x, 'r-', linewidth=2)
time_text = ax.text(.5, .5, '', fontsize=15)
def update(i):
#label = 'timestep {0}'.format(i)
curve.set_ydata(predictionFull[i].data.numpy())
time_text.set_text('Loss = %.4f' % lossFull[i].data.numpy())
time_text.set_x(1.0)
time_text.set_y(-3.0)
time_text.set_color('red')
return curve
class RegressionNet(nn.Module):
def __init__(self, size_input, size_hidden, size_output):
super(RegressionNet, self).__init__()
self.linear1 = nn.Linear(size_input, size_hidden)
self.linear2 = nn.Linear(size_hidden, size_output)
def forward(self, input_X):
X = F.relu(self.linear1(input_X))
X = self.linear2(X)
return X
def fit_reg(x, y):
num_epochs = 2000
loss_fn = torch.nn.MSELoss()
model = RegressionNet(1, 500, 1)
opt = torch.optim.Adam(model.parameters(), lr=0.002)
for epoch in range(num_epochs):
pred = model(x) # input x and predict based on x
loss = loss_fn(pred, y) # must be (1. nn output, 2. target)
opt.zero_grad() # clear gradients for next train
loss.backward() # backpropagation, compute gradients
opt.step() # apply gradients
predictionFull.append(pred)
lossFull.append(loss)
fit_reg(x, y)
ax.scatter(x.data.numpy(), y.data.numpy(), color = "orange")
ax.set_xlim(-math.pi, math.pi)
ax.set_ylim(-math.pi, math.pi)
if __name__ == '__main__':
# FuncAnimation will call the 'update' function for each frame; here
# animating over 10 frames, with an interval of 200ms between frames.
anim = FuncAnimation(fig, update, frames=np.arange(0, 2000, 20), interval=2)
anim.save('./an.gif', writer='imagemagick', fps=500)

Related

RuntimeError: mat1 and mat2 shapes cannot be multiplied CNN

while running this code for my dataset , I am getting errors. my data head looks like this
fridge_temperature temp_condition label
0 13.10 high 0
1 8.65 high 0
2 2.00 low 0
3 4.80 low 0
4 10.70 high 0
and this the shape of my data (587076, 3)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
# --------------- Dataset ---------------
class StudentsPerformanceDataset(Dataset):
"""Students Performance dataset."""
def __init__(self, csv_file):
"""Initializes instance of class StudentsPerformanceDataset.
Args:
csv_file (str): Path to the csv file with the students data.
"""
df = pd.read_csv("Z:/new_file.csv")
# Drop the column you want to remove
df = df.drop('date_time', axis=1)
df = df.drop('type', axis=1)
print(df.head())
print(df.shape)
# Grouping variable names
self.categorical = ["temp_condition"]
self.target = "label"
# One-hot encoding of categorical variables
self.students_frame = pd.get_dummies(df, prefix=self.categorical)
# Save target and predictors
self.X = self.students_frame.drop(self.target, axis=1)
self.y = self.students_frame[self.target]
def __len__(self):
return len(self.students_frame)
def __getitem__(self, idx):
# Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
if isinstance(idx, torch.Tensor):
idx = idx.tolist()
return [self.X.iloc[idx].values, self.y[idx]]
# --------------- Model ---------------
class Net(nn.Module):
def __init__(self, D_in, H=15, D_out=1):
super().__init__()
self.fc1 = nn.Linear(D_in, H)
self.fc2 = nn.Linear(H, D_out)
self.relu = nn.ReLU()
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x.squeeze()
""" class Net(nn.Module):
def __init__(self, D_in, H=15, D_out=1):
super().__init__()
self.fc1 = nn.Linear(D_in, H)
self.fc2 = nn.Linear(H, D_out)
self.relu = nn.ReLU()
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x.squeeze() """
# --------------- Training ---------------
def train(csv_file, n_epochs=100):
"""Trains the model.
Args:
csv_file (str): Absolute path of the dataset used for training.
n_epochs (int): Number of epochs to train.
"""
# Load dataset
dataset = StudentsPerformanceDataset(csv_file)
# Split into training and test
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
trainset, testset = random_split(dataset, [train_size, test_size])
# Dataloaders
trainloader = DataLoader(trainset, batch_size=200, shuffle=True)
testloader = DataLoader(testset, batch_size=200, shuffle=False)
# Use gpu if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Define the model
# Define the model
D_in, H = 3, 15
net = Net(D_in, H).to(device)
#D_in, H = 19, 15
#net = Net(D_in, H).to(device)
# Loss function
criterion = nn.MSELoss()
# Optimizer
optimizer = optim.Adam(net.parameters(), weight_decay=0.0001)
# Train the net
loss_per_iter = []
loss_per_batch = []
for epoch in range(n_epochs):
running_loss = 0.0
for i, (inputs, labels) in enumerate(trainloader):
inputs = inputs.to(device)
labels = labels.to(device)
# Zero the parameter gradients
optimizer.zero_grad()
# Forward + backward + optimize
outputs = net(inputs.float())
loss = criterion(outputs, labels.float())
loss.backward()
optimizer.step()
# Save loss to plot
running_loss += loss.item()
loss_per_iter.append(loss.item())
loss_per_batch.append(running_loss / (i + 1))
running_loss = 0.0
# Comparing training to test
dataiter = iter(testloader)
inputs, labels = dataiter.next()
inputs = inputs.to(device)
labels = labels.to(device)
outputs = net(inputs.float())
print("Root mean squared error")
print("Training:", np.sqrt(loss_per_batch[-1]))
print("Test", np.sqrt(criterion(labels.float(), outputs).detach().cpu().numpy()))
# Plot training loss curve
plt.plot(np.arange(len(loss_per_iter)), loss_per_iter, "-", alpha=0.5, label="Loss per epoch")
plt.plot(np.arange(len(loss_per_iter), step=4) + 3, loss_per_batch, ".-", label="Loss per mini-batch")
plt.xlabel("Number of epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()
if __name__ == "__main__":
import os
import sys
import argparse
# By default, read csv file in the same directory as this script
csv_file = os.path.join(sys.path[0], "Z:/new_file.csv")
# Parsing arguments
parser = argparse.ArgumentParser()
parser.add_argument("--file", "-f", nargs="?", const=csv_file, default=csv_file,
help="Dataset file used for training")
parser.add_argument("--epochs", "-e", type=int, nargs="?", default=100, help="Number of epochs to train")
args = parser.parse_args()
# Call the main function of the script
train(args.file, args.epochs)
I am getting this error return F.linear(input, self.weight,
self.bias) RuntimeError: mat1 and mat2 shapes cannot be multiplied
(200x7 and 3x15)

I should have a shape [150,2,2] but get a shape of [2,2,2] with DataLoader

I'm currently trying to train a Recurrent Neural Network with PyTorch and I am having trouble managing the DataLoader. Let's start from the beginning.
import matplotlib.pyplot as plt
import numpy as np
import torch
T = 50 #period
t = 300 #time
timeStep = np.linspace(0,t,300)
mu = 0
sigma = np.sqrt(0.001)
x1 = []
x2 = []
for s in timeStep:
eps1 = np.random.randn(1)*sigma+mu
eps2 = np.random.randn(1)*sigma+mu
x1.append(np.cos(2*s*np.pi/T)+eps1)
x2.append(np.sin(4*s*np.pi/T)+eps2)
from torch import nn
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import Dataset, DataLoader
class Data(torch.utils.data.Dataset):
def __init__(self):
for sample in range(10):
self.X = torch.from_numpy(np.stack([x1, x2], axis=1).reshape([-1, 2, 2])).float()
self.Y = torch.from_numpy(np.append(np.delete(self.X, 0, axis=0), self.X[1].reshape([1, 2, 2]), axis=0)).float()
print(self.X.shape,self.Y.shape)
def __len__(self):
return len(self.X)
def __getitem__(self, index):
feature = self.X[index]
label = self.Y[index]
return feature, label
dataset = Data()
At this point, dataset.X.shape,dataset.Y.shape gives [150,2,2], [150,2,2]. So up until here, that's what I need to get so no problem. (I get 2 samples of 150 time series data from a 300 data time series).
from torch.autograd import Variable
from typing import Tuple
class Recurrent(nn.Module):
def __init__(self, hidden_dim: int = 20):
super().__init__()
self.hidden_dim: int = hidden_dim
self.hidden: Tuple[Variable, Variable] = self.init_hidden()
self.rnn = nn.LSTM(2, self.hidden_dim)
self.fc = nn.Sequential(
nn.ReLU(),
nn.Linear(self.hidden_dim, 2)
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x, hidden = self.rnn(x, self.hidden)
self.hidden = (Variable(hidden[0].data), Variable(hidden[1].data))
x = self.fc(x)
return x
def init_hidden(self) -> Tuple[Variable, Variable]:
return (
Variable(torch.zeros(1, 2, self.hidden_dim)),
Variable(torch.zeros(1, 2, self.hidden_dim))
)
def fit(model, dataset, batch_size=2, epochs = 100, loss_print_per_epoch = 10):
def _w(worker_id):
np.random.seed(np.random.get_state()[1][0] + worker_id)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = LambdaLR(optimizer, lr_lambda=lambda _e: 0.97 ** _e)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
def _train_model(_m, _d):
_m.train()
train_batch_loss = []
for x, y in _d:
optimizer.zero_grad()
output = _m(x)
print(output.shape)
loss = criterion(output, y)
loss.backward(retain_graph=True)
optimizer.step()
train_batch_loss.append(loss.item())
return _m, np.mean(train_batch_loss)
for epoch in range(epochs+1):
model, train_loss = _train_model(model, train_loader)
if epoch % loss_print_per_epoch == 0:
print(f'epoch: {epoch}/{epochs} loss: {train_loss} lr: {scheduler.get_last_lr()[0]}')
scheduler.step()
return model
model = fit(model=Recurrent(), dataset=dataset, batch_size=2, epochs=100)
When I iterate through the DataLoader with my for loop, this is where the problem comes up. output.shape should be [150,2,2] for batch size = 2 and [150,1,2] for batch size = 1. But using the print in the for loop gives me a shape of [2,2,2] and I have no idea why. If anyone could help me understand what is going on here, it would be a great help.

Simple ANN for Prediction Using 6-6-3 Network Architecture

I am a beginner looking to code an ANN in PyTorch for the task of prediction for a dynamic engineering system of a Free Piston Sterling Engine. The dataset consists of 6 inputs and 3 outputs, as shown below:
Dataset
I have a basic code which I believe should be able to accommodate for this task, however I believe there may be an issue with the labelling of the dataset, and the datatype used. I have tried converting to longtensor datatype but it has not helped.
I receive the following error when changing the output datatype to float32:
"expected scalar type Long but found Float."
and when I put it as int64, I receive:
"Target 85 is out of bounds."
Please take a look, and any advice would be very appreciated. I have included the code below:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
SEED = 4096
torch.manual_seed(SEED)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
file_path = "./Dynamics of Sterling Engine Data(1).csv"
df = pd.read_csv(
file_path,
header=None,
names=[
"Kdp(N/m)",
"Kpp(N/m)",
"Cdp(Ns/m)",
"Cl(Ns/m)",
"mdp(kg)",
"mpp(kg)",
"f(Hz)",
"γ(DP/PP)",
"α(°)",
],
)
n = len(df.index) # 55
shuffle_indices = np.random.permutation(n)
df = df.iloc[shuffle_indices]
x = df.iloc[:, :6].values.astype(np.float32)
y = df.iloc[:, -3].values.astype(np.float32)
mu = x.mean(axis=0)
span = x.max(axis=0) - x.min(axis=0)
def rescale(inputs):
return (inputs - mu) / span
x = rescale(x)
num_train = int(n * 0.82)
num_test = n - num_train
x_train = x[:num_train]
y_train = y[:num_train]
x_test = x[-num_test:]
y_test = y[-num_test:]
class NpDataset(Dataset):
def __init__(self, data, label):
assert len(data) == len(label)
self.data = torch.from_numpy(data)
self.label = torch.from_numpy(label)
def __getitem__(self, index):
return self.data[index], self.label[index]
def __len__(self):
return len(self.label)
train_dataset = NpDataset(x_train, y_train)
test_dataset = NpDataset(x_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)
device = torch.device("cpu")
print(device)
class SterlingNN(nn.Module):
def __init__(self):
super(SterlingNN, self).__init__()
# 6 input feautures per data point
self.fn1 = nn.Linear(6, 6) # 6 features, 6 nodes in hidden layer
self.fn2 = nn.Linear(6, 3) # 6 nodes in hidden layer, 3 outputs
def forward(self, x):
x = torch.sigmoid(self.fn1(x)) # sigmoid activation function
x = self.fn2(x)
return x
model = SterlingNN()
print(model.to(device))
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(
model.parameters(), lr=0.01, weight_decay=0.01
)
x, y = next(iter(train_dataloader))
x = x[:5].to(device)
score = model(x)
print(score)
def train():
model.train() # model into training mode and iteratate through data loader
for x, y in train_dataloader:
x = x.to(device)
y = y.to(device)
n = x.size(0)
optimiser.zero_grad()
score = model(x)
loss = loss_fn(score, y)
loss.backward()
optimiser.step()
predictions = score.max(1, keepdim=True)[1]
num_correct = predictions.eq(y.view_as(predictions)).sum().item()
acc = num_correct / n
return loss, acc
def evaluate():
model.eval()
with torch.no_grad():
for x, y in test_dataloader:
x = x.to(device)
y = y.to(device)
n = x.size(0)
score = model(x)
loss = loss_fn(score, y)
predictions = score.max(1, keepdim=True)[1]
num_correct = predictions.eq(y.view_as(predictions)).sum().item()
acc = num_correct / n
return loss, acc
max_epochs = 128
for epoch in range(max_epochs):
tr_loss, tr_acc = train()
eva_loss, eva_acc = evaluate()
print(
"[{epoch}/{max_epochs}] Train loss:{tr_loss:.4f} acc:{tr_acc*100:.2f}% - Test loss:{eva_loss:.4f} acc:{eva_acc*100:.2f}%".format()
)

LSTM implementation / overfitting

I am having a problem on an implementation of LSTM. I am not sure if I have the right implementation or this is just an overfitting problem. I am doing essay grading using a LSTM, scoring text with score from 0 - 10 (or other range of score). I am using the ASAP kaggle competition data as one of the training data.
However, the main goal is to achieve good performance on a private dataset, with around 500 samples. The 500 samples includes validation and training set. I have previously done some experiment and got the model to work, but after fiddling with something, the model doesn't fit anymore. The model does not improve at all. I have also re-implemented the code in a cleaner manner with much more obejct oriented code and still can't reproduce my previous result.
However, I am getting the model to fit to my data, just there is tremendous overfitting. I am not sure if this is an implementation problem of some sort or just overfitting, but I cannot get the model to work. The maximum I can get it to is 0.35 kappa using LSTM on the ASAP data essay set 1. For some bizarre reason, I can get a single layer fully connected model to have 0.75 kappa. I think this is an implementation problem but I am not sure.
Here is my old code:
train.py
import gensim
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import cohen_kappa_score
from torch import nn
import torch.utils.data as data_utils
from torch.optim import Adam
from dataset import AESDataset
from network import Network
from optimizer import Ranger
from qwk import quadratic_weighted_kappa, kappa
batch_size = 32
device = "cuda:0"
torch.manual_seed(1000)
# Load data from csv
file_name = "data/data_new.csv"
data = pd.read_csv(file_name)
arr = data.to_numpy()
text = arr[:, :2]
text = [str(line[0]) + str(line[1]) for line in text]
text = [gensim.utils.simple_preprocess(line) for line in text]
score = arr[:,2]
score = [sco*6 for sco in score]
score = np.asarray(score, dtype=int)
train_dataset = AESDataset(text_arr=text[:400], scores=score[:400])
test_dataset = AESDataset(text_arr=text[400:], scores=score[400:])
score = torch.tensor(score).view(-1,1).long().to(device)
train_loader = data_utils.DataLoader(train_dataset,shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = data_utils.DataLoader(test_dataset,shuffle=True,batch_size=batch_size, drop_last=True)
out_class = 61
epochs = 1000
model = Network(out_class).to(device)
model.load_state_dict(torch.load("model/best_model"))
y_onehot = torch.FloatTensor(batch_size, out_class).to(device)
optimizer = Adam(model.parameters())
criti = torch.nn.CrossEntropyLoss()
# model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
step = 0
for i in range(epochs):
#Testing
if i % 1 == 0:
total_loss = 0
total_kappa = 0
total_batches = 0
model.eval()
for (text, score) in test_loader:
out = model(text)
out_score = torch.argmax(out, 1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())
score = score.view(-1)
loss = criti(out, score.view(-1))
total_loss += loss
total_kappa += kappa_l
total_batches += 1
print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")
with open(f"model/epoch_{i}", "wb") as f:
torch.save(model.state_dict(),f)
model.train()
#Training
for (text, score) in train_loader:
optimizer.zero_grad()
step += 1
out = model(text)
out_score = torch.argmax(out,1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())
loss = criti(out, score.view(-1))
print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")
loss.backward()
optimizer.step()
dataset.py
import gensim
import torch
import numpy as np
class AESDataset(torch.utils.data.Dataset):
def __init__(self, text_arr, scores):
self.data = text_arr
self.scores = scores
self.w2v_model = ("w2vec_model_all")
self.max_len = 500
def __getitem__(self, item):
vector = []
essay = self.data[item]
pad_vec = [1 for i in range(300)]
for i in range(self.max_len - len(essay)):
vector.append(pad_vec)
for word in essay:
word_vec = pad_vec
try:
word_vec = self.w2v_model[word]
except:
#print(f"Skipping word as word {word} not in dictionary")
word_vec = pad_vec
vector.append(word_vec)
#print(len(vector))
vector = np.stack(vector)
tensor = torch.tensor(vector[:self.max_len]).float().to("cuda")
score = self.scores[item]
score = torch.tensor(score).long().to("cuda").view(1)
return tensor, score
def __len__(self):
return len(self.scores)
network.py
import torch.nn as nn
import torch
import torch.nn.functional as F
class Network(nn.Module):
def __init__(self, output_size):
super(Network, self).__init__()
self.lstm = nn.LSTM(300,500,1, batch_first=True)
self.dropout = nn.Dropout(p=0.5)
#self.l2 = nn.L2
self.linear = nn.Linear(500,output_size)
def forward(self,x):
x, _ = self.lstm(x)
x = x[:,-1,:]
x = self.dropout(x)
x = self.linear(x)
return x
My new code: https://github.com/Clement-Hui/EssayGrading
I think the problem is in the training code since you are using LSTM you are supposed to flush down the hidden and cell state after every epoch and detach it from the computation graph after each batch.
network.py
import torch.nn as nn
import torch
import torch.nn.functional as F
class Network(nn.Module):
def __init__(self, output_size):
super(Network, self).__init__()
self.lstm = nn.LSTM(300,500,1, batch_first=True)
self.dropout = nn.Dropout(p=0.5)
#self.l2 = nn.L2
self.linear = nn.Linear(500,output_size)
def forward(self,x,hidden):
x, hidden = self.lstm(x,hidden)
x = x.contiguous().view(-1, 500)
x = self.dropout(x)
x = self.linear(x)
return x , hidden
def init_hidden(self,batch_size):
weights = next(self.parameters()).data
hidden = (weights.new(1 , batch_size,500).zero_().cuda(),
weights.new(1 , batch_size,500).zero_().cuda())
return hidden
train.py
# your code for intializing the model and data and all other stuff
for i in range(epochs):
#Testing
if i % 1 == 0:
total_loss = 0
total_kappa = 0
total_batches = 0
model.eval()
val_h = model.init_hidden(batch_size) # intialize the hidden state
for (text, score) in test_loader:
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
val_h = tuple([each.data for each in val_h])
out , val_h = model(text,val_h)
out_score = torch.argmax(out, 1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())
score = score.view(-1)
loss = criti(out, score.view(-1))
total_loss += loss
total_kappa += kappa_l
total_batches += 1
print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")
with open(f"model/epoch_{i}", "wb") as f:
torch.save(model.state_dict(),f)
model.train()
#Training
h = model.init_hidden(batch_size) # intialize the hidden state
for (text, score) in train_loader:
optimizer.zero_grad()
step += 1
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
h = tuple([each.data for each in h])
out , h = model(text,h)
out_score = torch.argmax(out,1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())
loss = criti(out, score.view(-1))
print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")
loss.backward()
optimizer.step()
Do let me know if the changes mentioned works or not.

Pytorch: Custom Loss only works for batch_size == 1

I am currently trying to port my existing (working) keras
BNN code to pytorch.
To this end, I have to write a custom NegativeLogLikelihood loss function. My unit test for this loss passes (e.g. for fixed network weights I get the same results and gradients as in my old (working) keras code), but in a simple dummy example (fitting a sinc function) my loss only gives okay results for batch_size == 1 and my network fails to fit sinc properly (at any amount of training iterations) for larger values. Using nn.MSELoss instead works perfectly fine, so I am assuming an issue with my loss computation.
import matplotlib.pyplot as plt
from itertools import islice
try:
from tqdm import tqdm
except ImportError:
tqdm = lambda x, total: x
import numpy as np
import torch
from torch.utils import data as data_utils
import torch.nn as nn
class NLLLoss(torch.nn.modules.loss._Loss):
def __init__(self, parameters, num_datapoints, size_average=False, reduce=True):
super().__init__(size_average, reduce)
self.parameters = tuple(parameters)
self.num_datapoints = num_datapoints
def log_variance_prior(self, log_variance, mean=1e-6, variance=0.01):
return torch.mean(
torch.sum(
((-(log_variance - torch.log(torch.tensor(mean))) ** 2) /
((2. * variance))) - 0.5 * torch.log(torch.tensor(variance)),
dim=1
)
)
def weight_prior(self, parameters, wdecay=1.):
num_parameters = torch.sum(torch.tensor([
torch.prod(torch.tensor(parameter.size()))
for parameter in parameters
]))
log_likelihood = torch.sum(torch.tensor([
torch.sum(-wdecay * 0.5 * (parameter ** 2))
for parameter in parameters
]))
return log_likelihood / (num_parameters.float() + 1e-16)
def forward(self, input, target):
torch.nn.modules.loss._assert_no_grad(target)
batch_size, *_ = input.shape
prediction_mean = input[:, 0].view(-1, 1)
log_prediction_variance = input[:, 1].view(-1, 1)
prediction_variance_inverse = 1. / (torch.exp(log_prediction_variance) + 1e-16)
mean_squared_error = torch.pow(target - prediction_mean, 2)
log_likelihood = (
torch.sum(
torch.sum(
-mean_squared_error * 0.5 * prediction_variance_inverse -
0.5 * log_prediction_variance,
dim=1
)
)
)
log_likelihood /= batch_size
log_likelihood += (
self.log_variance_prior(log_prediction_variance) / self.num_datapoints
)
log_likelihood += self.weight_prior(self.parameters) / self.num_datapoints
return -log_likelihood
# Helper Functions {{{ #
def infinite_dataloader(dataloader):
while True:
yield from dataloader
def tanh_network(input_dimensionality: int):
class AppendLayer(nn.Module):
def __init__(self, bias=True, *args, **kwargs):
super().__init__(*args, **kwargs)
if bias:
self.bias = nn.Parameter(torch.Tensor(1, 1))
else:
self.register_parameter('bias', None)
def forward(self, x):
return torch.cat((x, self.bias * torch.ones_like(x)), dim=1)
def init_weights(module):
if type(module) == AppendLayer:
nn.init.constant_(module.bias, val=np.log(1e-3))
elif type(module) == nn.Linear:
nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="linear")
nn.init.constant_(module.bias, val=0.0)
return nn.Sequential(
nn.Linear(input_dimensionality, 50), nn.Tanh(),
nn.Linear(50, 50), nn.Tanh(),
nn.Linear(50, 50), nn.Tanh(),
nn.Linear(50, 1),
AppendLayer()
).apply(init_weights)
# }}} Helper Functions #
input_dimensionality, num_datapoints = 1, 100
num_train_steps = 13000
# Set up data
x_train = np.array([
np.random.uniform(np.zeros(1), np.ones(1), input_dimensionality)
for _ in range(num_datapoints)
])
y_train = np.sinc(x_train * 10 - 5).sum(axis=1)
# Data Normalization
x_train_, x_mean, x_std = (
np.true_divide(x_train - np.mean(x_train), np.std(x_train)), np.mean(x_train), np.std(x_train)
)
y_train_, y_mean, y_std = (
np.true_divide(y_train - np.mean(y_train), np.std(y_train)), np.mean(y_train), np.std(y_train)
)
model = tanh_network(input_dimensionality=input_dimensionality)
# TODO Why does setting batch_size to 1 work with NLL, but setting it to higher values fails?
batch_size = 20 # setting this to 1 gives okay results.
loss_function = NLLLoss(model.parameters(), num_datapoints=num_datapoints)
# NOTE: Using MSE like this also works:
# loss_function = lambda input, target: nn.MSELoss()(input=input[:, 0], target=target)
train_loader = infinite_dataloader(
data_utils.DataLoader(
data_utils.TensorDataset(
torch.from_numpy(x_train_).float(),
torch.from_numpy(y_train_).float()
), batch_size=batch_size
)
)
optimizer = torch.optim.Adam(model.parameters())
# Train loop
for epoch, (x_batch, y_batch) in tqdm(enumerate(islice(train_loader, num_train_steps)), total=num_train_steps):
optimizer.zero_grad()
y_pred = model(x_batch)
loss = loss_function(input=y_pred, target=y_batch)
loss.backward()
optimizer.step()
if epoch % 100 == 0:
mse_value = nn.MSELoss()(input=y_pred[:, 0], target=y_batch)
print("Epoch: {}, Loss: {}, MSE: {}".format(epoch, loss, mse_value))
x_test = np.linspace(0, 1, 100)[:, None]
y_test = np.sinc(x_test * 10 - 5).sum(axis=1)
# Data Normalization
x_test_ = np.true_divide(x_test - x_mean, x_std)
x_test_torch = torch.from_numpy(x_test_).float()
y_test_torch = torch.from_numpy(y_test).float()
# Unnormalize predictions
y_pred = model(x_test_torch).detach().numpy() * y_std + y_mean
plt.plot(x_test[:, 0], y_test, label="true", color="black")
plt.plot(x_train[:, 0], y_train, "ro")
plt.plot(x_test[:, 0], y_pred[:, 0], label="Adam", color="blue")
plt.legend()
plt.show()
Any help or suggestions on what I could be doing wrong are very appreciated!

Categories