Purely for learning, I'd like to get the following code to work, without a DataLoader. I use Huggingface transformers regularly yet I struggle with PyTorch dimensions all the time so I have started with some simple projects from the book "Deep Learning with PyTorch." One of the problems from the book suggested using a wine quality dataset on a super simple linear model. I have toiled with the dimensions of the data, which I think is the source of my error:
RuntimeError: mat1 and mat2 shapes cannot be multiplied (3919x1 and 11x100)
Data is available here
import csv
from collections import OrderedDict
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
wine_path = "winequality-white.csv"
wine_quality_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";",
skiprows=1)
col_list = next(csv.reader(open(wine_path), delimiter=';'))
wineq = torch.from_numpy(wine_quality_numpy)
# print(wineq.shape, wineq.dtype)
data = wineq[:, :-1]
target = wineq[:, -1]
target = target.unsqueeze(1)
n_samples = wine_quality_numpy.shape[0]
n_val = int(0.2 * n_samples)
shuffled_indices = torch.randperm(n_samples)
train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]
target_train = target[train_indices]
data_train = data[train_indices]
target_val = target[val_indices]
data_val = data[val_indices]
seq_model = nn.Sequential(OrderedDict([
('hidden_linear', nn.Linear(11, 100)),
('hidden_activation', nn.Tanh()),
('output_linear', nn.Linear(100, 7))
]))
def training_loop(n_epochs, optimizer, model, loss_fn, target_train, target_val,
data_train, data_val):
for epoch in range(1, n_epochs + 1):
t_p_train = model(target_train) # <1>
loss_train = loss_fn(t_p_train, data_train)
t_p_val = model(t_u_val) # <1>
loss_val = loss_fn(t_p_val, data_val)
optimizer.zero_grad()
loss_train.backward() # <2>
optimizer.step()
if epoch == 1 or epoch % 1000 == 0:
print(f"Epoch {epoch}, Training loss {loss_train.item():.4f},"
f" Validation loss {loss_val.item():.4f}")
optimizer = optim.SGD(seq_model.parameters(), lr=1e-3) # <1>
training_loop(
n_epochs = 5000,
optimizer = optimizer,
model = seq_model,
loss_fn = nn.MSELoss(),
target_train = target_train,
target_val = target_val,
data_train = data_train,
data_val = data_val)
Thank you!
In my haste I had the training data and labels swapped. Here is the fixed section.
seq_model = nn.Sequential(OrderedDict([
('hidden_linear', nn.Linear(11, 100)),
('hidden_activation', nn.Tanh()),
('output_linear', nn.Linear(100, 7))
]))
def training_loop(n_epochs, optimizer, model, loss_fn, target_train, target_val,
data_train, data_val):
for epoch in range(1, n_epochs + 1):
t_p_train = model(data_train) # <1>
loss_train = loss_fn(t_p_train, target_train)
t_p_val = model(data_val) # <1>
loss_val = loss_fn(t_p_val, target_val)
optimizer.zero_grad()
loss_train.backward() # <2>
optimizer.step()
if epoch == 1 or epoch % 1000 == 0:
print(f"Epoch {epoch}, Training loss {loss_train.item():.4f},"
f" Validation loss {loss_val.item():.4f}")
Related
Im a real newbie on PyTorch and Neural Networks. I have started to work on these suubjects this week and my mentor has gave me a code and with some tasks to work on the code.
But the code that he gave me is not working. I have tried to fix this all day but got no result. Because i do not know the background of the NN's and PyTorch it is harder to understand the problem.
Need your help on that.
Thank you !
import torch
import numpy as np
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torchsummary import summary
#DEFINE YOUR DEVICE
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device) #if cpu, go Runtime-> Change runtime type-> Hardware accelerator GPU -> Save -> Redo previous steps
#DOWNLOAD DATASET
train_data = datasets.Omniglot('./data', background=True, download = True, transform = transforms.ToTensor())
test_data = datasets.Omniglot('./data',background = False, download = True, transform = transforms.ToTensor())
#DEFINE DATA GENERATOR
batch_size = 50
train_generator = torch.utils.data.DataLoader(train_data, batch_size = batch_size, shuffle = True)
test_generator = torch.utils.data.DataLoader(test_data, batch_size = batch_size, shuffle = False)
#DEFINE NEURAL NETWORK MODEL
class CNN(torch.nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = torch.nn.Conv2d(1, 8, kernel_size = 4, stride = 1)
self.conv2 = torch.nn.Conv2d(8, 16, kernel_size = 4, stride = 1)
self.mpool = torch.nn.MaxPool2d(2)
self.fc1 = torch.nn.Linear(18432, 256)
self.fc2 = torch.nn.Linear(256, 64)
self.fc3 = torch.nn.Linear(64, 50)
self.relu = torch.nn.ReLU()
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
hidden = self.mpool(self.relu(self.conv1(x)))
hidden = self.mpool(self.relu(self.conv2(hidden)))
hidden = hidden.view(-1,18432)
hidden = self.relu(self.fc1(hidden))
hidden = self.relu(self.fc2(hidden))
output = self.fc3(hidden)
return output
# CREATE MODEL
model = CNN()
model.to(device)
summary(model, (1, 105, 105))
# DEFINE LOSS FUNCTION AND OPTIMIZER
learning_rate = 0.001
loss_fun = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# TRAIN THE MODEL
model.train()
epoch = 10
num_of_batch = np.int(len(train_generator.dataset) / batch_size)
loss_values = np.zeros(epoch * num_of_batch)
for i in range(epoch):
for batch_idx, (x_train, y_train) in enumerate(train_generator):
x_train, y_train = x_train.to(device), y_train.to(device)
optimizer.zero_grad()
y_pred = model(x_train)
loss = loss_fun(y_pred, y_train)
loss_values[num_of_batch * i + batch_idx] = loss.item()
loss.backward()
optimizer.step()
if (batch_idx + 1) % batch_size == 0:
print('Epoch: {}/{} [Batch: {}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
i + 1, epoch, (batch_idx + 1) * len(x_train), len(train_generator.dataset),
100. * (batch_idx + 1) / len(train_generator), loss.item()))
#PLOT THE LEARNING CURVE
iterations = np.linspace(0,epoch,num_of_batch*epoch)
plt.plot(iterations, loss_values)
plt.title('Learning Curve')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid('on')
#TEST THE MODEL
model.eval()
correct=0
total=0
for x_val, y_val in test_generator:
x_val = x_val.to(device)
y_val = y_val.to(device)
output = model(x_val)
y_pred = output.argmax(dim=1)
for i in range(y_pred.shape[0]):
if y_val[i]==y_pred[i]:
correct += 1
total +=1
print('Validation accuracy: %.2f%%' %((100*correct)//(total)))
Here is the error code that i receive.
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py:528: UserWarning: Using a target size (torch.Size([50])) that is different to the input size (torch.Size([25, 50])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input, target, reduction=self.reduction)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-11-bffd863688df> in <module>()
13 loss = loss_fun(y_pred, y_train)
14 loss_values[num_of_batch*i+batch_idx] = loss.item()
---> 15 loss.backward()
16 optimizer.step()
17 if (batch_idx+1) % batch_size == 0:
1 frames
/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
147 Variable._execution_engine.run_backward(
148 tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 149 allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
150
151
RuntimeError: Found dtype Long but expected Float
Your dataset is returning integers for your labels, you should cast them to floating points. One way of solving it is to do:
loss = loss_fun(y_pred, y_train.float())
Hello I am building the Softmax Regression however I have got error
first I needed to extract the gzip mnist file than to convert them that it can work
but I got problem with the array size for the label datasets training
I am not sure is the problem because the extracting does not works well or because there is problem in the indices
the error says
batch_labels = TrainlableData[offset:(offset + batch_size), :]
IndexError: too many indices for array
how this can be solved?
import numpy as np
#import tensorflow as tf
import matplotlib.pyplot as plt
import struct as st
from array import array
#using struct to handel the binary files
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import idx2numpy
from skimage import img_as_float
from skimage import img_as_float32
import gzip
print(tf.__version__)
# number of features
num_features = 784
# number of target labels
num_labels = 10
# learning rate (alpha)
learning_rate = 0.05
# batch size
batch_size = 128
# number of epochs
num_steps = 5001
with gzip.open('train-images-idx3-ubyte.gz','rb') as f:
magicTrainImage = st.unpack('>4B',f.read(4))
trainImg = st.unpack('>I',f.read(4))[0] #num of images
trainImgnRow = st.unpack('>I',f.read(4))[0] #num of rows
trainImgnC = st.unpack('>I',f.read(4))[0] #num of column
nBytesTotal = trainImg*trainImgnRow*trainImgnC*1 #since each pixel data is 1 byte
trainImagesData = 255 - np.asarray(st.unpack('>'+'B'*nBytesTotal,f.read(nBytesTotal))).reshape((trainImg,trainImgnRow*trainImgnC))
print(trainImg)
with gzip.open('train-labels-idx1-ubyte.gz','rb') as S:
magic, size = st.unpack(">II",S.read(8))
TrainlableData = np.asarray(array("B", S.read()))
print("lable data ")
print(TrainlableData.shape)
with gzip.open('t10k-images-idx3-ubyte.gz','rb') as f:
magicTrainImage = st.unpack('>4B',f.read(4))
trainImg = st.unpack('>I',f.read(4))[0] #num of images
trainImgnRow = st.unpack('>I',f.read(4))[0] #num of rows
trainImgnC = st.unpack('>I',f.read(4))[0] #num of column
nBytesTotal = trainImg*trainImgnRow*trainImgnC*1 #since each pixel data is 1 byte
TestImageData = 255 - np.asarray(st.unpack('>'+'B'*nBytesTotal,f.read(nBytesTotal))).reshape((trainImg,trainImgnRow*trainImgnC))
print(TestImageData.shape)
with gzip.open('t10k-labels-idx1-ubyte.gz','rb') as S:
magic, size = st.unpack(">II",S.read(8))
TestlableData = np.asarray(array("B", S.read()))
print(TestlableData.shape)
td = img_as_float32(TestImageData)
print(td.dtype)
# initialize a tensorflow graph
graph = tf.Graph()
with graph.as_default():
"""
defining all the nodes
"""
# Inputs
tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, num_features))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
#tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(td)
# Variables.
weights = tf.Variable(tf.truncated_normal([num_features, num_labels]))
biases = tf.Variable(tf.zeros([num_labels]))
# Training computation.
logits = tf.matmul(tf_train_dataset, weights) + biases
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
labels=tf_train_labels, logits=logits))
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
#valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
# utility function to calculate accuracy
def accuracy(predictions, labels):
correctly_predicted = np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
accu = (100.0 * correctly_predicted) / predictions.shape[0]
return accu
with tf.Session(graph=graph) as session:
# initialize weights and biases
tf.global_variables_initializer().run()
print("Initialized")
for step in range(num_steps):
# pick a randomized offset
offset = np.random.randint(0, TrainlableData.shape[0] - batch_size - 1)
# Generate a minibatch.
batch_data = trainImagesData[offset:(offset + batch_size), :]
batch_labels = TrainlableData[offset:(offset + batch_size), :]
print(batch_data.shape)
print(batch_labels.shape)
# Prepare the feed dict
feed_dict = {tf_train_dataset : batch_data,
tf_train_labels : batch_labels}
# run one step of computation
_, l, predictions = session.run([optimizer, loss, train_prediction],
feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step {0}: {1}".format(step, l))
print("Minibatch accuracy: {:.1f}%".format(
accuracy(predictions, batch_labels)))
print("\nTest accuracy: {:.1f}%".format(
accuracy(test_prediction.eval(), TestlableData)))
When I start to train my model, Loss values decreasing but Accuracy values never change.I don't know why?
# -*- coding: utf-8 -*-
#Libraries
import torch
import torch.nn.functional as F
from torch import autograd, nn
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms, datasets
from torch.utils import data
"""
Olivetti face dataset
"""
from sklearn.datasets import fetch_olivetti_faces
# Olivetti dataset download
olivetti = fetch_olivetti_faces()
train = olivetti.images
label = olivetti.target
X = train
Y = label
print("Format for X:", X.shape)
print("Format for Y: ", Y.shape)
print("\nDownload Ok")
"""
Set for train
"""
train_rate = 0.8
X_train = np.zeros([int(train_rate * X.shape[0]),64,64], dtype=float)
Y_train = np.zeros([int(train_rate * X.shape[0])], dtype=int)
X_val = np.zeros([int((1-train_rate) * X.shape[0]+1),64,64], dtype=float)
Y_val = np.zeros([int((1-train_rate) * X.shape[0]+1)], dtype=int)
#Split data for train and validation
for i in range(X.shape[0]):
ie=0
iv=0
if (i%10)/10 >= train_rate:
X_train[ie] = X[i]
Y_train[ie] = Y[i]
ie += 1
else:
X_val[iv] = X[i]
Y_val[iv] = Y[i]
iv += 1
X_train = X_train.reshape(320,-1,64,64)
X_val = X_val.reshape(80,-1,64,64)
print(Y_train.shape)
X_train = torch.Tensor(X_train)
Y_train = torch.Tensor(Y_train)
X_val = torch.Tensor(X_val)
Y_val = torch.Tensor(Y_val)
batch_size = 20
train_loader = torch.utils.data.DataLoader(X_train,
batch_size=batch_size,
)
val_loader = torch.utils.data.DataLoader(X_val,
batch_size=batch_size,
)
class CNNModule(nn.Module):
def __init__(self):
super(CNNModule, self).__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 13 * 13, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 40)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 13 * 13)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def make_train(model,dataset,n_iters,gpu):
# Organize data
X_train,Y_train,X_val,Y_val = dataset
kriter = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.01)
#Arrays to save loss and accuracy
tl=np.zeros(n_iters) #For train loss
ta=np.zeros(n_iters) #For train accuracy
vl=np.zeros(n_iters) #For validation loss
va=np.zeros(n_iters) #For validation accuracy
# Convert labels to long
Y_train = Y_train.long()
Y_val = Y_val.long()
# GPU control
if gpu:
X_train,Y_train = X_train.cuda(),Y_train.cuda()
X_val,Y_val = X_val.cuda(),Y_val.cuda()
model = model.cuda() # Parameters to GPU!
print("Using GPU")
else:
print("Using CPU")
# print(X_train.shape)
# print(Y_train.shape)
for i in range(n_iters):
# train forward
train_out = model.forward(X_train)
train_loss = kriter(train_out,Y_train)
# Backward and optimization
train_loss.backward()
optimizer.step()
optimizer.zero_grad()
# Compute train accuracy
train_predict = train_out.cpu().detach().argmax(dim=1)
train_accuracy = (train_predict.cpu().numpy()==Y_train.cpu().numpy()).mean()
# For validation
val_out = model.forward(X_val)
val_loss = kriter(val_out,Y_val)
# Compute validation accuracy
val_predict = val_out.cpu().detach().argmax(dim=1)
val_accuracy = (val_predict.cpu().numpy()==Y_val.cpu().numpy()).mean()
tl[i] = train_loss.cpu().detach().numpy()
ta[i] = train_accuracy
vl[i] = val_loss.cpu().detach().numpy()
va[i] = val_accuracy
# Show result each 5 loop
if i%5==0:
print("Loop --> ",i)
print("Train Loss :",train_loss.cpu().detach().numpy())
print("Train Accuracy :",train_accuracy)
print("Validation Loss :",val_loss.cpu().detach().numpy())
print("Validation Accuracy :",val_accuracy)
model = model.cpu()
#Print result
plt.subplot(2,2,1)
plt.plot(np.arange(n_iters), tl, 'r-')
plt.subplot(2,2,2)
plt.plot(np.arange(n_iters), ta, 'b--')
plt.subplot(2,2,3)
plt.plot(np.arange(n_iters), vl, 'r-')
plt.subplot(2,2,4)
plt.plot(np.arange(n_iters), va, 'b--')
dataset = X_train,Y_train,X_val,Y_val
gpu = True
gpu = gpu and torch.cuda.is_available()
model = CNNModule()
make_train(model,dataset,100,gpu)
OUTPUT:
Using CPU
Loop --> 0
Train Loss : 3.6302185
Train Accuracy : 0.0
Validation Loss : 3.6171098
Validation Accuracy : 0.0
Loop --> 5
Train Loss : 3.557933
Train Accuracy : 0.996875
Validation Loss : 3.545982
Validation Accuracy : 0.9875
.
.
.
Loop --> 95
Train Loss : 0.04211783
Train Accuracy : 0.996875
Validation Loss : 0.13397054
Validation Accuracy : 0.9875
From your code,
train_accuracy = (train_predict.cpu().numpy()==Y_train.cpu().numpy()).mean()
you are taking mean of correct values that's why you are getting same answer in every loop. Instead you should divide the total correct numbers with total number of examples to find the accuracy.
I'm currently learning how to use Tensorflow and I'm having some issues to implement this Softmax Regression aplication.
There's no error when compiling but, for some reasson text validation and test predictions shows no improvement, only the train prediction is showing improvement.
I'm using Stocastic Gradient Descent(SGD) with minibatches in order to converge faster, but don't know if this could be causing a trouble somehow.
I'll be thankful if you could share some ideas, here's the full code:
import input_data
import numpy as np
import random as ran
import tensorflow as tf
import matplotlib.pyplot as plt
mnist = input_data.read_data_sets('MNIST_Data/', one_hot=True)
#Features & Data
num_features = 784
num_labels = 10
learning_rate = 0.05
batch_size = 128
num_steps = 5001
train_dataset = mnist.train.images
train_labels = mnist.train.labels
test_dataset = mnist.test.images
test_labels = mnist.test.labels
valid_dataset = mnist.validation.images
valid_labels = mnist.validation.labels
graph = tf.Graph()
with graph.as_default():
tf_train_data = tf.placeholder(tf.float32, shape=(batch_size, num_features))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_data = tf.constant(valid_dataset)
tf_test_data = tf.constant(test_dataset)
W = tf.Variable(tf.truncated_normal([num_features, num_labels]))
b = tf.Variable(tf.zeros([num_labels]))
score_vector = tf.matmul(tf_train_data, W) + b
cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
labels=tf_train_labels, logits=score_vector))
score_valid = tf.matmul(tf_test_data, W) + b
score_test = tf.matmul(tf_valid_data, W) + b
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_func)
train_pred = tf.nn.softmax(score_vector)
valid_pred = tf.nn.softmax(score_valid)
test_pred = tf.nn.softmax(score_test)
def accuracy(predictions, labels):
correct_pred = np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
accu = (100.0 * correct_pred) / predictions.shape[0]
return accu
with tf.Session(graph=graph) as sess:
sess.run(tf.global_variables_initializer())
print("Initialized")
for step in range(num_steps):
offset = np.random.randint(0, train_labels.shape[0] - batch_size - 1)
batch_data = train_dataset[offset:(offset+batch_size), :]
batch_labels = train_labels[offset:(offset+batch_size), :]
feed_dict = {tf_train_data : batch_data,
tf_train_labels : batch_labels
}
_, l, predictions = sess.run([optimizer, cost_func, train_pred],
feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step {0}: {1}".format(step, l))
print("Minibatch accuracy: {:.1f}%".format(
accuracy(predictions, batch_labels)))
print("Validation accuracy: {:.1f}%".format(
accuracy(valid_pred.eval(), valid_labels)))
print("\nTest accuracy: {:.1f}%".format(
accuracy(test_pred.eval(), test_labels)))
It sounds like overfitting, which isn't surprising since this model is basically a linear regression model.
There are few options you can try:
1. add hidden layers + activation functions(https://arxiv.org/abs/1511.07289: elu paper works on mnist data set with vanilla DNN).
2. Use either CNN or RNN, although CNN is more apt for image problems.
3. Use a better optimizer. If you are new, try ADAM optimizer (https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer), and then move onto using momentum with nestrov(https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer)
Without feature engineering, it'll be hard to pull off image classification using just linear regression. Also, you do not need to run softmax on your outcomes since softmax is designed to smooth argmax. Lastly, you should input (None,num_features) into shape of placeholders instead to have variational batch size. This will allow you to directly feed your valid and test datasets into feed_dict without having to create additional tensors.
I am very new in pytorch and implementing my own network of image classifier. However I see for each epoch training accuracy is very good but validation accuracy is 0.i noted till 5th epoch. I am using Adam optimizer and have learning rate .001. also resampling the whole data set after each epoch into training n validation set. Please help where I am going wrong.
Here is my code:
### where is data?
data_dir_train = '/home/sup/PycharmProjects/deep_learning/CNN_Data/training_set'
data_dir_test = '/home/sup/PycharmProjects/deep_learning/CNN_Data/test_set'
# Define your batch_size
batch_size = 64
allData = datasets.ImageFolder(root=data_dir_train,transform=transformArr)
# We need to further split our training dataset into training and validation sets.
def split_train_validation():
# Define the indices
num_train = len(allData)
indices = list(range(num_train)) # start with all the indices in training set
split = int(np.floor(0.2 * num_train)) # define the split size
#train_idx, valid_idx = indices[split:], indices[:split]
# Random, non-contiguous split
validation_idx = np.random.choice(indices, size=split, replace=False)
train_idx = list(set(indices) - set(validation_idx))
# define our samplers -- we use a SubsetRandomSampler because it will return
# a random subset of the split defined by the given indices without replacement
train_sampler = SubsetRandomSampler(train_idx)
validation_sampler = SubsetRandomSampler(validation_idx)
#train_loader = DataLoader(allData,batch_size=batch_size,sampler=train_sampler,shuffle=False,num_workers=4)
#validation_loader = DataLoader(dataset=allData,batch_size=1, sampler=validation_sampler)
return (train_sampler,validation_sampler)
Training
from torch.optim import Adam
import torch
import createNN
import torch.nn as nn
import loadData as ld
from torch.autograd import Variable
from torch.utils.data import DataLoader
# check if cuda - GPU support available
cuda = torch.cuda.is_available()
#create model, optimizer and loss function
model = createNN.ConvNet(class_num=2)
optimizer = Adam(model.parameters(),lr=.001,weight_decay=.0001)
loss_func = nn.CrossEntropyLoss()
if cuda:
model.cuda()
# function to save model
def save_model(epoch):
torch.save(model.load_state_dict(),'imageClassifier_{}.model'.format(epoch))
print('saved model at epoch',epoch)
def exp_lr_scheduler ( epoch , init_lr = args.lr, weight_decay = args.weight_decay, lr_decay_epoch = cf.lr_decay_epoch):
lr = init_lr * ( 0.5 ** (epoch // lr_decay_epoch))
def train(num_epochs):
best_acc = 0.0
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
model.train()
acc = 0.0
loss = 0.0
total = 0
# train model with training data
for i,(images,labels) in enumerate(train_loader):
# if cuda then move to GPU
if cuda:
images = images.cuda()
labels = labels.cuda()
# Variable class wraps a tensor and we can calculate grad
images = Variable(images)
labels = Variable(labels)
# reset accumulated gradients for each batch
optimizer.zero_grad()
# pass images to model which returns preiction
output = model(images)
#calculate the loss based on prediction and actual
loss = loss_func(output,labels)
# backpropagate the loss and compute gradient
loss.backward()
# update weights as per the computed gradients
optimizer.step()
# prediction class
predVal , predClass = torch.max(output.data, 1)
acc += torch.sum(predClass == labels.data)
loss += loss.cpu().data[0]
total += labels.size(0)
# print the statistics
train_acc = acc/total
train_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch,acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, loss))
# Valid model with validataion data
model.eval()
acc = 0.0
loss = 0.0
total = 0
for i,(images,labels) in enumerate(validation_loader):
# if cuda then move to GPU
if cuda:
images = images.cuda()
labels = labels.cuda()
# Variable class wraps a tensor and we can calculate grad
images = Variable(images)
labels = Variable(labels)
# reset accumulated gradients for each batch
optimizer.zero_grad()
# pass images to model which returns preiction
output = model(images)
#calculate the loss based on prediction and actual
loss = loss_func(output,labels)
# backpropagate the loss and compute gradient
loss.backward()
# update weights as per the computed gradients
optimizer.step()
# prediction class
predVal, predClass = torch.max(output.data, 1)
acc += torch.sum(predClass == labels.data)
loss += loss.cpu().data[0]
total += labels.size(0)
# print the statistics
valid_acc = acc / total
valid_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch, valid_acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, valid_loss))
if(best_acc<valid_acc):
best_acc = valid_acc
save_model(epoch)
# at 30th epoch we save the model
if (epoch == 30):
save_model(epoch)
train(20)
I think you did not take into account that acc += torch.sum(predClass == labels.data) returns a tensor instead of a float value. Depending on the version of pytorch you are using I think you should change it to:
acc += torch.sum(predClass == labels.data).cpu().data[0] #pytorch 0.3
acc += torch.sum(predClass == labels.data).item() #pytorch 0.4
Although your code seems to be working for old pytorch version, I would recommend you to upgrade to the 0.4 version.
Also, I mentioned other problems/typos in your code.
You are loading the dataset for every epoch.
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
...
That should not happen, it should be enough loading it once
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
...
In the training part you have (this does not happen in the validation):
train_acc = acc/total
train_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch,acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, loss))
Where you are printing acc instead of train_acc
Also, in the validation part I mentioned that you are printing print('Mean train acc = {} over epoch = {}'.format(epoch, valid_acc)) when it should be something like 'Mean val acc'.
Changing this lines of code, using a standard model I created and CIFAR dataset the training seems to converge, accuracy increases at every epoch while mean loss value decreases.
I Hope I could help you!