I'm a newbie in Deep Learning with Pytorch. I am using the Housing Prices dataset from Kaggle here. I tried sampling with first 50 rows. But the model.parameters() is not updating as I perform the training. Can anyone help?
import torch
import numpy as np
from torch.utils.data import TensorDataset
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
inputs = np.array(label_X_train[:50])
targets = np.array(train_y[:50])
# Tensors
inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
targets = targets.view(-1, 1)
train_ds = TensorDataset(inputs, targets)
batch_size = 5
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
model = nn.Linear(10, 1)
# Define Loss func
loss_fn = F.mse_loss
# Optimizer
opt = torch.optim.SGD(model.parameters(), lr = 1e-5)
num_epochs = 100
model.train()
for epoch in range(num_epochs):
# Train with batches of data
for xb, yb in train_dl:
# 1. Generate predictions
pred = model(xb.float())
# 2. Calculate loss
loss = loss_fn(pred, yb.float())
# 3. Compute gradients
loss.backward()
# 4. Update parameters using gradients
opt.step()
# 5. Reset the gradients to zero
opt.zero_grad()
if (epoch+1) % 10 == 0:
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch +
1, num_epochs,
loss.item()))
The weight does update, but you weren't capturing it correctly. model.weight.data is a torch tensor, but the name of the variable is just a reference, so setting w = model.weight.data does not create a copy but another reference to the object. Hence changing model.weight.data would change w too.
So by setting w = model.weight.data and w_new = model.weight data in different part of the loops means you're assigning two reference to the same object making their value equal at all time.
In order to assess that the model weight are changing, either print(model.weight.data) before and after the loop (since you got one linear layer of 10 parameters it's still okay to do that) or simply set w = model.weight.data.clone(). In that case your output will be:
tensor([[False, False, False, False, False, False, False, False, False, False]])
Here's an example that shows you that your weights are changing:
import torch
import numpy as np
from torch.utils.data import TensorDataset
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
inputs = np.random.rand(50, 10)
targets = np.random.randint(0, 2, 50)
# Tensors
inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
targets = targets.view(-1, 1)
train_ds = TensorDataset(inputs, targets.squeeze())
batch_size = 5
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
model = nn.Linear(10, 1)
# Define Loss func
loss_fn = F.mse_loss
# Optimizer
opt = torch.optim.SGD(model.parameters(), lr = 1e-1)
num_epochs = 100
model.train()
w = model.weight.data.clone()
for epoch in range(num_epochs):
# Train with batches of data
for xb, yb in train_dl:
# 1. Generate predictions
pred = model(xb.float())
# 2. Calculate loss
loss = loss_fn(pred, yb.float())
# 3. Compute gradients
loss.backward()
# 4. Update parameters using gradients
opt.step()
# 5. Reset the gradients to zero
opt.zero_grad()
if (epoch+1) % 10 == 0:
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch +
1, num_epochs,
loss.item()))
print(w == model.weight.data)
Related
I am wondering what I am doing wrong when looking to see how the weights changed during training.
My loss goes down considerably but it appears that the initialized weights are the same as trained weights. Am I looking in the wrong location? I would appreciate any insight that you might have!
import torch
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F
# setup GPU/CPU processing
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# initialize model
class mlp1(torch.nn.Module):
def __init__(self, num_features, num_hidden, num_classes):
super(mlp1, self).__init__()
self.num_classes = num_classes
self.input_layer = torch.nn.Linear(num_features, num_hidden)
self.out_layer = torch.nn.Linear(num_hidden, num_classes)
def forward(self, x):
x = self.input_layer(x)
x = torch.sigmoid(x)
logits = self.out_layer(x)
probas = torch.softmax(logits, dim=1)
return logits, probas
# instantiate model
model = mlp1(num_features=28*28, num_hidden=100, num_classes=10).to(device)
# check initial weights
weight_check_pre = model.state_dict()['input_layer.weight'][0][0:25]
# optim
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
# download data
train_dataset = datasets.MNIST(root='data',
train=True,
transform=transforms.ToTensor(),
download=True)
# data loader
train_dataloader = DataLoader(dataset=train_dataset,
batch_size=100,
shuffle=True)
# train
NUM_EPOCHS = 1
for epoch in range(NUM_EPOCHS):
model.train()
for batch_idx, (features, targets) in enumerate(train_dataloader):
# send data to device
features = features.view(-1, 28*28).to(device)
targets = targets.to(device)
# forward
logits, probas = model(features)
# loss
loss = F.cross_entropy(logits, targets)
optimizer.zero_grad()
loss.backward()
# now update weights
optimizer.step()
### LOGGING
if not batch_idx % 50:
print ('Epoch: %03d/%03d | Batch %03d/%03d | Loss: %.4f'
%(epoch+1, NUM_EPOCHS, batch_idx,
len(train_dataloader), loss))
# check post training
weight_check_post = model.state_dict()['input_layer.weight'][0][0:25]
# compare
weight_check_pre == weight_check_post # all equal
That is because both variables are referencing the same object (dictionary) in memory and so will always equal to each other.
You can do this to get actual copies of the state_dict.
import copy
# check initial weights
weight_check_pre = copy.deepcopy(model.state_dict()['input_layer.weight'][0][0:25])
...
# check post training
weight_check_post = copy.deepcopy(model.state_dict()['input_layer.weight'][0][0:25])
I was trying to build a neural network with 4 input nodes/ features and just one output feature(0/1). I wrote this code and it runs but while training the model returns NaN. I debugged too and weights and biases are fine until they go through the model.
From what I've searched so far, this could be a problem in the way I am passing the data.
My input data is : tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 1.5340e+00],
[1.5000e+01, 1.0000e-01, 2.4210e+00, 3.0000e+01],
[3.0000e+00, 2.2000e-01, 2.2000e-01, 4.5000e+01],
...,
[1.0000e+00, 2.0000e-02, 2.0000e-02, 1.5000e+01],
[6.0000e+00, 2.0000e-01, 2.0000e-01, 1.5000e+01],
[1.7000e+01, 5.2400e-01, 5.2400e-01, 2.0000e+00]], dtype=torch.float64)
import torch
from torchvision import datasets, transforms
import pandas as pd
import numpy as np
from torch.autograd import Variable
# Import tensor dataset & data loader
from torch.utils.data import TensorDataset, DataLoader
from torch import nn, optim
import torch.nn.functional as F
file = pd.read_csv('ks-projects-201801.csv')
array = np.array(file.values)
result = np.empty(len(array))
input_data = np.empty((len(array), 4))
for i in range(len(array)):
input_data[i] = np.array([array[i][10], array[i][12]/1000, array[i][13]/1000, array[i][14]/1000])
if array[i][9] == 'successful':
result[i] = 1
else:
result[i] = 0
input_node = Variable(torch.from_numpy(input_data))
output = torch.from_numpy(result)
print(input_node)
print(output)
train_ds = TensorDataset(input_node.squeeze(), output.squeeze())
batch_size = 5
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
This is the actual model and training
model = nn.Linear(4, 1)
print(model.weight)
print(model.bias)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.003)
epochs = 5
model = model.double()
for e in range(epochs):
running_loss = 0
for xb, yb in train_dl:
optimizer.zero_grad()
res = model(xb)
loss = criterion(res, yb)
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"model : {loss}")
This prints out model: nan for every epoch and terminates. I am very new to pytorch and I'm not sure how to handle this problem.
If you see NaN's in loss try gradient clipping and data normalisation. Normalising data is a must (i.e normalize input data such that mean = 0 and variance =1)
Let's say I have the following constraints and the network:
The architecture is fixed (see this image) (note that there are no biases)
Activation function for the hidden layer is ReLU
There's no activation function for the output layer (should just return the sum of the inputs it receive).
I tried to implement this in pytorch with various initialization schemes and different data sets but I failed (the code is at the bottom).
My questions are:
Is there anything wrong with my NN training process?
Is this a feasible problem? If yes, how?
If this is doable, can we still achieve that by constraining the weights to be in the set {-1, 0, 1}
Code:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import numpy as np
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.fc1 = nn.Linear(2,2,bias=False)
self.fc2 = nn.Linear(2,1, bias=False)
self.rl = nn.ReLU()
def forward(self, x):
x = self.fc1(x)
x = self.rl(x)
x = self.fc2(x)
return x
#create an XOR data set to train
rng = np.random.RandomState(0)
X = rng.randn(200, 2)
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0).astype('int32')
# test data set
X_test = np.array([[0,0],[0,1], [1,0], [1,1]])
train = data_utils.TensorDataset(torch.from_numpy(X).float(), \
torch.from_numpy(y).float())
train_loader = data_utils.DataLoader(train, batch_size=50, shuffle=True)
test = torch.from_numpy(X_test).float()
# training the network
num_epoch = 10000
net = Network()
net.fc1.weight.data.clamp_(min=-1, max=1)
net.fc2.weight.data.clamp_(min=-1, max=1)
# define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters())
for epoch in range(num_epoch):
running_loss = 0 # loss per epoch
for (X, y)in train_loader:
# make the grads zero
optimizer.zero_grad()
# forward propagate
out = net(X)
# calculate loss and update
loss = criterion(out, y)
loss.backward()
optimizer.step()
running_loss += loss.data
if epoch%500== 0:
print("Epoch: {0} Loss: {1}".format(epoch, running_loss))
The loss doesn't improve. It gets stuck in some value after a few epochs ( i'm not sure how to make this reproducible as I'm getting different values every time)
net(test) returns a set of predictions that are no way close to XOR output.
You need to use a nonlinear activation function such as sigmoid in your hidden and output layers . because xor is not linearly separable.Also biases are required.
I'm currently learning how to use Tensorflow and I'm having some issues to implement this Softmax Regression aplication.
There's no error when compiling but, for some reasson text validation and test predictions shows no improvement, only the train prediction is showing improvement.
I'm using Stocastic Gradient Descent(SGD) with minibatches in order to converge faster, but don't know if this could be causing a trouble somehow.
I'll be thankful if you could share some ideas, here's the full code:
import input_data
import numpy as np
import random as ran
import tensorflow as tf
import matplotlib.pyplot as plt
mnist = input_data.read_data_sets('MNIST_Data/', one_hot=True)
#Features & Data
num_features = 784
num_labels = 10
learning_rate = 0.05
batch_size = 128
num_steps = 5001
train_dataset = mnist.train.images
train_labels = mnist.train.labels
test_dataset = mnist.test.images
test_labels = mnist.test.labels
valid_dataset = mnist.validation.images
valid_labels = mnist.validation.labels
graph = tf.Graph()
with graph.as_default():
tf_train_data = tf.placeholder(tf.float32, shape=(batch_size, num_features))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_data = tf.constant(valid_dataset)
tf_test_data = tf.constant(test_dataset)
W = tf.Variable(tf.truncated_normal([num_features, num_labels]))
b = tf.Variable(tf.zeros([num_labels]))
score_vector = tf.matmul(tf_train_data, W) + b
cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
labels=tf_train_labels, logits=score_vector))
score_valid = tf.matmul(tf_test_data, W) + b
score_test = tf.matmul(tf_valid_data, W) + b
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_func)
train_pred = tf.nn.softmax(score_vector)
valid_pred = tf.nn.softmax(score_valid)
test_pred = tf.nn.softmax(score_test)
def accuracy(predictions, labels):
correct_pred = np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
accu = (100.0 * correct_pred) / predictions.shape[0]
return accu
with tf.Session(graph=graph) as sess:
sess.run(tf.global_variables_initializer())
print("Initialized")
for step in range(num_steps):
offset = np.random.randint(0, train_labels.shape[0] - batch_size - 1)
batch_data = train_dataset[offset:(offset+batch_size), :]
batch_labels = train_labels[offset:(offset+batch_size), :]
feed_dict = {tf_train_data : batch_data,
tf_train_labels : batch_labels
}
_, l, predictions = sess.run([optimizer, cost_func, train_pred],
feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step {0}: {1}".format(step, l))
print("Minibatch accuracy: {:.1f}%".format(
accuracy(predictions, batch_labels)))
print("Validation accuracy: {:.1f}%".format(
accuracy(valid_pred.eval(), valid_labels)))
print("\nTest accuracy: {:.1f}%".format(
accuracy(test_pred.eval(), test_labels)))
It sounds like overfitting, which isn't surprising since this model is basically a linear regression model.
There are few options you can try:
1. add hidden layers + activation functions(https://arxiv.org/abs/1511.07289: elu paper works on mnist data set with vanilla DNN).
2. Use either CNN or RNN, although CNN is more apt for image problems.
3. Use a better optimizer. If you are new, try ADAM optimizer (https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer), and then move onto using momentum with nestrov(https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer)
Without feature engineering, it'll be hard to pull off image classification using just linear regression. Also, you do not need to run softmax on your outcomes since softmax is designed to smooth argmax. Lastly, you should input (None,num_features) into shape of placeholders instead to have variational batch size. This will allow you to directly feed your valid and test datasets into feed_dict without having to create additional tensors.
I am trying to train a sparse data with an MLP to predict a forecast. However, the forecast on the test data is giving the same value for all observations. Once I omit the activation function from each layer, the outcome starts being different.
my code is below:
# imports
import numpy as np
import tensorflow as tf
import random
import json
from scipy.sparse import rand
# Parameters
learning_rate= 0.1
training_epochs = 50
batch_size = 100
# Network Parameters
m= 1000 #number of features
n= 5000 # number of observations
hidden_layers = [5,2,4,1,6]
n_layers = len(hidden_layers)
n_input = m
n_classes = 1 # it's a regression problem
X_train = rand(n, m, density=0.2,format = 'csr').todense().astype(np.float32)
Y_train = np.random.randint(4, size=n)
X_test = rand(200, m, density=0.2,format = 'csr').todense().astype(np.float32)
Y_test = np.random.randint(4, size=200)
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None])
# Store layers weight & bias
weights = {}
biases = {}
weights['h1']=tf.Variable(tf.random_normal([n_input, hidden_layers[0]])) #first matrice
biases['b1'] = tf.Variable(tf.random_normal([hidden_layers[0]]))
for i in xrange(2,n_layers+1):
weights['h'+str(i)]= tf.Variable(tf.random_normal([hidden_layers[i-2], hidden_layers[i-1]]))
biases['b'+str(i)] = tf.Variable(tf.random_normal([hidden_layers[i-1]]))
weights['out']=tf.Variable(tf.random_normal([hidden_layers[-1], 1])) #matrice between last layer and output
biases['out']= tf.Variable(tf.random_normal([1]))
# Create model
def multilayer_perceptron(_X, _weights, _biases):
layer_begin = tf.nn.relu(tf.add(tf.matmul(_X, _weights['h1'],a_is_sparse=True), _biases['b1']))
for layer in xrange(2,n_layers+1):
layer_begin = tf.nn.relu(tf.add(tf.matmul(layer_begin, _weights['h'+str(layer)]), _biases['b'+str(layer)]))
#layer_end = tf.nn.dropout(layer_begin, 0.3)
return tf.matmul(layer_begin, _weights['out'])+ _biases['out']
# Construct model
pred = multilayer_perceptron(x, weights, biases)
# Define loss and optimizer
rmse = tf.reduce_sum(tf.abs(y-pred))/tf.reduce_sum(tf.abs(y)) # rmse loss
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(rmse) # Adam Optimizer
# Initializing the variables
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
#training
for step in xrange(training_epochs):
# Generate a minibatch.
start = random.randrange(1, n - batch_size)
#print start
batch_xs=X_train[start:start+batch_size,:]
batch_ys =Y_train[start:start+batch_size]
#printing
_,rmseRes = sess.run([optimizer, rmse] , feed_dict={x: batch_xs, y: batch_ys} )
if step % 20 == 0:
print "rmse [%s] = %s" % (step, rmseRes)
#testing
pred_test = multilayer_perceptron(X_test, weights, biases)
print "prediction", pred_test.eval()[:20]
print "actual = ", Y_test[:20]
PS: I am generating randomly my data just to reproduce the error. My data is sparse in fact, pretty similar to the one generated randomly. The problem I want to solve is: MLP is giving the same prediction for all observations in the test data.
That's a sign that your training failed. With GoogeLeNet Imagenet training I've seen it label everything as "nematode" when started with a bad choice of hyper-parameters. Things to check -- does your training loss decrease? If it doesn't decrease, try different learning rates/architectures. If it decreases to zero maybe your loss is wrong like was case here