torch.Linear weight doesn't update - python

#import blah blah
#active funtion
Linear = torch.nn.Linear(6,1)
sig = torch.nn.Sigmoid()
#optimizer
optim = torch.optim.SGD(Linear.parameters() ,lr = 0.001)
#input
#x => (891,6)
#output
y = y.reshape(891,1)
#cost function
loss_f = torch.nn.BCELoss()
for iter in range (10):
for i in range (1000):
optim.zero_grad()
forward = sig(Linear(x)) > 0.5
forward = forward.to(torch.float32)
forward.requires_grad = True
loss = loss_f(forward, y)
loss.backward()
optim.step()
in this code, I want to update Linear.weight and Linear.bias but It doesn't work,,
I think my code doesn't know what is weight and bias so, I tried to change
optim = torch.optim.SGD(Linear.parameters() ,lr = 0.001)
to
optim = torch.optim.SGD([Linear.weight, Linear.bias] ,lr = 0.001)
but It still didn't work,,
// I wanna explain more detail in my problem but my English level is so low 🥲 sorry

The BCELoss is defined as
As you can see the input x are probabilities. However your use of sig(Linear(x)) > 0.5 is wrong. Moreover, sig(Linear(x)) > 0.5 return a tensor with no autograd and it breaks the computation graph. You are explicitly setting the requires_grad=True however, since the graph is broken it cannot reach the linear layers during back propagation and so its weights are not learned/changed.
Correct sample usage:
import torch
import numpy as np
Linear = torch.nn.Linear(6,1)
sig = torch.nn.Sigmoid()
#optimizer
optim = torch.optim.SGD(Linear.parameters() ,lr = 0.001)
# Sample data
x = torch.rand(891,6)
y = torch.rand(891,1)
loss_f = torch.nn.BCELoss()
for iter in range (10):
optim.zero_grad()
output = sig(Linear(x))
loss = loss_f(sig(Linear(x)), y)
loss.backward()
optim.step()
print (Linear.bias.item())
Output:
0.10717090964317322
0.10703673213720322
0.10690263658761978
0.10676861554384232
0.10663467645645142
0.10650081932544708
0.10636703670024872
0.10623333603143692
0.10609971731901169
0.10596618056297302

Related

Loss going to NaN after few iterations

In my model, the input is a graph data in the form of edge-index and the node features. After a few iterations of training on graph data, loss (EDIT: which is a combination of MSELoss function and a negative loss function i.e., L1 + (-L2)) becomes NaN. Both L1 and -L2 become NaN after around 40 iterations.
Learning rate = 0.00001. I also checked for invalid input data also, but found none.
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import networkx as nx
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
class Model(nn.Module):
def __init__(self, nin, nhid1, nout, inp_l, hid_l, out_l=1):
super(Model, self).__init__()
self.g1 = GCNConv(in_channels= nin, out_channels= nhid1)
self.g2 = GCNConv(in_channels= nhid1, out_channels= nout)
self.dropout = 0.5
self.lay1 = nn.Linear(inp_l ,hid_l)
self.lay2 = nn.Linear(hid_l ,out_l)
def forward(self, x, adj):
x = F.relu(self.g1(x, adj))
x = F.dropout(x, self.dropout, training=self.training)
x = self.g2(x, adj)
x = self.lay1(x)
x = F.relu(x)
x = self.lay2(x)
x = F.relu(x)
return x
The inputs to the model:
x (Tensor , optional ) – Node feature matrix with shape [num_nodes, num_node_features].
edge_index (LongTensor , optional ) – Graph connectivity in COO format with shape [2, num_edges]
Here num_nodes=1000 ; num_node_features=1 ; num_edges = 5000
GCNConv is a graph embedder returns a [num_nodes, dim] matrix. It takes in the edge-list and the features to return a matrix.
EDIT 2: Added how the loss is calculated
def train_model(epoch):
model= Model(nin = 1, nhid1=128, nout=128, inp_l=128, hid_l=64, out_l=1).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.00001)
model.train()
t = time.time()
optimizer.zero_grad()
Y = model(features, adjacency_list)
Y1 = func(Y) #Y1 values are calculated from Y by passing through a function func to obtain a same sized vector as Y
loss1 = ((Y1-Y)**2).mean() #MSE Loss function
loss2 = -Y.abs().mean() # This loss is implemented to prevent Y values going to 0. Notice the "-" sign
loss_train = loss1 + loss2
loss_train.backward(retain_graph=True)
nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
if epoch%20==0:
print("MSE loss = ",loss1,"\t","Mean Loss = ",loss2)
print('Epoch: {:04d}'.format(epoch+1),
'loss_train: {:.4f}'.format(loss_train.item()),
'time: {:.4f}s'.format(time.time() - t))
print("\n\n")
return Y

Why does regularization in pytorch and scratch code does not match and what is the formula used for regularization in pytorch?

I have been trying to do L2 regularization on a binary classification model in PyTorch but when I match the results of PyTorch and scratch code it doesn't match,
Pytorch code:
class LogisticRegression(nn.Module):
def __init__(self,n_input_features):
super(LogisticRegression,self).__init__()
self.linear=nn.Linear(4,1)
self.linear.weight.data.fill_(0.0)
self.linear.bias.data.fill_(0.0)
def forward(self,x):
y_predicted=torch.sigmoid(self.linear(x))
return y_predicted
model=LogisticRegression(4)
criterion=nn.BCELoss()
optimizer=torch.optim.SGD(model.parameters(),lr=0.05,weight_decay=0.1)
dataset=Data()
train_data=DataLoader(dataset=dataset,batch_size=1096,shuffle=False)
num_epochs=1000
for epoch in range(num_epochs):
for x,y in train_data:
y_pred=model(x)
loss=criterion(y_pred,y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
Scratch Code:
def sigmoid(z):
s = 1/(1+ np.exp(-z))
return s
def yinfer(X, beta):
return sigmoid(beta[0] + np.dot(X,beta[1:]))
def cost(X, Y, beta, lam):
sum = 0
sum1 = 0
n = len(beta)
m = len(Y)
for i in range(m):
sum = sum + Y[i]*(np.log( yinfer(X[i],beta)))+ (1 -Y[i])*np.log(1-yinfer(X[i],beta))
for i in range(0, n):
sum1 = sum1 + beta[i]**2
return (-sum + (lam/2) * sum1)/(1.0*m)
def pred(X,beta):
if ( yinfer(X, beta) > 0.5):
ypred = 1
else :
ypred = 0
return ypred
beta = np.zeros(5)
iterations = 1000
arr_cost = np.zeros((iterations,4))
print(beta)
n = len(Y_train)
for i in range(iterations):
Y_prediction_train=np.zeros(len(Y_train))
Y_prediction_test=np.zeros(len(Y_test))
for l in range(len(Y_train)):
Y_prediction_train[l]=pred(X[l,:],beta)
for l in range(len(Y_test)):
Y_prediction_test[l]=pred(X_test[l,:],beta)
train_acc = format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100)
test_acc = 100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100
arr_cost[i,:] = [i,cost(X,Y_train,beta,lam),train_acc,test_acc]
temp_beta = np.zeros(len(beta))
''' main code from below '''
for j in range(n):
temp_beta[0] = temp_beta[0] + yinfer(X[j,:], beta) - Y_train[j]
temp_beta[1:] = temp_beta[1:] + (yinfer(X[j,:], beta) - Y_train[j])*X[j,:]
for k in range(0, len(beta)):
temp_beta[k] = temp_beta[k] + lam * beta[k] #regularization here
temp_beta= temp_beta / (1.0*n)
beta = beta - alpha*temp_beta
graph of the losses
graph of training accuracy
graph of testing accuracy
Can someone please tell me why this is happening?
L2 value=0.1
Great question. I dug a lot through PyTorch documentation and found the answer. The answer is very tricky. Basically there are two ways to calculate regulalarization. (For summery jump to the last section).
The PyTorch uses the first type (in which regularization factor is not divided by batch size).
Here's a sample code which demonstrates that:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
class model(nn.Module):
def __init__(self):
super().__init__()
self.linear = nn.Linear(1, 1)
self.linear.weight.data.fill_(1.0)
self.linear.bias.data.fill_(1.0)
def forward(self, x):
return self.linear(x)
model = model()
optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=1.0)
input = torch.tensor([[2], [4]], dtype=torch.float32)
target = torch.tensor([[7], [11]], dtype=torch.float32)
optimizer.zero_grad()
pred = model(input)
loss = F.mse_loss(pred, target)
print(f'input: {input[0].data, input[1].data}')
print(f'prediction: {pred[0].data, pred[1].data}')
print(f'target: {target[0].data, target[1].data}')
print(f'\nMSEloss: {loss.item()}\n')
loss.backward()
print('Before updation:')
print('--------------------------------------------------------------------------')
print(f'weight [data, gradient]: {model.linear.weight.data, model.linear.weight.grad}')
print(f'bias [data, gradient]: {model.linear.bias.data, model.linear.bias.grad}')
print('--------------------------------------------------------------------------')
optimizer.step()
print('After updation:')
print('--------------------------------------------------------------------------')
print(f'weight [data]: {model.linear.weight.data}')
print(f'bias [data]: {model.linear.bias.data}')
print('--------------------------------------------------------------------------')
which outputs:
input: (tensor([2.]), tensor([4.]))
prediction: (tensor([3.]), tensor([5.]))
target: (tensor([7.]), tensor([11.]))
MSEloss: 26.0
Before updation:
--------------------------------------------------------------------------
weight [data, gradient]: (tensor([[1.]]), tensor([[-32.]]))
bias [data, gradient]: (tensor([1.]), tensor([-10.]))
--------------------------------------------------------------------------
After updation:
--------------------------------------------------------------------------
weight [data]: tensor([[4.1000]])
bias [data]: tensor([1.9000])
--------------------------------------------------------------------------
Here m = batch size = 2, lr = alpha = 0.1, lambda = weight_decay = 1.
Now consider tensor weight which has value = 1 and grad = -32
case1(type1 regularization):
weight = weight - lr(grad + weight_decay.weight)
weight = 1 - 0.1(-32 + 1(1))
weight = 4.1
case2(type2 regularization):
weight = weight - lr(grad + (weight_decay/batch size).weight)
weight = 1 - 0.1(-32 + (1/2)(1))
weight = 4.15
From the output we can see that updated weight = 4.1000. That concludes PyTorch uses type1 regularization.
So finally In your code you are following type2 regularization. So just change some last lines to this:
# for k in range(0, len(beta)):
# temp_beta[k] = temp_beta[k] + lam * beta[k] #regularization here
temp_beta= temp_beta / (1.0*n)
beta = beta - alpha*(temp_beta + lam * beta)
And also PyTorch loss functions doesn't include regularization term(implemented inside optimizers) so also remove regularization terms inside your custom cost function.
In summary:
Pytorch use this Regularization function:
Regularization is implemented inside Optimizers (weight_decay parameter).
PyTorch Loss functions doesn't include Regularization term.
Bias is also regularized if Regularization is used.
To use Regularization try:
torch.nn.optim.optimiser_name(model.parameters(), lr, weight_decay=lambda).

PyTorch does not converge when approximating square function with linear model

I'm trying to learn some PyTorch and am referencing this discussion here
The author provides a minimum working piece of code that illustrates how you can use PyTorch to solve for an unknown linear function that has been polluted with random noise.
This code runs fine for me.
However, when I change the function such that I want t = X^2, the parameter does not seem to converge.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
# Let's make some data for a linear regression.
A = 3.1415926
b = 2.7189351
error = 0.1
N = 100 # number of data points
# Data
X = Variable(torch.randn(N, 1))
# (noisy) Target values that we want to learn.
t = X * X + Variable(torch.randn(N, 1) * error)
# Creating a model, making the optimizer, defining loss
model = nn.Linear(1, 1)
optimizer = optim.SGD(model.parameters(), lr=0.05)
loss_fn = nn.MSELoss()
# Run training
niter = 50
for _ in range(0, niter):
optimizer.zero_grad()
predictions = model(X)
loss = loss_fn(predictions, t)
loss.backward()
optimizer.step()
print("-" * 50)
print("error = {}".format(loss.data[0]))
print("learned A = {}".format(list(model.parameters())[0].data[0, 0]))
print("learned b = {}".format(list(model.parameters())[1].data[0]))
When I execute this code, the new A and b parameters are seemingly random thus it does not converge. I think this should converge because you can approximate any function with a slope and offset function. My theory is that I'm using PyTorch incorrectly.
Can any identify a problem with my t = X * X + Variable(torch.randn(N, 1) * error) line of code?
You cannot fit a 2nd degree polynomial with a linear function. You cannot expect more than random (since you have random samples from the polynomial).
What you can do is try and have two inputs, x and x^2 and fit from them:
model = nn.Linear(2, 1) # you have 2 inputs now
X_input = torch.cat((X, X**2), dim=1) # have 2 inputs per entry
# ...
predictions = model(X_input) # 2 inputs -> 1 output
loss = loss_fn(predictions, t)
# ...
# learning t = c*x^2 + a*x + b
print("learned a = {}".format(list(model.parameters())[0].data[0, 0]))
print("learned c = {}".format(list(model.parameters())[0].data[0, 1]))
print("learned b = {}".format(list(model.parameters())[1].data[0]))

Neural network can learn |sin(x)| for [0,pi] but not [0,2pi] or [0, 4pi]

My neural network can learn |sin(x)| for [0,pi], but not larger intervals than that. I tried changing the quantity and widths of hidden layers in various ways, but none of the changes leads to a good result.
I train the NN on thousands of random values from a uniform distribution in the chosen interval. using back propagation with gradient descent.
I am starting to think there is a fundamental problem in my network.
For the following examples I used a 1-10-10-1 layer structure:
[0, pi]:
[0, 2pi]:
[0, 4pi]:
Here is the code for the neural network:
import math
import numpy
import random
import copy
import matplotlib.pyplot as plt
def sigmoid(x):
return 1.0/(1+ numpy.exp(-x))
def sigmoid_derivative(x):
return x * (1.0 - x)
class NeuralNetwork:
def __init__(self, weight_dimensions, x=None, y=None):
self.weights = []
self.layers = [[]] * len(weight_dimensions)
self.weight_gradients = []
self.learning_rate = 1
self.layers[0] = x
for i in range(len(weight_dimensions) - 1):
self.weights.append(numpy.random.rand(weight_dimensions[i],weight_dimensions[i+1]) - 0.5)
self.y = y
def feed_forward(self):
# calculate an output using feed forward layer-by-layer
for i in range(len(self.layers) - 1):
self.layers[i + 1] = sigmoid(numpy.dot(self.layers[i], self.weights[i]))
def print_loss(self):
loss = numpy.square(self.layers[-1] - self.y).sum()
print(loss)
def get_weight_gradients(self):
return self.weight_gradients
def apply_weight_gradients(self):
for i in range(len(self.weight_gradients)):
self.weights[i] += self.weight_gradients[i] * self.learning_rate
if self.learning_rate > 0.001:
self.learning_rate -= 0.0001
def back_prop(self):
# find derivative of the loss function with respect to weights
self.weight_gradients = []
deltas = []
output_error = (self.y - self.layers[-1])
output_delta = output_error * sigmoid_derivative(self.layers[-1])
deltas.append(output_delta)
self.weight_gradients.append(self.layers[-2].T.dot(output_delta))
for i in range(len(self.weights) - 1):
i_error = deltas[i].dot(self.weights[-(i+1)].T)
i_delta = i_error * sigmoid_derivative(self.layers[-(i+2)])
self.weight_gradients.append(self.layers[-(i+3)].T.dot(i_delta))
deltas.append(copy.deepcopy(i_delta))
# Unreverse weight gradient list
self.weight_gradients = self.weight_gradients[::-1]
def get_output(self, inp):
self.layers[0] = inp
self.feed_forward()
return self.layers[-1]
def sin_test():
interval = numpy.random.uniform(0, 2*math.pi, int(1000*(2*math.pi)))
x_values = []
y_values = []
for i in range(len(interval)):
y_values.append([abs(math.sin(interval[i]))])
x_values.append([interval[i]])
x = numpy.array(x_values)
y = numpy.array(y_values)
nn = NeuralNetwork([1, 10, 10, 1], x, y)
for i in range(10000):
tmp_input = []
tmp_output = []
mini_batch_indexes = random.sample(range(0, len(x)), 10)
for j in mini_batch_indexes:
tmp_input.append(x[j])
tmp_output.append(y[j])
nn.layers[0] = numpy.array(tmp_input)
nn.y = numpy.array(tmp_output)
nn.feed_forward()
nn.back_prop()
nn.apply_weight_gradients()
nn.print_loss()
nn.layers[0] = numpy.array(numpy.array(x))
nn.y = numpy.array(numpy.array(y))
nn.feed_forward()
axis_1 = []
axis_2 = []
for i in range(len(nn.layers[-1])):
axis_1.append(nn.layers[0][i][0])
axis_2.append(nn.layers[-1][i][0])
true_axis_2 = []
for x in axis_1:
true_axis_2.append(abs(math.sin(x)))
axises = []
for i in range(len(axis_1)):
axises.append([axis_1[i], axis_2[i], true_axis_2[i]])
axises.sort(key=lambda x: x[0], reverse=False)
axis_1_new = []
axis_2_new = []
true_axis_2_new = []
for elem in axises:
axis_1_new.append(elem[0])
axis_2_new.append(elem[1])
true_axis_2_new.append(elem[2])
plt.plot(axis_1_new, axis_2_new, label="nn")
plt.plot(axis_1_new, true_axis_2_new, 'k--', label="sin(x)")
plt.grid()
plt.axis([0, 2*math.pi, -1, 2.5])
plt.show()
sin_test()
The main issue with your network seem to be that you apply the activation function to the final "layer" of your network. The final output of your network should be a linear combination without any sigmoid applied.
As a warning though, do not expect the model to generalize outside of the region included in the training data.
Here is an example in PyTorch:
import torch
import torch.nn as nn
import math
import numpy as np
import matplotlib.pyplot as plt
N = 1000
p = 2.5
x = 2 * p * math.pi * torch.rand(N, 1)
y = np.abs(np.sin(x))
with torch.no_grad():
plt.plot(x.numpy(), y.numpy(), '.')
plt.savefig("training_data.png")
inner = 20
model = nn.Sequential(
nn.Linear(1, inner, bias=True),
nn.Sigmoid(),
nn.Linear(inner, 1, bias=True)#,
#nn.Sigmoid()
)
loss_fn = nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500000):
y_pred = model(x)
loss = loss_fn(y_pred, y)
if t % 1000 == 0:
print("MSE: {}".format(t), loss.item())
model.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
X = torch.arange(0, p * 2 * math.pi, step=0.01).reshape(-1, 1)
Y = model(X)
Y_TRUTH = np.abs(np.sin(X))
print(Y.shape)
print(Y_TRUTH.shape)
loss = loss_fn(Y, Y_TRUTH)
plt.clf()
plt.plot(X.numpy(), Y_TRUTH.numpy())
plt.plot(X.numpy(), Y.numpy())
plt.title("MSE: {}".format(loss.item()))
plt.savefig("output.png")
The output is available here: Image showing neural network prediction and ground truth. The yellow line is the predicted line by the neural network and the blue line is the ground truth.
First and foremost, you've chosen a topology suited for a different class of problems. A simple, fully-connected NN such as this is great with trivial classification (e.g. Boolean operators) or functions with at least two continuous derivatives. You've tried to apply it to a function that is simply one step beyond its capabilities.
Try your model on sin(x) and see how it performs at larger ranges. Try it on max(sin(x), 0). Do you see how the model has trouble with certain periodicity and irruptions? These are an emergent feature of the many linear equations struggling to predict the proper functional value: the linear combinations have trouble emulating non-linearities past a simple level.

How to train a simple linear regression model with SGD in pytorch successfully?

I was trying to train a simple polynomial linear regression model in pytorch with SGD. I wrote some self contained (what I thought would be extremely simple code), however, for some reason my model does not train as I thought it should.
I have 5 points sampled from a sine curve and try to fit it with a polynomial of degree 4. This is a convex problem so GD or SGD should find a solution with zero train error eventually as long as we have enough iterations and small enough step size. For some reason however my model does not train well (even though it seems that it is changing the parameters of the model. Anyone have an idea why? Here is the code (I tried making it self contained and minimal):
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
import torch
from torch.autograd import Variable
from maps import NamedDict
from plotting_utils import *
def index_batch(X,batch_indices,dtype):
'''
returns the batch indexed/sliced batch
'''
if len(X.shape) == 1: # i.e. dimension (M,) just a vector
batch_xs = torch.FloatTensor(X[batch_indices]).type(dtype)
else:
batch_xs = torch.FloatTensor(X[batch_indices,:]).type(dtype)
return batch_xs
def get_batch2(X,Y,M,dtype):
'''
get batch for pytorch model
'''
# TODO fix and make it nicer, there is pytorch forum question
X,Y = X.data.numpy(), Y.data.numpy()
N = len(Y)
valid_indices = np.array( range(N) )
batch_indices = np.random.choice(valid_indices,size=M,replace=False)
batch_xs = index_batch(X,batch_indices,dtype)
batch_ys = index_batch(Y,batch_indices,dtype)
return Variable(batch_xs, requires_grad=False), Variable(batch_ys, requires_grad=False)
def get_sequential_lifted_mdl(nb_monomials,D_out, bias=False):
return torch.nn.Sequential(torch.nn.Linear(nb_monomials,D_out,bias=bias))
def train_SGD(mdl, M,eta,nb_iter,logging_freq ,dtype, X_train,Y_train):
##
N_train,_ = tuple( X_train.size() )
#print(N_train)
for i in range(nb_iter):
# Forward pass: compute predicted Y using operations on Variables
batch_xs, batch_ys = get_batch2(X_train,Y_train,M,dtype) # [M, D], [M, 1]
## FORWARD PASS
y_pred = mdl.forward(batch_xs)
## LOSS + Regularization
batch_loss = (1/M)*(y_pred - batch_ys).pow(2).sum()
## BACKARD PASS
batch_loss.backward() # Use autograd to compute the backward pass. Now w will have gradients
## SGD update
for W in mdl.parameters():
delta = eta*W.grad.data
W.data.copy_(W.data - delta)
## train stats
if i % (nb_iter/10) == 0 or i == 0:
current_train_loss = (1/N_train)*(mdl.forward(X_train) - Y_train).pow(2).sum().data.numpy()
print('i = {}, current_loss = {}'.format(i, current_train_loss ) )
## Manually zero the gradients after updating weights
mdl.zero_grad()
##
logging_freq = 100
dtype = torch.FloatTensor
## SGD params
M = 3
eta = 0.0002
nb_iter = 20*1000
##
lb,ub = 0,1
f_target = lambda x: np.sin(2*np.pi*x)
N_train = 5
X_train = np.linspace(lb,ub,N_train)
Y_train = f_target(X_train)
## degree of mdl
Degree_mdl = 4
## pseudo-inverse solution
c_pinv = np.polyfit( X_train, Y_train , Degree_mdl )[::-1]
## linear mdl to train with SGD
nb_terms = c_pinv.shape[0]
mdl_sgd = get_sequential_lifted_mdl(nb_monomials=nb_terms,D_out=1, bias=False)
## Make polynomial Kernel
poly_feat = PolynomialFeatures(degree=Degree_mdl)
Kern_train = poly_feat.fit_transform(X_train.reshape(N_train,1))
Kern_train_pt, Y_train_pt = Variable(torch.FloatTensor(Kern_train).type(dtype), requires_grad=False), Variable(torch.FloatTensor(Y_train).type(dtype), requires_grad=False)
train_SGD(mdl_sgd, M,eta,nb_iter,logging_freq ,dtype, Kern_train_pt,Y_train_pt)
the error seems to hover on 2ish:
i = 0, current_loss = [ 2.08996224]
i = 2000, current_loss = [ 2.03536892]
i = 4000, current_loss = [ 2.02014995]
i = 6000, current_loss = [ 2.01307297]
i = 8000, current_loss = [ 2.01300406]
i = 10000, current_loss = [ 2.01125693]
i = 12000, current_loss = [ 2.01162267]
i = 14000, current_loss = [ 2.01296973]
i = 16000, current_loss = [ 2.00951076]
i = 18000, current_loss = [ 2.00967121]
which is weird cuz it should be able to reach zero.
I also plotted the learned function:
the code for the plotting:
##
x_horizontal = np.linspace(lb,ub,1000).reshape(1000,1)
X_plot = poly_feat.fit_transform(x_horizontal)
X_plot_pytorch = Variable( torch.FloatTensor(X_plot), requires_grad=False)
##
fig1 = plt.figure()
#plots objs
p_sgd, = plt.plot(x_horizontal, [ float(f_val) for f_val in mdl_sgd.forward(X_plot_pytorch).data.numpy() ])
p_pinv, = plt.plot(x_horizontal, np.dot(X_plot,c_pinv))
p_data, = plt.plot(X_train,Y_train,'ro')
## legend
nb_terms = c_pinv.shape[0]
legend_mdl = f'SGD solution standard parametrization, number of monomials={nb_terms}, batch-size={M}, iterations={nb_iter}, step size={eta}'
plt.legend(
[p_sgd,p_pinv,p_data],
[legend_mdl,f'linear algebra soln, number of monomials={nb_terms}',f'data points = {N_train}']
)
##
plt.xlabel('x'), plt.ylabel('f(x)')
plt.show()
I actually went ahead and implemented a TensorFlow version. That one does seem to train the model. I tried having both of them match by giving them the same initialization:
mdl_sgd[0].weight.data.fill_(0)
but that still didn't work. Tensorflow code:
graph = tf.Graph()
with graph.as_default():
X = tf.placeholder(tf.float32, [None, nb_terms])
Y = tf.placeholder(tf.float32, [None,1])
w = tf.Variable( tf.zeros([nb_terms,1]) )
#w = tf.Variable( tf.truncated_normal([Degree_mdl,1],mean=0.0,stddev=1.0) )
#w = tf.Variable( 1000*tf.ones([Degree_mdl,1]) )
##
f = tf.matmul(X,w) # [N,1] = [N,D] x [D,1]
#loss = tf.reduce_sum(tf.square(Y - f))
loss = tf.reduce_sum( tf.reduce_mean(tf.square(Y-f), 0))
l2loss_tf = (1/N_train)*2*tf.nn.l2_loss(Y-f)
##
learning_rate = eta
#global_step = tf.Variable(0, trainable=False)
#learning_rate = tf.train.exponential_decay(learning_rate=eta, global_step=global_step,decay_steps=nb_iter/2, decay_rate=1, staircase=True)
train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)
with tf.Session(graph=graph) as sess:
Y_train = Y_train.reshape(N_train,1)
tf.global_variables_initializer().run()
# Train
for i in range(nb_iter):
#if i % (nb_iter/10) == 0:
if i % (nb_iter/10) == 0 or i == 0:
current_loss = sess.run(fetches=loss, feed_dict={X: Kern_train, Y: Y_train})
print(f'i = {i}, current_loss = {current_loss}')
## train
batch_xs, batch_ys = get_batch(Kern_train,Y_train,M)
sess.run(train_step, feed_dict={X: batch_xs, Y: batch_ys})
I also tried changing the initialization but it didn't change anything, which makes sense cuz it shouldn't make a big difference:
mdl_sgd[0].weight.data.normal_(mean=0,std=0.001)
Original post:
https://discuss.pytorch.org/t/how-to-train-a-simple-linear-regression-model-with-sgd-in-pytorch-successfully/9620
This is how it should look like:
SOLUTION:
it seems that there is an issue with the result being returned as a vector instead of a number causing the issue. i.e. the following code fixed things:
y_pred = model(batch_xs).view(-1) # change this to "y_pred = model(batch_xs)" to get the incorrect results
loss = (y_pred - batch_ys).pow(2).mean()
which seems completely mysterious to me. Does someone know why this fixed the issue? it just seems like magic.
The bug is really subtle but essentially it's because pytorch is using numpy broadcasting rules. So when a column vector (3,1) and an array (i.e. dim is (3,) ) then what happens is that broadcasting produces a (3,3) matrix (note this wouldn't happen when you subtract a row vector (1,3) vector with a (3,) array, I guess arrays are treated as row vectors). This is really bad because it means that we compute the matrix of all pairwise differences between every label and every prediction. Of course this is nonsensical and produces a bug because we don't want the prediction of the first label point to match the prediction of every other label in the data set. Of course that won't produce anything sensible.
So it seems the answer is just to avoid wrong numpy broadcasting by either reshaping things during training or before the data is fed. Either one should work.
To avoid the error one can attach use this code:
def check_vectors_have_same_dimensions(Y,Y_):
'''
Checks that vector Y and Y_ have the same dimensions. If they don't
then there might be an error that could be caused due to wrong broadcasting.
'''
DY = tuple( Y.size() )
DY_ = tuple( Y_.size() )
if len(DY) != len(DY_):
return True
for i in range(len(DY)):
if DY[i] != DY_[i]:
return True
return False

Categories