Error in backpropagation python neural net

Error in backpropagation python neural net - python

Darn thing just won't learn. Sometimes weights seem to become nan.
I haven't played with different numbers of hidden layers/inputs/outputs but the bug appears consistent across different sizes of hidden layer.
from __future__ import division
import numpy
import matplotlib
import random
class Net:
def __init__(self, *sizes):
sizes = list(sizes)
sizes[0] += 1
self.sizes = sizes
self.weights = [numpy.random.uniform(-1, 1, (sizes[i+1],sizes[i])) for i in range(len(sizes)-1)]
#staticmethod
def activate(x):
return 1/(1+numpy.exp(-x))
def y(self, x_):
x = numpy.concatenate(([1], numpy.atleast_1d(x_.copy())))
o = [x] #o[i] is the (activated) output of hidden layer i, "hidden layer 0" is inputs
for weight in self.weights[:-1]:
x = weight.dot(x)
x = Net.activate(x)
o.append(x)
o.append(self.weights[-1].dot(x))
return o
def __call__(self, x):
return self.y(x)[-1]
def delta(self, x, t):
o = self.y(x)
delta = [(o[-1]-t) * o[-1] * (1-o[-1])]
for i, weight in enumerate(reversed(self.weights)):
delta.append(weight.T.dot(delta[-1]) * o[-i-2] * (1-o[-i-2]))
delta.reverse()
return o, delta
def train(self, inputs, outputs, epochs=100, rate=.1):
for epoch in range(epochs):
pairs = zip(inputs, outputs)
random.shuffle(pairs)
for x, t in pairs: #shuffle? subset?
o, d = self.delta(x, t)
for layer in range(len(self.sizes)-1):
self.weights[layer] -= rate * numpy.outer(o[layer+1], d[layer])
n = Net(1, 4, 1)
x = numpy.linspace(0, 2*3.14, 10)
t = numpy.sin(x)
matplotlib.pyplot.plot(x, t, 'g')
matplotlib.pyplot.plot(x, map(n, x), 'r')
n.train(x, t)
print n.weights
matplotlib.pyplot.plot(x, map(n, x), 'b')
matplotlib.pyplot.show()

I haven't looked for a particular bug in your code, but can you please try the following things to narrow down your problem further? Otherwise it is very tedious to find the needle in the haystack.
1) Please try to use a real dataset to have an idea what to expect, e.g., MNIST, and/or standardize your data, because your weights may become NaN if they become too small.
2) Try different learning rates and plot the cost function vs. epochs to check if you are converging. It should look somewhat like this (note that I used minibatch learning and averaged the minibatch chunks for each epoch).
3) I see that you are using a sigmoid activation, your implementation is correct, but to make it numerically more stable, replace 1.0 / (1.0 + np.exp(-z)) by expit(z) from scipy.special (same function but more efficient).
4) Implement gradient checking. Here, you compare the analytical solution to a numerically approximated gradient
Or an even better approach that yields a more accurate approximation of the gradient is to compute the symmetric (or centered) difference quotient given by the two-point formula
PS: If you are interested and find it useful, I have a working vanilla NumPy neural net implemented here.

I fixed it! Thanks for all the suggestions. I worked out numeric partials and found that my o and deltas were correct, but I was multiplying the wrong ones. That's why I now take numpy.outer(d[layer+1], o[layer]) instead of numpy.outer(d[layer], o[layer+1]).
I was also skipping the update on one layer. That's why I changed for layer in range(self.hidden_layers) to for layer in range(self.hidden_layers+1).
I'll add that I caught a bug just before posting originally. My output layer delta was incorrect because my net (intentionally) doesn't activate the final outputs, but my delta was computed as though it did.
Debugged primarily with a one hidden layer, one hidden unit net, then moved to a 2 input, 3 hidden layers of 2 neurons each, 2 output model.
from __future__ import division
import numpy
import scipy
import scipy.special
import matplotlib
#from pylab import *
#numpy.random.seed(23)
def nmap(f, x):
return numpy.array(map(f, x))
class Net:
def __init__(self, *sizes):
self.hidden_layers = len(sizes)-2
self.weights = [numpy.random.uniform(-1, 1, (sizes[i+1],sizes[i])) for i in range(self.hidden_layers+1)]
#staticmethod
def activate(x):
return scipy.special.expit(x)
#return 1/(1+numpy.exp(-x))
#staticmethod
def activate_(x):
s = scipy.special.expit(x)
return s*(1-s)
def y(self, x):
o = [numpy.array(x)] #o[i] is the (activated) output of hidden layer i, "hidden layer 0" is inputs and not activated
for weight in self.weights[:-1]:
o.append(Net.activate(weight.dot(o[-1])))
o.append(self.weights[-1].dot(o[-1]))
# for weight in self.weights:
# o.append(Net.activate(weight.dot(o[-1])))
return o
def __call__(self, x):
return self.y(x)[-1]
def delta(self, x, t):
x = numpy.array(x)
t = numpy.array(t)
o = self.y(x)
#delta = [(o[-1]-t) * o[-1] * (1-o[-1])]
delta = [o[-1]-t]
for i, weight in enumerate(reversed(self.weights)):
delta.append(weight.T.dot(delta[-1]) * o[-i-2] * (1-o[-i-2]))
delta.reverse() #surely i need this
return o, delta
def train(self, inputs, outputs, epochs=1000, rate=.1):
errors = []
for epoch in range(epochs):
for x, t in zip(inputs, outputs): #shuffle? subset?
o, d = self.delta(x, t)
for layer in range(self.hidden_layers+1):
grad = numpy.outer(d[layer+1], o[layer])
self.weights[layer] -= rate * grad
return errors
def rmse(self, inputs, outputs):
return ((outputs - nmap(self, inputs))**2).sum()**.5/len(inputs)
n = Net(1, 8, 1)
X = numpy.linspace(0, 2*3.1415, 10)
T = numpy.sin(X)
Y = map(n, X)
Y = numpy.array([y[0,0] for y in Y])
matplotlib.pyplot.plot(X, T, 'g')
matplotlib.pyplot.plot(X, Y, 'r')
print 'output successful'
print n.rmse(X, T)
errors = n.train(X, T)
print 'tried to train successfully'
print n.rmse(X, T)
Y = map(n, X)
Y = numpy.array([y[0,0] for y in Y])
matplotlib.pyplot.plot(x, Y, 'b')
matplotlib.pyplot.show()

Related

How to solve 1D heat equation using neural networks in pytorch

I want to solve a 1D heat conduction using neural netwroks in pytorch. The PDE represeting the heat conduction is as follows:
du/dt = k d2u/dx2
where, k is a constant, u represent temperature and x is also the space. I also include a boundary condition like 0 temperature at x=0 and initial condition like t=0. I am quite new in the field of PINN and only can solve normal ODEs with it. The following code is tryin to solve a very simple ODE like du/dx=u with boundary condition like u=0 at x=0. The answer is simply u = u2/2. The following code is a simple PINN that solve this ODE:
import torch
import torch.nn as nn
import numpy as np
# N is a Neural Network with three layers
N = nn.Sequential(nn.Linear(1, 50), nn.Sigmoid(), nn.Linear(50,1, bias=False))
BC = 0. # Boundary condition
g_f = lambda x: BC + x * N(x) # a general function satisfying the BC
f = lambda x: x # Undolvable ODE!!! : du/dx = x ----> u = x2/2
# The loss function
def loss(x):
x.requires_grad = True
outputs = g_f(x)
grdnt = torch.autograd.grad(outputs, x, grad_outputs=torch.ones_like(outputs),
create_graph=True)[0]
return torch.mean((grdnt - f(x)) ** 2)
optimizer = torch.optim.LBFGS(N.parameters())
x = torch.Tensor(np.linspace(-4, 4, 100)[:, None])
# Run the optimizer
def closure():
optimizer.zero_grad()
l = loss(x)
l.backward()
return l
for i in range(10):
optimizer.step(closure)
x_test = np.linspace(-4, 4, 100)[:, None]
with torch.no_grad():
y_test = g_f(torch.Tensor(x_test)).numpy()
How to replicate the code with 1D heat conduction?

Got stuck while make a ML model from scratch

I have a CSV file of various persons with 8 parameters to determine whether the person is diabetic or not.
You will get the CSV file from here
I am making a model that will train and predict if a person is diabetic or not without using of third-party applications like Tensorlfow Scikitlearn etc. I am making it from scratch.
here is my code:
from numpy import genfromtxt
import numpy as np
my_data = genfromtxt('E:/diabaties.csv', delimiter=',')
X,Y = my_data[1: ,:-1], my_data[1: ,-1:] #striping data and output from my_data
def sigmoid(x):
return (1/(1+np.exp(-x)))
m = X.shape[0]
def propagate(W, b, X, Y):
#forward propagation
A = sigmoid(np.dot(X, W) + b)
cost = (- 1 / m) * np.sum(Y * np.log(A) + (1 - Y) * (np.log(1 - A)))
print(cost)
#backward propagation
dw = (1 / m) * np.dot(X.T, (A - Y))
db = (1 / m) * np.sum(A - Y)
return(dw, db, cost)
def optimizer(W,b,X,Y,number_of_iterration,learning_rate):
for i in range(number_of_iterration):
dw, db, cost = propagate(W,b,X,Y)
W = W - learning_rate*dw
b = b - learning_rate*db
return(W, b)
W = np.zeros((X.shape[1],1))
b = 0
W,b = optimizer(W, b, X, Y, 100, 0.05)
The output which is getting generated is:
It is in this link please take a look.
I have tried to -
initialize the value of W with random numbers.
spent a lot of time to debug but cannot find what I have done wrong

This short answer is that your learning rate is about 500x too big for this problem. Think about it like you're trying to pilot your W vector into a canyon in the cost function. At each step, the gradient tells you which way is down hill, but the steps you take in that direction are so big that you jump over the canyon and end up on the other side. Each time this happens, your cost goes up because you're getting farther and farther out of the canyon until after 2 iterations, it blows up.
If you replace the line
W,b = optimizer(W, b, X, Y, 100, 0.05)
with
W,b = optimizer(W, b, X, Y, 100, 0.0001)
It will converge, though still not at a reasonable speed. (Side note, there's no good way to know the learning rate you need for a given problem. You just try lower and lower values until your cost value doesn't diverge.)
The longer answer is that the problem is that your features are all on different scales.
col_means = X.mean(axis=0)
col_stds = X.std(axis=0)
print('column means: ', col_means)
print('column stdevs: ', col_stds)
yields
column means: [ 3.84505208 120.89453125 69.10546875 20.53645833 79.79947917
31.99257812 0.4718763 33.24088542]
column stdevs: [ 3.36738361 31.95179591 19.34320163 15.94182863 115.16894926
7.87902573 0.33111282 11.75257265]
This means that the variations in the numbers of the second feature are about 100x as large as the variations in the numbers of the second to last feature which in turn means that the number of the second value in your W vector will have to be tuned to about 100x the precision of the value of the second to last number in your W vector.
There are two ways to deal with this in practice. First, you could use a fancier optimizer. Instead of basic gradient descent, you could use gradient descent with momentum, but that would change all your code. The second, simpler, way is just to scale your features so they're all about the same size.
col_means = X.mean(axis=0)
col_stds = X.std(axis=0)
print('column means: ', col_means)
print('column stdevs: ', col_stds)
X -= col_means
X /= col_stds
W, b = optimizer(W, b, X, Y, 100, 1.0)
Here we subtract the mean value of each feature and divide each feature's value by its standard deviation. Sometimes newbies are thrown off by this -- "you can't change your data values, that changes the problem" -- but it makes sense if you realize that it's just another mathematical transformation, just like multiplying by W, adding b, taking the sigmoid, etc. The only catch is that you've got to make sure you do the same thing for any future data. Just like the values of your W vector are learned parameters of your model, the values of the col_means and col_stds are too, so you've got to save them like W and b and use them if you want to perform inference with this model on new data in the future.
That lets us use a much bigger learning rater of 1.0 because now all the features are approximately the same size.
Now if you try, you'll get the following output:
column means: [ 3.84505208 120.89453125 69.10546875 20.53645833 79.79947917
31.99257812 0.4718763 33.24088542]
column stdevs: [ 3.36738361 31.95179591 19.34320163 15.94182863 115.16894926
7.87902573 0.33111282 11.75257265]
0.6931471805599452
0.5902957589079032
0.5481784378158732
0.5254804089153315
...
0.4709931321295562
0.4709931263193595
0.47099312122176273
0.4709931167488006
0.470993112823447
This is what you want. Your cost function is going down at each step and at the end of your 100 iterations, the cost is stable to ~8 significant figures, so dropping it more probably won't do much.
Welcome to machine learning!

The problem is with your initialization of the weight and bias. It’s important that you don’t initialize at least the weights to zero and instead initialize them with some random small numbers. The value of A is coming out to be zero making your cost function undefined
Update:
Try something like this:
from numpy import genfromtxt
import numpy as np
# my_data = genfromtxt('E:/diabaties.csv', delimiter=',')
# X,Y = my_data[1: ,:-1], my_data[1: ,-1:] #striping data and output from my_data
# Using random data
n_points = 100
n_neurons = 5
X = np.random.rand(n_points, n_neurons) # 5 dimensional data from uniform distribution [0, 1)
Y = np.random.randint(low=0, high=2, size=(n_points, 1)) # Binary labels
def sigmoid(x):
return (1/(1+np.exp(-x)))
m = X.shape[0]
def propagate(W, b, X, Y):
#forward propagation
A = sigmoid(np.dot(X, W) + b)
cost = (- 1 / m) * np.sum(Y * np.log(A) + (1 - Y) * (np.log(1 - A)))
print(cost)
#backward propagation
dw = (1 / m) * np.dot(X.T, (A - Y))
db = (1 / m) * np.sum(A - Y)
return(dw, db, cost)
def optimizer(W,b,X,Y,number_of_iterration,learning_rate):
for i in range(number_of_iterration):
dw, db, cost = propagate(W,b,X,Y)
W = W - learning_rate*dw
b = b - learning_rate*db
return(W, b)
W = np.random.normal(loc=0, scale=0.01, size=(n_neurons, 1)) # Drawing random initialization from gaussian
b = 0
W,b = optimizer(W, b, X, Y, 100, 0.05)

Your NaN problem is simply due to np.log encountering a zero value. You always want to scale your X values. Statistical (mean, std) normalization will work, but I find min-max scaling works best. Here is code for that:
def minmax_scaler(x):
min = np.nanmin(x, axis=0)
max = np.nanmax(x, axis=0)
return (x-min)/(max-min)
Also, your neural net has only one neuron. When you call np.dot(X, W) these should be matrices of shape (cases, features) and (features, neurons) respectively. So, now your initialization code looks like this:
X = minmax_scaler(X)
neurons = 10
learning_rate = 0.05
W = np.random.random((X.shape[1], neurons))
b = np.zeros((1, neurons)) # b width to match W
I got decent convergence without needing to change the learning rate. See chart:
This is such a small dataset that, even with 10-20 neurons, you are in danger of overfitting it. Ordinarily, you would code a predict() method and an accuracy check, and then set aside some of the data to test for overfitting.

Linear Regression with python - gradient descent error

I have been trying to implement my own Linear Regression from scratch using python but have been facing a issue during the last days.
This is the code I am using :
Import modules
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
Initialize parameters
def initialize_parameters(n):
w = np.zeros(n,)
b = 0.0
return w,b
Predictor/Hypothesis
def predictor(x, w, b):
return np.dot(x,w) + b
Cost function
def calculate_cost(X, y, theta, b):
m = len(y)
predictions = np.dot(X, theta)
error = predictions - y
cost = (1/2*m) * np.sum(np.power(error,2))
return cost
Gradient descent
def gradient_descent(X, W, b, y, learning_rate = 0.0001, epochs = 25):
m = len(y)
final_cost = 0
for _ in range(epochs):
predictions = predictor(X, W, b)
error = predictions - y
derivate = np.dot(error, X)
print(derivate)
W = W - (learning_rate/m) * derivate
b = b - (learning_rate/m) * error.sum()
Test run :
# Load dataset
boston = load_boston()
data = pd.DataFrame(boston.data)
data.columns = boston.feature_names
data['PRICE'] = boston.target
# Split dataset
X = data.drop(columns=['PRICE']).values
Y = data['PRICE'].values
w, b = initialize_parameters(X.shape[1])
gradient_descent(X, w, b, Y)
During the test run, I can see that the values for the derivate is growing insanely fast :
[1.41239553e+06 3.20162679e+06 3.84829686e+06 2.17737688e+04
1.81667467e+05 1.99565485e+06 2.27660208e+07 1.15045731e+06
3.50107975e+06 1.40396525e+08 5.96494458e+06 1.14447329e+08
4.25947931e+06]
[-4.33362969e+07 -9.66008831e+07 -1.16941872e+08 -6.62733008e+05
-5.50761913e+06 -6.04452389e+07 -6.90425672e+08 -3.46792848e+07
-1.06967561e+08 -4.26847914e+09 -1.80579130e+08 -3.45024565e+09
-1.29016170e+08]
...
[-2.01209195e+34 -4.47742185e+34 -5.42629282e+34 -3.07294644e+32
-2.55503032e+33 -2.80363423e+34 -3.20314565e+35 -1.60824109e+34
-4.96433806e+34 -1.98052568e+36 -8.37673498e+34 -1.60024763e+36
-5.98654489e+34]
[6.09700758e+35 1.35674093e+36 1.64426623e+36 9.31159124e+33
7.74221040e+34 8.49552585e+35 9.70611871e+36 4.87326542e+35
1.50428547e+36 6.00135600e+37 2.53830431e+36 4.84904376e+37
1.81403288e+36]
[-1.84750510e+37 -4.11117381e+37 -4.98242821e+37 -2.82158290e+35
-2.34603173e+36 -2.57430013e+37 -2.94113196e+38 -1.47668879e+37
-4.55826082e+37 -1.81852092e+39 -7.69152754e+37 -1.46934918e+39
-5.49685229e+37]
[5.59827926e+38 1.24576106e+39 1.50976712e+39 8.54991361e+36
7.10890636e+37 7.80060146e+38 8.91216919e+39 4.47463782e+38
1.38123662e+39 5.51045187e+40 2.33067389e+39 4.45239747e+40
1.66564705e+39]
[-1.69638128e+40 -3.77488445e+40 -4.57487122e+40 -2.59078061e+38
-2.15412899e+39 -2.36372529e+40 -2.70055070e+41 -1.35589732e+40
-4.18540025e+40 -1.66976797e+42 -7.06236930e+40 -1.34915808e+42
-5.04721600e+40]
And then, the gradient descent run stops before all interactions due to the high values.
At a certain point, the values form the derivate assume values as NaN.
As expected, when I try to predict a test case, I get 0.0 as output:
sample_house = [[2.29690000e-01, 0.00000000e+00, 1.05900000e+01, 0.00000000e+00, 4.89000000e-01,
6.32600000e+00, 5.25000000e+01, 4.35490000e+00, 4.00000000e+00, 2.77000000e+02,
1.86000000e+01, 3.94870000e+02, 1.09700000e+01]]
test_predict = predictor(sample_house, w, b)
test_predict
------------------------------------------------
out : array([0.])
Thanks!

Your cost function is wrong, it should be:
cost = 1/(2*m) * np.sum(np.power(error,2))
Also, try to initialize your weights as random values between 0 an 1 and scale your inputs to range 0-1.

I had the same issue which I resolved by normalizing the x values.

I think that you are making a mistake in the gradient descent algorithm. When updating the values for "W" vector it should be:
W = W - (learning_rate/m) * derivate.sum()

The learning rate is too large.
I try learning_rate = 0.000001, and it converges normally.

Why don't my 2 functions give the same results?

Below I have the code of my attempt to make a neural network with 2 inputs and 3 outputs. While the training gives good results, when I try to input the numbers, the results are way off. After I made some small changes, I observed that, even though they return the output from the function which should be the same, again, the results were different. The only explanation I can think of is that there is a bug.
The functions that I'm talking about are "train" and "result".
Here is the code:
from numpy import dot, exp, max, sum, random, array
class Network:
def __init__(self):
self.w = random.random((2,3))
def sigmoid(self, x, derivate = False):
if(derivate == True):
return x * (1 - x)
return 1 /(1 + exp(-x))
def train(self):
trainingInput = array([[0,0],[0,1],[1,0],[1,1]])
trainingOutput = array([[0,0,0],[0,1,0],[0,0,1],[1,0,0]])
n = 0
while(n < 10000):
exOutput = self.sigmoid(dot(trainingInput, self.w) - 0.1)
error = trainingOutput - exOutput
self.w += dot(trainingInput.T, error *
self.sigmoid(exOutput,True))
n += 1
return exOutput
def result(self):
trainingInput = array([[0,0],[0,1],[1,0],[1,1]])
exOutput = self.sigmoid(dot(trainingInput, self.w) - 0.1)
return exOutput
network = Network()
c = 0
d = 1
o = network.result()
output = network.train()
print(o)
print(output)

you should first train and then check the results.
If you check it before training, obviously two results will be different.
you can just once again calculate the results after training, hopefully, this will solve your bug.

T.hessian gives NotImplementedError() in theano

How it can be that it works
g_W = T.grad(cost=cost, wrt=classifier.vparamW)
whereas this
H_W=T.hessian(cost=cost, wrt=classifier.vparamW)
gives NotImplementedError()
may it be that the problem in such cost function:
-T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
Here y is the vector of class labels from 0 to n-1 and
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)

I am unable to reproduce this problem with the limited code that has been provided. However, here is a fully working demo of T.grad and T.hessian.
import numpy
import theano
import theano.tensor as T
x = T.matrix()
w_flat = theano.shared(numpy.random.randn(3, 2).astype(theano.config.floatX).flatten())
w = w_flat.reshape((3, 2))
cost = T.pow(theano.dot(x, w), 2).sum()
g_w = T.grad(cost=cost, wrt=[w])
h_w = T.hessian(cost=cost, wrt=[w_flat])
f = theano.function([x], outputs=g_w + h_w)
for output in f(numpy.random.randn(4, 3).astype(theano.config.floatX)):
print output.shape, '\n', output
Note that the wrt value for T.hessian needs to be a vector.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.