Below I have the code of my attempt to make a neural network with 2 inputs and 3 outputs. While the training gives good results, when I try to input the numbers, the results are way off. After I made some small changes, I observed that, even though they return the output from the function which should be the same, again, the results were different. The only explanation I can think of is that there is a bug.
The functions that I'm talking about are "train" and "result".
Here is the code:
from numpy import dot, exp, max, sum, random, array
class Network:
def __init__(self):
self.w = random.random((2,3))
def sigmoid(self, x, derivate = False):
if(derivate == True):
return x * (1 - x)
return 1 /(1 + exp(-x))
def train(self):
trainingInput = array([[0,0],[0,1],[1,0],[1,1]])
trainingOutput = array([[0,0,0],[0,1,0],[0,0,1],[1,0,0]])
n = 0
while(n < 10000):
exOutput = self.sigmoid(dot(trainingInput, self.w) - 0.1)
error = trainingOutput - exOutput
self.w += dot(trainingInput.T, error *
self.sigmoid(exOutput,True))
n += 1
return exOutput
def result(self):
trainingInput = array([[0,0],[0,1],[1,0],[1,1]])
exOutput = self.sigmoid(dot(trainingInput, self.w) - 0.1)
return exOutput
network = Network()
c = 0
d = 1
o = network.result()
output = network.train()
print(o)
print(output)
you should first train and then check the results.
If you check it before training, obviously two results will be different.
you can just once again calculate the results after training, hopefully, this will solve your bug.
Related
A simple neural net of 2 inputs and one output without a bias, like this - doesn't seem to work.
|input1||weight1 weight2| = Z
|input2|
output = sigmoid(Z)
Whereas, it works perfectly when BIAS is added, why does it work & what is the math behind it?
|input1||weight1 weight2| = Z
|input2|
output = sigmoid(Z - BIAS)
Here's the CODE to working version with BIAS:
import numpy as np
import random as r
import sys
def sigmoid(ip, derivate=False):
if derivate:
return ip*(1-ip)
return 1.0/(1+np.exp(-1*ip))
class NeuralNet:
global sigmoid
def __init__(self):
self.inputLayers = 2
self.outputLayer = 1
self.bias = r.random()
def setup(self):
self.i = np.array([r.random(), r.random()], dtype=float).reshape(2,)
self.w = np.array([r.random(), r.random()], dtype=float).reshape(2,)
def forward_propogate(self):
self.z = self.w*self.i
self.o = sigmoid(sum(self.z)-self.bias)
def optimize_cost(self, desired):
i=0
current_cost = pow(desired - self.o, 2)
for weight in self.w:
dpdw = -1*(desired-self.o) * (sigmoid(self.o, derivate=True)) * self.i[i]
self.w[i] = self.w[i] - 2*dpdw
i+=1
#calculate dp/dB
dpdB = -1*(desired-self.o) * (sigmoid(self.o, derivate=True)) * -1
self.bias = self.bias - 2*dpdB
self.forward_propogate()
def train(self, ip, op):
self.i = np.array(ip).reshape(2,)
self.forward_propogate()
self.optimize_cost(op[0])
n = NeuralNet()
n.setup()
# while sys.stdin.read(1):
success_rate = 0
trial=0
done = False
while not done:
a = [0.1,1,0.1,1]
b = [0.1,0.1,1,1]
c = [0,0,0,1]
for i in range(len(a)):
trial +=1
n.train([a[i],b[i]],[c[i]])
if c[i] - n.o < 0.01:
success_rate +=1
print(100*success_rate/trial, "%")
if 100*success_rate/trial > 99 and trial > 4:
print(100*success_rate/trial, "%")
print("Network trained, took: {} trials".format(trial))
print("Network weights:{}, bias:{}".format(n.w, n.bias))
done = True
break
A bias is just a shift of the intercept. The NN you have set up in this example appears to be a single layer neural network with no hidden layers, which is effectively a logistic regression, which is just a linear model.
When you don't learn an intercept value, the intercept defaults to 0, so it always passes through the origin and you're just learning the slope of the line. To correctly classify the AND of your data, i.e. the top right corner at (1,1), but not any of the other points, you need a non zero intercept because there is no line that passes through the origin that will only have the top right corner on one side and the other three points on the other side.
I'm trying to solve XOR problem using neural network. For training I'm using genetic algorithm.
population size : 200
max_generations: 10000
crossover rate : 0,8
mutation rate : 0.1
number of weights : 9
activation function : sigmoid
selection method : high percentance for the ones with best fits
Code:
def crossover(self,wfather,wmother):
r = np.random.random()
if r <= self.crossover_perc:
new_weight= self.crossover_perc*wfather+(1-self.crossover_perc)*wmother
new_weight2=self.crossover_perc*wmother+(1-self.crossover_perc)*wfather
return new_weight,new_weight2
else:
return wfather,wmother
def select(self,fits):
percentuais = np.array(fits) / float(sum(fits))
vet = [percentuais[0]]
for p in percentuais[1:]:
vet.append(vet[-1] + p)
r = np.random.random()
#print(len(vet), r)
for i in range(len(vet)):
if r <= vet[i]:
return i
def mutate(self, weight):
r = np.random.random()
if r <= self.mut_perc:
mutr=np.random.randint(self.number_weights)
weight[mutr] = weight[mutr] + np.random.normal()
return weight
def activation_fuction(self, net):
return 1 / (1 + math.exp(-net))
Problem:
~5/10 tests works fine
Expected Output:
0,0 0
0,1 1
1,0 1
1,1 0
Tests:
Its inconsistent, sometimes i got four 0's, three 1's, multiple results
Could you help me find the error?
**Edit
All Code:
def create_initial_population(self):
population = np.random.uniform(-40, 40, [self.population_size, self.number_weights])
return population
def feedforward(self, inp1, inp2, weights):
bias = 1
x = self.activation_fuction(bias * weights[0] + (inp1 * weights[1]) + (inp2 * weights[2]))
x2 = self.activation_fuction(bias * weights[3] + (inp1 * weights[4]) + (inp2 * weights[5]))
out = self.activation_fuction(bias * weights[6] + (x * weights[7]) + (x2 * weights[8]))
print(inp1, inp2, out)
return out
def fitness(self, weights):
y1 = abs(0.0 - self.feedforward(0.0, 0.0, weights))
y2 = abs(1.0 - self.feedforward(0.0, 1.0, weights))
y3 = abs(1.0 - self.feedforward(1.0, 0.0, weights))
y4 = abs(0.0 - self.feedforward(1.0, 1.0, weights))
error = (y1 + y2 + y3 + y4) ** 2
# print("Error: ", 1/error)
return 1 / error
def sortpopbest(self, pop):
pop_with_fit = [(weights,self.fitness(weights)) for weights in pop]
sorted_population=sorted(pop_with_fit, key=lambda weights_fit: weights_fit[1]) #Worst->Best One
fits = []
pop = []
for i in sorted_population:
pop.append(i[0])
fits.append(i[1])
return pop,fits
def execute(self):
pop = self.create_initial_population()
for g in range(self.max_generations): # maximo de geracoes
pop, fits = self.sortpopbest(pop)
nova_pop=[]
for c in range(int(self.population_size/2)):
weights = pop[self.select(fits)]
weights2 = pop[self.select(fits)]
new_weights,new_weights2=self.crossover(weights,weights2)
new_weights=self.mutate(new_weights)
new_weights2=self.mutate(new_weights2)
#print(fits)
nova_pop.append(new_weights) # adiciona na nova_pop
nova_pop.append(new_weights2)
pop = nova_pop
print(len(fits),fits)
Some input:
XOR is a simple problem. With a few hundreds of random initialization, you should have some lucky ones that solve it immediately (if "solved" means that they output is correct after doing a threshold). This is a good test to see if your initialization and feed-forward pass is correct, without debugging the whole GA all at once. Or you chould just hand-craft the correct weights and biases, and see if that works.
Your initial weights (uniform -40...+40) are way too large. I guess for XOR this maybe okay-ish. But initial weights should be such that most neurons don't saturate, but aren't fully in the linear zone of sigmoid either.
After your implementation works, have a look at this numpy implementation of the feed-foward pass of a neural network for how to do it with less code.
I am following Andrew's Coursera course on machine learning. I am trying to build a 3 layers neural net for digit recognition in Python (784 input, 25 hidden, 10 output). However, I am unable to get the predictions (of the training data) correct (accuracy < 5% at 100 iter, accuracy not increasing with iteration).
J (the cost function) seems to be going down (see photo 1) and I have done gradient checking (before minimizing) and it seems to match to around 1e-11 (see photo 2).
I have compared the theta1 and theta2 after 100 iterations to my working matlab code (see code snippet 1 for octave and code snippet 2 for python). It seems theta1 is reasonably similar but theta2 is very different -- see code snippet 2. (I know they should differ because of the different optimisation routines. However, firstly, I have place the same initial thetas into both codes. Secondly, my reasoning is that they should start to converge, or at least get close, after 100 iterations)
The only error I see is:
-c:32: RuntimeWarning: overflow encountered in exp
when running the sigmoid during the optimising. However, I was told that this is not essential and it is normal to encounter this error during optimising? Furthermore, because it is a sigmoid, anytime the input is large, it will tend towards 1 anyways.
I have also attached my code in snippet 3. I have cut out all the other non-essential bits (like gradient checking) to make it as short as possible.
I would appreciate any help into this as I cannot even find where it is going wrong, let alone fix it. Thank you.
Photos:
J (cost function) decreasing to 1.8 after 12 iterations
Gradient checking before optimizing, they look very similar
Code snippet:
Initializing Neural Network Parameters ...
initial1
-0.0100100
-0.0771400
-0.1113800
-0.0230100
0.0547800
-0.0505500
-0.0731200
-0.0988700
0.0128000
-0.0855400
-0.1002500
-0.1137200
-0.0669300
-0.0999900
0.0084500
-0.0363200
-0.0588600
-0.0431100
-0.1133700
-0.0326300
0.0282800
0.0052400
-0.1134600
-0.0617700
0.0267600
initial2
0.0273700
0.1026000
-0.0502100
-0.0699100
0.0190600
0.1004000
0.0784600
-0.0075900
-0.0362100
0.0286200
Doing fminunc
Training Neural Network...
Iteration 100 | Cost: 6.219605e-01
theta1
-0.0099719
-0.0768462
-0.1109559
-0.0229224
0.0545714
-0.0503575
-0.0728415
-0.0984935
0.0127513
-0.0852143
-0.0998682
-0.1132869
-0.0666751
-0.0996092
0.0084178
-0.0361817
-0.0586359
-0.0429458
-0.1129383
-0.0325057
0.0281723
0.0052200
-0.1130279
-0.0615348
0.0266581
theta2
1.124918
1.603780
-1.266390
-0.848874
0.037956
-1.360841
2.145562
-1.448657
-1.262285
-1.357635
theta1_initial
[-0.01001 -0.07714 -0.11138 -0.02301 0.05478 -0.05055 -0.07312 -0.09887
0.0128 -0.08554 -0.10025 -0.11372 -0.06693 -0.09999 0.00845 -0.03632
-0.05886 -0.04311 -0.11337 -0.03263 0.02828 0.00524 -0.11346 -0.06177
0.02676]
theta2_initial
[ 0.02737 0.1026 -0.05021 -0.06991 0.01906 0.1004 0.07846 -0.00759
-0.03621 0.02862]
Doing fminunc
-c:32: RuntimeWarning: overflow encountered in exp
theta1
[-0.00997202 -0.07680716 -0.11086841 -0.02292044 0.05455335 -0.05034252
-0.07280686 -0.09842603 0.01275117 -0.08516515 -0.0997987 -0.11319546
-0.06664666 -0.09954009 0.00841804 -0.03617494 -0.05861458 -0.04293555
-0.1128474 -0.0325006 0.02816879 0.00522031 -0.1129369 -0.06151103
0.02665508]
theta2
[ 0.27954826 -0.08007496 -0.36449273 -0.22988024 0.06849659 -0.47803973
1.09023041 -0.25570559 -0.24537494 -0.40341995]
#-----------------BEGIN HEADERS-----------------
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import csv
import scipy
#-----------------END HEADERS-----------------
#-----------------BEGIN FUNCTION 1-----------------
def randinitialize(L_in, L_out):
w = np.zeros((L_out, 1 + L_in))
epsilon_init = 0.12
w = np.random.rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init
return w
#-----------------END FUNCTION 1-----------------
#-----------------BEGIN FUNCTION 2-----------------
def sigmoid(lz):
g = 1.0/(1.0+np.exp(-lz))
return g
#-----------------END FUNCTION 2-----------------
#-----------------BEGIN FUNCTION 3-----------------
def sigmoidgradient(lz):
g = np.multiply(sigmoid(lz),(1-sigmoid(lz)))
return g
#-----------------END FUNCTION 3-----------------
#-----------------BEGIN FUNCTION 4-----------------
def nncostfunction(ltheta_ravel, linput_layer_size, lhidden_layer_size, lnum_labels, lx, ly, llambda_reg):
ltheta1 = np.array(np.reshape(ltheta_ravel[:lhidden_layer_size * (linput_layer_size + 1)], (lhidden_layer_size, (linput_layer_size + 1))))
ltheta2 = np.array(np.reshape(ltheta_ravel[lhidden_layer_size * (linput_layer_size + 1):], (lnum_labels, (lhidden_layer_size + 1))))
ltheta1_grad = np.zeros((np.shape(ltheta1)))
ltheta2_grad = np.zeros((np.shape(ltheta2)))
y_matrix = []
lm = np.shape(lx)[0]
eye_matrix = np.eye(lnum_labels)
for i in range(len(ly)):
y_matrix.append(eye_matrix[int(ly[i])-1,:]) #The minus one as python is zero based
y_matrix = np.array(y_matrix)
a1 = np.hstack((np.ones((lm,1)), lx)).astype(float)
z2 = sigmoid(ltheta1.dot(a1.T))
a2 = (np.concatenate((np.ones((np.shape(z2)[1], 1)), z2.T), axis=1)).astype(float)
a3 = sigmoid(ltheta2.dot(a2.T))
h = a3
J_unreg = 0
J = 0
J_unreg = (1/float(lm))*np.sum(\
-np.multiply(y_matrix,np.log(h.T))\
-np.multiply((1-y_matrix),np.log(1-h.T))\
,axis=None)
J = J_unreg + (llambda_reg/(2*float(lm)))*\
(np.sum(\
np.multiply(ltheta1[:,1:],ltheta1[:,1:])\
,axis=None)+np.sum(\
np.multiply(ltheta2[:,1:],ltheta2[:,1:])\
,axis=None))
delta3 = a3.T - y_matrix
delta2 = np.multiply((delta3.dot(ltheta2[:,1:])), (sigmoidgradient(ltheta1.dot(a1.T))).T)
cdelta2 = ((a2.T).dot(delta3)).T
cdelta1 = ((a1.T).dot(delta2)).T
ltheta1_grad = (1/float(lm))*cdelta1
ltheta2_grad = (1/float(lm))*cdelta2
theta1_hold = ltheta1
theta2_hold = ltheta2
theta1_hold[:,0] = 0;
theta2_hold[:,0] = 0;
ltheta1_grad = ltheta1_grad + (llambda_reg/float(lm))*theta1_hold;
ltheta2_grad = ltheta2_grad + (llambda_reg/float(lm))*theta2_hold;
thetagrad_ravel = np.concatenate((np.ravel(ltheta1_grad), np.ravel(ltheta2_grad)))
return (J, thetagrad_ravel)
#-----------------END FUNCTION 4-----------------
#-----------------BEGIN FUNCTION 5-----------------
def predict(ltheta1, ltheta2, x):
m, n = np.shape(x)
p = np.zeros(m)
h1 = sigmoid((np.hstack((np.ones((m,1)),x.astype(float)))).dot(ltheta1.T))
h2 = sigmoid((np.hstack((np.ones((m,1)),h1))).dot(ltheta2.T))
for i in range(0,np.shape(h2)[0]):
p[i] = np.argmax(h2[i,:])
return p
#-----------------END FUNCTION 5-----------------
## Setup the parameters you will use for this exercise
input_layer_size = 784; # 28x28 Input Images of Digits
hidden_layer_size = 25; # 25 hidden units
num_labels = 10; # 10 labels, from 0 to 9
data = []
#Reading in data, split into X and y, rewrite label 0 to 10 (for easy comparison to course)
with open('train.csv', 'rb') as csvfile:
has_header = csv.Sniffer().has_header(csvfile.read(1024))
csvfile.seek(0) # rewind
data_csv = csv.reader(csvfile, delimiter=',')
if has_header:
next(data_csv)
for row in data_csv:
data.append(row)
data = np.array(data)
x = data[:,1:]
y = data[:,0]
y = y.astype(int)
for i in range(len(y)):
if y[i] == 0:
y[i] = 10
#Set basic parameters
m, n = np.shape(x)
lambda_reg = 1.0
#Randomly initalize weights for Theta_initial
#theta1_initial = np.genfromtxt('tt1.csv', delimiter=',')
#theta2_initial = np.genfromtxt('tt2.csv', delimiter=',')
theta1_initial = randinitialize(input_layer_size, hidden_layer_size);
theta2_initial = randinitialize(hidden_layer_size, num_labels);
theta_initial_ravel = np.concatenate((np.ravel(theta1_initial), np.ravel(theta2_initial)))
#Doing optimize
fmin = scipy.optimize.minimize(fun=nncostfunction, x0=theta_initial_ravel, args=(input_layer_size, hidden_layer_size, num_labels, x, y, lambda_reg), method='L-BFGS-B', jac=True, options={'maxiter': 10, 'disp': True})
fmin
theta1 = np.array(np.reshape(fmin.x[:hidden_layer_size * (input_layer_size + 1)], (hidden_layer_size, (input_layer_size + 1))))
theta2 = np.array(np.reshape(fmin.x[hidden_layer_size * (input_layer_size + 1):], (num_labels, (hidden_layer_size + 1))))
p = predict(theta1, theta2, x);
for i in range(len(y)):
if y[i] == 10:
y[i] = 0
correct = [1 if a == b else 0 for (a, b) in zip(p,y)]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print 'accuracy = {0}%'.format(accuracy * 100)
I think I have fixed the problem: it seems I messed up the index
should be:
y_matrix.append(eye_matrix[int(ly[i]),:])
instead of:
y_matrix.append(eye_matrix[int(ly[i])-1,:])
I'm trying to make a XOR gate by using 2 perceptron network but for some reason the network is not learning, when I plot the change of error in a graph the error comes to a static level and oscillates in that region.
I did not add any bias to the network at the moment.
import numpy as np
def S(x):
return 1/(1+np.exp(-x))
win = np.random.randn(2,2)
wout = np.random.randn(2,1)
eta = 0.15
# win = [[1,1], [2,2]]
# wout = [[1],[2]]
obj = [[0,0],[1,0],[0,1],[1,1]]
target = [0,1,1,0]
epoch = int(10000)
emajor = ""
for r in range(0,epoch):
for xy in range(len(target)):
tar = target[xy]
fdata = obj[xy]
fdata = S(np.dot(1,fdata))
hnw = np.dot(fdata,win)
hnw = S(np.dot(fdata,win))
out = np.dot(hnw,wout)
out = S(out)
diff = tar-out
E = 0.5 * np.power(diff,2)
emajor += str(E[0]) + ",\n"
delta_out = (out-tar)*(out*(1-out))
nindelta_out = delta_out * eta
wout_change = np.dot(nindelta_out[0], hnw)
for x in range(len(wout_change)):
change = wout_change[x]
wout[x] -= change
delta_in = np.dot(hnw,(1-hnw)) * np.dot(delta_out[0], wout)
nindelta_in = eta * delta_in
for x in range(len(nindelta_in)):
midway = np.dot(nindelta_in[x][0], fdata)
for y in range(len(win)):
win[y][x] -= midway[y]
f = open('xor.csv','w')
f.write(emajor) # python will convert \n to os.linesep
f.close() # you can omit in most cases as the destructor will call it
This is the error changing by the number of learning rounds. Is this correct? The red color line is the line I was expecting how the error should change.
Anything wrong I'm doing in the code? As I can't seem to figure out what's causing the error. Help much appreciated.
Thanks in advance
Here is a one hidden layer network with backpropagation which can be customized to run experiments with relu, sigmoid and other activations. After several experiments it was concluded that with relu the network performed better and reached convergence sooner, while with sigmoid the loss value fluctuated. This happens because, "the gradient of sigmoids becomes increasingly small as the absolute value of x increases".
import numpy as np
import matplotlib.pyplot as plt
from operator import xor
class neuralNetwork():
def __init__(self):
# Define hyperparameters
self.noOfInputLayers = 2
self.noOfOutputLayers = 1
self.noOfHiddenLayerNeurons = 2
# Define weights
self.W1 = np.random.rand(self.noOfInputLayers,self.noOfHiddenLayerNeurons)
self.W2 = np.random.rand(self.noOfHiddenLayerNeurons,self.noOfOutputLayers)
def relu(self,z):
return np.maximum(0,z)
def sigmoid(self,z):
return 1/(1+np.exp(-z))
def forward (self,X):
self.z2 = np.dot(X,self.W1)
self.a2 = self.relu(self.z2)
self.z3 = np.dot(self.a2,self.W2)
yHat = self.relu(self.z3)
return yHat
def costFunction(self, X, y):
#Compute cost for given X,y, use weights already stored in class.
self.yHat = self.forward(X)
J = 0.5*sum((y-self.yHat)**2)
return J
def costFunctionPrime(self,X,y):
# Compute derivative with respect to W1 and W2
delta3 = np.multiply(-(y-self.yHat),self.sigmoid(self.z3))
djw2 = np.dot(self.a2.T, delta3)
delta2 = np.dot(delta3,self.W2.T)*self.sigmoid(self.z2)
djw1 = np.dot(X.T,delta2)
return djw1,djw2
if __name__ == "__main__":
EPOCHS = 6000
SCALAR = 0.01
nn= neuralNetwork()
COST_LIST = []
inputs = [ np.array([[0,0]]), np.array([[0,1]]), np.array([[1,0]]), np.array([[1,1]])]
for epoch in xrange(1,EPOCHS):
cost = 0
for i in inputs:
X = i #inputs
y = xor(X[0][0],X[0][1])
cost += nn.costFunction(X,y)[0]
djw1,djw2 = nn.costFunctionPrime(X,y)
nn.W1 = nn.W1 - SCALAR*djw1
nn.W2 = nn.W2 - SCALAR*djw2
COST_LIST.append(cost)
plt.plot(np.arange(1,EPOCHS),COST_LIST)
plt.ylim(0,1)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title(str('Epochs: '+str(EPOCHS)+', Scalar: '+str(SCALAR)))
plt.show()
inputs = [ np.array([[0,0]]), np.array([[0,1]]), np.array([[1,0]]), np.array([[1,1]])]
print "X\ty\ty_hat"
for inp in inputs:
print (inp[0][0],inp[0][1]),"\t",xor(inp[0][0],inp[0][1]),"\t",round(nn.forward(inp)[0][0],4)
End Result:
X y y_hat
(0, 0) 0 0.0
(0, 1) 1 0.9997
(1, 0) 1 0.9997
(1, 1) 0 0.0005
The weights obtained after training were:
nn.w1
[ [-0.81781753 0.71323677]
[ 0.48803631 -0.71286155] ]
nn.w2
[ [ 2.04849235]
[ 1.40170791] ]
I found the following youtube series extremely helpful for understanding neural nets: Neural networks demystified
There is only little which I know and also that can be explained in this answer. If you want an even better understanding of neural nets, then I would suggest you to go through the following link: cs231n: Modelling one neuron
The error calculated in each epoch should be a sum total of all sum squared errors (i.e. error for every target)
import numpy as np
def S(x):
return 1/(1+np.exp(-x))
win = np.random.randn(2,2)
wout = np.random.randn(2,1)
eta = 0.15
# win = [[1,1], [2,2]]
# wout = [[1],[2]]
obj = [[0,0],[1,0],[0,1],[1,1]]
target = [0,1,1,0]
epoch = int(10000)
emajor = ""
for r in range(0,epoch):
# ***** initialize final error *****
finalError = 0
for xy in range(len(target)):
tar = target[xy]
fdata = obj[xy]
fdata = S(np.dot(1,fdata))
hnw = np.dot(fdata,win)
hnw = S(np.dot(fdata,win))
out = np.dot(hnw,wout)
out = S(out)
diff = tar-out
E = 0.5 * np.power(diff,2)
# ***** sum all errors *****
finalError += E
delta_out = (out-tar)*(out*(1-out))
nindelta_out = delta_out * eta
wout_change = np.dot(nindelta_out[0], hnw)
for x in range(len(wout_change)):
change = wout_change[x]
wout[x] -= change
delta_in = np.dot(hnw,(1-hnw)) * np.dot(delta_out[0], wout)
nindelta_in = eta * delta_in
for x in range(len(nindelta_in)):
midway = np.dot(nindelta_in[x][0], fdata)
for y in range(len(win)):
win[y][x] -= midway[y]
# ***** Save final error *****
emajor += str(finalError[0]) + ",\n"
f = open('xor.csv','w')
f.write(emajor) # python will convert \n to os.linesep
f.close() # you can omit in most cases as the destructor will call it
Darn thing just won't learn. Sometimes weights seem to become nan.
I haven't played with different numbers of hidden layers/inputs/outputs but the bug appears consistent across different sizes of hidden layer.
from __future__ import division
import numpy
import matplotlib
import random
class Net:
def __init__(self, *sizes):
sizes = list(sizes)
sizes[0] += 1
self.sizes = sizes
self.weights = [numpy.random.uniform(-1, 1, (sizes[i+1],sizes[i])) for i in range(len(sizes)-1)]
#staticmethod
def activate(x):
return 1/(1+numpy.exp(-x))
def y(self, x_):
x = numpy.concatenate(([1], numpy.atleast_1d(x_.copy())))
o = [x] #o[i] is the (activated) output of hidden layer i, "hidden layer 0" is inputs
for weight in self.weights[:-1]:
x = weight.dot(x)
x = Net.activate(x)
o.append(x)
o.append(self.weights[-1].dot(x))
return o
def __call__(self, x):
return self.y(x)[-1]
def delta(self, x, t):
o = self.y(x)
delta = [(o[-1]-t) * o[-1] * (1-o[-1])]
for i, weight in enumerate(reversed(self.weights)):
delta.append(weight.T.dot(delta[-1]) * o[-i-2] * (1-o[-i-2]))
delta.reverse()
return o, delta
def train(self, inputs, outputs, epochs=100, rate=.1):
for epoch in range(epochs):
pairs = zip(inputs, outputs)
random.shuffle(pairs)
for x, t in pairs: #shuffle? subset?
o, d = self.delta(x, t)
for layer in range(len(self.sizes)-1):
self.weights[layer] -= rate * numpy.outer(o[layer+1], d[layer])
n = Net(1, 4, 1)
x = numpy.linspace(0, 2*3.14, 10)
t = numpy.sin(x)
matplotlib.pyplot.plot(x, t, 'g')
matplotlib.pyplot.plot(x, map(n, x), 'r')
n.train(x, t)
print n.weights
matplotlib.pyplot.plot(x, map(n, x), 'b')
matplotlib.pyplot.show()
I haven't looked for a particular bug in your code, but can you please try the following things to narrow down your problem further? Otherwise it is very tedious to find the needle in the haystack.
1) Please try to use a real dataset to have an idea what to expect, e.g., MNIST, and/or standardize your data, because your weights may become NaN if they become too small.
2) Try different learning rates and plot the cost function vs. epochs to check if you are converging. It should look somewhat like this (note that I used minibatch learning and averaged the minibatch chunks for each epoch).
3) I see that you are using a sigmoid activation, your implementation is correct, but to make it numerically more stable, replace 1.0 / (1.0 + np.exp(-z)) by expit(z) from scipy.special (same function but more efficient).
4) Implement gradient checking. Here, you compare the analytical solution to a numerically approximated gradient
Or an even better approach that yields a more accurate approximation of the gradient is to compute the symmetric (or centered) difference quotient given by the two-point formula
PS: If you are interested and find it useful, I have a working vanilla NumPy neural net implemented here.
I fixed it! Thanks for all the suggestions. I worked out numeric partials and found that my o and deltas were correct, but I was multiplying the wrong ones. That's why I now take numpy.outer(d[layer+1], o[layer]) instead of numpy.outer(d[layer], o[layer+1]).
I was also skipping the update on one layer. That's why I changed for layer in range(self.hidden_layers) to for layer in range(self.hidden_layers+1).
I'll add that I caught a bug just before posting originally. My output layer delta was incorrect because my net (intentionally) doesn't activate the final outputs, but my delta was computed as though it did.
Debugged primarily with a one hidden layer, one hidden unit net, then moved to a 2 input, 3 hidden layers of 2 neurons each, 2 output model.
from __future__ import division
import numpy
import scipy
import scipy.special
import matplotlib
#from pylab import *
#numpy.random.seed(23)
def nmap(f, x):
return numpy.array(map(f, x))
class Net:
def __init__(self, *sizes):
self.hidden_layers = len(sizes)-2
self.weights = [numpy.random.uniform(-1, 1, (sizes[i+1],sizes[i])) for i in range(self.hidden_layers+1)]
#staticmethod
def activate(x):
return scipy.special.expit(x)
#return 1/(1+numpy.exp(-x))
#staticmethod
def activate_(x):
s = scipy.special.expit(x)
return s*(1-s)
def y(self, x):
o = [numpy.array(x)] #o[i] is the (activated) output of hidden layer i, "hidden layer 0" is inputs and not activated
for weight in self.weights[:-1]:
o.append(Net.activate(weight.dot(o[-1])))
o.append(self.weights[-1].dot(o[-1]))
# for weight in self.weights:
# o.append(Net.activate(weight.dot(o[-1])))
return o
def __call__(self, x):
return self.y(x)[-1]
def delta(self, x, t):
x = numpy.array(x)
t = numpy.array(t)
o = self.y(x)
#delta = [(o[-1]-t) * o[-1] * (1-o[-1])]
delta = [o[-1]-t]
for i, weight in enumerate(reversed(self.weights)):
delta.append(weight.T.dot(delta[-1]) * o[-i-2] * (1-o[-i-2]))
delta.reverse() #surely i need this
return o, delta
def train(self, inputs, outputs, epochs=1000, rate=.1):
errors = []
for epoch in range(epochs):
for x, t in zip(inputs, outputs): #shuffle? subset?
o, d = self.delta(x, t)
for layer in range(self.hidden_layers+1):
grad = numpy.outer(d[layer+1], o[layer])
self.weights[layer] -= rate * grad
return errors
def rmse(self, inputs, outputs):
return ((outputs - nmap(self, inputs))**2).sum()**.5/len(inputs)
n = Net(1, 8, 1)
X = numpy.linspace(0, 2*3.1415, 10)
T = numpy.sin(X)
Y = map(n, X)
Y = numpy.array([y[0,0] for y in Y])
matplotlib.pyplot.plot(X, T, 'g')
matplotlib.pyplot.plot(X, Y, 'r')
print 'output successful'
print n.rmse(X, T)
errors = n.train(X, T)
print 'tried to train successfully'
print n.rmse(X, T)
Y = map(n, X)
Y = numpy.array([y[0,0] for y in Y])
matplotlib.pyplot.plot(x, Y, 'b')
matplotlib.pyplot.show()