I'm trying to make a XOR gate by using 2 perceptron network but for some reason the network is not learning, when I plot the change of error in a graph the error comes to a static level and oscillates in that region.
I did not add any bias to the network at the moment.
import numpy as np
def S(x):
return 1/(1+np.exp(-x))
win = np.random.randn(2,2)
wout = np.random.randn(2,1)
eta = 0.15
# win = [[1,1], [2,2]]
# wout = [[1],[2]]
obj = [[0,0],[1,0],[0,1],[1,1]]
target = [0,1,1,0]
epoch = int(10000)
emajor = ""
for r in range(0,epoch):
for xy in range(len(target)):
tar = target[xy]
fdata = obj[xy]
fdata = S(np.dot(1,fdata))
hnw = np.dot(fdata,win)
hnw = S(np.dot(fdata,win))
out = np.dot(hnw,wout)
out = S(out)
diff = tar-out
E = 0.5 * np.power(diff,2)
emajor += str(E[0]) + ",\n"
delta_out = (out-tar)*(out*(1-out))
nindelta_out = delta_out * eta
wout_change = np.dot(nindelta_out[0], hnw)
for x in range(len(wout_change)):
change = wout_change[x]
wout[x] -= change
delta_in = np.dot(hnw,(1-hnw)) * np.dot(delta_out[0], wout)
nindelta_in = eta * delta_in
for x in range(len(nindelta_in)):
midway = np.dot(nindelta_in[x][0], fdata)
for y in range(len(win)):
win[y][x] -= midway[y]
f = open('xor.csv','w')
f.write(emajor) # python will convert \n to os.linesep
f.close() # you can omit in most cases as the destructor will call it
This is the error changing by the number of learning rounds. Is this correct? The red color line is the line I was expecting how the error should change.
Anything wrong I'm doing in the code? As I can't seem to figure out what's causing the error. Help much appreciated.
Thanks in advance
Here is a one hidden layer network with backpropagation which can be customized to run experiments with relu, sigmoid and other activations. After several experiments it was concluded that with relu the network performed better and reached convergence sooner, while with sigmoid the loss value fluctuated. This happens because, "the gradient of sigmoids becomes increasingly small as the absolute value of x increases".
import numpy as np
import matplotlib.pyplot as plt
from operator import xor
class neuralNetwork():
def __init__(self):
# Define hyperparameters
self.noOfInputLayers = 2
self.noOfOutputLayers = 1
self.noOfHiddenLayerNeurons = 2
# Define weights
self.W1 = np.random.rand(self.noOfInputLayers,self.noOfHiddenLayerNeurons)
self.W2 = np.random.rand(self.noOfHiddenLayerNeurons,self.noOfOutputLayers)
def relu(self,z):
return np.maximum(0,z)
def sigmoid(self,z):
return 1/(1+np.exp(-z))
def forward (self,X):
self.z2 = np.dot(X,self.W1)
self.a2 = self.relu(self.z2)
self.z3 = np.dot(self.a2,self.W2)
yHat = self.relu(self.z3)
return yHat
def costFunction(self, X, y):
#Compute cost for given X,y, use weights already stored in class.
self.yHat = self.forward(X)
J = 0.5*sum((y-self.yHat)**2)
return J
def costFunctionPrime(self,X,y):
# Compute derivative with respect to W1 and W2
delta3 = np.multiply(-(y-self.yHat),self.sigmoid(self.z3))
djw2 = np.dot(self.a2.T, delta3)
delta2 = np.dot(delta3,self.W2.T)*self.sigmoid(self.z2)
djw1 = np.dot(X.T,delta2)
return djw1,djw2
if __name__ == "__main__":
EPOCHS = 6000
SCALAR = 0.01
nn= neuralNetwork()
COST_LIST = []
inputs = [ np.array([[0,0]]), np.array([[0,1]]), np.array([[1,0]]), np.array([[1,1]])]
for epoch in xrange(1,EPOCHS):
cost = 0
for i in inputs:
X = i #inputs
y = xor(X[0][0],X[0][1])
cost += nn.costFunction(X,y)[0]
djw1,djw2 = nn.costFunctionPrime(X,y)
nn.W1 = nn.W1 - SCALAR*djw1
nn.W2 = nn.W2 - SCALAR*djw2
COST_LIST.append(cost)
plt.plot(np.arange(1,EPOCHS),COST_LIST)
plt.ylim(0,1)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title(str('Epochs: '+str(EPOCHS)+', Scalar: '+str(SCALAR)))
plt.show()
inputs = [ np.array([[0,0]]), np.array([[0,1]]), np.array([[1,0]]), np.array([[1,1]])]
print "X\ty\ty_hat"
for inp in inputs:
print (inp[0][0],inp[0][1]),"\t",xor(inp[0][0],inp[0][1]),"\t",round(nn.forward(inp)[0][0],4)
End Result:
X y y_hat
(0, 0) 0 0.0
(0, 1) 1 0.9997
(1, 0) 1 0.9997
(1, 1) 0 0.0005
The weights obtained after training were:
nn.w1
[ [-0.81781753 0.71323677]
[ 0.48803631 -0.71286155] ]
nn.w2
[ [ 2.04849235]
[ 1.40170791] ]
I found the following youtube series extremely helpful for understanding neural nets: Neural networks demystified
There is only little which I know and also that can be explained in this answer. If you want an even better understanding of neural nets, then I would suggest you to go through the following link: cs231n: Modelling one neuron
The error calculated in each epoch should be a sum total of all sum squared errors (i.e. error for every target)
import numpy as np
def S(x):
return 1/(1+np.exp(-x))
win = np.random.randn(2,2)
wout = np.random.randn(2,1)
eta = 0.15
# win = [[1,1], [2,2]]
# wout = [[1],[2]]
obj = [[0,0],[1,0],[0,1],[1,1]]
target = [0,1,1,0]
epoch = int(10000)
emajor = ""
for r in range(0,epoch):
# ***** initialize final error *****
finalError = 0
for xy in range(len(target)):
tar = target[xy]
fdata = obj[xy]
fdata = S(np.dot(1,fdata))
hnw = np.dot(fdata,win)
hnw = S(np.dot(fdata,win))
out = np.dot(hnw,wout)
out = S(out)
diff = tar-out
E = 0.5 * np.power(diff,2)
# ***** sum all errors *****
finalError += E
delta_out = (out-tar)*(out*(1-out))
nindelta_out = delta_out * eta
wout_change = np.dot(nindelta_out[0], hnw)
for x in range(len(wout_change)):
change = wout_change[x]
wout[x] -= change
delta_in = np.dot(hnw,(1-hnw)) * np.dot(delta_out[0], wout)
nindelta_in = eta * delta_in
for x in range(len(nindelta_in)):
midway = np.dot(nindelta_in[x][0], fdata)
for y in range(len(win)):
win[y][x] -= midway[y]
# ***** Save final error *****
emajor += str(finalError[0]) + ",\n"
f = open('xor.csv','w')
f.write(emajor) # python will convert \n to os.linesep
f.close() # you can omit in most cases as the destructor will call it
Related
I am trying to create my own RNN with pytorch and am following some simple tutorials on the .backward function. Once I run my code, I get "None" as the result for .grad and I cannot figure out why. It looks like from this post, that it may be because I setting up the inputs as tensors so they are getting detached? If so, I am not sure how to correct for this but ensure they can still be multiplied in the matrices.
import math
import numpy as np
import torch
from collections import deque
#set up the inputs
lists = deque()
for i in range(0, 13, 1):
lists.append(range(i, i + 4))
x = np.array(lists)
# set up the y vector
y = []
for i in range(len(x)):
y.append((x[i,3])+1)
#set up the validation input
lists = deque()
for i in range(13, 19, 1):
lists.append(range(i, i + 4))
x_val = np.array(lists)
#set up the validation y vector
y_val = []
for i in range(len(x_val)):
y_val.append((x_val[i,3])+1)
#set params
input_dimension = len(x[0])
hidden_dimension = 100
output_dimension = 1
#set up the weighted matrices
np.random.seed(99)
Wxh = np.random.uniform(0, 1, (hidden_dimension, input_dimension)) # weights from input to hidden layer
Whh = np.random.uniform(0, 1, (hidden_dimension, hidden_dimension)) # weights inside cell - recurrant
Why = np.random.uniform(0, 1, (output_dimension, hidden_dimension)) # weights from hidden to output layer
#set up the input tensor
xt = torch.tensor(x[[0]], dtype=torch.float) #do I want to keep a float here? or force an int? think: float - understand why
Wxh_t = torch.tensor(Wxh, requires_grad = True).float()
Whh_t = torch.tensor(Whh, requires_grad = True).float()
Why_t = torch.tensor(Why, requires_grad = True).float()
loss = 0
for i in range(len(x)):
xt = torch.tensor(x[[i]], dtype=torch.float)
print(xt)
current_affine_3 = torch.mm(xt,Wxh_t.T)
hidden_t = torch.mm(h_prev_t, Whh_t.T)
ht_t = torch.tanh(current_affine_3 + hidden_t)
y_hat_t = torch.mm(ht_t, Why_t.T)
loss += (y[i] - y_hat_t)**2
print(y[i])
print(loss)
h_prev_t = ht_t
loss.backward
print(Wxh_t.grad)
loss.backward returns <bound method Tensor.backward of tensor([[18672.0215]], grad_fn=<AddBackward0>)>
And if I view the weighted matrices, I notice something different than the tutorials. Instead of grad_fn=<AddBackward0> after calculating with a requires_grad = True tensor, I get grad_fn=<MmBackward0>. I assume it's because I am using torch.mm, but I'm not sure if this matters. This is an example of some code I was using for a tutorial:
x = torch.tensor(2., requires_grad = False)
w = torch.tensor(3., requires_grad = True)
b = torch.tensor(1., requires_grad = True)
print("x:", x)
print("w:", w)
print("b:", b)
# define a function of the above defined tensors
y = w * x + b
print("y:", y)
# take the backward() for y
y.backward()
# print the gradients w.r.t. above x, w, and b
print("x.grad:", x.grad)
print("w.grad:", w.grad)
print("b.grad:", b.grad)
Thank you!
I'm using the L-BFGS-B optimizer to find the minima of a function. This will help me calculate sharpness for the function. However, I'm not sure if this following message is considered a normal message (i.e. Is there something wrong with my program or is this message typical?) See below:
RUNNING THE L-BFGS-B CODE
* * *
Machine precision = 2.220D-16
N = 28149514 M = 10
At X0 0 variables are exactly at the bounds
^[[C
At iterate 0 f= -3.59325D+00 |proj g|= 2.10249D-03
At iterate 1 f= -2.47853D+01 |proj g|= 4.20499D-03
Bad direction in the line search;
refresh the lbfgs memory and restart the iteration.
At iterate 2 f= -2.53202D+01 |proj g|= 4.17686D-03
At iterate 3 f= -2.53202D+01 |proj g|= 4.17686D-03
* * *
Tit = total number of iterations
Tnf = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip = number of BFGS updates skipped
Nact = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F = final function value
* * *
N Tit Tnf Tnint Skip Nact Projg F
***** 3 43 ****** 0 ***** 4.177D-03 -2.532D+01
F = -25.320247650146484
CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
Warning: more than 10 function and gradient
evaluations in the last line search. Termination
may possibly be caused by a bad search direction.
I got the following sharpness anyway which is relatively consistent with the paper I'm trying to reproduce: It's just that I'm a bit concerned with the above message.
tensor(473.0201)
Here is my code for computing sharpness:
def get_sharpness(data_loader, model, criterion, epsilon, manifolds=0):
# extract current x0
x0 = None
for p in model.parameters():
if x0 is None:
x0 = p.data.view(-1)
else:
x0 = torch.cat((x0, p.data.view(-1)))
x0 = x0.cpu().numpy()
# get current f_x
f_x0, _ = get_minus_cross_entropy(x0, data_loader, model, criterion)
f_x0 = -f_x0
logging.info('min loss f_x0 = {loss:.4f}'.format(loss=f_x0))
# find the minimum
if 0==manifolds:
x_min = np.reshape(x0 - epsilon * (np.abs(x0) + 1), (x0.shape[0], 1))
x_max = np.reshape(x0 + epsilon * (np.abs(x0) + 1), (x0.shape[0], 1))
bounds = np.concatenate([x_min, x_max], 1)
func = lambda x: get_minus_cross_entropy(x, data_loader, model, criterion, training=True)
init_guess = x0
else:
warnings.warn("Small manifolds may not be able to explore the space.")
assert(manifolds<=x0.shape[0])
#transformer = rp.GaussianRandomProjection(n_components=manifolds)
#transformer.fit(np.random.rand(manifolds, x0.shape[0]))
#A_plus = transformer.components_
#A = np.linalg.pinv(A_plus)
A_plus = np.random.rand(manifolds, x0.shape[0])*2.-1.
# normalize each column to unit length
A_plus_norm = np.linalg.norm(A_plus, axis=1)
A_plus = A_plus / np.reshape(A_plus_norm, (manifolds,1))
A = np.linalg.pinv(A_plus)
abs_bound = epsilon * (np.abs(np.dot(A_plus, x0))+1)
abs_bound = np.reshape(abs_bound, (abs_bound.shape[0], 1))
bounds = np.concatenate([-abs_bound, abs_bound], 1)
def func(y):
floss, fg = get_minus_cross_entropy(x0 + np.dot(A, y), data_loader, model, criterion, training=True)
return floss, np.dot(np.transpose(A), fg)
#func = lambda y: get_minus_cross_entropy(x0+np.dot(A, y), data_loader, model, criterion, training=True)
init_guess = np.zeros(manifolds)
#rand_selections = (np.random.rand(bounds.shape[0])+1e-6)*0.99
#init_guess = np.multiply(1.-rand_selections, bounds[:,0])+np.multiply(rand_selections, bounds[:,1])
minimum_x, f_x, d = sciopt.fmin_l_bfgs_b(func, init_guess, maxiter=10, bounds=list(bounds), disp=1, iprint=101)
#factr=10.,
#pgtol=1.e-12,
f_x = -f_x
logging.info('max loss f_x = {loss:.4f}'.format(loss=f_x))
sharpness = (f_x - f_x0)/(1+f_x0)*100
print(sharpness)
# recover the model
x0 = torch.from_numpy(x0).float()
x0 = x0.cuda()
x_start = 0
for p in model.parameters():
psize = p.data.size()
peltnum = 1
for s in psize:
peltnum *= s
x_part = x0[x_start:x_start + peltnum]
p.data = x_part.view(psize)
x_start += peltnum
return sharpness
Which was taken from this repository:
https://github.com/wenwei202/smoothout/blob/master/measure_sharpness.py
I'm concerned about exact accuracy.
First, l-bfgs-b will only give a global minimum for a convex function.
the message
CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
is the normal convergence message.
The warning you are getting says that there are a lot of function/gradient evaluations in the line search - this can often happen when you use l-bfgs-b on non convex functions. So if the thing you're minimizing is non convex (and it seems like it might be just by glancing at the code), I would say this is normal.
A simple neural net of 2 inputs and one output without a bias, like this - doesn't seem to work.
|input1||weight1 weight2| = Z
|input2|
output = sigmoid(Z)
Whereas, it works perfectly when BIAS is added, why does it work & what is the math behind it?
|input1||weight1 weight2| = Z
|input2|
output = sigmoid(Z - BIAS)
Here's the CODE to working version with BIAS:
import numpy as np
import random as r
import sys
def sigmoid(ip, derivate=False):
if derivate:
return ip*(1-ip)
return 1.0/(1+np.exp(-1*ip))
class NeuralNet:
global sigmoid
def __init__(self):
self.inputLayers = 2
self.outputLayer = 1
self.bias = r.random()
def setup(self):
self.i = np.array([r.random(), r.random()], dtype=float).reshape(2,)
self.w = np.array([r.random(), r.random()], dtype=float).reshape(2,)
def forward_propogate(self):
self.z = self.w*self.i
self.o = sigmoid(sum(self.z)-self.bias)
def optimize_cost(self, desired):
i=0
current_cost = pow(desired - self.o, 2)
for weight in self.w:
dpdw = -1*(desired-self.o) * (sigmoid(self.o, derivate=True)) * self.i[i]
self.w[i] = self.w[i] - 2*dpdw
i+=1
#calculate dp/dB
dpdB = -1*(desired-self.o) * (sigmoid(self.o, derivate=True)) * -1
self.bias = self.bias - 2*dpdB
self.forward_propogate()
def train(self, ip, op):
self.i = np.array(ip).reshape(2,)
self.forward_propogate()
self.optimize_cost(op[0])
n = NeuralNet()
n.setup()
# while sys.stdin.read(1):
success_rate = 0
trial=0
done = False
while not done:
a = [0.1,1,0.1,1]
b = [0.1,0.1,1,1]
c = [0,0,0,1]
for i in range(len(a)):
trial +=1
n.train([a[i],b[i]],[c[i]])
if c[i] - n.o < 0.01:
success_rate +=1
print(100*success_rate/trial, "%")
if 100*success_rate/trial > 99 and trial > 4:
print(100*success_rate/trial, "%")
print("Network trained, took: {} trials".format(trial))
print("Network weights:{}, bias:{}".format(n.w, n.bias))
done = True
break
A bias is just a shift of the intercept. The NN you have set up in this example appears to be a single layer neural network with no hidden layers, which is effectively a logistic regression, which is just a linear model.
When you don't learn an intercept value, the intercept defaults to 0, so it always passes through the origin and you're just learning the slope of the line. To correctly classify the AND of your data, i.e. the top right corner at (1,1), but not any of the other points, you need a non zero intercept because there is no line that passes through the origin that will only have the top right corner on one side and the other three points on the other side.
I've tried to make a script in python able to recognize handwritten digits, using this data set: http://deeplearning.net/data/mnist/mnist.pkl.gz.
More information about this problem and about the algorithm that I'm trying to implement can be found at this link: http://neuralnetworksanddeeplearning.com/chap1.html
I've implemented a classification algorithm using a perceptron for each digit.
import cPickle, gzip
import numpy as np
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()
def activation(x):
if x > 0:
return 1
return 0
bias = 0.5
learningRate = 0.01
images = train_set[0]
targets = train_set[1]
weights = np.random.uniform(0,1,(10,784))
for nr in range(0,10):
for i in range(0,49999):
x = images[i]
t = targets[i]
z = np.dot(weights[nr],x) + bias
output = activation(z)
weights[nr] = weights[nr] + (t - output) * x * learningRate
bias = bias + (t - output) * learningRate
images = test_set[0]
targets = test_set[1]
OK = 0
for i in range range(0, 10000):
vec = []
for j in range(0,10):
vec.append(np.dot(weights[j],images[i]))
if np.argmax(vec) == targets[i]:
OK = OK + 1
print("The network recognized " + str(OK) +'/'+ "10000")
I usually recognized 10% of the digits, which means that my algorithm is doing nothing, is the same as a random algorithm.
Even dough I know that this problem is popular and I can easily find another solution on the web, I'm still asking you to help me to identify mistakes in my code.
Maybe I've initialized the values of learningRate, bias and weights wrongly.
thanks to #Kevinj22 and the other ones, I was able to solve this problem in the end.
import cPickle, gzip
import numpy as np
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()
def activation(x):
if x > 0:
return 1
return 0
learningRate = 0.01
images = train_set[0]
targets = train_set[1]
weights = np.random.uniform(0,1,(10,784))
for nr in range(0,10):
for i in range(0,50000):
x = images[i]
t = targets[i]
z = np.dot(weights[nr],x)
output = activation(z)
if nr == t:
target = 1
else:
target = 0
adjust = np.multiply((target - output) * learningRate, x)
weights[nr] = np.add(weights[nr], adjust)
images = test_set[0]
targets = test_set[1]
OK = 0
for i in range(0, 10000):
vec = []
for j in range(0,10):
vec.append(np.dot(weights[j],images[i]))
if np.argmax(vec) == targets[i]:
OK = OK + 1
print("The network recognized " + str(OK) +'/'+ "10000")
here is my updated code. I didn't introduce loss computation in my first attempt. I also get rid of bias because I didn't find it useful in my implementation.
I run this piece of code 10 times, with an average accuracy of 88%
I am following Andrew's Coursera course on machine learning. I am trying to build a 3 layers neural net for digit recognition in Python (784 input, 25 hidden, 10 output). However, I am unable to get the predictions (of the training data) correct (accuracy < 5% at 100 iter, accuracy not increasing with iteration).
J (the cost function) seems to be going down (see photo 1) and I have done gradient checking (before minimizing) and it seems to match to around 1e-11 (see photo 2).
I have compared the theta1 and theta2 after 100 iterations to my working matlab code (see code snippet 1 for octave and code snippet 2 for python). It seems theta1 is reasonably similar but theta2 is very different -- see code snippet 2. (I know they should differ because of the different optimisation routines. However, firstly, I have place the same initial thetas into both codes. Secondly, my reasoning is that they should start to converge, or at least get close, after 100 iterations)
The only error I see is:
-c:32: RuntimeWarning: overflow encountered in exp
when running the sigmoid during the optimising. However, I was told that this is not essential and it is normal to encounter this error during optimising? Furthermore, because it is a sigmoid, anytime the input is large, it will tend towards 1 anyways.
I have also attached my code in snippet 3. I have cut out all the other non-essential bits (like gradient checking) to make it as short as possible.
I would appreciate any help into this as I cannot even find where it is going wrong, let alone fix it. Thank you.
Photos:
J (cost function) decreasing to 1.8 after 12 iterations
Gradient checking before optimizing, they look very similar
Code snippet:
Initializing Neural Network Parameters ...
initial1
-0.0100100
-0.0771400
-0.1113800
-0.0230100
0.0547800
-0.0505500
-0.0731200
-0.0988700
0.0128000
-0.0855400
-0.1002500
-0.1137200
-0.0669300
-0.0999900
0.0084500
-0.0363200
-0.0588600
-0.0431100
-0.1133700
-0.0326300
0.0282800
0.0052400
-0.1134600
-0.0617700
0.0267600
initial2
0.0273700
0.1026000
-0.0502100
-0.0699100
0.0190600
0.1004000
0.0784600
-0.0075900
-0.0362100
0.0286200
Doing fminunc
Training Neural Network...
Iteration 100 | Cost: 6.219605e-01
theta1
-0.0099719
-0.0768462
-0.1109559
-0.0229224
0.0545714
-0.0503575
-0.0728415
-0.0984935
0.0127513
-0.0852143
-0.0998682
-0.1132869
-0.0666751
-0.0996092
0.0084178
-0.0361817
-0.0586359
-0.0429458
-0.1129383
-0.0325057
0.0281723
0.0052200
-0.1130279
-0.0615348
0.0266581
theta2
1.124918
1.603780
-1.266390
-0.848874
0.037956
-1.360841
2.145562
-1.448657
-1.262285
-1.357635
theta1_initial
[-0.01001 -0.07714 -0.11138 -0.02301 0.05478 -0.05055 -0.07312 -0.09887
0.0128 -0.08554 -0.10025 -0.11372 -0.06693 -0.09999 0.00845 -0.03632
-0.05886 -0.04311 -0.11337 -0.03263 0.02828 0.00524 -0.11346 -0.06177
0.02676]
theta2_initial
[ 0.02737 0.1026 -0.05021 -0.06991 0.01906 0.1004 0.07846 -0.00759
-0.03621 0.02862]
Doing fminunc
-c:32: RuntimeWarning: overflow encountered in exp
theta1
[-0.00997202 -0.07680716 -0.11086841 -0.02292044 0.05455335 -0.05034252
-0.07280686 -0.09842603 0.01275117 -0.08516515 -0.0997987 -0.11319546
-0.06664666 -0.09954009 0.00841804 -0.03617494 -0.05861458 -0.04293555
-0.1128474 -0.0325006 0.02816879 0.00522031 -0.1129369 -0.06151103
0.02665508]
theta2
[ 0.27954826 -0.08007496 -0.36449273 -0.22988024 0.06849659 -0.47803973
1.09023041 -0.25570559 -0.24537494 -0.40341995]
#-----------------BEGIN HEADERS-----------------
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import csv
import scipy
#-----------------END HEADERS-----------------
#-----------------BEGIN FUNCTION 1-----------------
def randinitialize(L_in, L_out):
w = np.zeros((L_out, 1 + L_in))
epsilon_init = 0.12
w = np.random.rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init
return w
#-----------------END FUNCTION 1-----------------
#-----------------BEGIN FUNCTION 2-----------------
def sigmoid(lz):
g = 1.0/(1.0+np.exp(-lz))
return g
#-----------------END FUNCTION 2-----------------
#-----------------BEGIN FUNCTION 3-----------------
def sigmoidgradient(lz):
g = np.multiply(sigmoid(lz),(1-sigmoid(lz)))
return g
#-----------------END FUNCTION 3-----------------
#-----------------BEGIN FUNCTION 4-----------------
def nncostfunction(ltheta_ravel, linput_layer_size, lhidden_layer_size, lnum_labels, lx, ly, llambda_reg):
ltheta1 = np.array(np.reshape(ltheta_ravel[:lhidden_layer_size * (linput_layer_size + 1)], (lhidden_layer_size, (linput_layer_size + 1))))
ltheta2 = np.array(np.reshape(ltheta_ravel[lhidden_layer_size * (linput_layer_size + 1):], (lnum_labels, (lhidden_layer_size + 1))))
ltheta1_grad = np.zeros((np.shape(ltheta1)))
ltheta2_grad = np.zeros((np.shape(ltheta2)))
y_matrix = []
lm = np.shape(lx)[0]
eye_matrix = np.eye(lnum_labels)
for i in range(len(ly)):
y_matrix.append(eye_matrix[int(ly[i])-1,:]) #The minus one as python is zero based
y_matrix = np.array(y_matrix)
a1 = np.hstack((np.ones((lm,1)), lx)).astype(float)
z2 = sigmoid(ltheta1.dot(a1.T))
a2 = (np.concatenate((np.ones((np.shape(z2)[1], 1)), z2.T), axis=1)).astype(float)
a3 = sigmoid(ltheta2.dot(a2.T))
h = a3
J_unreg = 0
J = 0
J_unreg = (1/float(lm))*np.sum(\
-np.multiply(y_matrix,np.log(h.T))\
-np.multiply((1-y_matrix),np.log(1-h.T))\
,axis=None)
J = J_unreg + (llambda_reg/(2*float(lm)))*\
(np.sum(\
np.multiply(ltheta1[:,1:],ltheta1[:,1:])\
,axis=None)+np.sum(\
np.multiply(ltheta2[:,1:],ltheta2[:,1:])\
,axis=None))
delta3 = a3.T - y_matrix
delta2 = np.multiply((delta3.dot(ltheta2[:,1:])), (sigmoidgradient(ltheta1.dot(a1.T))).T)
cdelta2 = ((a2.T).dot(delta3)).T
cdelta1 = ((a1.T).dot(delta2)).T
ltheta1_grad = (1/float(lm))*cdelta1
ltheta2_grad = (1/float(lm))*cdelta2
theta1_hold = ltheta1
theta2_hold = ltheta2
theta1_hold[:,0] = 0;
theta2_hold[:,0] = 0;
ltheta1_grad = ltheta1_grad + (llambda_reg/float(lm))*theta1_hold;
ltheta2_grad = ltheta2_grad + (llambda_reg/float(lm))*theta2_hold;
thetagrad_ravel = np.concatenate((np.ravel(ltheta1_grad), np.ravel(ltheta2_grad)))
return (J, thetagrad_ravel)
#-----------------END FUNCTION 4-----------------
#-----------------BEGIN FUNCTION 5-----------------
def predict(ltheta1, ltheta2, x):
m, n = np.shape(x)
p = np.zeros(m)
h1 = sigmoid((np.hstack((np.ones((m,1)),x.astype(float)))).dot(ltheta1.T))
h2 = sigmoid((np.hstack((np.ones((m,1)),h1))).dot(ltheta2.T))
for i in range(0,np.shape(h2)[0]):
p[i] = np.argmax(h2[i,:])
return p
#-----------------END FUNCTION 5-----------------
## Setup the parameters you will use for this exercise
input_layer_size = 784; # 28x28 Input Images of Digits
hidden_layer_size = 25; # 25 hidden units
num_labels = 10; # 10 labels, from 0 to 9
data = []
#Reading in data, split into X and y, rewrite label 0 to 10 (for easy comparison to course)
with open('train.csv', 'rb') as csvfile:
has_header = csv.Sniffer().has_header(csvfile.read(1024))
csvfile.seek(0) # rewind
data_csv = csv.reader(csvfile, delimiter=',')
if has_header:
next(data_csv)
for row in data_csv:
data.append(row)
data = np.array(data)
x = data[:,1:]
y = data[:,0]
y = y.astype(int)
for i in range(len(y)):
if y[i] == 0:
y[i] = 10
#Set basic parameters
m, n = np.shape(x)
lambda_reg = 1.0
#Randomly initalize weights for Theta_initial
#theta1_initial = np.genfromtxt('tt1.csv', delimiter=',')
#theta2_initial = np.genfromtxt('tt2.csv', delimiter=',')
theta1_initial = randinitialize(input_layer_size, hidden_layer_size);
theta2_initial = randinitialize(hidden_layer_size, num_labels);
theta_initial_ravel = np.concatenate((np.ravel(theta1_initial), np.ravel(theta2_initial)))
#Doing optimize
fmin = scipy.optimize.minimize(fun=nncostfunction, x0=theta_initial_ravel, args=(input_layer_size, hidden_layer_size, num_labels, x, y, lambda_reg), method='L-BFGS-B', jac=True, options={'maxiter': 10, 'disp': True})
fmin
theta1 = np.array(np.reshape(fmin.x[:hidden_layer_size * (input_layer_size + 1)], (hidden_layer_size, (input_layer_size + 1))))
theta2 = np.array(np.reshape(fmin.x[hidden_layer_size * (input_layer_size + 1):], (num_labels, (hidden_layer_size + 1))))
p = predict(theta1, theta2, x);
for i in range(len(y)):
if y[i] == 10:
y[i] = 0
correct = [1 if a == b else 0 for (a, b) in zip(p,y)]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print 'accuracy = {0}%'.format(accuracy * 100)
I think I have fixed the problem: it seems I messed up the index
should be:
y_matrix.append(eye_matrix[int(ly[i]),:])
instead of:
y_matrix.append(eye_matrix[int(ly[i])-1,:])