My data is 4123 rows of inputs and outputs to an xor gate.
I want to write a Neural Network with three input layer neurons (the third one is bias), a hidden layer, and an output layer.
Here's my implementation
import numpy as np
class TwoLayerNetwork:
def __init__(self, input_size, hidden_size, output_size):
"""
input_size: the number of neurons in the input layer
hidden_size: the number of neurons in the hidden layer
output_size: the number of neurons in the output layer
"""
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.params = {}
self.params['W1'] = 0.01 * np.random.randn(input_size, hidden_size) # FxH
self.params['b1'] = np.zeros((hidden_size, 1)) # Hx1
self.params['W2'] = 0.01 * np.random.randn(hidden_size, output_size) # HxO
self.params['b2'] = np.zeros((output_size, 1)) # Ox1
self.optimal_weights = []
self.errors = {}
def train(self, X, y, epochs):
"""
X: input data matrix, NxF
y: output vector, Nx1
returns:
the optimal set of parameters that best minimize the loss function
"""
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
for iteration in range(epochs):
forward_to_hidden = X.dot(W1) # NxH
activate_hidden = sigmoid(forward_to_hidden) # NxH
forward_to_output = activate_hidden.dot(W2) # NxO
output = sigmoid(forward_to_output) # NxO
self.errors[iteration] = np.mean(0.5 * (y**2 - output**2))
output_error = y - output # NxO
output_layer_delta = output_error * sigmoidPrime(output) # NxO
hidden_layer_error = output_layer_delta.dot(W2.T) # NxO . OxH = NxH
hidden_layer_delta = hidden_layer_error * sigmoidPrime(activate_hidden) # NxH
W1_update = X.T.dot(hidden_layer_delta) # FxN . NxH = FxH
W2_update = activate_hidden.T.dot(output_layer_delta) # HxN . NxO = HxO
W1 += W1_update
W2 += W2_update
self.optimal_weights.append(W1)
self.optimal_weights.append(W2)
def predict(self, X):
W1, W2 = self.optimal_weights[0], self.optimal_weights[1]
forward = sigmoid(X.dot(W1)) # NxH
forward = forward.dot(W2) # NxO
forward = sigmoid(forward) # NxO
return forward
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoidPrime(x):
return sigmoid(x) * (1 - sigmoid(x))
I realize that's very vanilla, but that's intentional. I want to understand the most basic form of NN architecture first.
Now, my problem is that my error plot is confusing.
The neural network just stops learning.
My second problem is that my weights are blowing up reaching up to -10000, which causes overflow because of exp in the sigmoid function.
My third problem is that my output vector only outputs 0.5 instead of 1 or 0
import pandas as pd
data = pd.read_csv('xor.csv').sample(frac=1)
X = data.iloc[:, [0, 1]] # 1st and 2nd cols are the input
X = np.hstack((X, np.ones((data.shape[0], 1)))) # adding the bias 1's
y = data.iloc[:, 2][:, np.newaxis] # 3rd col is the output
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
nn.train(X_train, y_train, 100)
plt.plot(range(100), [i for i in nn.errors.values()])
plt.show()
The link for the dataset
So, if I read your code correctly, your network is specified correctly, but is missing a few key points in order to learn XOR by backpropagation.
The fun part is, your error specification is weird.
I made it into
self.errors[iteration] = np.mean(0.5 * (y - output)**2)
for visualization.
With x-axis denoting epoch and y-axis denoting error:
So what happens, the backpropagation hits a plateau, then rapidly blows up the weights. To slow down the blowing up of the weights and allow the network some time to re-evaluate its mistakes, you can add a so-called "learning rate" != 1. This adresses one of the pitfalls.
Another one is the second figure: you hit oscillatory behaviour in the updates and the program will never reach its optimum state. To adress this, you can deliberately enter an imperfection in the form of a "momentum".
Additionally, the initial conditions matter for the speed at which you converge, so you need to have enough epochs to overcome the local plateaux:
Last, but certainly not least, I did find an error with your specification, but all of the above still applies.
In your layer_deltas you do sigmoidPrime(sigmoid(forwards)) which is one call to sigmoid too many.
last_update = np.zeros((X.shape[1], W1.shape[1]))
last_update2 = np.zeros((W1.shape[1], W2.shape[1]))
output_layer_delta = output_error * sigmoidPrime(forward_to_output) # NxO
hidden_layer_delta = hidden_layer_error * sigmoidPrime(forward_to_hidden) # NxH
W1 += 0.001*(W1_update + last_update * 0.5)
W2 += 0.001*(W2_update + last_update2 * 0.5)
# W1 = 0.001*W1_update
# W2 = 0.001*W2_update
last_update = W1_update.copy()
last_update2 = W2_update.copy()
Did the final trick for me. Now please verify and appease this grumbling man who spent the better part of a night and day on figuring it out. ;)
Related
My neural network is stuck at 11.35 percent accuracy and i am unable to trace the error.
low accuracy at 11.35 percent
I am following this code https://github.com/MLForNerds/DL_Projects/blob/main/mnist_ann.ipynb which I found in a youtube video.
Here is my code for the neural network(I have defined Xavier weight initialization in a module called nn):
"""1. 784 neurons in input layer
2. 128 neurons in hidden layer 1
3. 64 neurons in hidden layer 2
4. 10 neurons in output layer"""
def softmax(input):
y = np.exp(input - input.max())
activated = y/ np.sum(y, axis=0)
return activated
def softmax_grad(x):
exps = np.exp(x-x.max())
return exps / np.sum(exps,axis = 0) * (1 - exps /np.sum(exps,axis = 0))
def sigmoid(input):
activated = 1/(1 + np.exp(-input))
return activated
def sigmoid_grad(input):
grad = input*(1-input)
return grad
class DenseNN:
def __init__(self,d0,d1,d2,d3):
self.params = {'w1': nn.Xavier.initialize(d0, d1),
'w2': nn.Xavier.initialize(d1, d2),
'w3': nn.Xavier.initialize(d2, d3)}
def forward(self,a0):
params = self.params
params['a0'] = a0
params['z1'] = np.dot(params['w1'],params['a0'])
params['a1'] = sigmoid(params['z1'])
params['z2'] = np.dot(params['w2'],params['a1'])
params['a2'] = sigmoid(params['z2'])
params['z3'] = np.dot(params['w3'],params['a2'])
params['a3'] = softmax(params['z3'])
return params['a3']
def backprop(self,y_true,y_pred):
params = self.params
w_change = {}
error = softmax_grad(params['z3'])*((y_pred - y_true)/y_true.shape[0])
w_change['w3'] = np.outer(error,params['a2'])
error = np.dot(params['w3'].T,error)*sigmoid_grad(params['a2'])
w_change['w2'] = np.outer(error,params['a1'])
error = np.dot(params['w2'].T,error)*sigmoid_grad(params['a1'])
w_change['w1'] = np.outer(error,params['a0'])
return w_change
def update_weights(self,learning_rate,w_change):
self.params['w1'] -= learning_rate*w_change['w1']
self.params['w2'] -= learning_rate*w_change['w2']
self.params['w3'] -= learning_rate*w_change['w3']
def train(self,epochs,lr):
for epoch in range(epochs):
for i in range(60000):
a0 = np.array([x_train[i]]).T
o = np.array([y_train[i]]).T
y_pred = self.forward(a0)
w_change = self.backprop(o,y_pred)
self.update_weights(lr,w_change)
# print(self.compute_accuracy()*100)
# print(calc_mse(a3, o))
print((self.compute_accuracy())*100)
def compute_accuracy(self):
'''
This function does a forward pass of x, then checks if the indices
of the maximum value in the output equals the indices in the label
y. Then it sums over each prediction and calculates the accuracy.
'''
predictions = []
for i in range(10000):
idx = i
a0 = x_test[idx]
a0 = np.array([a0]).T
#print("acc a1",np.shape(a1))
o = y_test[idx]
o = np.array([o]).T
#print("acc o",np.shape(o))
output = self.forward(a0)
pred = np.argmax(output)
predictions.append(pred == np.argmax(o))
return np.mean(predictions)
Here is the code for loading the data:
#load dataset csv
train_data = pd.read_csv('../Datasets/MNIST/mnist_train.csv')
test_data = pd.read_csv('../Datasets/MNIST/mnist_test.csv')
#train data
x_train = train_data.drop('label',axis=1).to_numpy()
y_train = pd.get_dummies(train_data['label']).values
#test data
x_test = test_data.drop('label',axis=1).to_numpy()
y_test = pd.get_dummies(test_data['label']).values
fac = 0.99 / 255
x_train = np.asfarray(x_train) * fac + 0.01
x_test = np.asfarray(x_test) * fac + 0.01
# train_labels = np.asfarray(train_data[:, :1])
# test_labels = np.asfarray(test_data[:, :1])
#printing dimensions
print(np.shape(x_train)) #(60000,784)
print(np.shape(y_train)) #(60000,10)
print(np.shape(x_test)) #(10000,784)
print(np.shape(y_test)) #(10000,10)
print((x_train))
Kindly help
I am a newbie in machine learning so any help would be appreciated.I am unable to figure out where i am going wrong.Most of the code is almost similar to https://github.com/MLForNerds/DL_Projects/blob/main/mnist_ann.ipynb but it manages to get 60 percent accuracy.
EDIT
I found the mistake :
Thanks to Bartosz Mikulski.
The problem was with how the weights were initialized in my Xavier weights initialization algorithm.
I changed the code for weights initialization to this:
self.params = {
'w1':np.random.randn(d1, d0) * np.sqrt(1. / d1),
'w2':np.random.randn(d2, d1) * np.sqrt(1. / d2),
'w3':np.random.randn(d3, d2) * np.sqrt(1. / d3),
'b1':np.random.randn(d1, 1) * np.sqrt(1. / d1),
'b2':np.random.randn(d2, 1) * np.sqrt(1. / d2),
'b3':np.random.randn(d3, 1) * np.sqrt(1. / d3),
}
then i got the output:
After changing weights initialization
after adding the bias parameters i got the output:
After changing weights initialization and adding bias
3: After changing weights initialization and adding bias
The one problem that I can see is that you are using only weights but no biases. They are very important because they allow your model to change the position of the decision plane (boundary) in the solution space. If you only have weights you can only angle the solution.
I guess that basically, this is the best fit you can get without biases. The dense layer is basically a linear function: w*x + b and you are missing the b. See the PyTorch documentation for the example: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html#linear.
Also, can you show your Xavier initialization? In your case, even the simple normal distributed values would be enough as initialization, no need to rush into more advanced topics.
I would also suggest you start from the smaller problem (for example Iris dataset) and no hidden layers (just a simple linear regression that learns by using gradient descent). Then you can expand it by adding hidden layers, and then by trying harder problems with the code you already have.
I followed an article here: TowardsDataScience.
I wrote math equations about the network, everything made sense.
However, after writing the code, results are pretty strange, like it is predicting always same class...
I spent a lot of time on it, changed many things, but I still cannot understand what I did wrong.
Here is the code:
# coding: utf-8
from mnist import MNIST
import numpy as np
import math
import os
import pdb
DATASETS_PREFIX = '../Datasets/MNIST'
mndata = MNIST(DATASETS_PREFIX)
TRAINING_IMAGES, TRAINING_LABELS = mndata.load_training()
TESTING_IMAGES , TESTING_LABELS = mndata.load_testing()
### UTILS
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def d_sigmoid(x):
return x.T * (1 - x)
#return np.dot(x.T, 1.0 - x)
def softmax(x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
def d_softmax(x):
#This function has not yet been tested.
return x.T * (1 - x)
def tanh(x):
return np.tanh(x)
def d_tanh(x):
return 1 - x.T * x
def normalize(image):
return image / (255.0 * 0.99 + 0.01)
### !UTILS
class NeuralNetwork(object):
"""
This is a 3-layer neural network (1 hidden layer).
#_input : input layer
#_weights1: weights between input layer and hidden layer (matrix shape (input.shape[1], 4))
#_weights2: weights between hidden layer and output layer (matrix shape (4, 1))
#_y : output
#_output : computed output
#_alpha : learning rate
"""
def __init__(self, xshape, yshape):
self._neurones_nb = 20
self._input = None
self._weights1 = np.random.randn(xshape, self._neurones_nb)
self._weights2 = np.random.randn(self._neurones_nb, yshape)
self._y = np.mat(np.zeros(yshape))
self._output = np.mat(np.zeros(yshape))
self._alpha1 = 0.1
self._alpha2 = 0.1
self._function = sigmoid
self._derivative = d_sigmoid
self._epoch = 1
def Train(self, xs, ys):
for j in range(self._epoch):
for i in range(len(xs)):
self._input = normalize(np.mat(xs[i]))
self._y[0, ys[i]] = 1
self.feedforward()
self.backpropagation()
self._y[0, ys[i]] = 0
def Predict(self, image):
self._input = normalize(image)
out = self.feedforward()
return out
def feedforward(self):
self._layer1 = self._function(np.dot(self._input, self._weights1))
self._output = self._function(np.dot(self._layer1, self._weights2))
return self._output
def backpropagation(self):
d_weights2 = np.dot(
self._layer1.T,
2 * (self._y - self._output) * self._derivative(self._output)
)
d_weights1 = np.dot(
self._input.T,
np.dot(
2 * (self._y - self._output) * self._derivative(self._output),
self._weights2.T
) * self._derivative(self._layer1)
)
self._weights1 += self._alpha1 * d_weights1
self._weights2 += self._alpha2 * d_weights2
if __name__ == '__main__':
neural_network = NeuralNetwork(len(TRAINING_IMAGES[0]), 10)
print('* training neural network')
neural_network.Train(TRAINING_IMAGES, TRAINING_LABELS)
print('* testing neural network')
count = 0
for i in range(len(TESTING_IMAGES)):
image = np.mat(TESTING_IMAGES[i])
expected = TESTING_LABELS[i]
prediction = neural_network.Predict(image)
if i % 100 == 0: print(expected, prediction)
#print(f'* results: {count} / {len(TESTING_IMAGES)}')
Thank you for your help, really appreciated.
Julien
Well, I don't see any error in the implementation so considering your network, this could be improved by doing two things :
One epoch is not enough. Like not a all ! You need to pass over your data multiple times (a great minimum is 10 times, average might be around 100 epochs and this could go up to 5000 or more)
You network is a shallow network, e.g. really simple. To detect difficult things (like images), you could implement a CNN (Convolutional Neural Network) or first trying to deepen your network and complexify it
=> Try to add layers (3, 4, 5 etc..) and then add neurons to each layers (50, 60, ..) depending of the size of your input. You can still go up to 800, 900 or more.
My neural network can learn |sin(x)| for [0,pi], but not larger intervals than that. I tried changing the quantity and widths of hidden layers in various ways, but none of the changes leads to a good result.
I train the NN on thousands of random values from a uniform distribution in the chosen interval. using back propagation with gradient descent.
I am starting to think there is a fundamental problem in my network.
For the following examples I used a 1-10-10-1 layer structure:
[0, pi]:
[0, 2pi]:
[0, 4pi]:
Here is the code for the neural network:
import math
import numpy
import random
import copy
import matplotlib.pyplot as plt
def sigmoid(x):
return 1.0/(1+ numpy.exp(-x))
def sigmoid_derivative(x):
return x * (1.0 - x)
class NeuralNetwork:
def __init__(self, weight_dimensions, x=None, y=None):
self.weights = []
self.layers = [[]] * len(weight_dimensions)
self.weight_gradients = []
self.learning_rate = 1
self.layers[0] = x
for i in range(len(weight_dimensions) - 1):
self.weights.append(numpy.random.rand(weight_dimensions[i],weight_dimensions[i+1]) - 0.5)
self.y = y
def feed_forward(self):
# calculate an output using feed forward layer-by-layer
for i in range(len(self.layers) - 1):
self.layers[i + 1] = sigmoid(numpy.dot(self.layers[i], self.weights[i]))
def print_loss(self):
loss = numpy.square(self.layers[-1] - self.y).sum()
print(loss)
def get_weight_gradients(self):
return self.weight_gradients
def apply_weight_gradients(self):
for i in range(len(self.weight_gradients)):
self.weights[i] += self.weight_gradients[i] * self.learning_rate
if self.learning_rate > 0.001:
self.learning_rate -= 0.0001
def back_prop(self):
# find derivative of the loss function with respect to weights
self.weight_gradients = []
deltas = []
output_error = (self.y - self.layers[-1])
output_delta = output_error * sigmoid_derivative(self.layers[-1])
deltas.append(output_delta)
self.weight_gradients.append(self.layers[-2].T.dot(output_delta))
for i in range(len(self.weights) - 1):
i_error = deltas[i].dot(self.weights[-(i+1)].T)
i_delta = i_error * sigmoid_derivative(self.layers[-(i+2)])
self.weight_gradients.append(self.layers[-(i+3)].T.dot(i_delta))
deltas.append(copy.deepcopy(i_delta))
# Unreverse weight gradient list
self.weight_gradients = self.weight_gradients[::-1]
def get_output(self, inp):
self.layers[0] = inp
self.feed_forward()
return self.layers[-1]
def sin_test():
interval = numpy.random.uniform(0, 2*math.pi, int(1000*(2*math.pi)))
x_values = []
y_values = []
for i in range(len(interval)):
y_values.append([abs(math.sin(interval[i]))])
x_values.append([interval[i]])
x = numpy.array(x_values)
y = numpy.array(y_values)
nn = NeuralNetwork([1, 10, 10, 1], x, y)
for i in range(10000):
tmp_input = []
tmp_output = []
mini_batch_indexes = random.sample(range(0, len(x)), 10)
for j in mini_batch_indexes:
tmp_input.append(x[j])
tmp_output.append(y[j])
nn.layers[0] = numpy.array(tmp_input)
nn.y = numpy.array(tmp_output)
nn.feed_forward()
nn.back_prop()
nn.apply_weight_gradients()
nn.print_loss()
nn.layers[0] = numpy.array(numpy.array(x))
nn.y = numpy.array(numpy.array(y))
nn.feed_forward()
axis_1 = []
axis_2 = []
for i in range(len(nn.layers[-1])):
axis_1.append(nn.layers[0][i][0])
axis_2.append(nn.layers[-1][i][0])
true_axis_2 = []
for x in axis_1:
true_axis_2.append(abs(math.sin(x)))
axises = []
for i in range(len(axis_1)):
axises.append([axis_1[i], axis_2[i], true_axis_2[i]])
axises.sort(key=lambda x: x[0], reverse=False)
axis_1_new = []
axis_2_new = []
true_axis_2_new = []
for elem in axises:
axis_1_new.append(elem[0])
axis_2_new.append(elem[1])
true_axis_2_new.append(elem[2])
plt.plot(axis_1_new, axis_2_new, label="nn")
plt.plot(axis_1_new, true_axis_2_new, 'k--', label="sin(x)")
plt.grid()
plt.axis([0, 2*math.pi, -1, 2.5])
plt.show()
sin_test()
The main issue with your network seem to be that you apply the activation function to the final "layer" of your network. The final output of your network should be a linear combination without any sigmoid applied.
As a warning though, do not expect the model to generalize outside of the region included in the training data.
Here is an example in PyTorch:
import torch
import torch.nn as nn
import math
import numpy as np
import matplotlib.pyplot as plt
N = 1000
p = 2.5
x = 2 * p * math.pi * torch.rand(N, 1)
y = np.abs(np.sin(x))
with torch.no_grad():
plt.plot(x.numpy(), y.numpy(), '.')
plt.savefig("training_data.png")
inner = 20
model = nn.Sequential(
nn.Linear(1, inner, bias=True),
nn.Sigmoid(),
nn.Linear(inner, 1, bias=True)#,
#nn.Sigmoid()
)
loss_fn = nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500000):
y_pred = model(x)
loss = loss_fn(y_pred, y)
if t % 1000 == 0:
print("MSE: {}".format(t), loss.item())
model.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
X = torch.arange(0, p * 2 * math.pi, step=0.01).reshape(-1, 1)
Y = model(X)
Y_TRUTH = np.abs(np.sin(X))
print(Y.shape)
print(Y_TRUTH.shape)
loss = loss_fn(Y, Y_TRUTH)
plt.clf()
plt.plot(X.numpy(), Y_TRUTH.numpy())
plt.plot(X.numpy(), Y.numpy())
plt.title("MSE: {}".format(loss.item()))
plt.savefig("output.png")
The output is available here: Image showing neural network prediction and ground truth. The yellow line is the predicted line by the neural network and the blue line is the ground truth.
First and foremost, you've chosen a topology suited for a different class of problems. A simple, fully-connected NN such as this is great with trivial classification (e.g. Boolean operators) or functions with at least two continuous derivatives. You've tried to apply it to a function that is simply one step beyond its capabilities.
Try your model on sin(x) and see how it performs at larger ranges. Try it on max(sin(x), 0). Do you see how the model has trouble with certain periodicity and irruptions? These are an emergent feature of the many linear equations struggling to predict the proper functional value: the linear combinations have trouble emulating non-linearities past a simple level.
I wanted to predict heart disease using backpropagation algorithm for neural networks. For this I used UCI heart disease data set linked here: processed cleveland. To do this, I used the cde found on the following blog: Build a flexible Neural Network with Backpropagation in Python and changed it little bit according to my own dataset. My code is as follows:
import numpy as np
import csv
reader = csv.reader(open("cleveland_data.csv"), delimiter=",")
x = list(reader)
result = np.array(x).astype("float")
X = result[:, :13]
y0 = result[:, 13]
y1 = np.array([y0])
y = y1.T
# scale units
X = X / np.amax(X, axis=0) # maximum of X array
class Neural_Network(object):
def __init__(self):
# parameters
self.inputSize = 13
self.outputSize = 1
self.hiddenSize = 13
# weights
self.W1 = np.random.randn(self.inputSize, self.hiddenSize)
self.W2 = np.random.randn(self.hiddenSize, self.outputSize)
def forward(self, X):
# forward propagation through our network
self.z = np.dot(X, self.W1)
self.z2 = self.sigmoid(self.z) # activation function
self.z3 = np.dot(self.z2, self.W2)
o = self.sigmoid(self.z3) # final activation function
return o
def sigmoid(self, s):
# activation function
return 1 / (1 + np.exp(-s))
def sigmoidPrime(self, s):
# derivative of sigmoid
return s * (1 - s)
def backward(self, X, y, o):
# backward propgate through the network
self.o_error = y - o # error in output
self.o_delta = self.o_error * self.sigmoidPrime(o) # applying derivative of sigmoid to error
self.z2_error = self.o_delta.dot(
self.W2.T) # z2 error: how much our hidden layer weights contributed to output error
self.z2_delta = self.z2_error * self.sigmoidPrime(self.z2) # applying derivative of sigmoid to z2 error
self.W1 += X.T.dot(self.z2_delta) # adjusting first set (input --> hidden) weights
self.W2 += self.z2.T.dot(self.o_delta) # adjusting second set (hidden --> output) weights
def train(self, X, y):
o = self.forward(X)
self.backward(X, y, o)
NN = Neural_Network()
for i in range(100): # trains the NN 100 times
print("Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n" + str(NN.forward(X)))
print("Loss: \n" + str(np.mean(np.square(y - NN.forward(X))))) # mean sum squared loss
print("\n")
NN.train(X, y)
But when I run this code, my all predicted outputs become = 1 after few iterations and then stays the same for up to all 100 iterations. what is the problem in the code?
Few mistakes that I've noticed:
The output of your network is a sigmoid, i.e. a value between [0, 1] -- suits for predicting probabilities. But the target seems to be a value between [0, 4]. This explains the desire of the network to maximize the output to get as close as possible to large labels. But it can't go more than 1.0 and gets stuck.
You should either get rid of the final sigmoid or pre-process the label and scale it to [0, 1]. Both options will make it learn better.
You don't use the learning rate (effectively setting it to 1.0), which is probably a bit high, so it's possible for the NN to diverge. My experiments showed that 0.01 is a good learning rate, but you can play around with that.
Other than this, your backprop seems working right.
For past few days I have been debugging my NN but I can't find an issue.
I've created total raw implementation of multi-layer perceptron for identifying MNIST dataset images.
Network seems to learn because after train cycle test data accuracy is above 94% accuracy. I have problem with loss function - it starts increasing after a while, when test/val accuracy reaches ~76%.
Can someone please check my forward/backprop math and tell me if my loss function is properly implemented, or suggest what might be wrong?
NN structure:
input layer: 758 nodes, (1 node per pixel)
hidden layer 1: 300 nodes
hidden layer 2: 75 nodes
output layer: 10 nodes
NN activation functions:
input layer -> hidden layer 1: ReLU
hidden layer 1 -> hidden layer 2: ReLU
hidden layer 2 -> output layer 3: Softmax
NN Loss function:
Categorial Cross-Entropy
Full CLEAN code available here as Jupyter Notebook.
Neural Network forward/backward pass:
def train(self, features, targets):
n_records = features.shape[0]
# placeholders for weights and biases change values
delta_weights_i_h1 = np.zeros(self.weights_i_to_h1.shape)
delta_weights_h1_h2 = np.zeros(self.weights_h1_to_h2.shape)
delta_weights_h2_o = np.zeros(self.weights_h2_to_o.shape)
delta_bias_i_h1 = np.zeros(self.bias_i_to_h1.shape)
delta_bias_h1_h2 = np.zeros(self.bias_h1_to_h2.shape)
delta_bias_h2_o = np.zeros(self.bias_h2_to_o.shape)
for X, y in zip(features, targets):
### forward pass
# input to hidden 1
inputs_to_h1_layer = np.dot(X, self.weights_i_to_h1) + self.bias_i_to_h1
inputs_to_h1_layer_activated = self.activation_ReLU(inputs_to_h1_layer)
# hidden 1 to hidden 2
h1_to_h2_layer = np.dot(inputs_to_h1_layer_activated, self.weights_h1_to_h2) + self.bias_h1_to_h2
h1_to_h2_layer_activated = self.activation_ReLU(h1_to_h2_layer)
# hidden 2 to output
h2_to_output_layer = np.dot(h1_to_h2_layer_activated, self.weights_h2_to_o) + self.bias_h2_to_o
h2_to_output_layer_activated = self.softmax(h2_to_output_layer)
# output
final_outputs = h2_to_output_layer_activated
### backpropagation
# output to hidden2
error = y - final_outputs
output_error_term = error.dot(self.dsoftmax(h2_to_output_layer_activated))
h2_error = np.dot(output_error_term, self.weights_h2_to_o.T)
h2_error_term = h2_error * self.activation_dReLU(h1_to_h2_layer_activated)
# hidden2 to hidden1
h1_error = np.dot(h2_error_term, self.weights_h1_to_h2.T)
h1_error_term = h1_error * self.activation_dReLU(inputs_to_h1_layer_activated)
# weight & bias step (input to hidden)
delta_weights_i_h1 += h1_error_term * X[:, None]
delta_bias_i_h1 = np.sum(h1_error_term, axis=0)
# weight & bias step (hidden1 to hidden2)
delta_weights_h1_h2 += h2_error_term * inputs_to_h1_layer_activated[:, None]
delta_bias_h1_h2 = np.sum(h2_error_term, axis=0)
# weight & bias step (hidden2 to output)
delta_weights_h2_o += output_error_term * h1_to_h2_layer_activated[:, None]
delta_bias_h2_o = np.sum(output_error_term, axis=0)
# update the weights and biases
self.weights_i_to_h1 += self.lr * delta_weights_i_h1 / n_records
self.weights_h1_to_h2 += self.lr * delta_weights_h1_h2 / n_records
self.weights_h2_to_o += self.lr * delta_weights_h2_o / n_records
self.bias_i_to_h1 += self.lr * delta_bias_i_h1 / n_records
self.bias_h1_to_h2 += self.lr * delta_bias_h1_h2 / n_records
self.bias_h2_to_o += self.lr * delta_bias_h2_o / n_records
Activation function implementation:
def activation_ReLU(self, x):
return x * (x > 0)
def activation_dReLU(self, x):
return 1. * (x > 0)
def softmax(self, x):
z = x - np.max(x)
return np.exp(z) / np.sum(np.exp(z))
def dsoftmax(self, x):
# TODO: vectorise math
vec_len = len(x)
J = np.zeros((vec_len, vec_len))
for i in range(vec_len):
for j in range(vec_len):
if i == j:
J[i][j] = x[i] * (1 - x[j])
else:
J[i][j] = -x[i] * x[j]
return J
Loss function implementation:
def categorical_cross_entropy(pred, target):
return (1/len(pred)) * -np.sum(target * np.log(pred))
I managed to find the problem.
Neural Network is large so I couldn't stick everything to this question. Though if you check my Jupiter Notebook you could see implementation of my Softmax activation function and how do I use it in train cycle.
Problem with Loss miscalculation was caused by the fact my Softmax implementation worked only for ndarray dim == 1.
During training step I have put only ndarray with dim 1 to activtion function so NN learned well, but my run() function was returning wrong predictions as I have inserted whole test data to it, not only single row of it in for loop. Because of that it calculated Softmax "matrix-wise" rather than "row-wise".
This is very fast fix for it:
def softmax(self, x):
# TODO: vectorise math to speed up computation
softmax_result = None
if x.ndim == 1:
z = x - np.max(x)
softmax_result = np.exp(z) / np.sum(np.exp(z))
return softmax_result
else:
softmax_result = []
for row in x:
z = row - np.max(row)
row_softmax_result = np.exp(z) / np.sum(np.exp(z))
softmax_result.append(row_softmax_result)
return np.array(softmax_result)
Yet this code should be vectorised to avoid for loops and ifs if possible because currently it's ugly and takes too much PC resources.