Neural network backpropagation algorithm not working in Python - python

I am writing a neural network in Python, following the example here. It seems that the backpropagation algorithm isn't working, given that the neural network fails to produce the right value (within a margin of error) after being trained 10 thousand times. Specifically, I am training it to compute the sine function in the following example:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = .1
for layer in range(len(neurons)):
self.inputs.append(np.empty(neurons[layer]))
self.outputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = np.dot(self.weights[layer].T, self.outputs[layer])
self.outputs[-1] = np.tanh(self.inputs[-1])
def backpropagate(self, targets):
gradient = 1 - self.outputs[-1] * self.outputs[-1]
self.errors[-1] = gradient * (self.outputs[-1] - targets)
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(100000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(x)])
net.feedforward([-2])
print(net.outputs[-1])
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
sine_example()
The output fails to be close to tanh(sin(3)) = 0.140190616. I suspected a mistake involving wrong indices or alignment, but Numpy isn't raising any errors like these. Any tips on where I went wrong?
EDIT: I forgot to add the bias neurons. Here is the updated code:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.outputs = []
self.inputs = []
self.errors = []
self.offsets = []
self.rate = .01
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
self.outputs.append(np.empty(neurons[layer]))
self.inputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
self.offsets.append(np.random.normal(scale=1/np.sqrt(neurons[layer]), size=neurons[layer + 1]))
self.inputs.append(np.empty(neurons[-1]))
self.errors.append(np.empty(neurons[-1]))
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = self.offsets[layer] + np.dot(self.weights[layer].T, self.outputs[layer])
def backpropagate(self, targets):
self.errors[-1] = self.inputs[-1] - targets
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
self.offsets[layer] -= self.rate * self.errors[layer + 1]
def sine_example():
net = Neuralnet([1, 5, 1])
for step in range(10000):
x = np.random.uniform(-5, 5)
net.feedforward([x])
net.backpropagate([np.sin(x)])
net.feedforward([np.pi])
print(net.inputs[-1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(10000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(10000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([x])
net.feedforward([-2])
print(net.outputs[-1])
identity_example()

I think you train the NN in the wrong way. You have a loop over 10000 iterations and feed a new sample in each cycle. The NN will never get trained in this case.
(the statement is wrong! See the update! )
What you need to do is to generate a large array of true samples Y = sin(X), give it to your network ONCE and iterate over the training set forwards and backwards, in order to minimize the cost function. To check the algorithm you may need to plot the cost function depending on the iteration number and make sure the cost goes down.
Another important point is the initialization of the weights. Your numbers are pretty large and the network will take a lot of time to converge, especially when using low rates. It's a good practice to generate the initial weights in some small range [-eps .. eps] uniformly.
In my code I implemented two different activation functions: sigmoid() and tanh(). You need to scale your inputs depending on the selected function: [0 .. 1] and [-1 .. 1] respectively.
Here are some images which show the cost function and the resulting predictions for sigmoid() and tanh() activation functions:
As you can see the sigmoid() activation gives a little bit better results, than the tanh().
Also I got much better predictions when using a network [1, 6, 1], compared to a bigger network with 4 layers [1, 6, 4, 1]. So the size of the NN is not always the crucial factor. Here is the prediction for the mentioned network with 4 layers:
Here is my code with some comments. I tried to use your notations where it was possible.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.5
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; # range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
plt.plot(J_history)
plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' # the input should be scaled into [ 0..1]
activation2 = 'tanh' # the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
##########################################################################################
# TRAINING
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); # input training set
Y = np.sin(X) # target
kx = 0.1 # noise parameter
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise # noisy target
# scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
# number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
# gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) # prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
# visualization
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
plt.show()
raw_input('press any key to exit')
UPDATE
I would like to take back the statement regarding the training method used in your code. The network can be indeed trained using only one sample per iteration. I got interesting results in online-training using both sigmoid and tanh activation functions:
Online-training using Sigmoid (cost function and prediction)
Online-training using Tanh (cost function and prediction)
As can be seen the choice of Sigmoid as activation function gives better performance. The cost function looks not that good as during the offline-training, but at least it tends to go down.
I plotted the cost function in your implementation, it looks pretty jerky as well:
May be it is a good idea to try your code with the sigmoid or even the ReLU function.
Here is the updated source code. To switch between online and offline training modes just change the method variable.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.2
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; #range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
#plt.plot(J_history)
#plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def cost_online(self, min_x, max_x, iter_number):
h_arr = np.zeros([iter_number, 1])
y_arr = np.zeros([iter_number, 1])
for step in range(iter_number):
x = np.random.uniform(min_x, max_x, 1).reshape(1, 1)
self.feedforward(x)
h_arr[step, 0] = self.outputs[-1]
y_arr[step, 0] = np.sin(x)
J = np.sum(np.sum(np.power((y_arr - h_arr), 2), axis=0))/(2*iter_number)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' #the input should be scaled into [0..1]
activation2 = 'tanh' #the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
method1 = 'online'
method2 = 'offline'
method = method1
kx = 0.1 #noise parameter
###################################################################################################
# TRAINING
if (method == 'offline'):
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); #input training set
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
#number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
elif (method == 'online'):
sampling_count = 100000 # number of samplings during the training stage
m = 1 #batch size
iter_count = sampling_count/m
for layer in range(net.L):
net.inputs.append(np.empty([m, net.neurons[layer]]))
net.errors.append(np.empty([m, net.neurons[layer]]))
if (layer < net.L -1):
net.outputs.append(np.empty([m, net.neurons[layer]+1]))
else:
net.outputs.append(np.empty([m, net.neurons[layer]]))
J_history = []
step_history = []
for i in range(iter_count):
X = np.random.uniform(0, 4*math.pi, m).reshape(m, 1)
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
net.feedforward(X)
net.backpropagate(Y_scaled)
if (np.remainder(i, 1000) == 0):
J = net.cost_online(0, 4*math.pi, 1000)
J_history.append(J)
step_history.append(i)
plt.plot(step_history, J_history)
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
#plt.ylim([0, 0.1])
plt.show()
#gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) #prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
#visualization
#fake sine curve to show the ideal signal
if (method == 'online'):
X = np.linspace(0, 4*math.pi, num = 100)
Y = np.sin(X)
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
if (method == 'online'):
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
plt.ylim([-1.5, 1.5])
plt.show()
raw_input('press any key to exit')
Now I have some remarks to your current code:
Your sine function looks like this:
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
I don't know why you use tanh in your target input. If you really want to use tanh of sine as target, you need to scale it to [-1..1], because tanh(sin(x)) returns values in range [-0.76..0.76].
The next thing is the range of your training set. You use x = np.random.normal() to generate the samples. Here is the distribution of such an input:
After it you want your network to predict the sine of 3, but the network has almost never seen this number during the training stage. I would use the uniform distribution in a wider range for sample generation instead.

Related

Scipy fails to minimize cost function

Currently I'm learning from Andrew Ng course on Coursera called "Machine Learning". In exercise 5, we built a model that can predict digits, trained by the MNIST dataset. This task was completed successfully in Matlab by me, but I wanted to migrate that code to Python, just to see how different things are and maybe continue to play around with the model.
I managed to implement the cost function and the back propagation algorithm correctly. I know that because I compared the metrics with my working model in Matlab and it emits the same numbers.
Now, because in the course we train the model using fmincg, I tried to do the same using Scipy fmin_cg
function.
My problem is, the cost function takes extra small steps and fails to converge.
Here is my code for the network:
import numpy as np
import utils
import scipy.optimize as op
class Network:
def __init__(self, layers):
self.layers = layers
self.weights = self.generate_params()
# Function for generating theta multidimensional matrix
def generate_params(self):
theta = []
epsilon = 0.12
for i in range(len(self.layers) - 1):
current_layer_units = self.layers[i]
next_layer_units = self.layers[i + 1]
theta_i = np.multiply(
np.random.rand(next_layer_units, current_layer_units + 1),
2 * epsilon - epsilon
)
# Appending the params to the theta matrix
theta.append(theta_i)
return theta
# Function to append bias row/column to matrix X
def append_bias(self, X, d):
m = X.shape[0]
n = 1 if len(X.shape) == 1 else X.shape[1]
if (d == 'column'):
ones = np.ones((m, n + 1))
ones[:, 1:] = X.reshape((m, n))
elif (d == 'row'):
ones = np.ones((m + 1, n))
ones[1:, :] = X.reshape((m, n))
return ones
# Function for computing the gradient for 1 training example
def back_prop(self, y, feed, theta):
activations = feed["activations"]
weighted_layers = feed["weighted_layers"]
delta_output = activations[-1] - y.reshape(len(y), 1)
current_delta = delta_output
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Peforming delta calculations.
# Here, we continue to propagate the delta values backwards
# until we arrive to the second layer.
for i in reversed(range(len(theta))):
theta_i = theta[i]
if (i > 0):
i_weighted_inputs = self.append_bias(weighted_layers[i - 1], 'row')
t_theta_i = np.transpose(theta_i)
delta_i = np.multiply(np.dot(t_theta_i, current_delta), utils.sigmoidGradient(i_weighted_inputs))
delta_i = delta_i[1:]
gradients[i] = current_delta * np.transpose(activations[i])
# Setting current delta for the next layer
current_delta = delta_i
else:
gradients[i] = current_delta * np.transpose(activations[i])
return gradients
# Function for computing the cost and the derivatives
def compute_cost(self, theta, X, y, r12n = 0):
m = len(X)
num_labels = self.layers[-1]
costs = np.zeros(m)
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Iterating over the training set
for i in range(m):
inputs = X[i]
observed = utils.create_output_vector(y[i], num_labels)
feed = self.feed_forward(inputs)
predicted = feed["activations"][-1]
total_cost = 0
for k, o in enumerate(observed):
if (o == 1):
total_cost += np.log(predicted[k])
else:
total_cost += np.log(1 - predicted[k])
cost = -1 * total_cost
# Storing the cost for the i-th training example
costs[i] = cost
# Calculating the gradient for this training example
# using back propagation algorithm
gradients_i = self.back_prop(observed, feed, theta)
for i, gradient in enumerate(gradients_i):
gradients[i] += gradient
# Calculating the avg regularization term for the cost
sum_of_theta = 0
for i, theta_i in enumerate(theta):
squared_theta = np.power(theta_i[:, 1:], 2)
sum_of_theta += np.sum(squared_theta)
r12n_avg = r12n * sum_of_theta / (2 * m)
total_cost = np.sum(costs) / m + r12n_avg
# Applying regularization terms to the gradients
for i, theta_i in enumerate(theta):
lambda_i = np.copy(theta_i)
lambda_i[:, 0] = 0
lambda_i = np.multiply((r12n / m), lambda_i)
# Adding the r12n matrix to the gradient
gradients[i] = gradients[i] / m + lambda_i
return total_cost, gradients
# Function for training the neural network using conjugate gradient algorithm
def train_cg(self, X, y, r12n = 0, iterations = 50):
weights = self.weights
def Cost(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
cost, _ = self.compute_cost(theta, X, y, r12n)
print(cost);
return cost
def Gradient(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
_, gradient = self.compute_cost(theta, X, y, r12n)
return utils.unroll_theta(gradient)
unrolled_theta = utils.unroll_theta(weights)
result = op.fmin_cg(f = Cost,
x0 = unrolled_theta,
args=(X, y),
fprime=Gradient,
maxiter = iterations)
self.weights = utils.roll_theta(result, self.layers)
# Function for feeding forward the network
def feed_forward(self, X):
# Useful variables
activations = []
weighted_layers = []
weights = self.weights
currentActivations = self.append_bias(X, 'row')
activations.append(currentActivations)
for i in range(len(self.layers) - 1):
layer_weights = weights[i]
weighted_inputs = np.dot(layer_weights, currentActivations)
# Storing the weighted inputs
weighted_layers.append(weighted_inputs)
activation_nodes = []
# If the next layer is not the output layer, we'd like to add a bias unit to it
# (Excluding the input and the output layer)
if (i < len(self.layers) - 2):
activation_nodes = self.append_bias(utils.sigmoid(weighted_inputs), 'row')
else:
activation_nodes = utils.sigmoid(weighted_inputs)
# Appending the layer of nodes to the activations array
activations.append(activation_nodes)
currentActivations = activation_nodes
data = {
"activations": activations,
"weighted_layers": weighted_layers
}
return data
def predict(self, X):
data = self.feed_forward(X)
output = data["activations"][-1]
# Finding the max index in the output layer
return np.argmax(output, axis=0)
Here is the invocation of the code:
import numpy as np
from network import Network
# %% Load data
X = np.genfromtxt('data/mnist_data.csv', delimiter=',')
y = np.genfromtxt('data/mnist_outputs.csv', delimiter=',').astype(int)
# %% Create network
num_labels = 10
input_layer = 400
hidden_layer = 25
output_layer = num_labels
layers = [input_layer, hidden_layer, output_layer]
# Create a new neural network
network = Network(layers)
# %% Train the network and save the weights
network.train_cg(X, y, r12n = 1, iterations = 20)
This is what the code emits after each iteration:
15.441233231650283
15.441116436313076
15.441192262452514
15.44122384651483
15.441231216030646
15.441232804294314
15.441233141284435
15.44123321255294
15.441233227614855
As you can see, the changes to the cost are very small.
I checked for the shapes of the vectors and gradient and they both seem fine, just like in my Matlab implementation. I'm not sure what I do wrong here.
If you guys could help me, that'd be great :)

What can I do to improve the results I have with linear outputs?

With sigmoid activation function on my output the network learns the task much better than with linear activation function.
I am using L2 regularization with my cost function, I have a learning rate and momentum term yet it learns better with the sigmoid activation function.
What can I do to improve the results?
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
import numpy as np
import numpy.random as r
import matplotlib.pyplot as plt
import random
random.seed(25)
def readcsv(filename):
ifile = open(filename, "rU")
reader = csv.reader(ifile, delimiter=",")
rownum = 0
dataset = []
for row in reader:
dataset.append(row)
rownum += 1
data = []
for s in dataset:
Dataset = [float(i) for i in s]
data.append(Dataset)
return [data, rownum]
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_deriv(x):
return x * (1 - x)
def initialise_weights(nn_structure):
weights = {}
bias = {}
c_weights = {1: np.ones((nn_structure[1], nn_structure[1]))}
context = {0: r.random_sample((nn_structure[1]))}
for l in range(1, len(nn_structure)):
q = []
for j in range(1, nn_structure[l] + 1):
w = [random.uniform(-0.09, 0.09) for i in range(nn_structure[l-1])]
q.append(w)
weights[l] = np.array(q)
bias[l] = r.random_sample((nn_structure[l],))
print(weights)
return weights, bias, c_weights, context
def initialise_weights_changes(nn_structure):
deltaweights = {}
deltabias = {}
deltac_weights = {1: np.zeros((nn_structure[1], nn_structure[1]))}
for l in range(1, len(nn_structure)):
deltaweights[l] = np.zeros((nn_structure[l], nn_structure[l-1]))
deltabias[l] = np.zeros((nn_structure[l],))
return deltaweights, deltabias, deltac_weights
def feed_forward(x, weights, bias, c_weights, context, i, hidden_layer):
hidden_layer[i] = {1: x}
activations = {}
for l in range(1, len(weights) + 1):
node_in = hidden_layer[i][l]
if l == 1:
activations[l+1] = sigmoid(weights[l].dot(node_in) + c_weights[l].dot(context[i]) + bias[l])
else:
activations[l+1] = weights[l].dot(node_in) + bias[l]
hidden_layer[i][l+1] = activations[l+1]
if l == 1:
context[i + 1] = hidden_layer[i][l+1]
return hidden_layer, activations
def calculate_out_layer_delta(y, hidden_layer):#, activations):
return -(y - hidden_layer)
def calculate_hidden_delta(delta_plus_1, weights_l):#, z_l):
return np.dot(np.transpose(weights_l), delta_plus_1)
def train_nn(nn_structure, X, y, iter_num=1000, alpha=0.6, momentum = 0.4, bptt = 5, reg = 0.0000009):
weights, bias, c_weights, context = initialise_weights(nn_structure)
cnt = 0
m = len(y)
avg_cost_func = []
print('Starting gradient descent for {} iterations'.format(iter_num))
while cnt < iter_num:
if cnt%1000 == 0:
print('Iteration {} of {}'.format(cnt, iter_num))
deltaweights, deltabias, deltac_weights = initialise_weights_changes(nn_structure)
avg_cost = 0
hidden_layer = {}
delta = {}
bp = []
for i in range(len(y)):
# perform the feed forward pass and return the stored h and z values, to be used in the gradient descent step
hidden_layer, activations = feed_forward(X[i], weights, bias, c_weights, context, i, hidden_layer)
bp.append(i)
# loop from nl-1 to 1 backpropagating the errors
if len(bp) == bptt:
for j in reversed(bp):
delta[j] = {}
if j == bp[-1]:
for l in range(len(nn_structure), 0, -1):
if l == len(nn_structure):
delta[j][l] = calculate_out_layer_delta(y[j], hidden_layer[j][l])
avg_cost += mean_squared_error(y[j] , hidden_layer[j][l]) + 0.5 * reg * np.linalg.norm(np.power(weights[l - 1], 2)) + 0.5 * reg * np.linalg.norm(np.power(weights[l - 2], 2))
else:
if l > 1:
delta[j][l] = calculate_hidden_delta(delta[j][l+1], weights[l]) * sigmoid_deriv(hidden_layer[j][l])
else:
for l in range(len(nn_structure), 0, -1):
if l == len(nn_structure):
delta[j][l] = calculate_out_layer_delta(y[j], hidden_layer[j][l])
avg_cost += np.linalg.norm((y[j] - hidden_layer[j][l]))
else:
if l > 1:
delta[j][l] = calculate_hidden_delta(delta[j][l + 1], weights[l]) + calculate_hidden_delta(delta[j + 1][l], c_weights[1]) * sigmoid_deriv(hidden_layer[j][l])
for l in range(len(nn_structure) - 1, 0, -1):
deltaweights[l] = (-alpha * np.dot(delta[j][l + 1][:, np.newaxis], np.transpose(hidden_layer[j][l][:, np.newaxis]))) + (momentum * deltaweights[l]) + (reg / bptt * weights[l])
deltabias[l] = (-alpha * delta[j][l + 1]) + ((momentum * deltabias[l])) + (reg / bptt * bias[l])
if l == 1:
deltac_weights[l] += (-alpha * np.dot(delta[j][1 + 1][:, np.newaxis], np.transpose(context[j][:, np.newaxis]))) + (momentum * deltac_weights[l]) + (reg / bptt * c_weights[l])
# perform the gradient descent step for the weights in each layer
for l in range(len(nn_structure) - 1, 0, -1):
weights[l] += (1 / bptt * deltaweights[l]) - (reg / bptt * weights[l])
bias[l] += (1 / bptt * deltabias[l]) - (reg / bptt * bias[l])
if l == 1:
c_weights[l] += (1 / bptt * deltac_weights[l]) - (reg / bptt * c_weights[l])
bp = []
deltaweights, deltabias, deltac_weights = initialise_weights_changes(nn_structure)
# complete the average cost calculation
if cnt % 500 == 0:
print(weights)
avg_cost = 1.0 / m * avg_cost
if cnt % 1000 == 0:
print('Error', avg_cost)
avg_cost_func.append(avg_cost)
cnt += 1
alpha = alpha - (alpha/iter_num)
return weights, bias, avg_cost_func, c_weights, context
def predict_y(weights, bias, X, c_weights, context):
m = X.shape[0]
y = np.zeros((m,))
for i in range(m):
hidden_layer = {1: X[i]}
for l in range(1, len(weights) + 1):
node_in = hidden_layer[l]
if l == 1:
activations = weights[l].dot(node_in) + c_weights[l].dot(context[l]) + bias[l]
else:
activations = weights[l].dot(node_in) + bias[l]
hidden_layer[l + 1] = sigmoid(activations)
y[i] = hidden_layer[3]
return y
if __name__ == "__main__":
# load data and scale
filename = 'C:/Users/n0762538/Documents/Data/MackeyGlass/MackeyGlass.csv'
dataset, rownum = readcsv(filename)
#np.random.shuffle(dataset)
dataset = np.array(dataset)
# define data
no = int(0.70 * len(dataset))
train_data = dataset[0:no]
test_data = dataset[no:-1]
train_output = dataset[1:no + 1]
test_output = dataset[no + 1:]
X = train_data
y = train_output
# setup the NN structure
nn_structure = [len(dataset[0]), 3, len(dataset[0])]
# train the NN
weights, bias, avg_cost_func, c_weights, context = train_nn(nn_structure, X, y)
# plot the avg_cost_func
plt.plot(avg_cost_func)
plt.ylabel('Average J')
plt.xlabel('Iteration number')
plt.show()
# get the prediction accuracy and print
#print(weights)
#print('test:', test_output)
y_pred = predict_y(weights, bias, test_data, c_weights, context)
print('Prediction accuracy is {}%'.format(r2_score(test_output, y_pred) * 100))
#plt.plot(train_data)
plt.plot(y_pred)
plt.plot(test_output)
plt.title('Approach 1')
plt.ylabel('Predicted')
plt.xlabel('Iteration number')
plt.show()
By results, if you mean Error Rate or some Evaluation Metric Improvement. Here maybe some guidelines.
Let us make it clear that Error = 100 - Evaluation Metric. If your accuracy is 99%, that means your Error is 1% (GOOD). If your accuracy is 85%, your error is 15% (NOT TOO GOOD. IT DEPENDS)
If your Train Error is very high (Test Error = 16%, Train Error = 15%): You have a problem with BIAS. You need to get a BIGGER/DEEPER neural network with more layers, train using gradient descent for more number of iterations or just try a different neural network architecture (CNN, RNN).
If your Test Error is much higher than your Train Error (Test Error = 11%, Train Error = 1%): It means you have a VARIANCE problem. You need to get more training data, and apply regularization or just try a different neural network architecture (CNN, RNN).
If both Train Error as well as Test Error is high (Train Error: 15%, Test Error: 16%, then you have both HIGH BIAS and HIGH VARIANCE, you need to do both of what I told you above.
To see if you have a BIAS problem, look at the Training Set Performance. To see if you have a VARIANCE problem, look at your Dev Set/Test Set Performance. More importantly, You need to understand, Bayes Error or Human Level Performance. If 15% is Bayes Error, meaning that NO MACHINE LEARNING ALGORITHM can do better than 15%, and if your Training Error is 16%, it means there is very little room for improvement in terms of fixing BIAS.

Trouble implementing softmax activation and cross-entropy loss, and their derivatives in a neural net

I have implemented a simple multi-layer perceptron (with just 1 hidden layer) which can learn regression problems. I have written it so that the choice between sigmoid, tanh and relu activations can be specified. The squared error is then implemented as the loss function with each of these.
I now want to allow the choice to use the same model to learn multi-class classification problems, and so would like to implement the choice to use the softmax activation along with the cross-entropy loss. In my code below, the only changes that would need to be made (I hope) is to implement these in the activation() and loss() functions, and this should then work out of the box in both the forward pass and the backprop. This code runs a simulation of my model learning the XOR function, where the chosen activation function should be uncommented at the top.
However, I am really lost with implementing both of these functions, and even more so their derivatives. Any help and guidance is appreciated.
import sys
import numpy as np
activation = 'sigmoid'
# activation = 'tanh'
# activation = 'relu'
# activation = 'softmax'
numEpochs = 10000
class DataSet:
def __init__(self, data, trainSplit=1):
self.size = len(data)
self.trainSize = int(self.size * trainSplit)
self.testSize = self.size - self.trainSize
self.inputs, self.labels = [], []
for i in range(len(data)):
self.inputs.append(data[i][0])
self.labels.append(data[i][1])
self.trainInputs = self.inputs[:self.trainSize]
self.trainLabels = self.labels[:self.trainSize]
self.testInputs = self.inputs[self.trainSize:]
self.testLabels = self.labels[self.trainSize:]
try:
self.numInputs = len(self.inputs[0])
except TypeError:
self.numInputs = 1
try:
self.numOutputs = len(self.labels[0])
except TypeError:
self.numOutputs = 1
class MLP:
def __init__(self, numInputs, numHidden, numOutputs, activationFunction):
# MLP architecture sizes
self.numInputs = numInputs
self.numHidden = numHidden
self.numOutputs = numOutputs
self.activationFunction = activationFunction.lower()
# MLP weights
self.IH_weights = np.random.rand(numInputs, numHidden) # Input -> Hidden
self.HO_weights = np.random.rand(numHidden, numOutputs) # Hidden -> Output
# MLP biases
self.IH_bias = np.zeros((1, numHidden))
self.HO_bias = np.zeros((1, numOutputs))
# Gradients corresponding to weight matrices computed during backprop
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
# Gradients corresponding to biases computed during backprop
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
# Input, hidden and output layer neuron values
self.I = np.zeros(numInputs) # Inputs
self.L = np.zeros(numOutputs) # Labels
self.H = np.zeros(numHidden) # Hidden
self.O = np.zeros(numOutputs) # Output
def activation(self, x, derivative=False):
if self.activationFunction == 'sigmoid':
if derivative:
return x * (1 - x)
return 1 / (1 + np.exp(-x))
if self.activationFunction == 'tanh':
if derivative:
return 1 - np.tanh(x) ** 2
return np.tanh(x)
if self.activationFunction == 'relu':
if derivative:
return (x > 0).astype(float)
return np.maximum(0, x)
# TO DO ################################################################
if self.activationFunction == 'softmax':
if derivative:
return 0
return 0
print("ERROR: Activation function not found.")
sys.exit()
def loss(self, labels, predictions, derivative=False):
# TO DO ################################################################
# Cross-Entropy
if self.activationFunction == 'softmax':
if derivative:
return 0
return 0
# Squared Error
else:
if derivative:
return (-2 * labels) + (2 * predictions)
return (labels - predictions) ** 2
def forward(self, inputs):
# Ensure that inputs is a list
try:
len(inputs)
except TypeError:
inputs = [inputs]
self.I = np.array(inputs).reshape(1, self.numInputs)
self.H = self.I.dot(self.IH_weights) + self.IH_bias
self.H = self.activation(self.H)
self.O = self.H.dot(self.HO_weights) + self.HO_bias
self.O = self.activation(self.O)
def backwards(self, labels):
# Ensure that labels is a list
try:
len(labels)
except TypeError:
labels = [labels]
self.L = np.array(labels)
self.O_error = self.loss(self.O, self.L)
self.O_delta = self.loss(self.O, self.L, derivative=True) * self.activation(self.O, derivative=True)
self.H_error = self.O_delta.dot(self.HO_weights.T)
self.H_delta = self.H_error * self.activation(self.H, derivative=True)
self.IH_w_gradients += self.I.T.dot(self.H_delta)
self.HO_w_gradients += self.H.T.dot(self.O_delta)
self.IH_b_gradients += self.H_delta
self.HO_b_gradients += self.O_delta
return self.O_error
def updateWeights(self, learningRate):
self.IH_weights += learningRate * self.IH_w_gradients
self.HO_weights += learningRate * self.HO_w_gradients
self.IH_bias += learningRate * self.IH_b_gradients
self.HO_bias += learningRate * self.HO_b_gradients
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
def process(self, data, train=False, learningRate=0):
if train:
size = data.trainSize
inputs = data.trainInputs
labels = data.trainLabels
else:
size = data.testSize
inputs = data.testInputs
labels = data.testLabels
errors = []
for i in range(size):
self.forward(inputs[i])
errors.append(self.backwards(labels[i]))
if train:
self.updateWeights(learningRate)
return np.mean(errors)
data1 = DataSet([
[[0, 0], 0],
[[0, 1], 1],
[[1, 0], 1],
[[1, 1], 0]
])
data2 = DataSet([
[[0, 0], -1],
[[0, 1], 1],
[[1, 0], 1],
[[1, 1], -1]
])
data3 = DataSet([
[[0, 0], [1, 0]],
[[0, 1], [0, 1]],
[[1, 0], [0, 1]],
[[1, 1], [1, 0]]
])
if activation == 'sigmoid':
data = data1
mlp = MLP(data.numInputs, 2, data.numOutputs, 'sigmoid')
learningRate = 1
if activation == 'tanh':
data = data2
mlp = MLP(data.numInputs, 2, data.numOutputs, 'tanh')
learningRate = 0.1
if activation == 'relu':
data = data1
mlp = MLP(data.numInputs, 2, data.numOutputs, 'relu')
learningRate = 0.001
if activation == 'softmax':
data = data3
mlp = MLP(data.numInputs, 2, data.numOutputs, 'softmax')
learningRate = 0.01
################################################################################
# TO DO: UPDATE WEIGHTS AT INTERVALS, NOT EVERY EPOCH
################################################################################
losses = []
for epoch in range(numEpochs):
epochLoss = mlp.process(data, train=True, learningRate=learningRate)
losses.append(epochLoss)
if epoch % 1000 == 0 or epoch == numEpochs - 1:
print("EPOCH:", epoch)
print("LOSS: ", epochLoss, "\n")
Unfortunately, softmax is not as easy as the other activation functions you have posted. For the activation function, you must calculate the exp(y_i) and then divide by the sum exp(y_k) for every y_k in Y. For the derivative, you must calculate every combination (n^2 combinations) of partial derivatives of every output wrt every input of the neuron. Luckily, the loss it is something a little bit easier to understand, since you can think about the softmax giving you some probabilities (so it resembles a probability distribution) and you calculate the Cross Entropy as is between the returned values and the target ones.

Why does cost function for MLP flatten?

I am very new to machine learning and am trying to implement an MLP however the cost function seems to be reaching a local minimum before reaching the global minimum. I plotted the cost as a function of iteration (including a 0 value as to not be fooled by where the y-axis starts). Here is the code that I am using at my attempt:
import numpy as np
class NNet(object):
def __init__(self, n_in, n_hidden, n_out):
self.n_in = n_in
self.n_hidden = n_hidden
self.n_out = n_out
self.W1 = np.random.randn(n_in, n_hidden)
self.W2 = np.random.randn(n_hidden, n_out)
self.b1 = np.random.randn(n_hidden,)
self.b2 = np.random.randn(n_out,)
def sigmoid(self, z):
return 1/(1 + np.exp(-z))
def sig_prime(self, z):
return (np.exp(-z))/((1+np.exp(-z))**2)
def propagate_forward(self, X):
self.z1 = np.dot(self.W1.T, X) + self.b1
self.a1 = self.sigmoid(self.z1)
self.z2 = np.dot(self.W2.T, self.a1) + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def cost(self, y, y_hat):
return np.mean([np.sum((y[i] - y_hat[i])**2) for i in range(y.shape[0])])/2
def cost_grad(self, X, y):
y_hat = self.propagate_forward(X)
d2 = np.multiply(self.sig_prime(self.z2), -(y - y_hat))
gJ_W2 = np.matrix(np.multiply(self.a1.T, d2))
d1 = np.dot(self.W2, d2)*self.sig_prime(self.z1)
gJ_W1 = np.dot(np.matrix(X).T, np.matrix(d1))
return [gJ_W1, d1, gJ_W2, d2]
m = 1000
n = 1
X = np.zeros((m, n))
y = np.zeros((m,1))
import random
import math
i = 0
for r, theta in zip(np.linspace(0, 5, num=m), np.linspace(0, 8 * math.pi, num=m)):
r += random.random()
X[i] = [r * math.cos(theta), r * math.sin(theta)]
if i < 333:
y[i] = 0
elif i < 666:
y[i] = 1
else:
y[i] = 2
i += 1
nnet = NNet(n, 5, 1)
learning_rate = 0.2
improvement_threshold = 0.995
cost = np.inf
xs = []
ys = []
iter = 0
while cost > 0.2:
cost = nnet.cost(y, [nnet.propagate_forward(x_train) for x_train
if iter % 100 == 0:
xs.append(iter)
ys.append(cost)
print("Cost", cost)
if iter >= 1000:
print("Gradient descent is taking too long, giving up.")
break
cost_grads = [nnet.cost_grad(x_train, y_train) for x_train, y_train in zip(X, y)]
gW1 = [grad[0] for grad in cost_grads]
gb1 = [grad[1] for grad in cost_grads]
gW2 = [grad[2] for grad in cost_grads]
gb2 = [grad[3] for grad in cost_grads]
nnet.W1 -= np.mean(gW1, axis=0)/2 * learning_rate
nnet.b1 -= np.mean(gb1, axis=0)/2 * learning_rate
nnet.W2 -= np.mean(gW2, axis=0).T/2 * learning_rate
nnet.b2 -= np.mean(gb2, axis=0)/2 * learning_rate
iter += 1
Why is the cost not improving after a certain point? Also any other tips are highly appreciated.
The generated toy dataset looks like this
Your goal seems to be to predict to which class {0,1,2} belongs the data.
The output of your net is a sigmoid (sigm(x) in [0,1]) and you're
training using mean squared error (MSE), it's impossible for the model to predict a value above 1. So it's always wrong when the class to predict is 2.
The cost probably flattens because your sigmoid unit saturate (when trying to predict 2) and the gradient for saturating sigmoid is 0
For classification neural net normally end with a softmax layer and
are trained using cross-entropy.
If you want to keep using MSE and sigmoids unit for classification, you should consider predicting only two classes at a time in a One-vs-(One/All) kinda way.
Anyway, if you only do bi-class classification by rounding output to 0 or 1,it seems to work. Cost is decreasing and accuracy rising (quickly modified code):

Implement bias neurons neural network

I implemented bias units for my neural network with gradient descent. But I'm not 100% sure If I've implemented it the right way. Would be glade if you can quickly look through my code. Only the parts with
if bias:
are important.
And my second question:
Shouldn't the derivate of the softmax function be 1-x, because x is the output of the softmax function?
I tried my net with 1-x but its performance was worse.
Every help is appreciated.
Thanks in advance.
import numpy as np
import pickle
import time
import math
class FeedForwardNetwork():
def __init__(self, input_dim, hidden_dim, output_dim, dropout=False, dropout_prop=0.5, bias=False):
np.random.seed(1)
self.input_layer = np.array([])
self.hidden_layer = np.array([])
self.output_layer = np.array([])
self.hidden_dim = hidden_dim
self.dropout = dropout
self.dropout_prop = dropout_prop
self.bias = bias
r_input_hidden = math.sqrt(6 / (input_dim + hidden_dim))
r_hidden_output = math.sqrt(6 / (hidden_dim + output_dim))
#self.weights_input_hidden = np.random.uniform(low=-r_input_hidden, high=r_input_hidden, size=(input_dim, hidden_dim))
#self.weights_hidden_output = np.random.uniform(low=-r_hidden_output, high=r_hidden_output, size=(hidden_dim, output_dim))
self.weights_input_hidden = np.random.uniform(low=-0.01, high=0.01, size=(input_dim, hidden_dim))
self.weights_hidden_output = np.random.uniform(low=-0.01, high=0.01, size=(hidden_dim, output_dim))
self.validation_data = np.array([])
self.validation_data_solution = np.array([])
self.velocities_input_hidden = np.zeros(self.weights_input_hidden.shape)
self.velocities_hidden_output = np.zeros(self.weights_hidden_output.shape)
if bias:
self.weights_bias_hidden = np.random.uniform(low=-0.01, high=0.01, size=((1, hidden_dim)))
self.weights_bias_output = np.random.uniform(low=-0.01, high=0.01, size=((1, output_dim)))
self.velocities_bias_hidden = np.zeros(self.weights_bias_hidden.shape)
self.velocities_bias_output = np.zeros(self.weights_bias_output.shape)
def _tanh(self, x, deriv=False):
#The derivate is: 1-np.tanh(x)**2; Because x is already the output of tanh(x) 1-x*x is the correct derivate.
if not deriv:
return np.tanh(x)
return 1-x*x
def _softmax(self, x, deriv=False):
if not deriv:
return np.exp(x) / np.sum(np.exp(x), axis=0)
return 1 - np.exp(x) / np.sum(np.exp(x), axis=0)
def set_training_data(self, training_data_input, training_data_target, validation_data_input=None, validation_data_target=None):
"""Splits the data up into training and validation data with a ratio of 0.85/0.15 if no validation data is given.
Sets the data for training."""
if len(training_data_input) != len(training_data_target):
raise ValueError(
'Number of training examples and'
' training targets does not match!'
)
if (validation_data_input is None) and (validation_data_target is None):
len_training_data = int((len(training_data_input)/100*85//1))
self.input_layer = training_data_input[:len_training_data]
self.output_layer = training_data_target[:len_training_data]
self.validation_data = training_data_input[len_training_data:]
self.validation_data_solution = training_data_target[len_training_data:]
else:
self.input_layer = training_data_input
self.output_layer = training_data_target
self.validation_data = validation_data_input
self.validation_data_solution = validation_data_target
def save(self, filename):
"""Saves the weights into a pickle file."""
with open(filename, "wb") as network_file:
pickle.dump(self.weights_input_hidden, network_file)
pickle.dump(self.weights_hidden_output, network_file)
def load(self, filename):
"""Loads network weights from a pickle file."""
with open(filename, "rb") as network_file:
weights_input_hidden = pickle.load(network_file)
weights_hidden_output = pickle.load(network_file)
if (
len(weights_input_hidden) != len(self.weights_input_hidden)
or len(weights_hidden_output) != len(self.weights_hidden_output)
):
raise ValueError(
'File contains weights that does not'
' match the current networks size!'
)
self.weights_input_hidden = weights_input_hidden
self.weights_hidden_output = weights_hidden_output
def measure_error(self, input_data, output_data):
return 1/2 * np.sum((output_data - self.forward_propagate(input_data))**2)
#return np.sum(np.nan_to_num(-output_data*np.log(self.forward_propagate(input_data))-(1-output_data)*np.log(1-self.forward_propagate(input_data))))
def forward_propagate(self, input_data, dropout=False):
"""Proceds the input data from input neurons up to output neurons and returns the output layer.
If dropout is True some of the neurons are randomly turned off."""
input_layer = input_data
self.hidden_layer = self._tanh(np.dot(input_layer, self.weights_input_hidden))
if self.bias:
self.hidden_layer += self.weights_bias_hidden
if dropout:
self.hidden_layer *= np.random.binomial([np.ones((len(input_data),self.hidden_dim))],1-self.dropout_prop)[0] * (1.0/(1-self.dropout_prop))
if self.bias:
return self._softmax((np.dot(self.hidden_layer, self.weights_hidden_output) + self.weights_bias_output).T).T
else:
return self._softmax(np.dot(self.hidden_layer, self.weights_hidden_output).T).T
#return self._softmax(output_layer.T).T
def back_propagate(self, input_data, output_data, alpha, beta, momentum):
"""Calculates the difference between target output and output and adjusts the weights to fit the target output better.
The parameter alpha is the learning rate.
Beta is the parameter for weight decay which penaltizes large weights."""
sample_count = len(input_data)
output_layer = self.forward_propagate(input_data, dropout=self.dropout)
output_layer_error = output_layer - output_data
output_layer_delta = output_layer_error * self._softmax(output_layer, deriv=True)
print("Error: ", np.mean(np.abs(output_layer_error)))
#How much did each hidden neuron contribute to the output error?
#Multiplys delta term with weights
hidden_layer_error = output_layer_delta.dot(self.weights_hidden_output.T)
#If the prediction is good, the second term will be small and the change will be small
#Ex: target: 1 -> Slope will be 1 so the second term will be big
hidden_layer_delta = hidden_layer_error * self._tanh(self.hidden_layer, deriv=True)
#The both lines return a matrix. A row stands for all weights connected to one neuron.
#E.g. [1, 2, 3] -> Weights to Neuron A
# [4, 5, 6] -> Weights to Neuron B
hidden_weights_gradient = input_data.T.dot(hidden_layer_delta)/sample_count
output_weights_gradient = self.hidden_layer.T.dot(output_layer_delta)/sample_count
velocities_input_hidden = self.velocities_input_hidden
velocities_hidden_output = self.velocities_hidden_output
self.velocities_input_hidden = velocities_input_hidden * momentum - alpha * hidden_weights_gradient
self.velocities_hidden_output = velocities_hidden_output * momentum - alpha * output_weights_gradient
#Includes momentum term and weight decay; The weight decay parameter is beta
#Weight decay penalizes large weights to prevent overfitting
self.weights_input_hidden += -velocities_input_hidden * momentum + (1 + momentum) * self.velocities_input_hidden
- alpha * beta * self.weights_input_hidden / sample_count
self.weights_hidden_output += -velocities_hidden_output * momentum + (1 + momentum) * self.velocities_hidden_output
- alpha * beta * self.weights_hidden_output / sample_count
if self.bias:
velocities_bias_hidden = self.velocities_bias_hidden
velocities_bias_output = self.velocities_bias_output
hidden_layer_delta = np.sum(hidden_layer_delta, axis=0)
output_layer_delta = np.sum(output_layer_delta, axis=0)
self.velocities_bias_hidden = velocities_bias_hidden * momentum - alpha * hidden_layer_delta
self.velocities_bias_output = velocities_bias_output * momentum - alpha * output_layer_delta
self.weights_bias_hidden += -velocities_bias_hidden * momentum + (1 + momentum) * self.velocities_bias_hidden
- alpha * beta * self.weights_bias_hidden / sample_count
self.weights_bias_output += -velocities_bias_output * momentum + (1 + momentum) * self.velocities_bias_output
- alpha * beta * self.weights_bias_output / sample_count
def batch_train(self, epochs, alpha, beta, momentum, patience=10):
"""Trains the network in batch mode that means the weights are updated after showing all training examples.
alpha is the learning rate and patience is the number of epochs that the validation error is allowed to increase before aborting.
Beta is the parameter for weight decay which penaltizes large weights."""
#The weight decay parameter is beta
validation_error = self.measure_error(self.validation_data, self.validation_data_solution)
for epoch in range(epochs):
self.back_propagate(self.input_layer, self.output_layer, alpha, beta, momentum)
validation_error_new = self.measure_error(self.validation_data, self.validation_data_solution)
if validation_error_new < validation_error:
validation_error = validation_error_new
else:
patience -= 1
if patience == 0:
print("Abort Training. Overfitting has started! Epoch: {0}. Error: {1}".format(epoch, validation_error_new))
return
print("Epoch: {0}, Validation Error: {1}".format(epoch, validation_error))
self.save("Network_Mnist.net")
def mini_batch_train(self, batch_size, epochs, alpha, beta, momentum, patience=10):
"""Trains the network in mini batch mode, that means the weights are updated after showing only a bunch of training examples.
alpha is the learning rate and patience is the number of epochs that the validation error is allowed to increase before aborting."""
validation_error = self.measure_error(self.validation_data, self.validation_data_solution)
sample_count = len(self.input_layer)
epoch_counter = 0
for epoch in range(0, epochs*batch_size, batch_size):
epoch_counter += 1
self.back_propagate(self.input_layer[epoch%sample_count:(epoch%sample_count)+batch_size],
self.output_layer[epoch%sample_count:(epoch%sample_count)+batch_size], alpha, beta, momentum)
validation_error_new = self.measure_error(self.validation_data, self.validation_data_solution)
if validation_error_new < validation_error:
validation_error = validation_error_new
patience = 20
else:
patience -= 1
if patience == 0:
print("Abort Training. Overfitting has started! Epoch: {0}. Error: {1}".format(epoch_counter, validation_error_new))
return
print("Epoch: {0}, Validation Error: {1}".format(epoch_counter, validation_error))
self.save("Network_Mnist.net")
if __name__ == "__main__":
#If the first row is a one the first output neuron should be on the second off
x = np.array([ [0, 0, 1, 1, 0],
[0, 1, 1, 1, 1],
[1, 0, 1, 1, 1],
[1, 1, 1, 1, 0],
[0, 1, 1, 1, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 0, 1, 0, 0] ])
y = np.array([ [0, 1],
[0, 1],
[1, 0],
[1, 0],
[0, 1],
[1, 0],
[1, 0],
[1, 0] ])
#x = np.array([ [0, 0, 1, 1] ])
#y = np.array([[0]]).T
a = FeedForwardNetwork(input_dim=5, hidden_dim=200, output_dim=2, bias=False)
a.set_training_data(x, y)
start = time.time()
a.batch_train(epochs=2000, alpha=0.05, beta=0.0001, momentum=0.99, patience=20)
print(time.time()-start)
In relation with the derivatives...
If you are using the tanh activation function, i.e. the derivative is: y' = 1 - y^2. The tanh is commonly used because it is zero-centered.
If you are using the logistic equation, then the derivative is: y' = y(1+y). The softmax has a similar derivative.
The nice thing is that all these can be expressed as functions of themselves, so you need to have a look at the def _softmax(self, x, deriv=False) function, to define it in a similar way than def _tanh(self, x, deriv=False).

Categories