I am trying to implement a NN from scratch in Python. It has 2 layers: input layer –
output layer. The input layer will have 4 neurons and the output layer will have only a
single node (+biases). I have the following code but I get the error message: ValueError: shapes (4,2) and (4,1) not aligned: 2 (dim 1) != 4 (dim 0). Can someone help me?
import numpy as np
# Step 1: Define input and output data
X = np.array([[0, 0, 1, 1], [0, 1, 0, 1]])
y = np.array([[0, 1, 0, 1]])
# Step 2: Define the number of input neurons, hidden neurons, and output neurons
input_neurons = 4
output_neurons = 1
# Step 3: Define the weights and biases for the network
weights = np.random.rand(input_neurons, output_neurons)
biases = np.random.rand(output_neurons, 1)
# Step 4: Define the sigmoid activation function
def sigmoid(x):
return 1 / (1 + np.exp(-x))
# Step 5: Define the derivative of the sigmoid function
def sigmoid_derivative(x):
return sigmoid(x) * (1 - sigmoid(x))
# Step 6: Define the forward propagation function
def forward_propagation(X, weights, biases):
output = sigmoid(np.dot(X.T, weights) + biases)
return output
# Step 7: Define the backward propagation function
def backward_propagation(X, y, output, weights, biases):
error = output - y
derivative = sigmoid_derivative(output)
delta = error * derivative
weights_derivative = np.dot(X, delta.T)
biases_derivative = np.sum(delta, axis=1, keepdims=True)
return delta, weights_derivative, biases_derivative
# Step 8: Define the train function
def train(X, y, weights, biases, epochs, learning_rate):
for i in range(epochs):
output = forward_propagation(X, weights, biases)
delta, weights_derivative, biases_derivative = backward_propagation(X, y, output, weights, biases)
weights -= learning_rate * weights_derivative
biases -= learning_rate * biases_derivative
error = np.mean(np.abs(delta))
print("Epoch ", i, " error: ", error)
# Step 9: Train the network
epochs = 5000
learning_rate = 0.1
train(X, y, weights, biases, epochs, learning_rate)
You have an output layer with one neuron, so your output should be of one dimension.
You're assuming that the output has 4 dims:
y = np.array([[0, 1, 0, 1]])
Since you are giving two inputs (a pair of 4 dim inputs) like this,
X = np.array([[0, 0, 1, 1], [0, 1, 0, 1]])
You need also give two outputs (in one dim), for example like this:
y = np.array([[0],[1]])
Hope this helps.
Related
I am trying to fit a neuronal network code to make it use the sklearn breast cancer database. This is the code I have tryed:
from numpy import exp, array, random, dot
class NeuronLayer():
def __init__(self, number_of_neurons, number_of_inputs_per_neuron):
self.synaptic_weights = 2 * random.random((number_of_inputs_per_neuron, number_of_neurons)) - 1
class NeuralNetwork():
def __init__(self, layer1, layer2):
self.layer1 = layer1
self.layer2 = layer2
# The Sigmoid function, which describes an S shaped curve.
# We pass the weighted sum of the inputs through this function to
# normalise them between 0 and 1.
def __sigmoid(self, x):
return 1 / (1 + exp(-x))
# The derivative of the Sigmoid function.
# This is the gradient of the Sigmoid curve.
# It indicates how confident we are about the existing weight.
def __sigmoid_derivative(self, x):
return x * (1 - x)
# We train the neural network through a process of trial and error.
# Adjusting the synaptic weights each time.
def train(self, training_set_inputs, training_set_outputs, number_of_training_iterations):
for iteration in range(0, number_of_training_iterations):
# Pass the training set through our neural network
output_from_layer_1, output_from_layer_2 = self.think(training_set_inputs)
# Calculate the error for layer 2 (The difference between the desired output
# and the predicted output).
layer2_error = training_set_outputs - output_from_layer_2
layer2_delta = layer2_error * self.__sigmoid_derivative(output_from_layer_2)
# Calculate the error for layer 1 (By looking at the weights in layer 1,
# we can determine by how much layer 1 contributed to the error in layer 2).
layer1_error = layer2_delta.dot(self.layer2.synaptic_weights.T)
layer1_delta = layer1_error * self.__sigmoid_derivative(output_from_layer_1)
# Calculate how much to adjust the weights by
layer1_adjustment = training_set_inputs.T.dot(layer1_delta)
layer2_adjustment = output_from_layer_1.T.dot(layer2_delta)
# Adjust the weights.
self.layer1.synaptic_weights += layer1_adjustment
self.layer2.synaptic_weights += layer2_adjustment
# The neural network thinks.
def think(self, inputs):
output_from_layer1 = self.__sigmoid(dot(inputs, self.layer1.synaptic_weights))
output_from_layer2 = self.__sigmoid(dot(output_from_layer1, self.layer2.synaptic_weights))
return output_from_layer1, output_from_layer2
# The neural network prints its weights
def print_weights(self):
print (" Layer 1 (4 neurons, each with 3 inputs): ")
print (self.layer1.synaptic_weights)
print (" Layer 2 (1 neuron, with 4 inputs):")
print (self.layer2.synaptic_weights)
if __name__ == "__main__":
# Leer el dataset
db_breast_cancer = datasets.load_breast_cancer()
# Creamos los conjuntos de entrenamiento y test.
conjunto_de_datos = db_breast_cancer.data # Las características del conjunto de datos
target = db_breast_cancer.target # Los targets del conjunto de datos
tamanio_conjunto_test = 0.30 # Tamaño para el conjunto de test en %
numero_semilla = 7 # Semilla para generar aleatoriedad
rango_a_estudiar = range(0, 30) # Seleccionar un rango de ejecuciones del algoritmo
# Separar conjunto de datos en entrenamiento y test
training_set_inputs, \
test_set_inputs, \
training_set_outputs, \
test_set_outputs = \
model_selection.train_test_split(
conjunto_de_datos,
target,
test_size=tamanio_conjunto_test,
random_state=numero_semilla
)
#Seed the random number generator
random.seed(1)
# Create layer 1 (4 neurons, each with 30 inputs)
layer1 = NeuronLayer(4, 30)
# Create layer 2 (a single neuron with 4 inputs)
layer2 = NeuronLayer(1, 4)
# Combine the layers to create a neural network
neural_network = NeuralNetwork(layer1, layer2)
print ("Stage 1) Random starting synaptic weights: ")
neural_network.print_weights()
# The training set. We have 7 examples, each consisting of 3 input values
# and 1 output value.
#training_set_inputs = array([[0, 0, 1], [0, 1, 1], [1, 0, 1], [0, 1, 0], [1, 0, 0], [1, 1, 1], [0, 0, 0]])
#training_set_outputs = array([[0, 1, 1, 1, 1, 0, 0]]).T
# Train the neural network using the training set.
# Do it 60,000 times and make small adjustments each time.
neural_network.train(training_set_inputs, training_set_outputs.T, 60000)
print ("Stage 2) New synaptic weights after training: ")
neural_network.print_weights()
# Test the neural network with a new situation.
print ("Stage 3) Considering a new situation [1, 1, 0] -> ?: ")
hidden_state, output = neural_network.think(test_set_input[0])
print (output)
But I get next error:
ValueError: shapes (398,398) and (1,4) not aligned: 398 (dim 1) != 1
(dim 0).
More explecitly, the error is:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-9-335a2d34ac99> in <module>()
103 # Train the neural network using the training set.
104 # Do it 60,000 times and make small adjustments each time.
--> 105 neural_network.train(training_set_inputs, training_set_outputs.T, 60000)
106
107 print ("Stage 2) New synaptic weights after training: ")
<ipython-input-9-335a2d34ac99> in train(self, training_set_inputs, training_set_outputs, number_of_training_iterations)
38 # Calculate the error for layer 1 (By looking at the weights in layer 1,
39 # we can determine by how much layer 1 contributed to the error in layer 2).
---> 40 layer1_error = layer2_delta.dot(self.layer2.synaptic_weights.T)
41 layer1_delta = layer1_error * self.__sigmoid_derivative(output_from_layer_1)
42
ValueError: shapes (398,398) and (1,4) not aligned: 398 (dim 1) != 1 (dim 0)
Any idea how could I solve this?
Thank you.
I'm currently building my 3-4-1 neural network from scratch using numpy (I avoided using keras and tensorflow for the purpose of learning and trying to demonstrate my knowledge instead of using pre-built libraries to do all the work), the problems I find when I run the program are:
1/ getting "nan" values after a certain number of iterations in the "updated" weights, lowering the learning rate only delays the problem and doesn't solve it.
2/ the second problem is the very low predicting accuracy.
I would like to know what causes these bugs on my program and would appreciate any help.
here is the code:
# Import our dependencies
from numpy import exp, array, random, dot, ones_like, where
# Create our Artificial Neural Network class
class ArtificialNeuralNetwork():
# initializing the class
def __init__(self):
# generating the same synaptic weights every time the program runs
random.seed(1)
# synaptic weights (3 × 4 Matrix) of the hidden layer
self.w_ij = 2 * random.rand(3, 4) - 1
# synaptic weights (4 × 1 Matrix) of the output layer
self.w_jk = 2 * random.rand(4, 1) - 1
def LeakyReLU(self, x):
# The Leaky ReLU (short for Rectified Linear Unit) activation function will be applied to the inputs of the hidden layer
# The activation function will return the same value of x if x is positive
# while it will multiply the negative values of x by the alpha parameter
# we used in this example the Leaky ReLU instead of the standard ReLU activation function to avoid the dying ReLU problem
return where(x > 0, x, x * 0.01)
def LeakyReLUDerivative(self, x, α = 0.01):
# The Leaky ReLU Derivative will return 1 for every positive value in the x array
# while returning the value of the parameter alpha for every negative value
x[x > 0] = 1 # returns 1 for every positive value in the x array
x[x <= 0] = α # returns α for every negative value in the x array
return x
def Sigmoid(self, x):
# The Sigmoid activation function will turn every input value into probabilities between 0 and 1
# the probabilistic values help us assert which class x belongs to
return 1 / (1 + exp(-x))
def SigmoidDerivative(self, x):
# The derivative of the Sigmoid activation function will be used to calculate the gradient during the backpropagation process
# and help optimize the random starting synaptic weights
return x * (1 - x)
def train(self, x, y, learning_rate, iterations):
# x: training set of data
# y: the actual output of the training data
for i in range(iterations):
z_ij = dot(x, self.w_ij) # the dot product of the weights of the hidden layer and the inputs
a_ij = self.LeakyReLU(z_ij) # using the Leaky ReLU activation function to introduce non-linearity to our Neural Network
z_jk = dot(a_ij, self.w_jk) # the same precedent process will be applied to find the last input of the output layer
a_jk = self.Sigmoid(z_jk) # this time the Sigmoid activation function will be used instead of Leaky ReLU
dl_jk = -y/a_jk + (1 - y)/(1 - a_jk) # calculating the derivative of the cross entropy loss wrt output
da_jk = self.SigmoidDerivative(a_jk) # calculating the derivative of the Sigmoid activation function wrt the input (before activation) of the output layer
dz_jk = a_ij # calculating the derivative of the inputs of the hidden layer (before activation) wrt weights of the output layer
dl_ij = dot(da_jk * dl_jk, self.w_jk.T) # calculating the derivative of the cross entropy loss wrt activated input of the hidden layer
# to do so we multiply the derivative of the cross entropy loss wrt output by the derivative of the Sigmoid activation function wrt the input (before activation) of the output layer by the derivative of the inputs of the hidden layer (before activation) wrt weights of the output layer
da_ij = self.LeakyReLUDerivative(z_ij) # calculating the derivative of the Leaky ReLU activation function wrt the inputs of the hidden layer (before activation)
dz_ij = x # calculating the derivative of the inputs of the hidden layer (before activation) wrt weights of the hidden layer
# calculating the gradient using the chain rule
gradient_ij = dot(dz_ij.T , dl_ij * da_ij)
gradient_jk = dot(dz_jk.T , dl_jk * da_jk)
# calculating the new optimal weights
self.w_ij = self.w_ij - learning_rate * gradient_ij
self.w_jk = self.w_jk - learning_rate * gradient_jk
def predict(self, inputs):
# predicting the class of the input data after weights optimization
output_from_layer1 = self.LeakyReLU(dot(inputs, self.w_ij)) # the output of the hidden layer
output_from_layer2 = self.Sigmoid(dot(output_from_layer1, self.w_jk)) # the output of the output layer
return output_from_layer1, output_from_layer2
# the function will print the initial starting weights before training
def SynapticWeights(self):
print("Layer 1 (4 neurons, each with 3 inputs): ")
print("w_ij: ", self.w_ij)
print("Layer 2 (1 neuron, with 4 inputs): ")
print("w_jk: ", self.w_jk)
def main():
ANN = ArtificialNeuralNetwork()
ANN.SynapticWeights()
# the training inputs
x = array([[0, 0, 1], [0, 1, 1], [1, 0, 1], [0, 1, 0], [1, 0, 0], [1, 1, 1], [0, 0, 0]])
# the training outputs
y = array([[0, 1, 1, 1, 1, 0, 0]]).T
ANN.train(x, y, 1, 10000)
# Printing the new synaptic weights after training
print("New synaptic weights after training: ")
print("w_ij: ", ANN.w_ij)
print("w_jk: ", ANN.w_jk)
# Our prediction after feeding the ANN with new set of data
print("Considering new situation [1, 1, 0] -> ?: ")
print(ANN.predict(array([[1, 1, 0]])))
if __name__=="__main__":
main()
So, I changed a few things. (Disclaimer: I didn't check the correctness of the code)
Weight initialization: initialize to much smaller weights.
# synaptic weights (3 × 4 Matrix) of the hidden layer
self.w_ij = (2 * random.rand(3, 4) - 1)*0.1
# synaptic weights (4 × 1 Matrix) of the output layer
self.w_jk = (2 * random.rand(4, 1) - 1)*0.1
Weight initialization really matter.
I reduced the learning rate to 0.1.
ANN.train(x, y, .1, 500000)
I see the neural network perfectly fitting your data and not giving Nan even after 500,000 iterations.
print(ANN.predict(array([[0, 0, 1],
[0, 1, 1],
[1, 0, 1],
[0, 1, 0],
[1, 0, 0],
[1, 1, 1],
[0, 0, 0]])))
I have implemented a simple Neural Net with just a single sigmoid hidden layer, with the choice of a sigmoid or softmax output layer and squared error or cross entropy loss function, respectively. After much research on the softmax activation function, the cross entropy loss, and their derivatives (and with following this blog) I believe that my implementation seems correct.
When attempting to learn the simple XOR function, the NN with the sigmoid output learns to a very small loss very quickly when using single binary outputs of 0 and 1. However, when changing the labels to one-hot encodings of [1, 0] = 0 and [0, 1] = 1, the softmax implementation does not work. The loss consistently increases as the network's outputs converge to exactly [0, 1] for the two outputs on every input, yet the labels of the data set is perfectly balanced between [0, 1] and [1, 0].
My code is below, where the choice of using sigmoid or softmax at the output layer can be chosen by uncommenting the necessary two lines near the bottom of the code. I cannot figure out why the softmax implementation is not working.
import numpy as np
class MLP:
def __init__(self, numInputs, numHidden, numOutputs, activation):
self.numInputs = numInputs
self.numHidden = numHidden
self.numOutputs = numOutputs
self.activation = activation.upper()
self.IH_weights = np.random.rand(numInputs, numHidden) # Input -> Hidden
self.HO_weights = np.random.rand(numHidden, numOutputs) # Hidden -> Output
self.IH_bias = np.zeros((1, numHidden))
self.HO_bias = np.zeros((1, numOutputs))
# Gradients corresponding to weight matrices computed during backprop
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
# Gradients corresponding to biases computed during backprop
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
# Input, hidden and output layer neuron values
self.I = np.zeros(numInputs) # Inputs
self.L = np.zeros(numOutputs) # Labels
self.H = np.zeros(numHidden) # Hidden
self.O = np.zeros(numOutputs) # Output
# ##########################################################################
# ACIVATION FUNCTIONS
# ##########################################################################
def sigmoid(self, x, derivative=False):
if derivative:
return x * (1 - x)
return 1 / (1 + np.exp(-x))
def softmax(self, prediction, label=None, derivative=False):
if derivative:
return prediction - label
return np.exp(prediction) / np.sum(np.exp(prediction))
# ##########################################################################
# LOSS FUNCTIONS
# ##########################################################################
def squaredError(self, prediction, label, derivative=False):
if derivative:
return (-2 * prediction) + (2 * label)
return (prediction - label) ** 2
def crossEntropy(self, prediction, label, derivative=False):
if derivative:
return [-(y / x) for x, y in zip(prediction, label)] # NOT NEEDED ###############################
return - np.sum([y * np.log(x) for x, y in zip(prediction, label)])
# ##########################################################################
def forward(self, inputs):
self.I = np.array(inputs).reshape(1, self.numInputs) # [numInputs, ] -> [1, numInputs]
self.H = self.I.dot(self.IH_weights) + self.IH_bias
self.H = self.sigmoid(self.H)
self.O = self.H.dot(self.HO_weights) + self.HO_bias
if self.activation == 'SIGMOID':
self.O = self.sigmoid(self.O)
elif self.activation == 'SOFTMAX':
self.O = self.softmax(self.O) + 1e-10 # allows for log(0)
return self.O
def backward(self, labels):
self.L = np.array(labels).reshape(1, self.numOutputs) # [numOutputs, ] -> [1, numOutputs]
if self.activation == 'SIGMOID':
self.O_error = self.squaredError(self.O, self.L)
self.O_delta = self.squaredError(self.O, self.L, derivative=True) * self.sigmoid(self.O, derivative=True)
elif self.activation == 'SOFTMAX':
self.O_error = self.crossEntropy(self.O, self.L)
self.O_delta = self.softmax(self.O, self.L, derivative=True)
self.H_error = self.O_delta.dot(self.HO_weights.T)
self.H_delta = self.H_error * self.sigmoid(self.H, derivative=True)
self.IH_w_gradients += self.I.T.dot(self.H_delta)
self.HO_w_gradients += self.H.T.dot(self.O_delta)
self.IH_b_gradients += self.H_delta
self.HO_b_gradients += self.O_delta
return self.O_error
def updateWeights(self, learningRate):
self.IH_weights += learningRate * self.IH_w_gradients
self.HO_weights += learningRate * self.HO_w_gradients
self.IH_bias += learningRate * self.IH_b_gradients
self.HO_bias += learningRate * self.HO_b_gradients
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
sigmoidData = [
[[0, 0], 0],
[[0, 1], 1],
[[1, 0], 1],
[[1, 1], 0]
]
softmaxData = [
[[0, 0], [1, 0]],
[[0, 1], [0, 1]],
[[1, 0], [0, 1]],
[[1, 1], [1, 0]]
]
sigmoidMLP = MLP(2, 10, 1, 'SIGMOID')
softmaxMLP = MLP(2, 10, 2, 'SOFTMAX')
# SIGMOID #######################
# data = sigmoidData
# mlp = sigmoidMLP
# ###############################
# SOFTMAX #######################
data = softmaxData
mlp = softmaxMLP
# ###############################
numEpochs = 5000
for epoch in range(numEpochs):
losses = []
for i in range(len(data)):
print(mlp.forward(data[i][0])) # Print outputs
# mlp.forward(data[i][0]) # Don't print outputs
loss = mlp.backward(data[i][1])
losses.append(loss)
mlp.updateWeights(0.001)
# if epoch % 1000 == 0 or epoch == numEpochs - 1: # Print loss every 1000 epochs
print(np.mean(losses)) # Print loss every epoch
Contrary to all the information online, simply changing the derivative of the softmax cross entropy from prediction - label to label - prediction solved the problem. Perhaps I have something else backwards somewhere since every source I have come across has it as prediction - label.
I've implemented the following neural network to solve the XOR problem in Python. My neural network consists of an input layer of 2 neurons, 1 hidden layer of 2 neurons and an output layer of 1 neuron. I am using the Sigmoid function as the activation function for both the hidden layer and output layer. Can someone please explain what I have done wrong.
import numpy
import scipy.special
class NeuralNetwork:
def __init__(self, inputNodes, hiddenNodes, outputNodes, learningRate):
self.iNodes = inputNodes
self.hNodes = hiddenNodes
self.oNodes = outputNodes
self.wIH = numpy.random.normal(0.0, pow(self.iNodes, -0.5), (self.hNodes, self.iNodes))
self.wOH = numpy.random.normal(0.0, pow(self.hNodes, -0.5), (self.oNodes, self.hNodes))
self.lr = learningRate
self.activationFunction = lambda x: scipy.special.expit(x)
def train(self, inputList, targetList):
inputs = numpy.array(inputList, ndmin=2).T
targets = numpy.array(targetList, ndmin=2).T
#print(inputs, targets)
hiddenInputs = numpy.dot(self.wIH, inputs)
hiddenOutputs = self.activationFunction(hiddenInputs)
finalInputs = numpy.dot(self.wOH, hiddenOutputs)
finalOutputs = self.activationFunction(finalInputs)
outputErrors = targets - finalOutputs
hiddenErrors = numpy.dot(self.wOH.T, outputErrors)
self.wOH += self.lr * numpy.dot((outputErrors * finalOutputs * (1.0 - finalOutputs)), numpy.transpose(hiddenOutputs))
self.wIH += self.lr * numpy.dot((hiddenErrors * hiddenOutputs * (1.0 - hiddenOutputs)), numpy.transpose(inputs))
def query(self, inputList):
inputs = numpy.array(inputList, ndmin=2).T
hiddenInputs = numpy.dot(self.wIH, inputs)
hiddenOutputs = self.activationFunction(hiddenInputs)
finalInputs = numpy.dot(self.wOH, hiddenOutputs)
finalOutputs = self.activationFunction(finalInputs)
return finalOutputs
nn = NeuralNetwork(2, 2, 1, 0.01)
data = [[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0]]
epochs = 10
for e in range(epochs):
for record in data:
inputs = numpy.asfarray(record[1:])
targets = record[0]
#print(targets)
#print(inputs, targets)
nn.train(inputs, targets)
print(nn.query([0, 0]))
print(nn.query([1, 0]))
print(nn.query([0, 1]))
print(nn.query([1, 1]))
Several reasons.
I don't think you should be taking the activation function of everything, especially in your query function. I think you have muddled up the ideas of neuron to neuron weightings (wIH and wOH) with the activation values.
Because of your muddle you have missed the idea of re-using your query function as part of your training. You should think of it as feed forward activation levels to the output, compare the result with the target output to give an array of errors which are then fed backwards using the derivative of the sigmoid function to adjust the weightings.
I would put the function and it's derivative in rather than importing from scipy as they are so simple. Also "it's recommended" to use tanh and d/dx.tanh for the hidden layer functions (can't remember why, probably not needed for this simple net)
# transfer functions
def sigmoid(x):
return 1 / (1 + np.exp(-x))
# derivative of sigmoid
def dsigmoid(y):
return y * (1.0 - y)
# using tanh over logistic sigmoid for the hidden layer is recommended
def tanh(x):
return np.tanh(x)
# derivative for tanh sigmoid
def dtanh(y):
return 1 - y*y
Finally, you might be able to figure out what I did a while ago with a neural net using just numpy here https://github.com/paddywwoof/Machine-Learning/blob/master/perceptron.py
I implemented bias units for my neural network with gradient descent. But I'm not 100% sure If I've implemented it the right way. Would be glade if you can quickly look through my code. Only the parts with
if bias:
are important.
And my second question:
Shouldn't the derivate of the softmax function be 1-x, because x is the output of the softmax function?
I tried my net with 1-x but its performance was worse.
Every help is appreciated.
Thanks in advance.
import numpy as np
import pickle
import time
import math
class FeedForwardNetwork():
def __init__(self, input_dim, hidden_dim, output_dim, dropout=False, dropout_prop=0.5, bias=False):
np.random.seed(1)
self.input_layer = np.array([])
self.hidden_layer = np.array([])
self.output_layer = np.array([])
self.hidden_dim = hidden_dim
self.dropout = dropout
self.dropout_prop = dropout_prop
self.bias = bias
r_input_hidden = math.sqrt(6 / (input_dim + hidden_dim))
r_hidden_output = math.sqrt(6 / (hidden_dim + output_dim))
#self.weights_input_hidden = np.random.uniform(low=-r_input_hidden, high=r_input_hidden, size=(input_dim, hidden_dim))
#self.weights_hidden_output = np.random.uniform(low=-r_hidden_output, high=r_hidden_output, size=(hidden_dim, output_dim))
self.weights_input_hidden = np.random.uniform(low=-0.01, high=0.01, size=(input_dim, hidden_dim))
self.weights_hidden_output = np.random.uniform(low=-0.01, high=0.01, size=(hidden_dim, output_dim))
self.validation_data = np.array([])
self.validation_data_solution = np.array([])
self.velocities_input_hidden = np.zeros(self.weights_input_hidden.shape)
self.velocities_hidden_output = np.zeros(self.weights_hidden_output.shape)
if bias:
self.weights_bias_hidden = np.random.uniform(low=-0.01, high=0.01, size=((1, hidden_dim)))
self.weights_bias_output = np.random.uniform(low=-0.01, high=0.01, size=((1, output_dim)))
self.velocities_bias_hidden = np.zeros(self.weights_bias_hidden.shape)
self.velocities_bias_output = np.zeros(self.weights_bias_output.shape)
def _tanh(self, x, deriv=False):
#The derivate is: 1-np.tanh(x)**2; Because x is already the output of tanh(x) 1-x*x is the correct derivate.
if not deriv:
return np.tanh(x)
return 1-x*x
def _softmax(self, x, deriv=False):
if not deriv:
return np.exp(x) / np.sum(np.exp(x), axis=0)
return 1 - np.exp(x) / np.sum(np.exp(x), axis=0)
def set_training_data(self, training_data_input, training_data_target, validation_data_input=None, validation_data_target=None):
"""Splits the data up into training and validation data with a ratio of 0.85/0.15 if no validation data is given.
Sets the data for training."""
if len(training_data_input) != len(training_data_target):
raise ValueError(
'Number of training examples and'
' training targets does not match!'
)
if (validation_data_input is None) and (validation_data_target is None):
len_training_data = int((len(training_data_input)/100*85//1))
self.input_layer = training_data_input[:len_training_data]
self.output_layer = training_data_target[:len_training_data]
self.validation_data = training_data_input[len_training_data:]
self.validation_data_solution = training_data_target[len_training_data:]
else:
self.input_layer = training_data_input
self.output_layer = training_data_target
self.validation_data = validation_data_input
self.validation_data_solution = validation_data_target
def save(self, filename):
"""Saves the weights into a pickle file."""
with open(filename, "wb") as network_file:
pickle.dump(self.weights_input_hidden, network_file)
pickle.dump(self.weights_hidden_output, network_file)
def load(self, filename):
"""Loads network weights from a pickle file."""
with open(filename, "rb") as network_file:
weights_input_hidden = pickle.load(network_file)
weights_hidden_output = pickle.load(network_file)
if (
len(weights_input_hidden) != len(self.weights_input_hidden)
or len(weights_hidden_output) != len(self.weights_hidden_output)
):
raise ValueError(
'File contains weights that does not'
' match the current networks size!'
)
self.weights_input_hidden = weights_input_hidden
self.weights_hidden_output = weights_hidden_output
def measure_error(self, input_data, output_data):
return 1/2 * np.sum((output_data - self.forward_propagate(input_data))**2)
#return np.sum(np.nan_to_num(-output_data*np.log(self.forward_propagate(input_data))-(1-output_data)*np.log(1-self.forward_propagate(input_data))))
def forward_propagate(self, input_data, dropout=False):
"""Proceds the input data from input neurons up to output neurons and returns the output layer.
If dropout is True some of the neurons are randomly turned off."""
input_layer = input_data
self.hidden_layer = self._tanh(np.dot(input_layer, self.weights_input_hidden))
if self.bias:
self.hidden_layer += self.weights_bias_hidden
if dropout:
self.hidden_layer *= np.random.binomial([np.ones((len(input_data),self.hidden_dim))],1-self.dropout_prop)[0] * (1.0/(1-self.dropout_prop))
if self.bias:
return self._softmax((np.dot(self.hidden_layer, self.weights_hidden_output) + self.weights_bias_output).T).T
else:
return self._softmax(np.dot(self.hidden_layer, self.weights_hidden_output).T).T
#return self._softmax(output_layer.T).T
def back_propagate(self, input_data, output_data, alpha, beta, momentum):
"""Calculates the difference between target output and output and adjusts the weights to fit the target output better.
The parameter alpha is the learning rate.
Beta is the parameter for weight decay which penaltizes large weights."""
sample_count = len(input_data)
output_layer = self.forward_propagate(input_data, dropout=self.dropout)
output_layer_error = output_layer - output_data
output_layer_delta = output_layer_error * self._softmax(output_layer, deriv=True)
print("Error: ", np.mean(np.abs(output_layer_error)))
#How much did each hidden neuron contribute to the output error?
#Multiplys delta term with weights
hidden_layer_error = output_layer_delta.dot(self.weights_hidden_output.T)
#If the prediction is good, the second term will be small and the change will be small
#Ex: target: 1 -> Slope will be 1 so the second term will be big
hidden_layer_delta = hidden_layer_error * self._tanh(self.hidden_layer, deriv=True)
#The both lines return a matrix. A row stands for all weights connected to one neuron.
#E.g. [1, 2, 3] -> Weights to Neuron A
# [4, 5, 6] -> Weights to Neuron B
hidden_weights_gradient = input_data.T.dot(hidden_layer_delta)/sample_count
output_weights_gradient = self.hidden_layer.T.dot(output_layer_delta)/sample_count
velocities_input_hidden = self.velocities_input_hidden
velocities_hidden_output = self.velocities_hidden_output
self.velocities_input_hidden = velocities_input_hidden * momentum - alpha * hidden_weights_gradient
self.velocities_hidden_output = velocities_hidden_output * momentum - alpha * output_weights_gradient
#Includes momentum term and weight decay; The weight decay parameter is beta
#Weight decay penalizes large weights to prevent overfitting
self.weights_input_hidden += -velocities_input_hidden * momentum + (1 + momentum) * self.velocities_input_hidden
- alpha * beta * self.weights_input_hidden / sample_count
self.weights_hidden_output += -velocities_hidden_output * momentum + (1 + momentum) * self.velocities_hidden_output
- alpha * beta * self.weights_hidden_output / sample_count
if self.bias:
velocities_bias_hidden = self.velocities_bias_hidden
velocities_bias_output = self.velocities_bias_output
hidden_layer_delta = np.sum(hidden_layer_delta, axis=0)
output_layer_delta = np.sum(output_layer_delta, axis=0)
self.velocities_bias_hidden = velocities_bias_hidden * momentum - alpha * hidden_layer_delta
self.velocities_bias_output = velocities_bias_output * momentum - alpha * output_layer_delta
self.weights_bias_hidden += -velocities_bias_hidden * momentum + (1 + momentum) * self.velocities_bias_hidden
- alpha * beta * self.weights_bias_hidden / sample_count
self.weights_bias_output += -velocities_bias_output * momentum + (1 + momentum) * self.velocities_bias_output
- alpha * beta * self.weights_bias_output / sample_count
def batch_train(self, epochs, alpha, beta, momentum, patience=10):
"""Trains the network in batch mode that means the weights are updated after showing all training examples.
alpha is the learning rate and patience is the number of epochs that the validation error is allowed to increase before aborting.
Beta is the parameter for weight decay which penaltizes large weights."""
#The weight decay parameter is beta
validation_error = self.measure_error(self.validation_data, self.validation_data_solution)
for epoch in range(epochs):
self.back_propagate(self.input_layer, self.output_layer, alpha, beta, momentum)
validation_error_new = self.measure_error(self.validation_data, self.validation_data_solution)
if validation_error_new < validation_error:
validation_error = validation_error_new
else:
patience -= 1
if patience == 0:
print("Abort Training. Overfitting has started! Epoch: {0}. Error: {1}".format(epoch, validation_error_new))
return
print("Epoch: {0}, Validation Error: {1}".format(epoch, validation_error))
self.save("Network_Mnist.net")
def mini_batch_train(self, batch_size, epochs, alpha, beta, momentum, patience=10):
"""Trains the network in mini batch mode, that means the weights are updated after showing only a bunch of training examples.
alpha is the learning rate and patience is the number of epochs that the validation error is allowed to increase before aborting."""
validation_error = self.measure_error(self.validation_data, self.validation_data_solution)
sample_count = len(self.input_layer)
epoch_counter = 0
for epoch in range(0, epochs*batch_size, batch_size):
epoch_counter += 1
self.back_propagate(self.input_layer[epoch%sample_count:(epoch%sample_count)+batch_size],
self.output_layer[epoch%sample_count:(epoch%sample_count)+batch_size], alpha, beta, momentum)
validation_error_new = self.measure_error(self.validation_data, self.validation_data_solution)
if validation_error_new < validation_error:
validation_error = validation_error_new
patience = 20
else:
patience -= 1
if patience == 0:
print("Abort Training. Overfitting has started! Epoch: {0}. Error: {1}".format(epoch_counter, validation_error_new))
return
print("Epoch: {0}, Validation Error: {1}".format(epoch_counter, validation_error))
self.save("Network_Mnist.net")
if __name__ == "__main__":
#If the first row is a one the first output neuron should be on the second off
x = np.array([ [0, 0, 1, 1, 0],
[0, 1, 1, 1, 1],
[1, 0, 1, 1, 1],
[1, 1, 1, 1, 0],
[0, 1, 1, 1, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 0, 1, 0, 0] ])
y = np.array([ [0, 1],
[0, 1],
[1, 0],
[1, 0],
[0, 1],
[1, 0],
[1, 0],
[1, 0] ])
#x = np.array([ [0, 0, 1, 1] ])
#y = np.array([[0]]).T
a = FeedForwardNetwork(input_dim=5, hidden_dim=200, output_dim=2, bias=False)
a.set_training_data(x, y)
start = time.time()
a.batch_train(epochs=2000, alpha=0.05, beta=0.0001, momentum=0.99, patience=20)
print(time.time()-start)
In relation with the derivatives...
If you are using the tanh activation function, i.e. the derivative is: y' = 1 - y^2. The tanh is commonly used because it is zero-centered.
If you are using the logistic equation, then the derivative is: y' = y(1+y). The softmax has a similar derivative.
The nice thing is that all these can be expressed as functions of themselves, so you need to have a look at the def _softmax(self, x, deriv=False) function, to define it in a similar way than def _tanh(self, x, deriv=False).