I have this implementation of a feed forward neural network with stochastic gradient descent in python. When training a NN instance with the xor gate, it trains just fine. But when I train the instance with stock change data, it throws this error:
Traceback (most recent call last):
File "F:\predict\test.py", line 21, in <module>
stock.train(stockData, 30, 10, 3.0)
File "F:\predict\network.py", line 46, in train
self.updateMiniBatch(miniBatch, eta)
File "F:\predict\network.py", line 62, in updateMiniBatch
delta_nabla_b, delta_nabla_w = self.backPropagate(x, y)
File "F:\predict\network.py", line 102, in backPropagate
nabla_w[-l] = numpy.dot(delta, activations[-l - 1].transpose())
ValueError: objects are not aligned
The XOR Gate data, and stock data looks as such (Mind you the stock data has been truncated greatly):
xorData = [
([0, 0], [0]),
([0, 1], [1]),
([1, 0], [1]),
([1, 1], [0])
]
stockData = [
([0.0, 0.0, 0.003927144694969353], [-0.0038780602954071597]),
([0.0, 0.003927144694969353, -0.0038780602954071597], [0.0012018010088359343]),
([0.003927144694969353, -0.0038780602954071597, 0.0012018010088359343], [-0.0115302033846727])
]
I then create a network with 2 inputs, 2 hidden, and 1 output for the XOR Gate. And 3 inputs, 15 hidden, and 1 output for the Stock data.
xor = network.NN([2, 2, 1])
xor.train(xorData, 30, 10, 3.0)
stock = network.NN([3, 15, 1])
stock.train(stockData, 30, 10, 3.0)
Both of the training sets have the exact same structure, so why would this error occur?
network.py:
import numpy
import random
import json
# the sigmoid function
def sigmoid(z):
return 1.0 / (1.0 + numpy.exp(-z))
# the derivative of the sigmoid function
def sigmoidPrime(z):
return sigmoid(z) * (1 - sigmoid(z))
# sigmoid vectors
sigmoidVector = numpy.vectorize(sigmoid)
sigmoidPrimeVector = numpy.vectorize(sigmoidPrime)
#A class that implements stochastic gradient descent learning algorithm for a feedforward neural network
class NN:
def __init__(self, sizes):
self.numLayers = len(sizes)
self.sizes = sizes
# the biases and weights for the network are initialized randomly, using a Guassian distribution with mean 0, and variance 1
self.biases = [numpy.random.randn(y, 1) for y in sizes[1:]]
self.weights = [numpy.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
# feedForward function - return the output of the network
def feedForward(self, inputs):
for b, w in zip(self.biases, self.weights):
inputs = sigmoidVector(numpy.dot(w, inputs) + b)
return inputs
# train function - train the neural network using mini-batch stochastic gradient descent
# the trainingData is a list of tuples "(x, y)" representing the training inputs and the desired outputs
# if testData is provided then the network will be evaluated against the test data after each epoch
def train(self, trainingData, epochs, miniBatchSize, eta, testData = None):
if testData:
nTest = len(testData)
n = len(trainingData)
for j in xrange(epochs):
random.shuffle(trainingData)
miniBatches = [trainingData[k:k + miniBatchSize] for k in xrange(0, n, miniBatchSize)]
for miniBatch in miniBatches:
self.updateMiniBatch(miniBatch, eta)
if testData:
print "Epoch %i: %i / %i" % (j, self.evaluate(testData), nTest)
else:
print "Epoch %i complete" % j
# updateMiniBatch function - update the network's weights and biases by applying gradient descent using backpropagation
# to a single mini batch
# the miniBatch is a list of tuples "(x, y)" and eta is the learning rate
def updateMiniBatch(self, miniBatch, eta):
nabla_b = [numpy.zeros(b.shape) for b in self.biases]
nabla_w = [numpy.zeros(w.shape) for w in self.weights]
for x, y in miniBatch:
delta_nabla_b, delta_nabla_w = self.backPropagate(x, y)
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w - (eta / len(miniBatch)) * nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - (eta / len(miniBatch)) * nb for b, nb in zip(self.biases, nabla_b)]
# backPropagate function - returns a tuple "(nabla_b, nabla_w)" representing the gradient for the cost function C_x
# nabla_b and nabla_w are layer-by-layer lists of numpy arrays, similar to self.biases and self.weights
def backPropagate(self, x, y):
nabla_b = [numpy.zeros(b.shape) for b in self.biases]
nabla_w = [numpy.zeros(w.shape) for w in self.weights]
x = numpy.array(x)
y = numpy.array(y)
# feedForward
activation = x
activations = [x] # list to store all of the activations, layer by layer
zs = [] # list to store all of the z vectors, layer by layer
for b, w in zip(self.biases, self.weights):
z = numpy.dot(w, activation) + b
zs.append(z)
activation = sigmoidVector(z)
activations.append(activation)
# backward pass
delta = self.costDerivative(activations[-1], y) * sigmoidPrimeVector(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = numpy.dot(delta, activations[-2].transpose())
for l in xrange(2, self.numLayers):
spv = sigmoidPrimeVector(zs[-l])
delta = numpy.dot(self.weights[-l + 1].transpose(), delta) * spv
nabla_b[-l] = delta
nabla_w[-l] = numpy.dot(delta, activations[-l - 1].transpose())
return (nabla_b, nabla_w)
# evaluate function - return the number of test inputs for which the neural network outputs the correct result
def evaluate(self, testData):
testResults = [(numpy.argmax(self.feedForward(x)), y) for (x, y) in testData]
return sum(int(x == y) for (x, y) in testResults)
# costDerivative function - return the vector of partial derivatives for the output activations
def costDerivative(self, outputActivations, y):
return (outputActivations - y)
# save function - save the neural network to filename
def save(self, filename):
data = {
"sizes": self.sizes,
"weights": [w.tolist() for w in self.weights],
"biases": [b.tolist() for b in self.biases]
}
with open(filename, "w") as handle:
json.dump(data, handle)
# load function - load a neural network from the file filename
# returns a network instance
def load(filename):
with open(filename, "r") as handle:
data = json.load(handle)
network = NN(data["sizes"])
network.weights = [numpy.array(w) for w in data["weights"]]
network.biases = [numpy.array(b) for b in data["biases"]]
return network
EDIT:
I believe it might have something to do with the hidden layer value. I changed the XOR Gate hidden layer number and it threw the same error. It seems as if the hidden layer value must be the exact same as the amount for the input layer.
Related
I could not understand well especially how gradients were computed with regards to matrix transposes. My question is for DW2 but if you want also to discuss about the computation of the other gradients and extend my question I am open to discussion. Mathematically things seem a little bit different but this code is reliable and on github so I trust this code.
from __future__ import print_function
from builtins import range
from builtins import object
import numpy as np
import matplotlib.pyplot as plt
from past.builtins import xrange
class TwoLayerNet(object):
"""
A two-layer fully-connected neural network. The net has an input dimension of
D* (correction), a hidden layer dimension of H, and performs classification over C classes.
We train the network with a softmax loss function and L2 regularization on the
weight matrices. The network uses a ReLU nonlinearity after the first fully
connected layer.
In other words, the network has the following architecture:
input - fully connected layer - ReLU - fully connected layer - softmax
The outputs of the second fully-connected layer are the scores for each class.
"""
def __init__(self, input_size, hidden_size, output_size, std=1e-4):
"""
Initialize the model. Weights are initialized to small random values and
biases are initialized to zero. Weights and biases are stored in the
variable self.params, which is a dictionary with the following keys:
W1: First layer weights; has shape (D, H)
b1: First layer biases; has shape (H,)
W2: Second layer weights; has shape (H, C)
b2: Second layer biases; has shape (C,)
Inputs:
- input_size: The dimension D of the input data.
- hidden_size: The number of neurons H in the hidden layer.
- output_size: The number of classes C.
"""
self.params = {}
self.params['W1'] = std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
def loss(self, X, y=None, reg=0.0):
"""
Compute the loss and gradients for a two layer fully connected neural
network.
Inputs:
- X: Input data of shape (N, D). Each X[i] is a training sample.
- y: Vector of training labels. y[i] is the label for X[i], and each y[i] is
an integer in the range 0 <= y[i] < C. This parameter is optional; if it
is not passed then we only return scores, and if it is passed then we
instead return the loss and gradients.
- reg: Regularization strength.
Returns:
If y is None, return a matrix scores of shape (N, C) where scores[i, c] is
the score for class c on input X[i].
If y is not None, instead return a tuple of:
- loss: Loss (data loss and regularization loss) for this batch of training
samples.
- grads: Dictionary mapping parameter names to gradients of those parameters
with respect to the loss function; has the same keys as self.params.
"""
# Unpack variables from the params dictionary
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
N, D = X.shape
# Compute the forward pass
scores = None
#############################################################################
# TODO: Perform the forward pass, computing the class scores for the input. #
# Store the result in the scores variable, which should be an array of #
# shape (N, C). #
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# perform the forward pass and compute the class scores for the input
# input - fully connected layer - ReLU - fully connected layer - softmax
# define lamba function for relu
relu = lambda x: np.maximum(0, x)
# a1 = X x W1 = (N x D) x (D x H) = N x H
a1 = relu(X.dot(W1) + b1) # activations of fully connected layer #1
# store the result in the scores variable, which should be an array of
# shape (N, C).
# scores = a1 x W2 = (N x H) x (H x C) = N x C
scores = a1.dot(W2) + b2 # output of softmax
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# If the targets are not given then jump out, we're done
if y is None:
return scores
# Compute the loss
loss = None
#############################################################################
# TODO: Finish the forward pass, and compute the loss. This should include #
# both the data loss and L2 regularization for W1 and W2. Store the result #
# in the variable loss, which should be a scalar. Use the Softmax #
# classifier loss. #
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# shift values for 'scores' for numeric reasons (over-flow cautious)
# figure out the max score across all classes
# scores.shape is N x C
scores -= scores.max(axis = 1, keepdims = True)
# probs.shape is N x C
probs = np.exp(scores)/np.sum(np.exp(scores), axis = 1, keepdims = True)
loss = -np.log(probs[np.arange(N), y])
# loss is a single number
loss = np.sum(loss)
# Right now the loss is a sum over all training examples, but we want it
# to be an average instead so we divide by N.
loss /= N
# Add regularization to the loss.
loss += reg * (np.sum(W1 * W1) + np.sum(W2 * W2))
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# Backward pass: compute gradients
grads = {}
#############################################################################
# TODO: Compute the backward pass, computing the derivatives of the weights #
# and biases. Store the results in the grads dictionary. For example, #
# grads['W1'] should store the gradient on W1, and be a matrix of same size #
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# since dL(i)/df(k) = p(k) - 1 (if k = y[i]), where f is a vector of scores for the given example
# i is the training sample and k is the class
dscores = probs.reshape(N, -1) # dscores is (N x C)
dscores[np.arange(N), y] -= 1
# since scores = a1.dot(W2), we get dW2 by multiplying a1.T and dscores
# W2 is H x C so dW2 should also match those dimensions
# a1.T x dscores = (H x N) x (N x C) = H x C
dW2 = np.dot(a1.T, dscores)
# Right now the gradient is a sum over all training examples, but we want it
# to be an average instead so we divide by N.
dW2 /= N
# b2 gradient: sum dscores over all N and C
db2 = dscores.sum(axis = 0)/N
# since a1 = X.dot(W1), we get dW1 by multiplying X.T and da1
# W1 is D x H so dW1 should also match those dimensions
# X.T x da1 = (D x N) x (N x H) = D x H
# first get da1 using scores = a1.dot(W2)
# a1 is N x H so da1 should also match those dimensions
# dscores x W2.T = (N x C) x (C x H) = N x H
da1 = dscores.dot(W2.T)
da1[a1 == 0] = 0 # set gradient of units that did not activate to 0
dW1 = X.T.dot(da1)
# Right now the gradient is a sum over all training examples, but we want it
# to be an average instead so we divide by N.
dW1 /= N
# b1 gradient: sum da1 over all N and H
db1 = da1.sum(axis = 0)/N
# Add regularization loss to the gradient
dW1 += 2 * reg * W1
dW2 += 2 * reg * W2
grads = {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2}
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return loss, grads
def train(self, X, y, X_val, y_val,
learning_rate=1e-3, learning_rate_decay=0.95,
reg=5e-6, num_iters=100,
batch_size=200, verbose=False):
"""
Train this neural network using stochastic gradient descent.
Inputs:
- X: A numpy array of shape (N, D) giving training data.
- y: A numpy array f shape (N,) giving training labels; y[i] = c means that
X[i] has label c, where 0 <= c < C.
- X_val: A numpy array of shape (N_val, D) giving validation data.
- y_val: A numpy array of shape (N_val,) giving validation labels.
- learning_rate: Scalar giving learning rate for optimization.
- learning_rate_decay: Scalar giving factor used to decay the learning rate
after each epoch.
- reg: Scalar giving regularization strength.
- num_iters: Number of steps to take when optimizing.
- batch_size: Number of training examples to use per step.
- verbose: boolean; if true print progress during optimization.
"""
num_train = X.shape[0]
iterations_per_epoch = max(num_train / batch_size, 1)
# Use SGD to optimize the parameters in self.model
loss_history = []
train_acc_history = []
val_acc_history = []
for it in range(num_iters):
X_batch = None
y_batch = None
#########################################################################
# TODO: Create a random minibatch of training data and labels, storing #
# them in X_batch and y_batch respectively. #
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# generate random indices
indices = np.random.choice(num_train, batch_size)
X_batch, y_batch = X[indices], y[indices]
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# Compute loss and gradients using the current minibatch
loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
loss_history.append(loss)
#########################################################################
# TODO: Use the gradients in the grads dictionary to update the #
# parameters of the network (stored in the dictionary self.params) #
# using stochastic gradient descent. You'll need to use the gradients #
# stored in the grads dictionary defined above. #
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
self.params['W1'] -= learning_rate * grads['W1']
self.params['W2'] -= learning_rate * grads['W2']
self.params['b1'] -= learning_rate * grads['b1']
self.params['b2'] -= learning_rate * grads['b2']
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f' % (it, num_iters, loss))
# Every epoch, check train and val accuracy and decay learning rate.
if it % iterations_per_epoch == 0:
# Check accuracy
train_acc = (self.predict(X_batch) == y_batch).mean()
val_acc = (self.predict(X_val) == y_val).mean()
train_acc_history.append(train_acc)
val_acc_history.append(val_acc)
# Decay learning rate
learning_rate *= learning_rate_decay
return {
'loss_history': loss_history,
'train_acc_history': train_acc_history,
'val_acc_history': val_acc_history,
}
def predict(self, X):
"""
Use the trained weights of this two-layer network to predict labels for
data points. For each data point we predict scores for each of the C
classes, and assign each data point to the class with the highest score.
Inputs:
- X: A numpy array of shape (N, D) giving N D-dimensional data points to
classify.
Returns:
- y_pred: A numpy array of shape (N,) giving predicted labels for each of
the elements of X. For all i, y_pred[i] = c means that X[i] is predicted
to have class c, where 0 <= c < C.
"""
y_pred = None
###########################################################################
# TODO: Implement this function; it should be VERY simple! #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# define lamba function for relu
relu = lambda x: np.maximum(0, x)
# activations of fully connected layer #1
a1 = relu(X.dot(self.params['W1']) + self.params['b1'])
# output of softmax
# scores = a1 x W2 = (N x H) x (H x C) = N x C
scores = a1.dot(self.params['W2']) + self.params['b2']
y_pred = np.argmax(scores, axis = 1)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return y_pred
With regards to above code, I could not understand how DW2 was computed well. I took picture of the point I need to clarify and need an explanation for the difference.enter image description here
My ideas
I am following a tutorial on number recognition in images with neural networks, but I just encountered this error when I want to call a method of the Network class
TypeError: 'tuple' object does not support item assignment
I can't figure out what's causing it, please help.
this is the tutorial link
http://neuralnetworksanddeeplearning.com/chap1.html
this is my code
import numpy as np
import random
from keras.datasets import mnist
"""
A module to implement the stochastic gradient descent learning
algorithm for a feedforward neural network. Gradients are calculated
using backpropagation.
"""
class Network(object):
"""Making neural network"""
def __init__(self, sizes):
self.num_layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x)
for x, y in zip(sizes[:-1], sizes[1:])]
def feedforward(self, a):
"""Return the output of the network if a is input"""
for b, w in zip(self.biases, self. weights):
a = sigmoid(np.dot(w, a) + b)
return a
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
"""Train the neural network using mini-batch stochastic
gradient descent. The ``training_data`` is a list of tuples
``(x, y)`` representing the training inputs and the desired
outputs. The other non-optional parameters are
self-explanatory. If ``test_data`` is provided then the
network will be evaluated agains the test data after each
epoch, and partial progress printed out. This is useful for
tracking progress, but slows things down substantially."""
if test_data:
n_test = len(test_data)
n = len(training_data)
for j in range(epochs):
random.shuffle(training_data)
mini_batches = [
training_data[k: k + mini_batch_size]
for k in range(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
if test_data:
print("Epoch {0}: {1} / {2}".format(
j, self.evaluate(test_data), n_test))
else:
print("Epoch {0} complete".format(j))
def update_mini_batch(self, mini_batch, eta):
"""Update the network's wrigths and biases by applying
gradient descent using backpropagation to a single mini batch.
The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
is the learning rate"""
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w - (eta / len(mini_batch)) * nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
"""Return a tuple ``(nabla_b, nabla_w`` representing the
gradient for the cost function C_x. ``nabla_b`` and
``nabla_w`` are layer-by-layer lists of numpy arrays, similar
to ``self.biases`` and ``self.weights``."""
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
# backward pass
delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
# l = 1 means the last layer of neurons, l = 2 is the
# secod-last layer, and so on. It's a renumbering of the
# scheme in the book, used here to take advantage of the fact
# that python can use negative indices in lists.
for l in range(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l, -1].transpose())
return (nabla_b, nabla_w)
def evaluate(self, test_data):
"""Return the number of test inputs for which the neural
network outputs the correct result. Note that the neural
network's output is assumed to be the index of wicheve
neuron in the final layer has the highest activation"""
test_results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
def cost_derivative(self, output_activations, y):
"""Return the vector of partial derivatives ``\partial C_x / \partial a``
for the output activations"""
return (output_activations - y)
def sigmoid(z):
"""Sigmoid function"""
return 1 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
"""Derivative of the sigmoid function"""
return sigmoid(z) * (1 - sigmoid(z))
training_data, test_data = mnist.load_data()
net = Network([784, 30, 10])
net.SGD(training_data=training_data, epochs=30, mini_batch_size=10, eta=3.0, test_data=test_data)
test_data=test_data)
TypeError: 'tuple' object does not support item assignment
Do you have any idea why this network doesn't want to learn? The idea is that it uses ReLU as an activation function in earlier layers and sigmoid as an activation function in the last layer. The network learned fine when I used only sigmoid. To verify the network I used MNIST.
def sigmoid( z ):
return 1.0 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
def RELU(z):
return z*(z>0)
def RELU_Prime(z):
return (z>0)
# x - training data in mnist for example (1,784) vector
# y - training label in mnist for example (1,10) vector
# nabla is gradient for the current x and y
def backprop(self, x, y):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
index =0
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b
zs.append(z)
if index == len(self.weights)-1:
activation = sigmoid(z)
#previous layers are RELU
else:
activation = RELU(z)
activations.append(activation)
index +=1
# backward pass
delta = self.cost_derivative(activations[-1], y) *\
sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in range(2, self.num_layers):
z = zs[-l]
sp = RELU_Prime(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return (nabla_b, nabla_w)
--------------- Edit -----------------------------
def cost_derivative(self, output_activations, y):
return (output_activations-y)
--------------- Edit 2 -----------------------------
self.weights = [w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
eta > 0
For those in future the answer for this problem is simple but hidden :). It turns out the weight initialization was wrong. To make it work you have to use Xavier initialization and multiply it by 2.
Currently I'm learning from Andrew Ng course on Coursera called "Machine Learning". In exercise 5, we built a model that can predict digits, trained by the MNIST dataset. This task was completed successfully in Matlab by me, but I wanted to migrate that code to Python, just to see how different things are and maybe continue to play around with the model.
I managed to implement the cost function and the back propagation algorithm correctly. I know that because I compared the metrics with my working model in Matlab and it emits the same numbers.
Now, because in the course we train the model using fmincg, I tried to do the same using Scipy fmin_cg
function.
My problem is, the cost function takes extra small steps and fails to converge.
Here is my code for the network:
import numpy as np
import utils
import scipy.optimize as op
class Network:
def __init__(self, layers):
self.layers = layers
self.weights = self.generate_params()
# Function for generating theta multidimensional matrix
def generate_params(self):
theta = []
epsilon = 0.12
for i in range(len(self.layers) - 1):
current_layer_units = self.layers[i]
next_layer_units = self.layers[i + 1]
theta_i = np.multiply(
np.random.rand(next_layer_units, current_layer_units + 1),
2 * epsilon - epsilon
)
# Appending the params to the theta matrix
theta.append(theta_i)
return theta
# Function to append bias row/column to matrix X
def append_bias(self, X, d):
m = X.shape[0]
n = 1 if len(X.shape) == 1 else X.shape[1]
if (d == 'column'):
ones = np.ones((m, n + 1))
ones[:, 1:] = X.reshape((m, n))
elif (d == 'row'):
ones = np.ones((m + 1, n))
ones[1:, :] = X.reshape((m, n))
return ones
# Function for computing the gradient for 1 training example
def back_prop(self, y, feed, theta):
activations = feed["activations"]
weighted_layers = feed["weighted_layers"]
delta_output = activations[-1] - y.reshape(len(y), 1)
current_delta = delta_output
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Peforming delta calculations.
# Here, we continue to propagate the delta values backwards
# until we arrive to the second layer.
for i in reversed(range(len(theta))):
theta_i = theta[i]
if (i > 0):
i_weighted_inputs = self.append_bias(weighted_layers[i - 1], 'row')
t_theta_i = np.transpose(theta_i)
delta_i = np.multiply(np.dot(t_theta_i, current_delta), utils.sigmoidGradient(i_weighted_inputs))
delta_i = delta_i[1:]
gradients[i] = current_delta * np.transpose(activations[i])
# Setting current delta for the next layer
current_delta = delta_i
else:
gradients[i] = current_delta * np.transpose(activations[i])
return gradients
# Function for computing the cost and the derivatives
def compute_cost(self, theta, X, y, r12n = 0):
m = len(X)
num_labels = self.layers[-1]
costs = np.zeros(m)
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Iterating over the training set
for i in range(m):
inputs = X[i]
observed = utils.create_output_vector(y[i], num_labels)
feed = self.feed_forward(inputs)
predicted = feed["activations"][-1]
total_cost = 0
for k, o in enumerate(observed):
if (o == 1):
total_cost += np.log(predicted[k])
else:
total_cost += np.log(1 - predicted[k])
cost = -1 * total_cost
# Storing the cost for the i-th training example
costs[i] = cost
# Calculating the gradient for this training example
# using back propagation algorithm
gradients_i = self.back_prop(observed, feed, theta)
for i, gradient in enumerate(gradients_i):
gradients[i] += gradient
# Calculating the avg regularization term for the cost
sum_of_theta = 0
for i, theta_i in enumerate(theta):
squared_theta = np.power(theta_i[:, 1:], 2)
sum_of_theta += np.sum(squared_theta)
r12n_avg = r12n * sum_of_theta / (2 * m)
total_cost = np.sum(costs) / m + r12n_avg
# Applying regularization terms to the gradients
for i, theta_i in enumerate(theta):
lambda_i = np.copy(theta_i)
lambda_i[:, 0] = 0
lambda_i = np.multiply((r12n / m), lambda_i)
# Adding the r12n matrix to the gradient
gradients[i] = gradients[i] / m + lambda_i
return total_cost, gradients
# Function for training the neural network using conjugate gradient algorithm
def train_cg(self, X, y, r12n = 0, iterations = 50):
weights = self.weights
def Cost(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
cost, _ = self.compute_cost(theta, X, y, r12n)
print(cost);
return cost
def Gradient(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
_, gradient = self.compute_cost(theta, X, y, r12n)
return utils.unroll_theta(gradient)
unrolled_theta = utils.unroll_theta(weights)
result = op.fmin_cg(f = Cost,
x0 = unrolled_theta,
args=(X, y),
fprime=Gradient,
maxiter = iterations)
self.weights = utils.roll_theta(result, self.layers)
# Function for feeding forward the network
def feed_forward(self, X):
# Useful variables
activations = []
weighted_layers = []
weights = self.weights
currentActivations = self.append_bias(X, 'row')
activations.append(currentActivations)
for i in range(len(self.layers) - 1):
layer_weights = weights[i]
weighted_inputs = np.dot(layer_weights, currentActivations)
# Storing the weighted inputs
weighted_layers.append(weighted_inputs)
activation_nodes = []
# If the next layer is not the output layer, we'd like to add a bias unit to it
# (Excluding the input and the output layer)
if (i < len(self.layers) - 2):
activation_nodes = self.append_bias(utils.sigmoid(weighted_inputs), 'row')
else:
activation_nodes = utils.sigmoid(weighted_inputs)
# Appending the layer of nodes to the activations array
activations.append(activation_nodes)
currentActivations = activation_nodes
data = {
"activations": activations,
"weighted_layers": weighted_layers
}
return data
def predict(self, X):
data = self.feed_forward(X)
output = data["activations"][-1]
# Finding the max index in the output layer
return np.argmax(output, axis=0)
Here is the invocation of the code:
import numpy as np
from network import Network
# %% Load data
X = np.genfromtxt('data/mnist_data.csv', delimiter=',')
y = np.genfromtxt('data/mnist_outputs.csv', delimiter=',').astype(int)
# %% Create network
num_labels = 10
input_layer = 400
hidden_layer = 25
output_layer = num_labels
layers = [input_layer, hidden_layer, output_layer]
# Create a new neural network
network = Network(layers)
# %% Train the network and save the weights
network.train_cg(X, y, r12n = 1, iterations = 20)
This is what the code emits after each iteration:
15.441233231650283
15.441116436313076
15.441192262452514
15.44122384651483
15.441231216030646
15.441232804294314
15.441233141284435
15.44123321255294
15.441233227614855
As you can see, the changes to the cost are very small.
I checked for the shapes of the vectors and gradient and they both seem fine, just like in my Matlab implementation. I'm not sure what I do wrong here.
If you guys could help me, that'd be great :)
I am working through Nielsen's Neural Networks and Deep Learning. To develop my understanding Nielsen suggests rewriting his back-propagation algorithm to take a matrix based approach (supposedly much quicker due to optimizations in linear algebra libraries).
Currently I get a very low/fluctuating accuracy between 9-10% every single time. Normally, I'd continue working on my understanding, but I have worked this algorithm for the better part of 3 days and I feel like I have a pretty good handle on the math behind backprop. Regardless, I continue to generate mediocre results for accuracy, so any insight would be greatly appreciated!!!
I'm using the MNIST handwritten digits database.
neural_net_batch.py
the neural network functions (backprop in here)
"""
neural_net_batch.py
neural_net.py modified to use matrix operations
"""
# Libs
import random
import numpy as np
# Neural Network
class Network(object):
def __init__(self, sizes):
self.num_layers = len(sizes) # Number of layers in network
self.sizes = sizes # Number of neurons in each layer
self.biases = [np.random.randn(y, 1) for y in sizes[1:]] # Bias vector, 1 bias for each neuron in each layer, except input neurons
self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])] # Weight matrix
# Feed Forward Function
# Returns netowrk output for input a
def feedforward(self, a):
for b, w in zip(self.biases, self.weights): # a’ = σ(wa + b)
a = sigmoid(np.dot(w, a)+b)
return a
# Stochastic Gradient Descent
def SGD(self, training_set, epochs, m, eta, test_data):
if test_data: n_test = len(test_data)
n = len(training_set)
# Epoch loop
for j in range(epochs):
# Shuffle training data & parcel out mini batches
random.shuffle(training_set)
mini_batches = [training_set[k:k+m] for k in range(0, n, m)]
# Pass mini batches one by one to be updated
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
# End of Epoch (optional epoch testing)
if test_data:
evaluation = self.evaluate(test_data)
print("Epoch %6i: %5i / %5i" % (j, evaluation, n_test))
else:
print("Epoch %5i complete" % (j))
# Update Mini Batch (Matrix approach)
def update_mini_batch(self, mini_batch, eta):
m = len(mini_batch)
nabla_b = []
nabla_w = []
# Build activation & answer matrices
x = np.asarray([_x.ravel() for _x,_y in mini_batch]) # 10x784 where each row is an input vector
y = np.asarray([_y.ravel() for _x,_y in mini_batch]) # 10x10 where each row is an desired output vector
nabla_b, nabla_w = self.backprop(x, y) # Feed matrices into backpropagation
# Train Biases & weights
self.biases = [b-(eta/m)*nb for b, nb in zip(self.biases, nabla_b)]
self.weights = [w-(eta/m)*nw for w, nw in zip(self.weights, nabla_w)]
def backprop(self, x, y):
# Gradient arrays
nabla_b = [0 for i in self.biases]
nabla_w = [0 for i in self.weights]
w = self.weights
# Vars
m = len(x) # Mini batch size
a = x # Activation matrix temp variable
a_s = [x] # Activation matrix record
z_s = [] # Weighted Activation matrix record
special_b = [] # Special bias matrix to facilitate matrix operations
# Build special bias matrix (repeating biases for each example)
for j in range(len(self.biases)):
special_b.append([])
for k in range(m):
special_b[j].append(self.biases[j].flatten())
special_b[j] = np.asarray(special_b[j])
# Forward pass
# Starting at the input layer move through each layer
for l in range(len(self.sizes)-1):
z = a # w[l].transpose() + special_b[l]
z_s.append(z)
a = sigmoid(z)
a_s.append(a)
# Backward pass
delta = cost_derivative(a_s[-1], y) * sigmoid_prime(z_s[-1])
nabla_b[-1] = delta
nabla_w[-1] = delta # a_s[-2]
for n in range(2, self.num_layers):
z = z_s[-n]
sp = sigmoid_prime(z)
delta = self.weights[-n+1].transpose() # delta * sp.transpose()
nabla_b[-n] = delta
nabla_w[-n] = delta # a_s[-n-1]
# Create bias vectors by summing bias columns elementwise
for i in range(len(nabla_b)):
temp = []
for j in nabla_b[i]:
temp.append(sum(j))
nabla_b[i] = np.asarray(temp).reshape(-1,1)
return [nabla_b, nabla_w]
def evaluate(self, test_data):
test_results = [(np.argmax(self.feedforward(t[0])), t[1]) for t in test_data]
return sum(int(x==y) for (x, y) in test_results)
# Cost Derivative Function
# Returns the vector of partial derivatives C_x, a for the output activations y
def cost_derivative(output_activations, y):
return(output_activations-y)
# Sigmoid Function
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
# Sigmoid Prime (Derivative) Function
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
MNIST_TEST.py
test script
import mnist_data
import neural_net_batch as nn
# Data Sets
training_data, validation_data, test_data = mnist_data.load_data_wrapper()
training_data = list(training_data)
validation_data = list(validation_data)
test_data = list(test_data)
# Network
net = nn.Network([784, 30, 10])
# Perform Stochastic Gradient Descent using MNIST training & test data,
# 30 epochs, mini_batch size of 10, and learning rate of 3.0
net.SGD(list(training_data), 30, 10, 3.0, test_data=test_data)
A very helpful Reddit (u/xdaimon) helped me to get the following answer (on Reddit):
Your backward pass should be
# Backward pass
delta = cost_derivative(a_s[-1], y) * sigmoid_prime(z_s[-1])
nabla_b[-1] = delta.T
nabla_w[-1] = delta.T # a_s[-2]
for n in range(2, self.num_layers):
z = z_s[-n]
sp = sigmoid_prime(z)
delta = delta # self.weights[-n+1] * sp
nabla_b[-n] = delta.T
nabla_w[-n] = delta.T # a_s[-n-1]
One way to find this bug is to remember that there should be a
transpose somewhere in the product that computes nabla_w.
And if you're interested, the transpose shows up in the matrix
implementation of backprop because AB is the same as the sum of outer
products of the columns of A and the rows of B. In this case A=delta.T
and B=a_s[-n-1] and so the outer products are between the rows of
delta and the rows of a_s[-n-1]. Each term in the sum is nabla_w for a
single element in the batch which is exactly what we want. If the
minibatch size is 1 you can easily see that delta.T#a_s[-n-1] is just
the outer product of the delta vector and activation vector.
Testing shows not only is the network accurate again, the expected speedup is present.