I am following a tutorial on number recognition in images with neural networks, but I just encountered this error when I want to call a method of the Network class
TypeError: 'tuple' object does not support item assignment
I can't figure out what's causing it, please help.
this is the tutorial link
http://neuralnetworksanddeeplearning.com/chap1.html
this is my code
import numpy as np
import random
from keras.datasets import mnist
"""
A module to implement the stochastic gradient descent learning
algorithm for a feedforward neural network. Gradients are calculated
using backpropagation.
"""
class Network(object):
"""Making neural network"""
def __init__(self, sizes):
self.num_layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x)
for x, y in zip(sizes[:-1], sizes[1:])]
def feedforward(self, a):
"""Return the output of the network if a is input"""
for b, w in zip(self.biases, self. weights):
a = sigmoid(np.dot(w, a) + b)
return a
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
"""Train the neural network using mini-batch stochastic
gradient descent. The ``training_data`` is a list of tuples
``(x, y)`` representing the training inputs and the desired
outputs. The other non-optional parameters are
self-explanatory. If ``test_data`` is provided then the
network will be evaluated agains the test data after each
epoch, and partial progress printed out. This is useful for
tracking progress, but slows things down substantially."""
if test_data:
n_test = len(test_data)
n = len(training_data)
for j in range(epochs):
random.shuffle(training_data)
mini_batches = [
training_data[k: k + mini_batch_size]
for k in range(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
if test_data:
print("Epoch {0}: {1} / {2}".format(
j, self.evaluate(test_data), n_test))
else:
print("Epoch {0} complete".format(j))
def update_mini_batch(self, mini_batch, eta):
"""Update the network's wrigths and biases by applying
gradient descent using backpropagation to a single mini batch.
The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
is the learning rate"""
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w - (eta / len(mini_batch)) * nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
"""Return a tuple ``(nabla_b, nabla_w`` representing the
gradient for the cost function C_x. ``nabla_b`` and
``nabla_w`` are layer-by-layer lists of numpy arrays, similar
to ``self.biases`` and ``self.weights``."""
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
# backward pass
delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
# l = 1 means the last layer of neurons, l = 2 is the
# secod-last layer, and so on. It's a renumbering of the
# scheme in the book, used here to take advantage of the fact
# that python can use negative indices in lists.
for l in range(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l, -1].transpose())
return (nabla_b, nabla_w)
def evaluate(self, test_data):
"""Return the number of test inputs for which the neural
network outputs the correct result. Note that the neural
network's output is assumed to be the index of wicheve
neuron in the final layer has the highest activation"""
test_results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
def cost_derivative(self, output_activations, y):
"""Return the vector of partial derivatives ``\partial C_x / \partial a``
for the output activations"""
return (output_activations - y)
def sigmoid(z):
"""Sigmoid function"""
return 1 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
"""Derivative of the sigmoid function"""
return sigmoid(z) * (1 - sigmoid(z))
training_data, test_data = mnist.load_data()
net = Network([784, 30, 10])
net.SGD(training_data=training_data, epochs=30, mini_batch_size=10, eta=3.0, test_data=test_data)
test_data=test_data)
TypeError: 'tuple' object does not support item assignment
Related
Do you have any idea why this network doesn't want to learn? The idea is that it uses ReLU as an activation function in earlier layers and sigmoid as an activation function in the last layer. The network learned fine when I used only sigmoid. To verify the network I used MNIST.
def sigmoid( z ):
return 1.0 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
def RELU(z):
return z*(z>0)
def RELU_Prime(z):
return (z>0)
# x - training data in mnist for example (1,784) vector
# y - training label in mnist for example (1,10) vector
# nabla is gradient for the current x and y
def backprop(self, x, y):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
index =0
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b
zs.append(z)
if index == len(self.weights)-1:
activation = sigmoid(z)
#previous layers are RELU
else:
activation = RELU(z)
activations.append(activation)
index +=1
# backward pass
delta = self.cost_derivative(activations[-1], y) *\
sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in range(2, self.num_layers):
z = zs[-l]
sp = RELU_Prime(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return (nabla_b, nabla_w)
--------------- Edit -----------------------------
def cost_derivative(self, output_activations, y):
return (output_activations-y)
--------------- Edit 2 -----------------------------
self.weights = [w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
eta > 0
For those in future the answer for this problem is simple but hidden :). It turns out the weight initialization was wrong. To make it work you have to use Xavier initialization and multiply it by 2.
Learning Neural Nets I've written my own class.
import numpy as np
import random
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def sigmoid_prime(x):
return sigmoid(x) * (1 - sigmoid(x))
def linear(x):
return x
def linear_prime(x):
return 1
def tanh(x):
return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))
def tanh_prime(x):
return 1 - tanh(x)*tanh(x)
class Network:
def __init__(self, sizes, activation_func = sigmoid, activation_prime = sigmoid_prime):
self.biases = [np.random.randn(x, 1) for x in sizes[1:]]
self.weights = [np.random.randn(y, x) for x, y in zip(sizes, sizes[1:])]
self.num_layers = len(sizes)
self.sizes = sizes
self.activation_function = activation_func
self.actiovation_prime = activation_prime
def forward_prop(self, a):
for w, b in zip(self.weights, self.biases):
a = self.activation_function(np.dot(w, a) + b)
return a
def cost_derivative(self, output_activations, y):
return (output_activations - y)
def backprop(self, x, y):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# forward pass
activation = x # first activation, which is input layer
a_mas = [x]
z_mas = []
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation) + b
activation = self.activation_function(z)
z_mas.append(z)
a_mas.append(activation)
pass
# backward pass
delta = self.cost_derivative(a_mas[-1], y) * self.actiovation_prime(z_mas[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, a_mas[-2].T)
for l in range(2, self.num_layers): # there is 2 such as we've already done for last layer
delta = np.dot(self.weights[-l + 1].transpose(), delta) * self.actiovation_prime(z_mas[-l])
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, a_mas[-l - 1].T)
return nabla_b, nabla_w
def update_mini_batch(self, mini_batch, eta):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
eps = eta / len(mini_batch)
self.weights = [w - eps * nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - eps * nb for b, nb in zip(self.biases, nabla_b)]
def SGD(self, training_data, epochs, mini_batch_size, eta):
n = len(training_data)
for j in range(epochs):
random.shuffle(training_data)
mini_batches = [training_data[k:k + mini_batch_size]
for k in range(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
And now I'm trying to approximate sin() with a help of this net. But the following code below doesn't work correctly.
%matplotlib inline
import matplotlib.pyplot as plt
net2 = Network([1,100,1],tanh,tanh_prime)
x = np.linspace(0,10,1000)
y = np.sin(x)
train = [(np.array(x[i]).reshape(1,1),np.array(y[i]).reshape(1,1)) for i in range(len(x))]
net2.SGD(train,10,10,0.1)
y_pred = []
y_tmp = []
for i in range(len(x)):
y_tmp.append(net2.forward_prop(train[i][0]))
y_pred.append(float(net2.forward_prop(train[i][0])))
plt.plot(x,y_look,'r',x,y_pred)
plt.grid()
Here is what I got.
I've already tried to implement this net in digit recognition with a help of MNIST dataset. There everything worked. But I couldn't get better accuracy then 70%, but That's not a problem. But here I have no clue what is wrong... Activation function is tanh().
From what I see, you are minimising the function f(x) - y, you might want to change that to mean squared error ((f(x) - y)^2) or mean absolute error (|f(x) - y|), which are adequate for a regression problem like yours. For a classification problem such as MNIST, a cross-entropy is a good choice.
Also, you might try to remove the tanh function in the output layer. I do not think that tanh is a problem since the output is between -1 and 1 but generally, for regression problems, we use linear activations and we keep squashing functions such as sigmoid and tanh for classification problems.
Currently I'm learning from Andrew Ng course on Coursera called "Machine Learning". In exercise 5, we built a model that can predict digits, trained by the MNIST dataset. This task was completed successfully in Matlab by me, but I wanted to migrate that code to Python, just to see how different things are and maybe continue to play around with the model.
I managed to implement the cost function and the back propagation algorithm correctly. I know that because I compared the metrics with my working model in Matlab and it emits the same numbers.
Now, because in the course we train the model using fmincg, I tried to do the same using Scipy fmin_cg
function.
My problem is, the cost function takes extra small steps and fails to converge.
Here is my code for the network:
import numpy as np
import utils
import scipy.optimize as op
class Network:
def __init__(self, layers):
self.layers = layers
self.weights = self.generate_params()
# Function for generating theta multidimensional matrix
def generate_params(self):
theta = []
epsilon = 0.12
for i in range(len(self.layers) - 1):
current_layer_units = self.layers[i]
next_layer_units = self.layers[i + 1]
theta_i = np.multiply(
np.random.rand(next_layer_units, current_layer_units + 1),
2 * epsilon - epsilon
)
# Appending the params to the theta matrix
theta.append(theta_i)
return theta
# Function to append bias row/column to matrix X
def append_bias(self, X, d):
m = X.shape[0]
n = 1 if len(X.shape) == 1 else X.shape[1]
if (d == 'column'):
ones = np.ones((m, n + 1))
ones[:, 1:] = X.reshape((m, n))
elif (d == 'row'):
ones = np.ones((m + 1, n))
ones[1:, :] = X.reshape((m, n))
return ones
# Function for computing the gradient for 1 training example
def back_prop(self, y, feed, theta):
activations = feed["activations"]
weighted_layers = feed["weighted_layers"]
delta_output = activations[-1] - y.reshape(len(y), 1)
current_delta = delta_output
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Peforming delta calculations.
# Here, we continue to propagate the delta values backwards
# until we arrive to the second layer.
for i in reversed(range(len(theta))):
theta_i = theta[i]
if (i > 0):
i_weighted_inputs = self.append_bias(weighted_layers[i - 1], 'row')
t_theta_i = np.transpose(theta_i)
delta_i = np.multiply(np.dot(t_theta_i, current_delta), utils.sigmoidGradient(i_weighted_inputs))
delta_i = delta_i[1:]
gradients[i] = current_delta * np.transpose(activations[i])
# Setting current delta for the next layer
current_delta = delta_i
else:
gradients[i] = current_delta * np.transpose(activations[i])
return gradients
# Function for computing the cost and the derivatives
def compute_cost(self, theta, X, y, r12n = 0):
m = len(X)
num_labels = self.layers[-1]
costs = np.zeros(m)
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Iterating over the training set
for i in range(m):
inputs = X[i]
observed = utils.create_output_vector(y[i], num_labels)
feed = self.feed_forward(inputs)
predicted = feed["activations"][-1]
total_cost = 0
for k, o in enumerate(observed):
if (o == 1):
total_cost += np.log(predicted[k])
else:
total_cost += np.log(1 - predicted[k])
cost = -1 * total_cost
# Storing the cost for the i-th training example
costs[i] = cost
# Calculating the gradient for this training example
# using back propagation algorithm
gradients_i = self.back_prop(observed, feed, theta)
for i, gradient in enumerate(gradients_i):
gradients[i] += gradient
# Calculating the avg regularization term for the cost
sum_of_theta = 0
for i, theta_i in enumerate(theta):
squared_theta = np.power(theta_i[:, 1:], 2)
sum_of_theta += np.sum(squared_theta)
r12n_avg = r12n * sum_of_theta / (2 * m)
total_cost = np.sum(costs) / m + r12n_avg
# Applying regularization terms to the gradients
for i, theta_i in enumerate(theta):
lambda_i = np.copy(theta_i)
lambda_i[:, 0] = 0
lambda_i = np.multiply((r12n / m), lambda_i)
# Adding the r12n matrix to the gradient
gradients[i] = gradients[i] / m + lambda_i
return total_cost, gradients
# Function for training the neural network using conjugate gradient algorithm
def train_cg(self, X, y, r12n = 0, iterations = 50):
weights = self.weights
def Cost(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
cost, _ = self.compute_cost(theta, X, y, r12n)
print(cost);
return cost
def Gradient(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
_, gradient = self.compute_cost(theta, X, y, r12n)
return utils.unroll_theta(gradient)
unrolled_theta = utils.unroll_theta(weights)
result = op.fmin_cg(f = Cost,
x0 = unrolled_theta,
args=(X, y),
fprime=Gradient,
maxiter = iterations)
self.weights = utils.roll_theta(result, self.layers)
# Function for feeding forward the network
def feed_forward(self, X):
# Useful variables
activations = []
weighted_layers = []
weights = self.weights
currentActivations = self.append_bias(X, 'row')
activations.append(currentActivations)
for i in range(len(self.layers) - 1):
layer_weights = weights[i]
weighted_inputs = np.dot(layer_weights, currentActivations)
# Storing the weighted inputs
weighted_layers.append(weighted_inputs)
activation_nodes = []
# If the next layer is not the output layer, we'd like to add a bias unit to it
# (Excluding the input and the output layer)
if (i < len(self.layers) - 2):
activation_nodes = self.append_bias(utils.sigmoid(weighted_inputs), 'row')
else:
activation_nodes = utils.sigmoid(weighted_inputs)
# Appending the layer of nodes to the activations array
activations.append(activation_nodes)
currentActivations = activation_nodes
data = {
"activations": activations,
"weighted_layers": weighted_layers
}
return data
def predict(self, X):
data = self.feed_forward(X)
output = data["activations"][-1]
# Finding the max index in the output layer
return np.argmax(output, axis=0)
Here is the invocation of the code:
import numpy as np
from network import Network
# %% Load data
X = np.genfromtxt('data/mnist_data.csv', delimiter=',')
y = np.genfromtxt('data/mnist_outputs.csv', delimiter=',').astype(int)
# %% Create network
num_labels = 10
input_layer = 400
hidden_layer = 25
output_layer = num_labels
layers = [input_layer, hidden_layer, output_layer]
# Create a new neural network
network = Network(layers)
# %% Train the network and save the weights
network.train_cg(X, y, r12n = 1, iterations = 20)
This is what the code emits after each iteration:
15.441233231650283
15.441116436313076
15.441192262452514
15.44122384651483
15.441231216030646
15.441232804294314
15.441233141284435
15.44123321255294
15.441233227614855
As you can see, the changes to the cost are very small.
I checked for the shapes of the vectors and gradient and they both seem fine, just like in my Matlab implementation. I'm not sure what I do wrong here.
If you guys could help me, that'd be great :)
I am working through Nielsen's Neural Networks and Deep Learning. To develop my understanding Nielsen suggests rewriting his back-propagation algorithm to take a matrix based approach (supposedly much quicker due to optimizations in linear algebra libraries).
Currently I get a very low/fluctuating accuracy between 9-10% every single time. Normally, I'd continue working on my understanding, but I have worked this algorithm for the better part of 3 days and I feel like I have a pretty good handle on the math behind backprop. Regardless, I continue to generate mediocre results for accuracy, so any insight would be greatly appreciated!!!
I'm using the MNIST handwritten digits database.
neural_net_batch.py
the neural network functions (backprop in here)
"""
neural_net_batch.py
neural_net.py modified to use matrix operations
"""
# Libs
import random
import numpy as np
# Neural Network
class Network(object):
def __init__(self, sizes):
self.num_layers = len(sizes) # Number of layers in network
self.sizes = sizes # Number of neurons in each layer
self.biases = [np.random.randn(y, 1) for y in sizes[1:]] # Bias vector, 1 bias for each neuron in each layer, except input neurons
self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])] # Weight matrix
# Feed Forward Function
# Returns netowrk output for input a
def feedforward(self, a):
for b, w in zip(self.biases, self.weights): # a’ = σ(wa + b)
a = sigmoid(np.dot(w, a)+b)
return a
# Stochastic Gradient Descent
def SGD(self, training_set, epochs, m, eta, test_data):
if test_data: n_test = len(test_data)
n = len(training_set)
# Epoch loop
for j in range(epochs):
# Shuffle training data & parcel out mini batches
random.shuffle(training_set)
mini_batches = [training_set[k:k+m] for k in range(0, n, m)]
# Pass mini batches one by one to be updated
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
# End of Epoch (optional epoch testing)
if test_data:
evaluation = self.evaluate(test_data)
print("Epoch %6i: %5i / %5i" % (j, evaluation, n_test))
else:
print("Epoch %5i complete" % (j))
# Update Mini Batch (Matrix approach)
def update_mini_batch(self, mini_batch, eta):
m = len(mini_batch)
nabla_b = []
nabla_w = []
# Build activation & answer matrices
x = np.asarray([_x.ravel() for _x,_y in mini_batch]) # 10x784 where each row is an input vector
y = np.asarray([_y.ravel() for _x,_y in mini_batch]) # 10x10 where each row is an desired output vector
nabla_b, nabla_w = self.backprop(x, y) # Feed matrices into backpropagation
# Train Biases & weights
self.biases = [b-(eta/m)*nb for b, nb in zip(self.biases, nabla_b)]
self.weights = [w-(eta/m)*nw for w, nw in zip(self.weights, nabla_w)]
def backprop(self, x, y):
# Gradient arrays
nabla_b = [0 for i in self.biases]
nabla_w = [0 for i in self.weights]
w = self.weights
# Vars
m = len(x) # Mini batch size
a = x # Activation matrix temp variable
a_s = [x] # Activation matrix record
z_s = [] # Weighted Activation matrix record
special_b = [] # Special bias matrix to facilitate matrix operations
# Build special bias matrix (repeating biases for each example)
for j in range(len(self.biases)):
special_b.append([])
for k in range(m):
special_b[j].append(self.biases[j].flatten())
special_b[j] = np.asarray(special_b[j])
# Forward pass
# Starting at the input layer move through each layer
for l in range(len(self.sizes)-1):
z = a # w[l].transpose() + special_b[l]
z_s.append(z)
a = sigmoid(z)
a_s.append(a)
# Backward pass
delta = cost_derivative(a_s[-1], y) * sigmoid_prime(z_s[-1])
nabla_b[-1] = delta
nabla_w[-1] = delta # a_s[-2]
for n in range(2, self.num_layers):
z = z_s[-n]
sp = sigmoid_prime(z)
delta = self.weights[-n+1].transpose() # delta * sp.transpose()
nabla_b[-n] = delta
nabla_w[-n] = delta # a_s[-n-1]
# Create bias vectors by summing bias columns elementwise
for i in range(len(nabla_b)):
temp = []
for j in nabla_b[i]:
temp.append(sum(j))
nabla_b[i] = np.asarray(temp).reshape(-1,1)
return [nabla_b, nabla_w]
def evaluate(self, test_data):
test_results = [(np.argmax(self.feedforward(t[0])), t[1]) for t in test_data]
return sum(int(x==y) for (x, y) in test_results)
# Cost Derivative Function
# Returns the vector of partial derivatives C_x, a for the output activations y
def cost_derivative(output_activations, y):
return(output_activations-y)
# Sigmoid Function
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
# Sigmoid Prime (Derivative) Function
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
MNIST_TEST.py
test script
import mnist_data
import neural_net_batch as nn
# Data Sets
training_data, validation_data, test_data = mnist_data.load_data_wrapper()
training_data = list(training_data)
validation_data = list(validation_data)
test_data = list(test_data)
# Network
net = nn.Network([784, 30, 10])
# Perform Stochastic Gradient Descent using MNIST training & test data,
# 30 epochs, mini_batch size of 10, and learning rate of 3.0
net.SGD(list(training_data), 30, 10, 3.0, test_data=test_data)
A very helpful Reddit (u/xdaimon) helped me to get the following answer (on Reddit):
Your backward pass should be
# Backward pass
delta = cost_derivative(a_s[-1], y) * sigmoid_prime(z_s[-1])
nabla_b[-1] = delta.T
nabla_w[-1] = delta.T # a_s[-2]
for n in range(2, self.num_layers):
z = z_s[-n]
sp = sigmoid_prime(z)
delta = delta # self.weights[-n+1] * sp
nabla_b[-n] = delta.T
nabla_w[-n] = delta.T # a_s[-n-1]
One way to find this bug is to remember that there should be a
transpose somewhere in the product that computes nabla_w.
And if you're interested, the transpose shows up in the matrix
implementation of backprop because AB is the same as the sum of outer
products of the columns of A and the rows of B. In this case A=delta.T
and B=a_s[-n-1] and so the outer products are between the rows of
delta and the rows of a_s[-n-1]. Each term in the sum is nabla_w for a
single element in the batch which is exactly what we want. If the
minibatch size is 1 you can easily see that delta.T#a_s[-n-1] is just
the outer product of the delta vector and activation vector.
Testing shows not only is the network accurate again, the expected speedup is present.
I have this implementation of a feed forward neural network with stochastic gradient descent in python. When training a NN instance with the xor gate, it trains just fine. But when I train the instance with stock change data, it throws this error:
Traceback (most recent call last):
File "F:\predict\test.py", line 21, in <module>
stock.train(stockData, 30, 10, 3.0)
File "F:\predict\network.py", line 46, in train
self.updateMiniBatch(miniBatch, eta)
File "F:\predict\network.py", line 62, in updateMiniBatch
delta_nabla_b, delta_nabla_w = self.backPropagate(x, y)
File "F:\predict\network.py", line 102, in backPropagate
nabla_w[-l] = numpy.dot(delta, activations[-l - 1].transpose())
ValueError: objects are not aligned
The XOR Gate data, and stock data looks as such (Mind you the stock data has been truncated greatly):
xorData = [
([0, 0], [0]),
([0, 1], [1]),
([1, 0], [1]),
([1, 1], [0])
]
stockData = [
([0.0, 0.0, 0.003927144694969353], [-0.0038780602954071597]),
([0.0, 0.003927144694969353, -0.0038780602954071597], [0.0012018010088359343]),
([0.003927144694969353, -0.0038780602954071597, 0.0012018010088359343], [-0.0115302033846727])
]
I then create a network with 2 inputs, 2 hidden, and 1 output for the XOR Gate. And 3 inputs, 15 hidden, and 1 output for the Stock data.
xor = network.NN([2, 2, 1])
xor.train(xorData, 30, 10, 3.0)
stock = network.NN([3, 15, 1])
stock.train(stockData, 30, 10, 3.0)
Both of the training sets have the exact same structure, so why would this error occur?
network.py:
import numpy
import random
import json
# the sigmoid function
def sigmoid(z):
return 1.0 / (1.0 + numpy.exp(-z))
# the derivative of the sigmoid function
def sigmoidPrime(z):
return sigmoid(z) * (1 - sigmoid(z))
# sigmoid vectors
sigmoidVector = numpy.vectorize(sigmoid)
sigmoidPrimeVector = numpy.vectorize(sigmoidPrime)
#A class that implements stochastic gradient descent learning algorithm for a feedforward neural network
class NN:
def __init__(self, sizes):
self.numLayers = len(sizes)
self.sizes = sizes
# the biases and weights for the network are initialized randomly, using a Guassian distribution with mean 0, and variance 1
self.biases = [numpy.random.randn(y, 1) for y in sizes[1:]]
self.weights = [numpy.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
# feedForward function - return the output of the network
def feedForward(self, inputs):
for b, w in zip(self.biases, self.weights):
inputs = sigmoidVector(numpy.dot(w, inputs) + b)
return inputs
# train function - train the neural network using mini-batch stochastic gradient descent
# the trainingData is a list of tuples "(x, y)" representing the training inputs and the desired outputs
# if testData is provided then the network will be evaluated against the test data after each epoch
def train(self, trainingData, epochs, miniBatchSize, eta, testData = None):
if testData:
nTest = len(testData)
n = len(trainingData)
for j in xrange(epochs):
random.shuffle(trainingData)
miniBatches = [trainingData[k:k + miniBatchSize] for k in xrange(0, n, miniBatchSize)]
for miniBatch in miniBatches:
self.updateMiniBatch(miniBatch, eta)
if testData:
print "Epoch %i: %i / %i" % (j, self.evaluate(testData), nTest)
else:
print "Epoch %i complete" % j
# updateMiniBatch function - update the network's weights and biases by applying gradient descent using backpropagation
# to a single mini batch
# the miniBatch is a list of tuples "(x, y)" and eta is the learning rate
def updateMiniBatch(self, miniBatch, eta):
nabla_b = [numpy.zeros(b.shape) for b in self.biases]
nabla_w = [numpy.zeros(w.shape) for w in self.weights]
for x, y in miniBatch:
delta_nabla_b, delta_nabla_w = self.backPropagate(x, y)
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w - (eta / len(miniBatch)) * nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - (eta / len(miniBatch)) * nb for b, nb in zip(self.biases, nabla_b)]
# backPropagate function - returns a tuple "(nabla_b, nabla_w)" representing the gradient for the cost function C_x
# nabla_b and nabla_w are layer-by-layer lists of numpy arrays, similar to self.biases and self.weights
def backPropagate(self, x, y):
nabla_b = [numpy.zeros(b.shape) for b in self.biases]
nabla_w = [numpy.zeros(w.shape) for w in self.weights]
x = numpy.array(x)
y = numpy.array(y)
# feedForward
activation = x
activations = [x] # list to store all of the activations, layer by layer
zs = [] # list to store all of the z vectors, layer by layer
for b, w in zip(self.biases, self.weights):
z = numpy.dot(w, activation) + b
zs.append(z)
activation = sigmoidVector(z)
activations.append(activation)
# backward pass
delta = self.costDerivative(activations[-1], y) * sigmoidPrimeVector(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = numpy.dot(delta, activations[-2].transpose())
for l in xrange(2, self.numLayers):
spv = sigmoidPrimeVector(zs[-l])
delta = numpy.dot(self.weights[-l + 1].transpose(), delta) * spv
nabla_b[-l] = delta
nabla_w[-l] = numpy.dot(delta, activations[-l - 1].transpose())
return (nabla_b, nabla_w)
# evaluate function - return the number of test inputs for which the neural network outputs the correct result
def evaluate(self, testData):
testResults = [(numpy.argmax(self.feedForward(x)), y) for (x, y) in testData]
return sum(int(x == y) for (x, y) in testResults)
# costDerivative function - return the vector of partial derivatives for the output activations
def costDerivative(self, outputActivations, y):
return (outputActivations - y)
# save function - save the neural network to filename
def save(self, filename):
data = {
"sizes": self.sizes,
"weights": [w.tolist() for w in self.weights],
"biases": [b.tolist() for b in self.biases]
}
with open(filename, "w") as handle:
json.dump(data, handle)
# load function - load a neural network from the file filename
# returns a network instance
def load(filename):
with open(filename, "r") as handle:
data = json.load(handle)
network = NN(data["sizes"])
network.weights = [numpy.array(w) for w in data["weights"]]
network.biases = [numpy.array(b) for b in data["biases"]]
return network
EDIT:
I believe it might have something to do with the hidden layer value. I changed the XOR Gate hidden layer number and it threw the same error. It seems as if the hidden layer value must be the exact same as the amount for the input layer.