First Neural Network, (MLP), from Scratch, Python -- Questions - python

I understand how the Neural Network with backpropogation is supposed to work. I know how to use Python's own MLPClassifier and fit functions work in sklearn. I am creating my own because I'd like to know the details better. I will first show my code (with comments) and then discuss my problems.
import numpy as np
import scipy as sp
import sklearn as ML
# z: the linear combination of the previous layer
#
# returns the activation for the node
#
def sigmoid(z):
a = 1 / (1 + np.exp(-z))
return a
# z: the contribution of a layer
#
# returns the derivative of the sigmoid evaluated at z
#
def sig_grad(z):
d = (1 - sigmoid(z))*sigmoid(z)
return d
# input: the data we want to train the network with
# hidden_layers: the number of nodes in the hidden layers
# num_layers: how many hidden layers between the input layer and the output layer
# num_output: how many outputs there are... this becomes relevant when we input many features.
#
# returns the activations determined
# and the linear combinations of previous layer's nodes for each layer
#
def feedforward(input, hidden_layers, num_layers, num_output, thresh, weights):
#initialize the vector for inputs AND threshold values
X = np.hstack([thresh[0], input])
#intialize the activations list
A = []
#intialize the linear combos for each layer
Z = []
w = list(weights)
#place ones in the first row of each layer of weights for the threshold
w[0] = np.vstack([np.ones([1,hidden_layers]), w[0]])
for i in range(1,num_layers):
w[i] = np.vstack([np.ones([1,hidden_layers]), weights[i]])
w[-1] = np.vstack([np.ones([1,num_output]), w[-1]])
#the first layer of weights are initialized outside function
#cycle through the hidden layers
for i in range(1, num_layers+1):
Z.append( np.dot(X, w[i-1])); S = sigmoid(Z[i-1]); A.append(S); X = np.hstack([thresh[i], A[i-1]])
#find the output/last layer activations
Z.append( np.dot(X, w[-1]) ); S = sigmoid(Z[-1]); A.append(S);
return A, Z
#
# truth: what we know the output should be
# activations: the activations determined at each node by the sigmoid
# function in the previous feedforward pass
# combos: the linear combinations at each layer in the prev. ff pass
# num_layers: the number of hidden layers
#
# error: the errors determined at each layer; will be needed for gradient descent
#
def backprop(input, truth, activations, combos, num_layers, weights):
#initialize an array of errors for each hidden layer and the output layer
error = [0 for x in range(0,num_layers+1)]
#intialize the lists containing the gradients w.r.t. weights and threshold
derivW = []; derivb = []
#set the output layer since its error is computed differently than the others
error[num_layers] = (activations[num_layers] - truth)*sig_grad(combos[num_layers])
#find the rate of change for weights and thresh for connections to output
derivW.append( activations[num_layers-1]*error[num_layers]); derivb.append(np.sum(error[num_layers]))
if(num_layers > 1):
#find the errors for each of the hidden layers
for i in range(num_layers - 1, 0, -1):
error[i] = np.dot(weights[i+1],error[i+1])*sig_grad(combos[i])
derivW.append( np.outer(activations[i-1], error[i]) ); derivb.append(np.sum(error[i]))
#
#finding the derivative for weights of input to next layer
#
error[0] = np.dot(weights[i],error[i])*sig_grad(combos[0])
derivW.append( np.outer(input, error[0]) ); derivb.append(np.sum(error[0]))
return derivW, derivb
#
# weights: our networks weights to update via gradient descent
# thresh: the threshold values to update for our system
# derivb: the derivative of our cost function with respect to b for each layer
# derivW: the derivative of our cost function with respect to W for each layer
# stepsize: the stepsize we want to take, determines how big of a step we take
#
# returns the updated weights and threshold values for our network
def gradDesc(weights, thresh, derivb, derivW, stepsize, num_layers):
#perform gradient descent
for j in range(100):
for i in range(0, num_layers + 1):
weights[i] = weights[i] - stepsize*derivW[num_layers-i]
thresh[i] = thresh[i] - stepsize*derivb[num_layers-i]
return weights, thresh
#input: the data to send through the network
#hidden_layers: the number of hidden_layers between the input layer and the output layer
#num_layers: the number of nodes in the hidden layer
#num_output: the number of nodes in the output layer
#
#returns the output of the network
#
def nNetwork(input, truth, hidden_layers, num_layers, num_output, maxiter, stepsize):
#assuming that input is an array where each element is an input/sample
#we also need to know the size of each sample itself
m = input.size
thresh = np.random.randn(num_layers + 1, 1)
thresh_weights = np.ones([num_layers + 1, 1])
# initialize the weights as a list because each layer might have
# a different number of weights
weights = []; weights.append(np.random.randn(m,hidden_layers));
if( num_layers > 1):
for i in range(1, num_layers):
weights.append(np.random.randn(hidden_layers, hidden_layers))
weights.append(np.random.randn(hidden_layers, num_output))
for i in range(maxiter):
activations, combos = feedforward(input, hidden_layers, num_layers, num_output, thresh, weights)
derivW, derivb = backprop(input, truth, activations, combos, num_layers, weights)
weights, thresh = gradDesc(weights, thresh, derivb, derivW, stepsize, num_layers)
return weights, thresh
def main():
# a very, very simple neural network
input = np.array([1,0,0])
truth = 0
hidden_layers = 3
num_layers = 2
num_output = 1
#train the network
w, t = nNetwork(input, truth, hidden_layers, num_layers, num_output, maxiter = 10, stepsize = 0.001)
#test the network on a new set of arguments
#activations, combos = feedforward(new_input, hidden_layers = 3, num_layers = 2, thresh = t, weights = w)
main()
I've tested this code on simple examples where there are n input of one dimension and output of n dimension (not yet able to work out the bugs when I type import NN.py into the console, but works when I run it piece by piece in the console). I have a few questions to help me better understand what is going on when I have n input there are m dimensions. For example, the digits data in Python (there are 1797 samples and each sample is 64x1 -- an 8x8 image vectorized).
1) Is each of the 64 pixels considered an input? If so, is the neural net trained one image at a time? This would be an easy fix for me.
2) If the neural net is trained all images at once, what are suggestions for modifying my code?
3) Obviously the output for an image comes in the form of 0, 1, 2, 3, ... , or 9. But, does the output come in the form of a vector 10x1 where there is a 1 in the digit the image represents and 0's elsewhere? So, my prediction vector would have the highest value where the 1 might be, right?
4) Then, I'm not quite sure how #3 would look if #2 is true..
I apologize for the long note. Thanks for taking a look and helping me understand better!

Related

Gradient and Loss function

I could not understand well especially how gradients were computed with regards to matrix transposes. My question is for DW2 but if you want also to discuss about the computation of the other gradients and extend my question I am open to discussion. Mathematically things seem a little bit different but this code is reliable and on github so I trust this code.
from __future__ import print_function
from builtins import range
from builtins import object
import numpy as np
import matplotlib.pyplot as plt
from past.builtins import xrange
class TwoLayerNet(object):
"""
A two-layer fully-connected neural network. The net has an input dimension of
D* (correction), a hidden layer dimension of H, and performs classification over C classes.
We train the network with a softmax loss function and L2 regularization on the
weight matrices. The network uses a ReLU nonlinearity after the first fully
connected layer.
In other words, the network has the following architecture:
input - fully connected layer - ReLU - fully connected layer - softmax
The outputs of the second fully-connected layer are the scores for each class.
"""
def __init__(self, input_size, hidden_size, output_size, std=1e-4):
"""
Initialize the model. Weights are initialized to small random values and
biases are initialized to zero. Weights and biases are stored in the
variable self.params, which is a dictionary with the following keys:
W1: First layer weights; has shape (D, H)
b1: First layer biases; has shape (H,)
W2: Second layer weights; has shape (H, C)
b2: Second layer biases; has shape (C,)
Inputs:
- input_size: The dimension D of the input data.
- hidden_size: The number of neurons H in the hidden layer.
- output_size: The number of classes C.
"""
self.params = {}
self.params['W1'] = std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
def loss(self, X, y=None, reg=0.0):
"""
Compute the loss and gradients for a two layer fully connected neural
network.
Inputs:
- X: Input data of shape (N, D). Each X[i] is a training sample.
- y: Vector of training labels. y[i] is the label for X[i], and each y[i] is
an integer in the range 0 <= y[i] < C. This parameter is optional; if it
is not passed then we only return scores, and if it is passed then we
instead return the loss and gradients.
- reg: Regularization strength.
Returns:
If y is None, return a matrix scores of shape (N, C) where scores[i, c] is
the score for class c on input X[i].
If y is not None, instead return a tuple of:
- loss: Loss (data loss and regularization loss) for this batch of training
samples.
- grads: Dictionary mapping parameter names to gradients of those parameters
with respect to the loss function; has the same keys as self.params.
"""
# Unpack variables from the params dictionary
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
N, D = X.shape
# Compute the forward pass
scores = None
#############################################################################
# TODO: Perform the forward pass, computing the class scores for the input. #
# Store the result in the scores variable, which should be an array of #
# shape (N, C). #
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# perform the forward pass and compute the class scores for the input
# input - fully connected layer - ReLU - fully connected layer - softmax
# define lamba function for relu
relu = lambda x: np.maximum(0, x)
# a1 = X x W1 = (N x D) x (D x H) = N x H
a1 = relu(X.dot(W1) + b1) # activations of fully connected layer #1
# store the result in the scores variable, which should be an array of
# shape (N, C).
# scores = a1 x W2 = (N x H) x (H x C) = N x C
scores = a1.dot(W2) + b2 # output of softmax
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# If the targets are not given then jump out, we're done
if y is None:
return scores
# Compute the loss
loss = None
#############################################################################
# TODO: Finish the forward pass, and compute the loss. This should include #
# both the data loss and L2 regularization for W1 and W2. Store the result #
# in the variable loss, which should be a scalar. Use the Softmax #
# classifier loss. #
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# shift values for 'scores' for numeric reasons (over-flow cautious)
# figure out the max score across all classes
# scores.shape is N x C
scores -= scores.max(axis = 1, keepdims = True)
# probs.shape is N x C
probs = np.exp(scores)/np.sum(np.exp(scores), axis = 1, keepdims = True)
loss = -np.log(probs[np.arange(N), y])
# loss is a single number
loss = np.sum(loss)
# Right now the loss is a sum over all training examples, but we want it
# to be an average instead so we divide by N.
loss /= N
# Add regularization to the loss.
loss += reg * (np.sum(W1 * W1) + np.sum(W2 * W2))
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# Backward pass: compute gradients
grads = {}
#############################################################################
# TODO: Compute the backward pass, computing the derivatives of the weights #
# and biases. Store the results in the grads dictionary. For example, #
# grads['W1'] should store the gradient on W1, and be a matrix of same size #
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# since dL(i)/df(k) = p(k) - 1 (if k = y[i]), where f is a vector of scores for the given example
# i is the training sample and k is the class
dscores = probs.reshape(N, -1) # dscores is (N x C)
dscores[np.arange(N), y] -= 1
# since scores = a1.dot(W2), we get dW2 by multiplying a1.T and dscores
# W2 is H x C so dW2 should also match those dimensions
# a1.T x dscores = (H x N) x (N x C) = H x C
dW2 = np.dot(a1.T, dscores)
# Right now the gradient is a sum over all training examples, but we want it
# to be an average instead so we divide by N.
dW2 /= N
# b2 gradient: sum dscores over all N and C
db2 = dscores.sum(axis = 0)/N
# since a1 = X.dot(W1), we get dW1 by multiplying X.T and da1
# W1 is D x H so dW1 should also match those dimensions
# X.T x da1 = (D x N) x (N x H) = D x H
# first get da1 using scores = a1.dot(W2)
# a1 is N x H so da1 should also match those dimensions
# dscores x W2.T = (N x C) x (C x H) = N x H
da1 = dscores.dot(W2.T)
da1[a1 == 0] = 0 # set gradient of units that did not activate to 0
dW1 = X.T.dot(da1)
# Right now the gradient is a sum over all training examples, but we want it
# to be an average instead so we divide by N.
dW1 /= N
# b1 gradient: sum da1 over all N and H
db1 = da1.sum(axis = 0)/N
# Add regularization loss to the gradient
dW1 += 2 * reg * W1
dW2 += 2 * reg * W2
grads = {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2}
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return loss, grads
def train(self, X, y, X_val, y_val,
learning_rate=1e-3, learning_rate_decay=0.95,
reg=5e-6, num_iters=100,
batch_size=200, verbose=False):
"""
Train this neural network using stochastic gradient descent.
Inputs:
- X: A numpy array of shape (N, D) giving training data.
- y: A numpy array f shape (N,) giving training labels; y[i] = c means that
X[i] has label c, where 0 <= c < C.
- X_val: A numpy array of shape (N_val, D) giving validation data.
- y_val: A numpy array of shape (N_val,) giving validation labels.
- learning_rate: Scalar giving learning rate for optimization.
- learning_rate_decay: Scalar giving factor used to decay the learning rate
after each epoch.
- reg: Scalar giving regularization strength.
- num_iters: Number of steps to take when optimizing.
- batch_size: Number of training examples to use per step.
- verbose: boolean; if true print progress during optimization.
"""
num_train = X.shape[0]
iterations_per_epoch = max(num_train / batch_size, 1)
# Use SGD to optimize the parameters in self.model
loss_history = []
train_acc_history = []
val_acc_history = []
for it in range(num_iters):
X_batch = None
y_batch = None
#########################################################################
# TODO: Create a random minibatch of training data and labels, storing #
# them in X_batch and y_batch respectively. #
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# generate random indices
indices = np.random.choice(num_train, batch_size)
X_batch, y_batch = X[indices], y[indices]
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# Compute loss and gradients using the current minibatch
loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
loss_history.append(loss)
#########################################################################
# TODO: Use the gradients in the grads dictionary to update the #
# parameters of the network (stored in the dictionary self.params) #
# using stochastic gradient descent. You'll need to use the gradients #
# stored in the grads dictionary defined above. #
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
self.params['W1'] -= learning_rate * grads['W1']
self.params['W2'] -= learning_rate * grads['W2']
self.params['b1'] -= learning_rate * grads['b1']
self.params['b2'] -= learning_rate * grads['b2']
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f' % (it, num_iters, loss))
# Every epoch, check train and val accuracy and decay learning rate.
if it % iterations_per_epoch == 0:
# Check accuracy
train_acc = (self.predict(X_batch) == y_batch).mean()
val_acc = (self.predict(X_val) == y_val).mean()
train_acc_history.append(train_acc)
val_acc_history.append(val_acc)
# Decay learning rate
learning_rate *= learning_rate_decay
return {
'loss_history': loss_history,
'train_acc_history': train_acc_history,
'val_acc_history': val_acc_history,
}
def predict(self, X):
"""
Use the trained weights of this two-layer network to predict labels for
data points. For each data point we predict scores for each of the C
classes, and assign each data point to the class with the highest score.
Inputs:
- X: A numpy array of shape (N, D) giving N D-dimensional data points to
classify.
Returns:
- y_pred: A numpy array of shape (N,) giving predicted labels for each of
the elements of X. For all i, y_pred[i] = c means that X[i] is predicted
to have class c, where 0 <= c < C.
"""
y_pred = None
###########################################################################
# TODO: Implement this function; it should be VERY simple! #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# define lamba function for relu
relu = lambda x: np.maximum(0, x)
# activations of fully connected layer #1
a1 = relu(X.dot(self.params['W1']) + self.params['b1'])
# output of softmax
# scores = a1 x W2 = (N x H) x (H x C) = N x C
scores = a1.dot(self.params['W2']) + self.params['b2']
y_pred = np.argmax(scores, axis = 1)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return y_pred
With regards to above code, I could not understand how DW2 was computed well. I took picture of the point I need to clarify and need an explanation for the difference.enter image description here
My ideas

Global deep neural network with Tensorflow.keras in Python

I am working on implementing an algorithm which approximates the solution of a partial differential equation. The main idea behind this is that I start at time 0 with a guess of the solution u[0], and its gradient z[0], and then use a recursive formula to approximately calculate the solution up to the last time point in a forward manner. The formula looks like this
u[i+1] = u[i] + f(t[i],x[i],u[i],z[i])*dt + z[i]*dW[i]
where the function f, time time discretization, the time step dt, and the increment of a Brownian motion dW is given. The gradient z[i] at time point i is being approximated by a deep neural network with input x[i] which I already have implemented with tf.keras with two hidden dense layers. These networks perform quite well. So far, I have N (number of time points) independent neural networks approximating z[i] for each time point respectively.
My task is to form a global neural network with input (x, W), and where (u[0], z[0]) will be given to this network as network parameters, such that this network can than optimize its parameters by minimizing the expected quadratic loss of the output/approximation of uN and the given terminal condition of the partial differential equation g(x). u[0] will then be the solution of the PDE. So while my neural networks approximating have 2 hidden layers each, the global network should have 2*(N-1) layers in total.
My neural networks for the gradients look like this:
# Input dimension
d = 1
# Output dimension
d_1 = 1
# Number of neurons
m = d + 10
# Batch size
batch_size = 32
# Training data
x_tr = some_simulation()
z_tr = calculated_given(x_tr)
# Test data
x_te = some_simulation()
z_te = calculated_given(x_te)
model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(d,), dtype=tf.float32))
model.add(tf.keras.layers.Dense(m, activation=tf.nn.tanh))
model.add(tf.keras.layers.Dense(m, activation=tf.nn.tanh))
model.add(tf.keras.layers.Dense(d_1, activation=tf.keras.activations.linear))
model.compile(optimizer='adam',
loss='MeanSquaredError',
metrics=[])
model.fit(x_tr, z_tr, batch_size=batch_size, epochs=10)
val_loss = model.evaluate(x_te, z_te)
print(val_loss)
So I have trained N of them, and saved each as a file using
model.save(path_to_model)
So given the approximations of the gradients, I now want to stack all the subnetworks together to form a global deep neural network, which is based on the recursive formula above, which only takes the N-dimensional vectors x, and W as input data, and which gives the final output u[N] as output, and which uses (u[0], z[0]) as parameters. But I am trying to wrap my head around for two days as to how such a global neural network should be implemented in Python using Tensorflow.keras, so maybe someone can give me a push in the right direction?
I assume that you'll pass the tuple (x, dW, t) as input to your model, since t is also indexed. Furthermore, you can always create dW from W using np.diff. I also assume that u0 and z0 are scalars (common to all batches).
With all of that in mind, you can subclass the base Model and override its call() as follows
class GlobalModel(tf.keras.models.Model):
def __init__(self, u0, z0, dt, subnet_list, **kwargs):
super().__init__(**kwargs)
self.u0 = tf.Variable(u0, trainable=True, dtype=tf.float32)
self.z0 = tf.Variable(z0, trainable=True, dtype=tf.float32)
self.dt = tf.constant(dt, dtype=tf.float32)
self.subnet_list = subnet_list
# Freeze the pre-trained subnets
for subnet in subnet_list:
subnet.trainable = False
def f(self, t, x, u, z):
# code of your function f() goes here
def step_update(self, t, x, u, z, dW):
return u + self.f(t, x, u, z) * self.dt + z * dW
def call(self, inputs, training=None):
x, dW, t = inputs
# First step
x_i = tf.gather(x, 0, axis=1)
dW_i = tf.gather(dW, 0, axis=1)
t_i = tf.gather(t, 0, axis=1)
u_i = self.step_update(t_i, x_i, self.u0, self.z0, dW_i)
# Subsequent steps
for i, subnet in enumerate(self.subnet_list):
x_i = tf.gather(x, i+1, axis=1)
dW_i = tf.gather(dW, i+1, axis=1)
t_i = tf.gather(t, i+1, axis=1)
z_i = subnet(x_i, training=False)
u_i = self.step_update(t_i, x_i, u_i, z_i, dW_i)
return u_i
You initialize this model by
global_model = GlobalModel(init_u0, init_z0, dt, subnet_list)
where subnet_list is a list of your pre-trained subnets, ordered by time index. That is, the subnet responsible for predicting z_i should be at index i-1 in this list.
After compiling, you call fit() on the model by
global_model.fit(x=(x_tr, dW_tr, t_tr), y=y_tr, batch_size=batch_size, epochs=epochs)
where y_tr is your target.

What is wrong with my matrix-based backpropagation algorithm?

I am working through Nielsen's Neural Networks and Deep Learning. To develop my understanding Nielsen suggests rewriting his back-propagation algorithm to take a matrix based approach (supposedly much quicker due to optimizations in linear algebra libraries).
Currently I get a very low/fluctuating accuracy between 9-10% every single time. Normally, I'd continue working on my understanding, but I have worked this algorithm for the better part of 3 days and I feel like I have a pretty good handle on the math behind backprop. Regardless, I continue to generate mediocre results for accuracy, so any insight would be greatly appreciated!!!
I'm using the MNIST handwritten digits database.
neural_net_batch.py
the neural network functions (backprop in here)
"""
neural_net_batch.py
neural_net.py modified to use matrix operations
"""
# Libs
import random
import numpy as np
# Neural Network
class Network(object):
def __init__(self, sizes):
self.num_layers = len(sizes) # Number of layers in network
self.sizes = sizes # Number of neurons in each layer
self.biases = [np.random.randn(y, 1) for y in sizes[1:]] # Bias vector, 1 bias for each neuron in each layer, except input neurons
self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])] # Weight matrix
# Feed Forward Function
# Returns netowrk output for input a
def feedforward(self, a):
for b, w in zip(self.biases, self.weights): # a’ = σ(wa + b)
a = sigmoid(np.dot(w, a)+b)
return a
# Stochastic Gradient Descent
def SGD(self, training_set, epochs, m, eta, test_data):
if test_data: n_test = len(test_data)
n = len(training_set)
# Epoch loop
for j in range(epochs):
# Shuffle training data & parcel out mini batches
random.shuffle(training_set)
mini_batches = [training_set[k:k+m] for k in range(0, n, m)]
# Pass mini batches one by one to be updated
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
# End of Epoch (optional epoch testing)
if test_data:
evaluation = self.evaluate(test_data)
print("Epoch %6i: %5i / %5i" % (j, evaluation, n_test))
else:
print("Epoch %5i complete" % (j))
# Update Mini Batch (Matrix approach)
def update_mini_batch(self, mini_batch, eta):
m = len(mini_batch)
nabla_b = []
nabla_w = []
# Build activation & answer matrices
x = np.asarray([_x.ravel() for _x,_y in mini_batch]) # 10x784 where each row is an input vector
y = np.asarray([_y.ravel() for _x,_y in mini_batch]) # 10x10 where each row is an desired output vector
nabla_b, nabla_w = self.backprop(x, y) # Feed matrices into backpropagation
# Train Biases & weights
self.biases = [b-(eta/m)*nb for b, nb in zip(self.biases, nabla_b)]
self.weights = [w-(eta/m)*nw for w, nw in zip(self.weights, nabla_w)]
def backprop(self, x, y):
# Gradient arrays
nabla_b = [0 for i in self.biases]
nabla_w = [0 for i in self.weights]
w = self.weights
# Vars
m = len(x) # Mini batch size
a = x # Activation matrix temp variable
a_s = [x] # Activation matrix record
z_s = [] # Weighted Activation matrix record
special_b = [] # Special bias matrix to facilitate matrix operations
# Build special bias matrix (repeating biases for each example)
for j in range(len(self.biases)):
special_b.append([])
for k in range(m):
special_b[j].append(self.biases[j].flatten())
special_b[j] = np.asarray(special_b[j])
# Forward pass
# Starting at the input layer move through each layer
for l in range(len(self.sizes)-1):
z = a # w[l].transpose() + special_b[l]
z_s.append(z)
a = sigmoid(z)
a_s.append(a)
# Backward pass
delta = cost_derivative(a_s[-1], y) * sigmoid_prime(z_s[-1])
nabla_b[-1] = delta
nabla_w[-1] = delta # a_s[-2]
for n in range(2, self.num_layers):
z = z_s[-n]
sp = sigmoid_prime(z)
delta = self.weights[-n+1].transpose() # delta * sp.transpose()
nabla_b[-n] = delta
nabla_w[-n] = delta # a_s[-n-1]
# Create bias vectors by summing bias columns elementwise
for i in range(len(nabla_b)):
temp = []
for j in nabla_b[i]:
temp.append(sum(j))
nabla_b[i] = np.asarray(temp).reshape(-1,1)
return [nabla_b, nabla_w]
def evaluate(self, test_data):
test_results = [(np.argmax(self.feedforward(t[0])), t[1]) for t in test_data]
return sum(int(x==y) for (x, y) in test_results)
# Cost Derivative Function
# Returns the vector of partial derivatives C_x, a for the output activations y
def cost_derivative(output_activations, y):
return(output_activations-y)
# Sigmoid Function
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
# Sigmoid Prime (Derivative) Function
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
MNIST_TEST.py
test script
import mnist_data
import neural_net_batch as nn
# Data Sets
training_data, validation_data, test_data = mnist_data.load_data_wrapper()
training_data = list(training_data)
validation_data = list(validation_data)
test_data = list(test_data)
# Network
net = nn.Network([784, 30, 10])
# Perform Stochastic Gradient Descent using MNIST training & test data,
# 30 epochs, mini_batch size of 10, and learning rate of 3.0
net.SGD(list(training_data), 30, 10, 3.0, test_data=test_data)
A very helpful Reddit (u/xdaimon) helped me to get the following answer (on Reddit):
Your backward pass should be
# Backward pass
delta = cost_derivative(a_s[-1], y) * sigmoid_prime(z_s[-1])
nabla_b[-1] = delta.T
nabla_w[-1] = delta.T # a_s[-2]
for n in range(2, self.num_layers):
z = z_s[-n]
sp = sigmoid_prime(z)
delta = delta # self.weights[-n+1] * sp
nabla_b[-n] = delta.T
nabla_w[-n] = delta.T # a_s[-n-1]
One way to find this bug is to remember that there should be a
transpose somewhere in the product that computes nabla_w.
And if you're interested, the transpose shows up in the matrix
implementation of backprop because AB is the same as the sum of outer
products of the columns of A and the rows of B. In this case A=delta.T
and B=a_s[-n-1] and so the outer products are between the rows of
delta and the rows of a_s[-n-1]. Each term in the sum is nabla_w for a
single element in the batch which is exactly what we want. If the
minibatch size is 1 you can easily see that delta.T#a_s[-n-1] is just
the outer product of the delta vector and activation vector.
Testing shows not only is the network accurate again, the expected speedup is present.

Keras_ERROR : "cannot import name '_time_distributed_dense"

Since the Keras wrapper does not support attention model yet, I'd like to refer to the following custom attention.
https://github.com/datalogue/keras-attention/blob/master/models/custom_recurrents.py
But the problem is, when I run the code above, it returns following error:
ImportError: cannot import name '_time_distributed_dense'
It looks like no more _time_distributed_dense is supported by keras over 2.0.0
the only parts that use _time_distributed_dense module is the part below:
def call(self, x):
# store the whole sequence so we can "attend" to it at each timestep
self.x_seq = x
# apply the a dense layer over the time dimension of the sequence
# do it here because it doesn't depend on any previous steps
# thefore we can save computation time:
self._uxpb = _time_distributed_dense(self.x_seq, self.U_a, b=self.b_a,
input_dim=self.input_dim,
timesteps=self.timesteps,
output_dim=self.units)
return super(AttentionDecoder, self).call(x)
In which way should I change the _time_distrubuted_dense(self ... ) part?
I just posted from An Chen's answer of the GitHub issue (the page or his answer might be deleted in the future)
def _time_distributed_dense(x, w, b=None, dropout=None,
input_dim=None, output_dim=None,
timesteps=None, training=None):
"""Apply `y . w + b` for every temporal slice y of x.
# Arguments
x: input tensor.
w: weight matrix.
b: optional bias vector.
dropout: wether to apply dropout (same dropout mask
for every temporal slice of the input).
input_dim: integer; optional dimensionality of the input.
output_dim: integer; optional dimensionality of the output.
timesteps: integer; optional number of timesteps.
training: training phase tensor or boolean.
# Returns
Output tensor.
"""
if not input_dim:
input_dim = K.shape(x)[2]
if not timesteps:
timesteps = K.shape(x)[1]
if not output_dim:
output_dim = K.shape(w)[1]
if dropout is not None and 0. < dropout < 1.:
# apply the same dropout pattern at every timestep
ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
dropout_matrix = K.dropout(ones, dropout)
expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training)
# collapse time dimension and batch dimension together
x = K.reshape(x, (-1, input_dim))
x = K.dot(x, w)
if b is not None:
x = K.bias_add(x, b)
# reshape to 3D tensor
if K.backend() == 'tensorflow':
x = K.reshape(x, K.stack([-1, timesteps, output_dim]))
x.set_shape([None, None, output_dim])
else:
x = K.reshape(x, (-1, timesteps, output_dim))
return x
You could just add this on your Python code.

MLP Neural Network: calculating the gradient (matrices)

What is a good implementation for calculating the gradient in a n-layered neural network?
Weight layers:
First layer weights:     (n_inputs+1, n_units_layer)-matrix
Hidden layer weights: (n_units_layer+1, n_units_layer)-matrix
Last layer weights:     (n_units_layer+1, n_outputs)-matrix
Notes:
If there is only one hidden layer we would represent the net by using just two (weight) layers:
inputs --first_layer-> network_unit --second_layer-> output
For a n-layer network with more than one hidden layer, we need to implement the (2) step.
A bit vague pseudocode:
weight_layers = [ layer1, layer2 ] # a list of layers as described above
input_values = [ [0,0], [0,0], [1,0], [0,1] ] # our test set (corresponds to XOR)
target_output = [ 0, 0, 1, 1 ] # what we want to train our net to output
output_layers = [] # output for the corresponding layers
for layer in weight_layers:
output <-- calculate the output # calculate the output from the current layer
output_layers <-- output # store the output from each layer
n_samples = input_values.shape[0]
n_outputs = target_output.shape[1]
error = ( output-target_output )/( n_samples*n_outputs )
""" calculate the gradient here """
Final implementation
The final implementation is available at github.
With Python and numpy that is easy.
You have two options:
You can either compute everything in parallel for num_instances instances or
you can compute the gradient for one instance (which is actually a special case of 1.).
I will now give some hints how to implement option 1. I would suggest that you create a new class that is called Layer. It should have two functions:
forward:
inputs:
X: shape = [num_instances, num_inputs]
inputs
W: shape = [num_outputs, num_inputs]
weights
b: shape = [num_outputs]
biases
g: function
activation function
outputs:
Y: shape = [num_instances, num_outputs]
outputs
backprop:
inputs:
dE/dY: shape = [num_instances, num_outputs]
backpropagated gradient
W: shape = [num_outputs, num_inputs]
weights
b: shape = [num_outputs]
biases
gd: function
calculates the derivative of g(A) = Y
based on Y, i.e. gd(Y) = g'(A)
Y: shape = [num_instances, num_outputs]
outputs
X: shape = [num_instances, num_inputs]
inputs
outputs:
dE/dX: shape = [num_instances, num_inputs]
will be backpropagated (dE/dY of lower layer)
dE/dW: shape = [num_outputs, num_inputs]
accumulated derivative with respect to weights
dE/db: shape = [num_outputs]
accumulated derivative with respect to biases
The implementation is simple:
def forward(X, W, b):
A = X.dot(W.T) + b # will be broadcasted
Y = g(A)
return Y
def backprop(dEdY, W, b, gd, Y, X):
Deltas = gd(Y) * dEdY # element-wise multiplication
dEdX = Deltas.dot(W)
dEdW = Deltas.T.dot(X)
dEdb = Deltas.sum(axis=0)
return dEdX, dEdW, dEdb
X of the first layer is your taken from your dataset and then you pass each Y as the X of the next layer in the forward pass.
The dE/dY of the output layer is computed (either for softmax activation function and cross entropy error function or for linear activation function and sum of squared errors) as Y-T, where Y is the output of the network (shape = [num_instances, num_outputs]) and T (shape = [num_instances, num_outputs]) is the desired output. Then you can backpropagate, i.e. dE/dX of each layer is dE/dY of the previous layer.
Now you can use dE/dW and dE/db of each layer to update W and b.
Here is an example for C++: OpenANN.
Btw. you can compare the speed of instance-wise and batch-wise forward propagation:
In [1]: import timeit
In [2]: setup = """import numpy
...: W = numpy.random.rand(10, 5000)
...: X = numpy.random.rand(1000, 5000)"""
In [3]: timeit.timeit('[W.dot(x) for x in X]', setup=setup, number=10)
Out[3]: 0.5420958995819092
In [4]: timeit.timeit('X.dot(W.T)', setup=setup, number=10)
Out[4]: 0.22001314163208008

Categories