Learning Neural Nets I've written my own class.
import numpy as np
import random
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def sigmoid_prime(x):
return sigmoid(x) * (1 - sigmoid(x))
def linear(x):
return x
def linear_prime(x):
return 1
def tanh(x):
return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))
def tanh_prime(x):
return 1 - tanh(x)*tanh(x)
class Network:
def __init__(self, sizes, activation_func = sigmoid, activation_prime = sigmoid_prime):
self.biases = [np.random.randn(x, 1) for x in sizes[1:]]
self.weights = [np.random.randn(y, x) for x, y in zip(sizes, sizes[1:])]
self.num_layers = len(sizes)
self.sizes = sizes
self.activation_function = activation_func
self.actiovation_prime = activation_prime
def forward_prop(self, a):
for w, b in zip(self.weights, self.biases):
a = self.activation_function(np.dot(w, a) + b)
return a
def cost_derivative(self, output_activations, y):
return (output_activations - y)
def backprop(self, x, y):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# forward pass
activation = x # first activation, which is input layer
a_mas = [x]
z_mas = []
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation) + b
activation = self.activation_function(z)
z_mas.append(z)
a_mas.append(activation)
pass
# backward pass
delta = self.cost_derivative(a_mas[-1], y) * self.actiovation_prime(z_mas[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, a_mas[-2].T)
for l in range(2, self.num_layers): # there is 2 such as we've already done for last layer
delta = np.dot(self.weights[-l + 1].transpose(), delta) * self.actiovation_prime(z_mas[-l])
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, a_mas[-l - 1].T)
return nabla_b, nabla_w
def update_mini_batch(self, mini_batch, eta):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
eps = eta / len(mini_batch)
self.weights = [w - eps * nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - eps * nb for b, nb in zip(self.biases, nabla_b)]
def SGD(self, training_data, epochs, mini_batch_size, eta):
n = len(training_data)
for j in range(epochs):
random.shuffle(training_data)
mini_batches = [training_data[k:k + mini_batch_size]
for k in range(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
And now I'm trying to approximate sin() with a help of this net. But the following code below doesn't work correctly.
%matplotlib inline
import matplotlib.pyplot as plt
net2 = Network([1,100,1],tanh,tanh_prime)
x = np.linspace(0,10,1000)
y = np.sin(x)
train = [(np.array(x[i]).reshape(1,1),np.array(y[i]).reshape(1,1)) for i in range(len(x))]
net2.SGD(train,10,10,0.1)
y_pred = []
y_tmp = []
for i in range(len(x)):
y_tmp.append(net2.forward_prop(train[i][0]))
y_pred.append(float(net2.forward_prop(train[i][0])))
plt.plot(x,y_look,'r',x,y_pred)
plt.grid()
Here is what I got.
I've already tried to implement this net in digit recognition with a help of MNIST dataset. There everything worked. But I couldn't get better accuracy then 70%, but That's not a problem. But here I have no clue what is wrong... Activation function is tanh().
From what I see, you are minimising the function f(x) - y, you might want to change that to mean squared error ((f(x) - y)^2) or mean absolute error (|f(x) - y|), which are adequate for a regression problem like yours. For a classification problem such as MNIST, a cross-entropy is a good choice.
Also, you might try to remove the tanh function in the output layer. I do not think that tanh is a problem since the output is between -1 and 1 but generally, for regression problems, we use linear activations and we keep squashing functions such as sigmoid and tanh for classification problems.
Related
I am following a tutorial on number recognition in images with neural networks, but I just encountered this error when I want to call a method of the Network class
TypeError: 'tuple' object does not support item assignment
I can't figure out what's causing it, please help.
this is the tutorial link
http://neuralnetworksanddeeplearning.com/chap1.html
this is my code
import numpy as np
import random
from keras.datasets import mnist
"""
A module to implement the stochastic gradient descent learning
algorithm for a feedforward neural network. Gradients are calculated
using backpropagation.
"""
class Network(object):
"""Making neural network"""
def __init__(self, sizes):
self.num_layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x)
for x, y in zip(sizes[:-1], sizes[1:])]
def feedforward(self, a):
"""Return the output of the network if a is input"""
for b, w in zip(self.biases, self. weights):
a = sigmoid(np.dot(w, a) + b)
return a
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
"""Train the neural network using mini-batch stochastic
gradient descent. The ``training_data`` is a list of tuples
``(x, y)`` representing the training inputs and the desired
outputs. The other non-optional parameters are
self-explanatory. If ``test_data`` is provided then the
network will be evaluated agains the test data after each
epoch, and partial progress printed out. This is useful for
tracking progress, but slows things down substantially."""
if test_data:
n_test = len(test_data)
n = len(training_data)
for j in range(epochs):
random.shuffle(training_data)
mini_batches = [
training_data[k: k + mini_batch_size]
for k in range(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
if test_data:
print("Epoch {0}: {1} / {2}".format(
j, self.evaluate(test_data), n_test))
else:
print("Epoch {0} complete".format(j))
def update_mini_batch(self, mini_batch, eta):
"""Update the network's wrigths and biases by applying
gradient descent using backpropagation to a single mini batch.
The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
is the learning rate"""
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w - (eta / len(mini_batch)) * nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
"""Return a tuple ``(nabla_b, nabla_w`` representing the
gradient for the cost function C_x. ``nabla_b`` and
``nabla_w`` are layer-by-layer lists of numpy arrays, similar
to ``self.biases`` and ``self.weights``."""
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
# backward pass
delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
# l = 1 means the last layer of neurons, l = 2 is the
# secod-last layer, and so on. It's a renumbering of the
# scheme in the book, used here to take advantage of the fact
# that python can use negative indices in lists.
for l in range(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l, -1].transpose())
return (nabla_b, nabla_w)
def evaluate(self, test_data):
"""Return the number of test inputs for which the neural
network outputs the correct result. Note that the neural
network's output is assumed to be the index of wicheve
neuron in the final layer has the highest activation"""
test_results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
def cost_derivative(self, output_activations, y):
"""Return the vector of partial derivatives ``\partial C_x / \partial a``
for the output activations"""
return (output_activations - y)
def sigmoid(z):
"""Sigmoid function"""
return 1 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
"""Derivative of the sigmoid function"""
return sigmoid(z) * (1 - sigmoid(z))
training_data, test_data = mnist.load_data()
net = Network([784, 30, 10])
net.SGD(training_data=training_data, epochs=30, mini_batch_size=10, eta=3.0, test_data=test_data)
test_data=test_data)
TypeError: 'tuple' object does not support item assignment
Do you have any idea why this network doesn't want to learn? The idea is that it uses ReLU as an activation function in earlier layers and sigmoid as an activation function in the last layer. The network learned fine when I used only sigmoid. To verify the network I used MNIST.
def sigmoid( z ):
return 1.0 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
def RELU(z):
return z*(z>0)
def RELU_Prime(z):
return (z>0)
# x - training data in mnist for example (1,784) vector
# y - training label in mnist for example (1,10) vector
# nabla is gradient for the current x and y
def backprop(self, x, y):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
index =0
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b
zs.append(z)
if index == len(self.weights)-1:
activation = sigmoid(z)
#previous layers are RELU
else:
activation = RELU(z)
activations.append(activation)
index +=1
# backward pass
delta = self.cost_derivative(activations[-1], y) *\
sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in range(2, self.num_layers):
z = zs[-l]
sp = RELU_Prime(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return (nabla_b, nabla_w)
--------------- Edit -----------------------------
def cost_derivative(self, output_activations, y):
return (output_activations-y)
--------------- Edit 2 -----------------------------
self.weights = [w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
eta > 0
For those in future the answer for this problem is simple but hidden :). It turns out the weight initialization was wrong. To make it work you have to use Xavier initialization and multiply it by 2.
I am trying to create a basic Linear Regression Model implementing Coordinate Descent (I have made it inherit from OrdinaryLinearRegression, because it implements the same predict and score functions).
Using the loss function as the Residual Sum of squares:
πΏπ
ππ= 1βN βππ€βπ¦β2
Our gradient descent should be:
π€β²= π€ β π 2βN ππ(ππ€βπ¦)
Implementing the code:
def scalingfeatures(X):
scaler = StandardScaler()
scaler.fit(X)
return scaler.transform(X)
class OrdinaryLinearRegressionCoordinateDescent(OrdinaryLinearRegression):
def __init__(self,lr,num_iter):
self.lr = lr
self.num_iter = num_iter
def lossfunction(self,X,y,w):
m = np.size(y)
#Cost function in vectorized form
y_pred = X # w
# J = 1/N * Sum((αΊ - y)**2)
J = float((1./(2*m)) * (y_pred - y).T # (y_pred - y))
return J
def fit(self,X,y):
X = scalingfeatures(X)
X = np.concatenate((np.ones((X.shape[0],1)),X),axis=1)
m,n = X.shape
np.random.seed(42)
w = np.random.randn(n,1)
y = y.reshape(-1,1)
for iter in range(self.num_iter):
for j in range(n):
#Coordinate descent in vectorized form
X_j = X[:,j].reshape(-1,1)
y_pred = X # w
gradient = X_j.T # (y_pred-y)
w[j] = w[j] - self.lr * (2/n) * gradient
loss = self.lossfunction(X,y,w)
print(loss)
self.w = w
return self
OLRCD = OrdinaryLinearRegressionCoordinateDescent(lr=0.05,num_iter=500)
train = OLRCD.fit(X,y)
print("The training MSE for ORLGD is: ",train.score(X,y))
When I run the code, I get that with every iteration the loss only increases...
I am writing a neural network in Python, following the example here. It seems that the backpropagation algorithm isn't working, given that the neural network fails to produce the right value (within a margin of error) after being trained 10 thousand times. Specifically, I am training it to compute the sine function in the following example:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = .1
for layer in range(len(neurons)):
self.inputs.append(np.empty(neurons[layer]))
self.outputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = np.dot(self.weights[layer].T, self.outputs[layer])
self.outputs[-1] = np.tanh(self.inputs[-1])
def backpropagate(self, targets):
gradient = 1 - self.outputs[-1] * self.outputs[-1]
self.errors[-1] = gradient * (self.outputs[-1] - targets)
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(100000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(x)])
net.feedforward([-2])
print(net.outputs[-1])
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
sine_example()
The output fails to be close to tanh(sin(3)) = 0.140190616. I suspected a mistake involving wrong indices or alignment, but Numpy isn't raising any errors like these. Any tips on where I went wrong?
EDIT: I forgot to add the bias neurons. Here is the updated code:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.outputs = []
self.inputs = []
self.errors = []
self.offsets = []
self.rate = .01
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
self.outputs.append(np.empty(neurons[layer]))
self.inputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
self.offsets.append(np.random.normal(scale=1/np.sqrt(neurons[layer]), size=neurons[layer + 1]))
self.inputs.append(np.empty(neurons[-1]))
self.errors.append(np.empty(neurons[-1]))
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = self.offsets[layer] + np.dot(self.weights[layer].T, self.outputs[layer])
def backpropagate(self, targets):
self.errors[-1] = self.inputs[-1] - targets
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
self.offsets[layer] -= self.rate * self.errors[layer + 1]
def sine_example():
net = Neuralnet([1, 5, 1])
for step in range(10000):
x = np.random.uniform(-5, 5)
net.feedforward([x])
net.backpropagate([np.sin(x)])
net.feedforward([np.pi])
print(net.inputs[-1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(10000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(10000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([x])
net.feedforward([-2])
print(net.outputs[-1])
identity_example()
I think you train the NN in the wrong way. You have a loop over 10000 iterations and feed a new sample in each cycle. The NN will never get trained in this case.
(the statement is wrong! See the update! )
What you need to do is to generate a large array of true samples Y = sin(X), give it to your network ONCE and iterate over the training set forwards and backwards, in order to minimize the cost function. To check the algorithm you may need to plot the cost function depending on the iteration number and make sure the cost goes down.
Another important point is the initialization of the weights. Your numbers are pretty large and the network will take a lot of time to converge, especially when using low rates. It's a good practice to generate the initial weights in some small range [-eps .. eps] uniformly.
In my code I implemented two different activation functions: sigmoid() and tanh(). You need to scale your inputs depending on the selected function: [0 .. 1] and [-1 .. 1] respectively.
Here are some images which show the cost function and the resulting predictions for sigmoid() and tanh() activation functions:
As you can see the sigmoid() activation gives a little bit better results, than the tanh().
Also I got much better predictions when using a network [1, 6, 1], compared to a bigger network with 4 layers [1, 6, 4, 1]. So the size of the NN is not always the crucial factor. Here is the prediction for the mentioned network with 4 layers:
Here is my code with some comments. I tried to use your notations where it was possible.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.5
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; # range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
plt.plot(J_history)
plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' # the input should be scaled into [ 0..1]
activation2 = 'tanh' # the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
##########################################################################################
# TRAINING
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); # input training set
Y = np.sin(X) # target
kx = 0.1 # noise parameter
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise # noisy target
# scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
# number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
# gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) # prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
# visualization
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
plt.show()
raw_input('press any key to exit')
UPDATE
I would like to take back the statement regarding the training method used in your code. The network can be indeed trained using only one sample per iteration. I got interesting results in online-training using both sigmoid and tanh activation functions:
Online-training using Sigmoid (cost function and prediction)
Online-training using Tanh (cost function and prediction)
As can be seen the choice of Sigmoid as activation function gives better performance. The cost function looks not that good as during the offline-training, but at least it tends to go down.
I plotted the cost function in your implementation, it looks pretty jerky as well:
May be it is a good idea to try your code with the sigmoid or even the ReLU function.
Here is the updated source code. To switch between online and offline training modes just change the method variable.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.2
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; #range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
#plt.plot(J_history)
#plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def cost_online(self, min_x, max_x, iter_number):
h_arr = np.zeros([iter_number, 1])
y_arr = np.zeros([iter_number, 1])
for step in range(iter_number):
x = np.random.uniform(min_x, max_x, 1).reshape(1, 1)
self.feedforward(x)
h_arr[step, 0] = self.outputs[-1]
y_arr[step, 0] = np.sin(x)
J = np.sum(np.sum(np.power((y_arr - h_arr), 2), axis=0))/(2*iter_number)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' #the input should be scaled into [0..1]
activation2 = 'tanh' #the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
method1 = 'online'
method2 = 'offline'
method = method1
kx = 0.1 #noise parameter
###################################################################################################
# TRAINING
if (method == 'offline'):
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); #input training set
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
#number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
elif (method == 'online'):
sampling_count = 100000 # number of samplings during the training stage
m = 1 #batch size
iter_count = sampling_count/m
for layer in range(net.L):
net.inputs.append(np.empty([m, net.neurons[layer]]))
net.errors.append(np.empty([m, net.neurons[layer]]))
if (layer < net.L -1):
net.outputs.append(np.empty([m, net.neurons[layer]+1]))
else:
net.outputs.append(np.empty([m, net.neurons[layer]]))
J_history = []
step_history = []
for i in range(iter_count):
X = np.random.uniform(0, 4*math.pi, m).reshape(m, 1)
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
net.feedforward(X)
net.backpropagate(Y_scaled)
if (np.remainder(i, 1000) == 0):
J = net.cost_online(0, 4*math.pi, 1000)
J_history.append(J)
step_history.append(i)
plt.plot(step_history, J_history)
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
#plt.ylim([0, 0.1])
plt.show()
#gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) #prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
#visualization
#fake sine curve to show the ideal signal
if (method == 'online'):
X = np.linspace(0, 4*math.pi, num = 100)
Y = np.sin(X)
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
if (method == 'online'):
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
plt.ylim([-1.5, 1.5])
plt.show()
raw_input('press any key to exit')
Now I have some remarks to your current code:
Your sine function looks like this:
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
I don't know why you use tanh in your target input. If you really want to use tanh of sine as target, you need to scale it to [-1..1], because tanh(sin(x)) returns values in range [-0.76..0.76].
The next thing is the range of your training set. You use x = np.random.normal() to generate the samples. Here is the distribution of such an input:
After it you want your network to predict the sine of 3, but the network has almost never seen this number during the training stage. I would use the uniform distribution in a wider range for sample generation instead.
I have this implementation of a feed forward neural network with stochastic gradient descent in python. When training a NN instance with the xor gate, it trains just fine. But when I train the instance with stock change data, it throws this error:
Traceback (most recent call last):
File "F:\predict\test.py", line 21, in <module>
stock.train(stockData, 30, 10, 3.0)
File "F:\predict\network.py", line 46, in train
self.updateMiniBatch(miniBatch, eta)
File "F:\predict\network.py", line 62, in updateMiniBatch
delta_nabla_b, delta_nabla_w = self.backPropagate(x, y)
File "F:\predict\network.py", line 102, in backPropagate
nabla_w[-l] = numpy.dot(delta, activations[-l - 1].transpose())
ValueError: objects are not aligned
The XOR Gate data, and stock data looks as such (Mind you the stock data has been truncated greatly):
xorData = [
([0, 0], [0]),
([0, 1], [1]),
([1, 0], [1]),
([1, 1], [0])
]
stockData = [
([0.0, 0.0, 0.003927144694969353], [-0.0038780602954071597]),
([0.0, 0.003927144694969353, -0.0038780602954071597], [0.0012018010088359343]),
([0.003927144694969353, -0.0038780602954071597, 0.0012018010088359343], [-0.0115302033846727])
]
I then create a network with 2 inputs, 2 hidden, and 1 output for the XOR Gate. And 3 inputs, 15 hidden, and 1 output for the Stock data.
xor = network.NN([2, 2, 1])
xor.train(xorData, 30, 10, 3.0)
stock = network.NN([3, 15, 1])
stock.train(stockData, 30, 10, 3.0)
Both of the training sets have the exact same structure, so why would this error occur?
network.py:
import numpy
import random
import json
# the sigmoid function
def sigmoid(z):
return 1.0 / (1.0 + numpy.exp(-z))
# the derivative of the sigmoid function
def sigmoidPrime(z):
return sigmoid(z) * (1 - sigmoid(z))
# sigmoid vectors
sigmoidVector = numpy.vectorize(sigmoid)
sigmoidPrimeVector = numpy.vectorize(sigmoidPrime)
#A class that implements stochastic gradient descent learning algorithm for a feedforward neural network
class NN:
def __init__(self, sizes):
self.numLayers = len(sizes)
self.sizes = sizes
# the biases and weights for the network are initialized randomly, using a Guassian distribution with mean 0, and variance 1
self.biases = [numpy.random.randn(y, 1) for y in sizes[1:]]
self.weights = [numpy.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
# feedForward function - return the output of the network
def feedForward(self, inputs):
for b, w in zip(self.biases, self.weights):
inputs = sigmoidVector(numpy.dot(w, inputs) + b)
return inputs
# train function - train the neural network using mini-batch stochastic gradient descent
# the trainingData is a list of tuples "(x, y)" representing the training inputs and the desired outputs
# if testData is provided then the network will be evaluated against the test data after each epoch
def train(self, trainingData, epochs, miniBatchSize, eta, testData = None):
if testData:
nTest = len(testData)
n = len(trainingData)
for j in xrange(epochs):
random.shuffle(trainingData)
miniBatches = [trainingData[k:k + miniBatchSize] for k in xrange(0, n, miniBatchSize)]
for miniBatch in miniBatches:
self.updateMiniBatch(miniBatch, eta)
if testData:
print "Epoch %i: %i / %i" % (j, self.evaluate(testData), nTest)
else:
print "Epoch %i complete" % j
# updateMiniBatch function - update the network's weights and biases by applying gradient descent using backpropagation
# to a single mini batch
# the miniBatch is a list of tuples "(x, y)" and eta is the learning rate
def updateMiniBatch(self, miniBatch, eta):
nabla_b = [numpy.zeros(b.shape) for b in self.biases]
nabla_w = [numpy.zeros(w.shape) for w in self.weights]
for x, y in miniBatch:
delta_nabla_b, delta_nabla_w = self.backPropagate(x, y)
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w - (eta / len(miniBatch)) * nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - (eta / len(miniBatch)) * nb for b, nb in zip(self.biases, nabla_b)]
# backPropagate function - returns a tuple "(nabla_b, nabla_w)" representing the gradient for the cost function C_x
# nabla_b and nabla_w are layer-by-layer lists of numpy arrays, similar to self.biases and self.weights
def backPropagate(self, x, y):
nabla_b = [numpy.zeros(b.shape) for b in self.biases]
nabla_w = [numpy.zeros(w.shape) for w in self.weights]
x = numpy.array(x)
y = numpy.array(y)
# feedForward
activation = x
activations = [x] # list to store all of the activations, layer by layer
zs = [] # list to store all of the z vectors, layer by layer
for b, w in zip(self.biases, self.weights):
z = numpy.dot(w, activation) + b
zs.append(z)
activation = sigmoidVector(z)
activations.append(activation)
# backward pass
delta = self.costDerivative(activations[-1], y) * sigmoidPrimeVector(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = numpy.dot(delta, activations[-2].transpose())
for l in xrange(2, self.numLayers):
spv = sigmoidPrimeVector(zs[-l])
delta = numpy.dot(self.weights[-l + 1].transpose(), delta) * spv
nabla_b[-l] = delta
nabla_w[-l] = numpy.dot(delta, activations[-l - 1].transpose())
return (nabla_b, nabla_w)
# evaluate function - return the number of test inputs for which the neural network outputs the correct result
def evaluate(self, testData):
testResults = [(numpy.argmax(self.feedForward(x)), y) for (x, y) in testData]
return sum(int(x == y) for (x, y) in testResults)
# costDerivative function - return the vector of partial derivatives for the output activations
def costDerivative(self, outputActivations, y):
return (outputActivations - y)
# save function - save the neural network to filename
def save(self, filename):
data = {
"sizes": self.sizes,
"weights": [w.tolist() for w in self.weights],
"biases": [b.tolist() for b in self.biases]
}
with open(filename, "w") as handle:
json.dump(data, handle)
# load function - load a neural network from the file filename
# returns a network instance
def load(filename):
with open(filename, "r") as handle:
data = json.load(handle)
network = NN(data["sizes"])
network.weights = [numpy.array(w) for w in data["weights"]]
network.biases = [numpy.array(b) for b in data["biases"]]
return network
EDIT:
I believe it might have something to do with the hidden layer value. I changed the XOR Gate hidden layer number and it threw the same error. It seems as if the hidden layer value must be the exact same as the amount for the input layer.