Implement bias neurons neural network - python

I implemented bias units for my neural network with gradient descent. But I'm not 100% sure If I've implemented it the right way. Would be glade if you can quickly look through my code. Only the parts with
if bias:
are important.
And my second question:
Shouldn't the derivate of the softmax function be 1-x, because x is the output of the softmax function?
I tried my net with 1-x but its performance was worse.
Every help is appreciated.
Thanks in advance.
import numpy as np
import pickle
import time
import math
class FeedForwardNetwork():
def __init__(self, input_dim, hidden_dim, output_dim, dropout=False, dropout_prop=0.5, bias=False):
np.random.seed(1)
self.input_layer = np.array([])
self.hidden_layer = np.array([])
self.output_layer = np.array([])
self.hidden_dim = hidden_dim
self.dropout = dropout
self.dropout_prop = dropout_prop
self.bias = bias
r_input_hidden = math.sqrt(6 / (input_dim + hidden_dim))
r_hidden_output = math.sqrt(6 / (hidden_dim + output_dim))
#self.weights_input_hidden = np.random.uniform(low=-r_input_hidden, high=r_input_hidden, size=(input_dim, hidden_dim))
#self.weights_hidden_output = np.random.uniform(low=-r_hidden_output, high=r_hidden_output, size=(hidden_dim, output_dim))
self.weights_input_hidden = np.random.uniform(low=-0.01, high=0.01, size=(input_dim, hidden_dim))
self.weights_hidden_output = np.random.uniform(low=-0.01, high=0.01, size=(hidden_dim, output_dim))
self.validation_data = np.array([])
self.validation_data_solution = np.array([])
self.velocities_input_hidden = np.zeros(self.weights_input_hidden.shape)
self.velocities_hidden_output = np.zeros(self.weights_hidden_output.shape)
if bias:
self.weights_bias_hidden = np.random.uniform(low=-0.01, high=0.01, size=((1, hidden_dim)))
self.weights_bias_output = np.random.uniform(low=-0.01, high=0.01, size=((1, output_dim)))
self.velocities_bias_hidden = np.zeros(self.weights_bias_hidden.shape)
self.velocities_bias_output = np.zeros(self.weights_bias_output.shape)
def _tanh(self, x, deriv=False):
#The derivate is: 1-np.tanh(x)**2; Because x is already the output of tanh(x) 1-x*x is the correct derivate.
if not deriv:
return np.tanh(x)
return 1-x*x
def _softmax(self, x, deriv=False):
if not deriv:
return np.exp(x) / np.sum(np.exp(x), axis=0)
return 1 - np.exp(x) / np.sum(np.exp(x), axis=0)
def set_training_data(self, training_data_input, training_data_target, validation_data_input=None, validation_data_target=None):
"""Splits the data up into training and validation data with a ratio of 0.85/0.15 if no validation data is given.
Sets the data for training."""
if len(training_data_input) != len(training_data_target):
raise ValueError(
'Number of training examples and'
' training targets does not match!'
)
if (validation_data_input is None) and (validation_data_target is None):
len_training_data = int((len(training_data_input)/100*85//1))
self.input_layer = training_data_input[:len_training_data]
self.output_layer = training_data_target[:len_training_data]
self.validation_data = training_data_input[len_training_data:]
self.validation_data_solution = training_data_target[len_training_data:]
else:
self.input_layer = training_data_input
self.output_layer = training_data_target
self.validation_data = validation_data_input
self.validation_data_solution = validation_data_target
def save(self, filename):
"""Saves the weights into a pickle file."""
with open(filename, "wb") as network_file:
pickle.dump(self.weights_input_hidden, network_file)
pickle.dump(self.weights_hidden_output, network_file)
def load(self, filename):
"""Loads network weights from a pickle file."""
with open(filename, "rb") as network_file:
weights_input_hidden = pickle.load(network_file)
weights_hidden_output = pickle.load(network_file)
if (
len(weights_input_hidden) != len(self.weights_input_hidden)
or len(weights_hidden_output) != len(self.weights_hidden_output)
):
raise ValueError(
'File contains weights that does not'
' match the current networks size!'
)
self.weights_input_hidden = weights_input_hidden
self.weights_hidden_output = weights_hidden_output
def measure_error(self, input_data, output_data):
return 1/2 * np.sum((output_data - self.forward_propagate(input_data))**2)
#return np.sum(np.nan_to_num(-output_data*np.log(self.forward_propagate(input_data))-(1-output_data)*np.log(1-self.forward_propagate(input_data))))
def forward_propagate(self, input_data, dropout=False):
"""Proceds the input data from input neurons up to output neurons and returns the output layer.
If dropout is True some of the neurons are randomly turned off."""
input_layer = input_data
self.hidden_layer = self._tanh(np.dot(input_layer, self.weights_input_hidden))
if self.bias:
self.hidden_layer += self.weights_bias_hidden
if dropout:
self.hidden_layer *= np.random.binomial([np.ones((len(input_data),self.hidden_dim))],1-self.dropout_prop)[0] * (1.0/(1-self.dropout_prop))
if self.bias:
return self._softmax((np.dot(self.hidden_layer, self.weights_hidden_output) + self.weights_bias_output).T).T
else:
return self._softmax(np.dot(self.hidden_layer, self.weights_hidden_output).T).T
#return self._softmax(output_layer.T).T
def back_propagate(self, input_data, output_data, alpha, beta, momentum):
"""Calculates the difference between target output and output and adjusts the weights to fit the target output better.
The parameter alpha is the learning rate.
Beta is the parameter for weight decay which penaltizes large weights."""
sample_count = len(input_data)
output_layer = self.forward_propagate(input_data, dropout=self.dropout)
output_layer_error = output_layer - output_data
output_layer_delta = output_layer_error * self._softmax(output_layer, deriv=True)
print("Error: ", np.mean(np.abs(output_layer_error)))
#How much did each hidden neuron contribute to the output error?
#Multiplys delta term with weights
hidden_layer_error = output_layer_delta.dot(self.weights_hidden_output.T)
#If the prediction is good, the second term will be small and the change will be small
#Ex: target: 1 -> Slope will be 1 so the second term will be big
hidden_layer_delta = hidden_layer_error * self._tanh(self.hidden_layer, deriv=True)
#The both lines return a matrix. A row stands for all weights connected to one neuron.
#E.g. [1, 2, 3] -> Weights to Neuron A
# [4, 5, 6] -> Weights to Neuron B
hidden_weights_gradient = input_data.T.dot(hidden_layer_delta)/sample_count
output_weights_gradient = self.hidden_layer.T.dot(output_layer_delta)/sample_count
velocities_input_hidden = self.velocities_input_hidden
velocities_hidden_output = self.velocities_hidden_output
self.velocities_input_hidden = velocities_input_hidden * momentum - alpha * hidden_weights_gradient
self.velocities_hidden_output = velocities_hidden_output * momentum - alpha * output_weights_gradient
#Includes momentum term and weight decay; The weight decay parameter is beta
#Weight decay penalizes large weights to prevent overfitting
self.weights_input_hidden += -velocities_input_hidden * momentum + (1 + momentum) * self.velocities_input_hidden
- alpha * beta * self.weights_input_hidden / sample_count
self.weights_hidden_output += -velocities_hidden_output * momentum + (1 + momentum) * self.velocities_hidden_output
- alpha * beta * self.weights_hidden_output / sample_count
if self.bias:
velocities_bias_hidden = self.velocities_bias_hidden
velocities_bias_output = self.velocities_bias_output
hidden_layer_delta = np.sum(hidden_layer_delta, axis=0)
output_layer_delta = np.sum(output_layer_delta, axis=0)
self.velocities_bias_hidden = velocities_bias_hidden * momentum - alpha * hidden_layer_delta
self.velocities_bias_output = velocities_bias_output * momentum - alpha * output_layer_delta
self.weights_bias_hidden += -velocities_bias_hidden * momentum + (1 + momentum) * self.velocities_bias_hidden
- alpha * beta * self.weights_bias_hidden / sample_count
self.weights_bias_output += -velocities_bias_output * momentum + (1 + momentum) * self.velocities_bias_output
- alpha * beta * self.weights_bias_output / sample_count
def batch_train(self, epochs, alpha, beta, momentum, patience=10):
"""Trains the network in batch mode that means the weights are updated after showing all training examples.
alpha is the learning rate and patience is the number of epochs that the validation error is allowed to increase before aborting.
Beta is the parameter for weight decay which penaltizes large weights."""
#The weight decay parameter is beta
validation_error = self.measure_error(self.validation_data, self.validation_data_solution)
for epoch in range(epochs):
self.back_propagate(self.input_layer, self.output_layer, alpha, beta, momentum)
validation_error_new = self.measure_error(self.validation_data, self.validation_data_solution)
if validation_error_new < validation_error:
validation_error = validation_error_new
else:
patience -= 1
if patience == 0:
print("Abort Training. Overfitting has started! Epoch: {0}. Error: {1}".format(epoch, validation_error_new))
return
print("Epoch: {0}, Validation Error: {1}".format(epoch, validation_error))
self.save("Network_Mnist.net")
def mini_batch_train(self, batch_size, epochs, alpha, beta, momentum, patience=10):
"""Trains the network in mini batch mode, that means the weights are updated after showing only a bunch of training examples.
alpha is the learning rate and patience is the number of epochs that the validation error is allowed to increase before aborting."""
validation_error = self.measure_error(self.validation_data, self.validation_data_solution)
sample_count = len(self.input_layer)
epoch_counter = 0
for epoch in range(0, epochs*batch_size, batch_size):
epoch_counter += 1
self.back_propagate(self.input_layer[epoch%sample_count:(epoch%sample_count)+batch_size],
self.output_layer[epoch%sample_count:(epoch%sample_count)+batch_size], alpha, beta, momentum)
validation_error_new = self.measure_error(self.validation_data, self.validation_data_solution)
if validation_error_new < validation_error:
validation_error = validation_error_new
patience = 20
else:
patience -= 1
if patience == 0:
print("Abort Training. Overfitting has started! Epoch: {0}. Error: {1}".format(epoch_counter, validation_error_new))
return
print("Epoch: {0}, Validation Error: {1}".format(epoch_counter, validation_error))
self.save("Network_Mnist.net")
if __name__ == "__main__":
#If the first row is a one the first output neuron should be on the second off
x = np.array([ [0, 0, 1, 1, 0],
[0, 1, 1, 1, 1],
[1, 0, 1, 1, 1],
[1, 1, 1, 1, 0],
[0, 1, 1, 1, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 0, 1, 0, 0] ])
y = np.array([ [0, 1],
[0, 1],
[1, 0],
[1, 0],
[0, 1],
[1, 0],
[1, 0],
[1, 0] ])
#x = np.array([ [0, 0, 1, 1] ])
#y = np.array([[0]]).T
a = FeedForwardNetwork(input_dim=5, hidden_dim=200, output_dim=2, bias=False)
a.set_training_data(x, y)
start = time.time()
a.batch_train(epochs=2000, alpha=0.05, beta=0.0001, momentum=0.99, patience=20)
print(time.time()-start)

In relation with the derivatives...
If you are using the tanh activation function, i.e. the derivative is: y' = 1 - y^2. The tanh is commonly used because it is zero-centered.
If you are using the logistic equation, then the derivative is: y' = y(1+y). The softmax has a similar derivative.
The nice thing is that all these can be expressed as functions of themselves, so you need to have a look at the def _softmax(self, x, deriv=False) function, to define it in a similar way than def _tanh(self, x, deriv=False).

Related

Pricing american options using deep learning, put instead of max-call

So I'm trying to learn to optimally stop options in a Black-Scholes setting along the lines of the article: "Solving high-dimensional optimal stopping problems using deep learning" by Sebastian Becker, Patrick Cheridito, Arnulf Jentzen, and Timo Welti.
The framework used to price options is the following:
import tensorflow as tf
from tensorflow.python.training.moving_averages import assign_moving_average
def neural_net(x, neurons, is_training, dtype=tf.float32, decay=0.9):
def batch_normalization(y):
shape = y.get_shape().as_list()
y = tf.reshape(y, [-1, shape[1] * shape[2]])
#variables for batch normalization
beta = tf.compat.v1.get_variable(
name='beta', shape=[shape[1] * shape[2]],
dtype=dtype, initializer=tf.zeros_initializer())
gamma = tf.compat.v1.get_variable(
name='gamma', shape=[shape[1] * shape[2]],
dtype=dtype, initializer=tf.ones_initializer())
mv_mean = tf.compat.v1.get_variable(
'mv_mean', [shape[1]*shape[2]],
dtype = dtype, initializer=tf.zeros_initializer(),
trainable = False)
mv_var = tf.compat.v1.get_variable(
'mv_var', [shape[1]*shape[2]],
dtype = dtype, initializer =tf.ones_initializer(),
trainable = False)
mean,variance = tf.nn.moments(y, [0], name = 'moments')
tf.compat.v1.add_to_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS,
assign_moving_average(mv_mean, mean, decay,
zero_debias=True))
tf.compat.v1.add_to_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS,
assign_moving_average(mv_var, variance, decay,
zero_debias=False))
mean, variance = tf.cond(is_training, lambda: (mean, variance),
lambda: (mv_mean, mv_var))
y = tf.nn.batch_normalization(y, mean, variance, beta, gamma, 1e-6)
return tf.reshape(y, [-1, shape[1], shape[2]])
def fc_layer(y, out_size, activation, is_single):
shape = y.get_shape().as_list()
w = tf.compat.v1.get_variable(
name='weights',
shape=[shape[2], shape[1], out_size],
dtype=dtype,
initializer=tf.initializers.glorot_uniform())
y = tf.transpose(tf.matmul(tf.transpose(y, [2, 0, 1]), w),
[1, 2, 0])
if is_single:
b = tf.compat.v1.get_variable(
name='bias',
shape=[out_size, shape[2]],
dtype = dtype,
initializer=tf.zeros_initializer())
return activation(y + b)
return activation(batch_normalization(y))
x = batch_normalization(x)
for i in range(len(neurons)):
with tf.compat.v1.variable_scope('layer_' + str(i)):
x = fc_layer(x, neurons[i],
tf.nn.relu if i < len(neurons) - 1
else tf.nn.sigmoid, False)
return x
#then Deep optimal stopping
def deep_optimal_stopping(x, t, n, g, neurons, batch_size, train_steps,
mc_runs, lr_boundaries, lr_values, beta1=0.9,
beta2=0.999, epsilon=1e-8, decay=0.9):
is_training = tf.compat.v1.placeholder(tf.bool, []) # a variable used to distinguish between training and Monte Carlo simulation, used for batch noralization
p = g(t, x) # we evaluate the payoff for the whole batch at every point in time
nets = neural_net(tf.concat([x[:, :, :-1], p[:, :, :-1]], axis=1),
neurons, is_training, decay=decay)
u_list = [nets[:, :, 0]]
u_sum = u_list[-1]
for k in range(1, n - 1): #range(start, stop)
u_list.append(nets[:, :, k] * (1. - u_sum)) # we build a neural network to approximate the stopping decision at time n*T/N
u_sum += u_list[-1]
#last iteration?
u_list.append(1. - u_sum)
u_stack = tf.concat(u_list, axis=1)
p = tf.squeeze(p, axis=1) #removes dimension of size 1
loss = tf.reduce_mean(tf.reduce_sum(-u_stack * p, axis=1)) #loss function
idx = tf.argmax(tf.cast(tf.cumsum(u_stack, axis=1) + u_stack >= 1,
dtype=tf.uint8), #idx for index?, argmax takes index for largest value
axis=1, output_type=tf.int32)
stopped_payoffs = tf.reduce_mean(
tf.gather_nd(p, tf.stack([tf.range(0, batch_size, dtype=tf.int32),
idx], axis=1))) # this is the approximation of the price for one batch, we will calculate the mean over MC-runs of those numbers
global_step = tf.Variable(0) # a variable used to apply the learning rate schedule, without it the optimizer would not know at which training step we are
learning_rate = tf.compat.v1.train.piecewise_constant(global_step,
lr_boundaries,
lr_values) # this gives us a piecewise constant learning rate, according to the schedule
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate,
beta1=beta1,
beta2=beta2,# define the optimizer, we use Adam with our learning rate schedule and a small tweak of one of its parameters
epsilon=epsilon)
update_ops = tf.compat.v1.get_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss, global_step=global_step)
with tf.compat.v1.Session() as sess:
sess.run(tf.compat.v1.global_variables_initializer())
for _ in range(train_steps):
sess.run(train_op, feed_dict={is_training: True})
px_mean = 0. # value that will hold the price
for _ in range(mc_runs): # loop over the number of MC runs
px_mean += sess.run(stopped_payoffs,
feed_dict={is_training: False})# we stop training, this is used for the batch normalization, from now on we will use the sampled moving averages
return px_mean / mc_runs
Now we define the various variables and simulate paths of a stock as X. Then we run use deep_optimal_stopping function to price the option, defined in the following code
import tensorflow as tf
import numpy as np
import time
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()
T, N, K = 3., 9, 100.
r, delta, beta = 0.05, 0.1, 0.2
batch_size = 800#8192
lr_values = [0.05, 0.005, 0.0005]
mc_runs = 50#500
def g(s, x):
return tf.exp(-r * s) \
* tf.maximum(tf.reduce_max(x, axis=1, keepdims=True) - K, 0.)
_file = open('example_4_4_1_1.csv', 'w')
_file.write('dim, run, mean, time\n')
for d in [2, 3, 5, 10, 20, 30, 50, 100, 200, 500]:
for s_0 in [40.]:#[90., 100., 110.]:
for run in range(5):
tf.compat.v1.reset_default_graph()
t0 = time.time()
neurons = [d + 50, d + 50, 1]
train_steps = 1500 + d
lr_boundaries = [int(500 + d / 5), int(1500 + 3 * d / 5)]
W = tf.cumsum(tf.compat.v1.random_normal(
shape=[batch_size, d, N],
stddev=np.sqrt(T / N)), axis=2)
t = tf.constant(np.linspace(start=T / N, stop=T, num=N,
endpoint=True, dtype=np.float32))
#X = tf.exp((r - delta - beta ** 2 / 2.) * t + beta * W) * s_0
px_mean = deep_optimal_stopping(
W, t, N, g, neurons, batch_size,
train_steps, mc_runs,
lr_boundaries, lr_values, epsilon=0.1)
t1 = time.time()
print("")
_file.write('%i, %i, %f, %f\n' % (d, run, px_mean, t1 - t0))
_file.close()
So here the option is a bermudan max-call defined by the payoff function g(s,x). My understanding would be, if I wanted the price of an American put, I instead changed the payoff function g to be:
def g(s, x):
return tf.exp(-r * s) * tf.maximum(K-x, 0.)
and otherwise changing nothing. But instead of getting a price of 5.31 as reported in their article, I get 4.02.
Can someone explain where I'm going wrong with my understanding of the problem?

What can I do to improve the results I have with linear outputs?

With sigmoid activation function on my output the network learns the task much better than with linear activation function.
I am using L2 regularization with my cost function, I have a learning rate and momentum term yet it learns better with the sigmoid activation function.
What can I do to improve the results?
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
import numpy as np
import numpy.random as r
import matplotlib.pyplot as plt
import random
random.seed(25)
def readcsv(filename):
ifile = open(filename, "rU")
reader = csv.reader(ifile, delimiter=",")
rownum = 0
dataset = []
for row in reader:
dataset.append(row)
rownum += 1
data = []
for s in dataset:
Dataset = [float(i) for i in s]
data.append(Dataset)
return [data, rownum]
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_deriv(x):
return x * (1 - x)
def initialise_weights(nn_structure):
weights = {}
bias = {}
c_weights = {1: np.ones((nn_structure[1], nn_structure[1]))}
context = {0: r.random_sample((nn_structure[1]))}
for l in range(1, len(nn_structure)):
q = []
for j in range(1, nn_structure[l] + 1):
w = [random.uniform(-0.09, 0.09) for i in range(nn_structure[l-1])]
q.append(w)
weights[l] = np.array(q)
bias[l] = r.random_sample((nn_structure[l],))
print(weights)
return weights, bias, c_weights, context
def initialise_weights_changes(nn_structure):
deltaweights = {}
deltabias = {}
deltac_weights = {1: np.zeros((nn_structure[1], nn_structure[1]))}
for l in range(1, len(nn_structure)):
deltaweights[l] = np.zeros((nn_structure[l], nn_structure[l-1]))
deltabias[l] = np.zeros((nn_structure[l],))
return deltaweights, deltabias, deltac_weights
def feed_forward(x, weights, bias, c_weights, context, i, hidden_layer):
hidden_layer[i] = {1: x}
activations = {}
for l in range(1, len(weights) + 1):
node_in = hidden_layer[i][l]
if l == 1:
activations[l+1] = sigmoid(weights[l].dot(node_in) + c_weights[l].dot(context[i]) + bias[l])
else:
activations[l+1] = weights[l].dot(node_in) + bias[l]
hidden_layer[i][l+1] = activations[l+1]
if l == 1:
context[i + 1] = hidden_layer[i][l+1]
return hidden_layer, activations
def calculate_out_layer_delta(y, hidden_layer):#, activations):
return -(y - hidden_layer)
def calculate_hidden_delta(delta_plus_1, weights_l):#, z_l):
return np.dot(np.transpose(weights_l), delta_plus_1)
def train_nn(nn_structure, X, y, iter_num=1000, alpha=0.6, momentum = 0.4, bptt = 5, reg = 0.0000009):
weights, bias, c_weights, context = initialise_weights(nn_structure)
cnt = 0
m = len(y)
avg_cost_func = []
print('Starting gradient descent for {} iterations'.format(iter_num))
while cnt < iter_num:
if cnt%1000 == 0:
print('Iteration {} of {}'.format(cnt, iter_num))
deltaweights, deltabias, deltac_weights = initialise_weights_changes(nn_structure)
avg_cost = 0
hidden_layer = {}
delta = {}
bp = []
for i in range(len(y)):
# perform the feed forward pass and return the stored h and z values, to be used in the gradient descent step
hidden_layer, activations = feed_forward(X[i], weights, bias, c_weights, context, i, hidden_layer)
bp.append(i)
# loop from nl-1 to 1 backpropagating the errors
if len(bp) == bptt:
for j in reversed(bp):
delta[j] = {}
if j == bp[-1]:
for l in range(len(nn_structure), 0, -1):
if l == len(nn_structure):
delta[j][l] = calculate_out_layer_delta(y[j], hidden_layer[j][l])
avg_cost += mean_squared_error(y[j] , hidden_layer[j][l]) + 0.5 * reg * np.linalg.norm(np.power(weights[l - 1], 2)) + 0.5 * reg * np.linalg.norm(np.power(weights[l - 2], 2))
else:
if l > 1:
delta[j][l] = calculate_hidden_delta(delta[j][l+1], weights[l]) * sigmoid_deriv(hidden_layer[j][l])
else:
for l in range(len(nn_structure), 0, -1):
if l == len(nn_structure):
delta[j][l] = calculate_out_layer_delta(y[j], hidden_layer[j][l])
avg_cost += np.linalg.norm((y[j] - hidden_layer[j][l]))
else:
if l > 1:
delta[j][l] = calculate_hidden_delta(delta[j][l + 1], weights[l]) + calculate_hidden_delta(delta[j + 1][l], c_weights[1]) * sigmoid_deriv(hidden_layer[j][l])
for l in range(len(nn_structure) - 1, 0, -1):
deltaweights[l] = (-alpha * np.dot(delta[j][l + 1][:, np.newaxis], np.transpose(hidden_layer[j][l][:, np.newaxis]))) + (momentum * deltaweights[l]) + (reg / bptt * weights[l])
deltabias[l] = (-alpha * delta[j][l + 1]) + ((momentum * deltabias[l])) + (reg / bptt * bias[l])
if l == 1:
deltac_weights[l] += (-alpha * np.dot(delta[j][1 + 1][:, np.newaxis], np.transpose(context[j][:, np.newaxis]))) + (momentum * deltac_weights[l]) + (reg / bptt * c_weights[l])
# perform the gradient descent step for the weights in each layer
for l in range(len(nn_structure) - 1, 0, -1):
weights[l] += (1 / bptt * deltaweights[l]) - (reg / bptt * weights[l])
bias[l] += (1 / bptt * deltabias[l]) - (reg / bptt * bias[l])
if l == 1:
c_weights[l] += (1 / bptt * deltac_weights[l]) - (reg / bptt * c_weights[l])
bp = []
deltaweights, deltabias, deltac_weights = initialise_weights_changes(nn_structure)
# complete the average cost calculation
if cnt % 500 == 0:
print(weights)
avg_cost = 1.0 / m * avg_cost
if cnt % 1000 == 0:
print('Error', avg_cost)
avg_cost_func.append(avg_cost)
cnt += 1
alpha = alpha - (alpha/iter_num)
return weights, bias, avg_cost_func, c_weights, context
def predict_y(weights, bias, X, c_weights, context):
m = X.shape[0]
y = np.zeros((m,))
for i in range(m):
hidden_layer = {1: X[i]}
for l in range(1, len(weights) + 1):
node_in = hidden_layer[l]
if l == 1:
activations = weights[l].dot(node_in) + c_weights[l].dot(context[l]) + bias[l]
else:
activations = weights[l].dot(node_in) + bias[l]
hidden_layer[l + 1] = sigmoid(activations)
y[i] = hidden_layer[3]
return y
if __name__ == "__main__":
# load data and scale
filename = 'C:/Users/n0762538/Documents/Data/MackeyGlass/MackeyGlass.csv'
dataset, rownum = readcsv(filename)
#np.random.shuffle(dataset)
dataset = np.array(dataset)
# define data
no = int(0.70 * len(dataset))
train_data = dataset[0:no]
test_data = dataset[no:-1]
train_output = dataset[1:no + 1]
test_output = dataset[no + 1:]
X = train_data
y = train_output
# setup the NN structure
nn_structure = [len(dataset[0]), 3, len(dataset[0])]
# train the NN
weights, bias, avg_cost_func, c_weights, context = train_nn(nn_structure, X, y)
# plot the avg_cost_func
plt.plot(avg_cost_func)
plt.ylabel('Average J')
plt.xlabel('Iteration number')
plt.show()
# get the prediction accuracy and print
#print(weights)
#print('test:', test_output)
y_pred = predict_y(weights, bias, test_data, c_weights, context)
print('Prediction accuracy is {}%'.format(r2_score(test_output, y_pred) * 100))
#plt.plot(train_data)
plt.plot(y_pred)
plt.plot(test_output)
plt.title('Approach 1')
plt.ylabel('Predicted')
plt.xlabel('Iteration number')
plt.show()
By results, if you mean Error Rate or some Evaluation Metric Improvement. Here maybe some guidelines.
Let us make it clear that Error = 100 - Evaluation Metric. If your accuracy is 99%, that means your Error is 1% (GOOD). If your accuracy is 85%, your error is 15% (NOT TOO GOOD. IT DEPENDS)
If your Train Error is very high (Test Error = 16%, Train Error = 15%): You have a problem with BIAS. You need to get a BIGGER/DEEPER neural network with more layers, train using gradient descent for more number of iterations or just try a different neural network architecture (CNN, RNN).
If your Test Error is much higher than your Train Error (Test Error = 11%, Train Error = 1%): It means you have a VARIANCE problem. You need to get more training data, and apply regularization or just try a different neural network architecture (CNN, RNN).
If both Train Error as well as Test Error is high (Train Error: 15%, Test Error: 16%, then you have both HIGH BIAS and HIGH VARIANCE, you need to do both of what I told you above.
To see if you have a BIAS problem, look at the Training Set Performance. To see if you have a VARIANCE problem, look at your Dev Set/Test Set Performance. More importantly, You need to understand, Bayes Error or Human Level Performance. If 15% is Bayes Error, meaning that NO MACHINE LEARNING ALGORITHM can do better than 15%, and if your Training Error is 16%, it means there is very little room for improvement in terms of fixing BIAS.

Softmax activation with cross entropy loss results in the outputs converging to exactly 0 and 1 for both classes, respectively

I have implemented a simple Neural Net with just a single sigmoid hidden layer, with the choice of a sigmoid or softmax output layer and squared error or cross entropy loss function, respectively. After much research on the softmax activation function, the cross entropy loss, and their derivatives (and with following this blog) I believe that my implementation seems correct.
When attempting to learn the simple XOR function, the NN with the sigmoid output learns to a very small loss very quickly when using single binary outputs of 0 and 1. However, when changing the labels to one-hot encodings of [1, 0] = 0 and [0, 1] = 1, the softmax implementation does not work. The loss consistently increases as the network's outputs converge to exactly [0, 1] for the two outputs on every input, yet the labels of the data set is perfectly balanced between [0, 1] and [1, 0].
My code is below, where the choice of using sigmoid or softmax at the output layer can be chosen by uncommenting the necessary two lines near the bottom of the code. I cannot figure out why the softmax implementation is not working.
import numpy as np
class MLP:
def __init__(self, numInputs, numHidden, numOutputs, activation):
self.numInputs = numInputs
self.numHidden = numHidden
self.numOutputs = numOutputs
self.activation = activation.upper()
self.IH_weights = np.random.rand(numInputs, numHidden) # Input -> Hidden
self.HO_weights = np.random.rand(numHidden, numOutputs) # Hidden -> Output
self.IH_bias = np.zeros((1, numHidden))
self.HO_bias = np.zeros((1, numOutputs))
# Gradients corresponding to weight matrices computed during backprop
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
# Gradients corresponding to biases computed during backprop
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
# Input, hidden and output layer neuron values
self.I = np.zeros(numInputs) # Inputs
self.L = np.zeros(numOutputs) # Labels
self.H = np.zeros(numHidden) # Hidden
self.O = np.zeros(numOutputs) # Output
# ##########################################################################
# ACIVATION FUNCTIONS
# ##########################################################################
def sigmoid(self, x, derivative=False):
if derivative:
return x * (1 - x)
return 1 / (1 + np.exp(-x))
def softmax(self, prediction, label=None, derivative=False):
if derivative:
return prediction - label
return np.exp(prediction) / np.sum(np.exp(prediction))
# ##########################################################################
# LOSS FUNCTIONS
# ##########################################################################
def squaredError(self, prediction, label, derivative=False):
if derivative:
return (-2 * prediction) + (2 * label)
return (prediction - label) ** 2
def crossEntropy(self, prediction, label, derivative=False):
if derivative:
return [-(y / x) for x, y in zip(prediction, label)] # NOT NEEDED ###############################
return - np.sum([y * np.log(x) for x, y in zip(prediction, label)])
# ##########################################################################
def forward(self, inputs):
self.I = np.array(inputs).reshape(1, self.numInputs) # [numInputs, ] -> [1, numInputs]
self.H = self.I.dot(self.IH_weights) + self.IH_bias
self.H = self.sigmoid(self.H)
self.O = self.H.dot(self.HO_weights) + self.HO_bias
if self.activation == 'SIGMOID':
self.O = self.sigmoid(self.O)
elif self.activation == 'SOFTMAX':
self.O = self.softmax(self.O) + 1e-10 # allows for log(0)
return self.O
def backward(self, labels):
self.L = np.array(labels).reshape(1, self.numOutputs) # [numOutputs, ] -> [1, numOutputs]
if self.activation == 'SIGMOID':
self.O_error = self.squaredError(self.O, self.L)
self.O_delta = self.squaredError(self.O, self.L, derivative=True) * self.sigmoid(self.O, derivative=True)
elif self.activation == 'SOFTMAX':
self.O_error = self.crossEntropy(self.O, self.L)
self.O_delta = self.softmax(self.O, self.L, derivative=True)
self.H_error = self.O_delta.dot(self.HO_weights.T)
self.H_delta = self.H_error * self.sigmoid(self.H, derivative=True)
self.IH_w_gradients += self.I.T.dot(self.H_delta)
self.HO_w_gradients += self.H.T.dot(self.O_delta)
self.IH_b_gradients += self.H_delta
self.HO_b_gradients += self.O_delta
return self.O_error
def updateWeights(self, learningRate):
self.IH_weights += learningRate * self.IH_w_gradients
self.HO_weights += learningRate * self.HO_w_gradients
self.IH_bias += learningRate * self.IH_b_gradients
self.HO_bias += learningRate * self.HO_b_gradients
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
sigmoidData = [
[[0, 0], 0],
[[0, 1], 1],
[[1, 0], 1],
[[1, 1], 0]
]
softmaxData = [
[[0, 0], [1, 0]],
[[0, 1], [0, 1]],
[[1, 0], [0, 1]],
[[1, 1], [1, 0]]
]
sigmoidMLP = MLP(2, 10, 1, 'SIGMOID')
softmaxMLP = MLP(2, 10, 2, 'SOFTMAX')
# SIGMOID #######################
# data = sigmoidData
# mlp = sigmoidMLP
# ###############################
# SOFTMAX #######################
data = softmaxData
mlp = softmaxMLP
# ###############################
numEpochs = 5000
for epoch in range(numEpochs):
losses = []
for i in range(len(data)):
print(mlp.forward(data[i][0])) # Print outputs
# mlp.forward(data[i][0]) # Don't print outputs
loss = mlp.backward(data[i][1])
losses.append(loss)
mlp.updateWeights(0.001)
# if epoch % 1000 == 0 or epoch == numEpochs - 1: # Print loss every 1000 epochs
print(np.mean(losses)) # Print loss every epoch
Contrary to all the information online, simply changing the derivative of the softmax cross entropy from prediction - label to label - prediction solved the problem. Perhaps I have something else backwards somewhere since every source I have come across has it as prediction - label.

Trouble implementing softmax activation and cross-entropy loss, and their derivatives in a neural net

I have implemented a simple multi-layer perceptron (with just 1 hidden layer) which can learn regression problems. I have written it so that the choice between sigmoid, tanh and relu activations can be specified. The squared error is then implemented as the loss function with each of these.
I now want to allow the choice to use the same model to learn multi-class classification problems, and so would like to implement the choice to use the softmax activation along with the cross-entropy loss. In my code below, the only changes that would need to be made (I hope) is to implement these in the activation() and loss() functions, and this should then work out of the box in both the forward pass and the backprop. This code runs a simulation of my model learning the XOR function, where the chosen activation function should be uncommented at the top.
However, I am really lost with implementing both of these functions, and even more so their derivatives. Any help and guidance is appreciated.
import sys
import numpy as np
activation = 'sigmoid'
# activation = 'tanh'
# activation = 'relu'
# activation = 'softmax'
numEpochs = 10000
class DataSet:
def __init__(self, data, trainSplit=1):
self.size = len(data)
self.trainSize = int(self.size * trainSplit)
self.testSize = self.size - self.trainSize
self.inputs, self.labels = [], []
for i in range(len(data)):
self.inputs.append(data[i][0])
self.labels.append(data[i][1])
self.trainInputs = self.inputs[:self.trainSize]
self.trainLabels = self.labels[:self.trainSize]
self.testInputs = self.inputs[self.trainSize:]
self.testLabels = self.labels[self.trainSize:]
try:
self.numInputs = len(self.inputs[0])
except TypeError:
self.numInputs = 1
try:
self.numOutputs = len(self.labels[0])
except TypeError:
self.numOutputs = 1
class MLP:
def __init__(self, numInputs, numHidden, numOutputs, activationFunction):
# MLP architecture sizes
self.numInputs = numInputs
self.numHidden = numHidden
self.numOutputs = numOutputs
self.activationFunction = activationFunction.lower()
# MLP weights
self.IH_weights = np.random.rand(numInputs, numHidden) # Input -> Hidden
self.HO_weights = np.random.rand(numHidden, numOutputs) # Hidden -> Output
# MLP biases
self.IH_bias = np.zeros((1, numHidden))
self.HO_bias = np.zeros((1, numOutputs))
# Gradients corresponding to weight matrices computed during backprop
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
# Gradients corresponding to biases computed during backprop
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
# Input, hidden and output layer neuron values
self.I = np.zeros(numInputs) # Inputs
self.L = np.zeros(numOutputs) # Labels
self.H = np.zeros(numHidden) # Hidden
self.O = np.zeros(numOutputs) # Output
def activation(self, x, derivative=False):
if self.activationFunction == 'sigmoid':
if derivative:
return x * (1 - x)
return 1 / (1 + np.exp(-x))
if self.activationFunction == 'tanh':
if derivative:
return 1 - np.tanh(x) ** 2
return np.tanh(x)
if self.activationFunction == 'relu':
if derivative:
return (x > 0).astype(float)
return np.maximum(0, x)
# TO DO ################################################################
if self.activationFunction == 'softmax':
if derivative:
return 0
return 0
print("ERROR: Activation function not found.")
sys.exit()
def loss(self, labels, predictions, derivative=False):
# TO DO ################################################################
# Cross-Entropy
if self.activationFunction == 'softmax':
if derivative:
return 0
return 0
# Squared Error
else:
if derivative:
return (-2 * labels) + (2 * predictions)
return (labels - predictions) ** 2
def forward(self, inputs):
# Ensure that inputs is a list
try:
len(inputs)
except TypeError:
inputs = [inputs]
self.I = np.array(inputs).reshape(1, self.numInputs)
self.H = self.I.dot(self.IH_weights) + self.IH_bias
self.H = self.activation(self.H)
self.O = self.H.dot(self.HO_weights) + self.HO_bias
self.O = self.activation(self.O)
def backwards(self, labels):
# Ensure that labels is a list
try:
len(labels)
except TypeError:
labels = [labels]
self.L = np.array(labels)
self.O_error = self.loss(self.O, self.L)
self.O_delta = self.loss(self.O, self.L, derivative=True) * self.activation(self.O, derivative=True)
self.H_error = self.O_delta.dot(self.HO_weights.T)
self.H_delta = self.H_error * self.activation(self.H, derivative=True)
self.IH_w_gradients += self.I.T.dot(self.H_delta)
self.HO_w_gradients += self.H.T.dot(self.O_delta)
self.IH_b_gradients += self.H_delta
self.HO_b_gradients += self.O_delta
return self.O_error
def updateWeights(self, learningRate):
self.IH_weights += learningRate * self.IH_w_gradients
self.HO_weights += learningRate * self.HO_w_gradients
self.IH_bias += learningRate * self.IH_b_gradients
self.HO_bias += learningRate * self.HO_b_gradients
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
def process(self, data, train=False, learningRate=0):
if train:
size = data.trainSize
inputs = data.trainInputs
labels = data.trainLabels
else:
size = data.testSize
inputs = data.testInputs
labels = data.testLabels
errors = []
for i in range(size):
self.forward(inputs[i])
errors.append(self.backwards(labels[i]))
if train:
self.updateWeights(learningRate)
return np.mean(errors)
data1 = DataSet([
[[0, 0], 0],
[[0, 1], 1],
[[1, 0], 1],
[[1, 1], 0]
])
data2 = DataSet([
[[0, 0], -1],
[[0, 1], 1],
[[1, 0], 1],
[[1, 1], -1]
])
data3 = DataSet([
[[0, 0], [1, 0]],
[[0, 1], [0, 1]],
[[1, 0], [0, 1]],
[[1, 1], [1, 0]]
])
if activation == 'sigmoid':
data = data1
mlp = MLP(data.numInputs, 2, data.numOutputs, 'sigmoid')
learningRate = 1
if activation == 'tanh':
data = data2
mlp = MLP(data.numInputs, 2, data.numOutputs, 'tanh')
learningRate = 0.1
if activation == 'relu':
data = data1
mlp = MLP(data.numInputs, 2, data.numOutputs, 'relu')
learningRate = 0.001
if activation == 'softmax':
data = data3
mlp = MLP(data.numInputs, 2, data.numOutputs, 'softmax')
learningRate = 0.01
################################################################################
# TO DO: UPDATE WEIGHTS AT INTERVALS, NOT EVERY EPOCH
################################################################################
losses = []
for epoch in range(numEpochs):
epochLoss = mlp.process(data, train=True, learningRate=learningRate)
losses.append(epochLoss)
if epoch % 1000 == 0 or epoch == numEpochs - 1:
print("EPOCH:", epoch)
print("LOSS: ", epochLoss, "\n")
Unfortunately, softmax is not as easy as the other activation functions you have posted. For the activation function, you must calculate the exp(y_i) and then divide by the sum exp(y_k) for every y_k in Y. For the derivative, you must calculate every combination (n^2 combinations) of partial derivatives of every output wrt every input of the neuron. Luckily, the loss it is something a little bit easier to understand, since you can think about the softmax giving you some probabilities (so it resembles a probability distribution) and you calculate the Cross Entropy as is between the returned values and the target ones.

Neural network backpropagation algorithm not working in Python

I am writing a neural network in Python, following the example here. It seems that the backpropagation algorithm isn't working, given that the neural network fails to produce the right value (within a margin of error) after being trained 10 thousand times. Specifically, I am training it to compute the sine function in the following example:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = .1
for layer in range(len(neurons)):
self.inputs.append(np.empty(neurons[layer]))
self.outputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = np.dot(self.weights[layer].T, self.outputs[layer])
self.outputs[-1] = np.tanh(self.inputs[-1])
def backpropagate(self, targets):
gradient = 1 - self.outputs[-1] * self.outputs[-1]
self.errors[-1] = gradient * (self.outputs[-1] - targets)
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(100000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(x)])
net.feedforward([-2])
print(net.outputs[-1])
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
sine_example()
The output fails to be close to tanh(sin(3)) = 0.140190616. I suspected a mistake involving wrong indices or alignment, but Numpy isn't raising any errors like these. Any tips on where I went wrong?
EDIT: I forgot to add the bias neurons. Here is the updated code:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.outputs = []
self.inputs = []
self.errors = []
self.offsets = []
self.rate = .01
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
self.outputs.append(np.empty(neurons[layer]))
self.inputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
self.offsets.append(np.random.normal(scale=1/np.sqrt(neurons[layer]), size=neurons[layer + 1]))
self.inputs.append(np.empty(neurons[-1]))
self.errors.append(np.empty(neurons[-1]))
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = self.offsets[layer] + np.dot(self.weights[layer].T, self.outputs[layer])
def backpropagate(self, targets):
self.errors[-1] = self.inputs[-1] - targets
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
self.offsets[layer] -= self.rate * self.errors[layer + 1]
def sine_example():
net = Neuralnet([1, 5, 1])
for step in range(10000):
x = np.random.uniform(-5, 5)
net.feedforward([x])
net.backpropagate([np.sin(x)])
net.feedforward([np.pi])
print(net.inputs[-1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(10000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(10000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([x])
net.feedforward([-2])
print(net.outputs[-1])
identity_example()
I think you train the NN in the wrong way. You have a loop over 10000 iterations and feed a new sample in each cycle. The NN will never get trained in this case.
(the statement is wrong! See the update! )
What you need to do is to generate a large array of true samples Y = sin(X), give it to your network ONCE and iterate over the training set forwards and backwards, in order to minimize the cost function. To check the algorithm you may need to plot the cost function depending on the iteration number and make sure the cost goes down.
Another important point is the initialization of the weights. Your numbers are pretty large and the network will take a lot of time to converge, especially when using low rates. It's a good practice to generate the initial weights in some small range [-eps .. eps] uniformly.
In my code I implemented two different activation functions: sigmoid() and tanh(). You need to scale your inputs depending on the selected function: [0 .. 1] and [-1 .. 1] respectively.
Here are some images which show the cost function and the resulting predictions for sigmoid() and tanh() activation functions:
As you can see the sigmoid() activation gives a little bit better results, than the tanh().
Also I got much better predictions when using a network [1, 6, 1], compared to a bigger network with 4 layers [1, 6, 4, 1]. So the size of the NN is not always the crucial factor. Here is the prediction for the mentioned network with 4 layers:
Here is my code with some comments. I tried to use your notations where it was possible.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.5
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; # range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
plt.plot(J_history)
plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' # the input should be scaled into [ 0..1]
activation2 = 'tanh' # the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
##########################################################################################
# TRAINING
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); # input training set
Y = np.sin(X) # target
kx = 0.1 # noise parameter
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise # noisy target
# scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
# number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
# gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) # prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
# visualization
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
plt.show()
raw_input('press any key to exit')
UPDATE
I would like to take back the statement regarding the training method used in your code. The network can be indeed trained using only one sample per iteration. I got interesting results in online-training using both sigmoid and tanh activation functions:
Online-training using Sigmoid (cost function and prediction)
Online-training using Tanh (cost function and prediction)
As can be seen the choice of Sigmoid as activation function gives better performance. The cost function looks not that good as during the offline-training, but at least it tends to go down.
I plotted the cost function in your implementation, it looks pretty jerky as well:
May be it is a good idea to try your code with the sigmoid or even the ReLU function.
Here is the updated source code. To switch between online and offline training modes just change the method variable.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.2
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; #range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
#plt.plot(J_history)
#plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def cost_online(self, min_x, max_x, iter_number):
h_arr = np.zeros([iter_number, 1])
y_arr = np.zeros([iter_number, 1])
for step in range(iter_number):
x = np.random.uniform(min_x, max_x, 1).reshape(1, 1)
self.feedforward(x)
h_arr[step, 0] = self.outputs[-1]
y_arr[step, 0] = np.sin(x)
J = np.sum(np.sum(np.power((y_arr - h_arr), 2), axis=0))/(2*iter_number)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' #the input should be scaled into [0..1]
activation2 = 'tanh' #the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
method1 = 'online'
method2 = 'offline'
method = method1
kx = 0.1 #noise parameter
###################################################################################################
# TRAINING
if (method == 'offline'):
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); #input training set
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
#number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
elif (method == 'online'):
sampling_count = 100000 # number of samplings during the training stage
m = 1 #batch size
iter_count = sampling_count/m
for layer in range(net.L):
net.inputs.append(np.empty([m, net.neurons[layer]]))
net.errors.append(np.empty([m, net.neurons[layer]]))
if (layer < net.L -1):
net.outputs.append(np.empty([m, net.neurons[layer]+1]))
else:
net.outputs.append(np.empty([m, net.neurons[layer]]))
J_history = []
step_history = []
for i in range(iter_count):
X = np.random.uniform(0, 4*math.pi, m).reshape(m, 1)
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
net.feedforward(X)
net.backpropagate(Y_scaled)
if (np.remainder(i, 1000) == 0):
J = net.cost_online(0, 4*math.pi, 1000)
J_history.append(J)
step_history.append(i)
plt.plot(step_history, J_history)
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
#plt.ylim([0, 0.1])
plt.show()
#gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) #prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
#visualization
#fake sine curve to show the ideal signal
if (method == 'online'):
X = np.linspace(0, 4*math.pi, num = 100)
Y = np.sin(X)
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
if (method == 'online'):
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
plt.ylim([-1.5, 1.5])
plt.show()
raw_input('press any key to exit')
Now I have some remarks to your current code:
Your sine function looks like this:
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
I don't know why you use tanh in your target input. If you really want to use tanh of sine as target, you need to scale it to [-1..1], because tanh(sin(x)) returns values in range [-0.76..0.76].
The next thing is the range of your training set. You use x = np.random.normal() to generate the samples. Here is the distribution of such an input:
After it you want your network to predict the sine of 3, but the network has almost never seen this number during the training stage. I would use the uniform distribution in a wider range for sample generation instead.

Categories