Simple convolutional neural network - python

import numpy as np
from keras.datasets import mnist
import time
# Functions
def sigmoid(x):
return 1.0/(1.0 + np.exp(-x))
def sigmoid_derivative(x):
return sigmoid(x)*(1-sigmoid(x))
def relu(x):
return np.maximum(0,x)
def relu_derivative(x):
return np.greater(x, 0).astype(int)
def softmax(x):
exps = np.exp(x - x.max())
return exps / np.sum(exps)
# Import and Create Dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# Setup
np.random.seed(seed=12345)
alpha = 0.05
# Initialize Network Values
# Layers
input_layer = np.zeros(shape=(28,28))
convolutional_layer = np.zeros(shape=(10,24,24))
pooling_layer = np.zeros(shape=(10,12,12))
flattened_layer = np.reshape(pooling_layer,newshape=(1440,1))
dense_layer = np.zeros(shape=(100,1))
output_layer = np.zeros(shape=(10,1))
# Filters and Weights
convolution_filters = np.random.rand(10,5,5)
weights1 = np.random.rand(100,1440)
weights2 = np.random.rand(10,100)
# Bias
dense_layer_bias = np.ones(shape=(100,1))
output_layer_bias = np.ones(shape=(10,1))
convolution_bias = np.ones(shape=(10,5,5))
for epoch in range(1):
print(np.mean(weights1),np.mean(weights2),np.mean(convolution_filters))
for sample in range(20):
# Get Input Data
input = x_train[sample]
# Target Data
target = np.zeros((10,1))
target[y_train[sample]][0] = 1
# Feed Forward Input to Convolution Layer
i=j=k=0
for i in range(10):
for j in range(24):
for k in range(24):
minimatrix = input[j:j+5, k:k+5]
convolutional_layer[i][j][k] = np.sum(minimatrix * convolution_filters[i] + convolution_bias[i])
# Pooling Layer
i=j=k=0
for i in range(10):
for j in range(12):
for k in range(12):
minimatrix = convolutional_layer[i,j*2:j*2+2,k*2:k*2+2]
pooling_layer[i][j][k] = relu(minimatrix.max())
# Flattening Layer
flattened_layer = np.reshape(pooling_layer,newshape=(1440,1))
# Feed Forward - DENSE_LAYER
dense_layer = relu(np.dot(weights1,flattened_layer) + dense_layer_bias)
# Feed Forward - OUTPUT_LAYER
output_layer = softmax(np.dot(weights2,dense_layer) + output_layer_bias)
# Backpropogation - OUTPUT_LAYER
delta = output_layer - target
weights2gradient = np.dot(delta,dense_layer.T)
output_layer_bias_gradient = delta
# Backpropogation - DENSE_LAYER
delta = np.dot(weights2.T,delta) * relu_derivative(dense_layer)
weights1gradient = np.dot(delta,flattened_layer.T)
dense_layer_bias_gradient = delta
# Backpropogation - POOLING_LAYER
delta = np.reshape(np.dot(weights1.T,delta),newshape=(10,12,12)) * relu_derivative(pooling_layer) # find delta at pooling layer
# Backpropagation - TRANSPOSE FOR CALCULATIONS
delta = np.array([delta[i].T for i in range(len(delta))]) # Math says this has to happen
# Gradient For Backward Pass
pooling_backward_pass = np.zeros(shape=(10,24,24)) # matrix for passing adjusted cost
# BACKWARD POOLING GRADIENT PASS
i=j=k=0
for i in range(10):
for j in range(12):
for k in range(12):
minimatrix = convolutional_layer[i,j*2:j*2+2,k*2:k*2+2]
maxvalindex = np.argmax(minimatrix)
pooling_backward_pass[i, j*2+(maxvalindex // 2), k+(maxvalindex % 2)] += delta[i,j,k]
# Backpropogation - CONVOLUTION LAYER
convolution_gradient = np.zeros(shape=(10,5,5))
convolution_bias_gradient = np.zeros(shape=(10,5,5))
i=j=k=0
for i in range(10):
for j in range(24):
for k in range(24):
minimatrix = input[j:j+5, k:k+5]
convolution_gradient[i] += pooling_backward_pass[i,j,k] * minimatrix
convolution_bias_gradient[i] += pooling_backward_pass[i,j,k]
# Weight and Filter Adjustments
weights2 -= weights2gradient * alpha
weights1 -= weights1gradient * alpha
convolution_filters -= convolution_gradient * alpha
# Bias Adjustments
dense_layer_bias -= dense_layer_bias_gradient * alpha
output_layer_bias -= output_layer_bias_gradient * alpha
convolution_bias -= convolution_gradient * alpha
print(np.mean(weights1),np.mean(weights2),np.mean(convolution_filters))
I have been working on this code for a while now and I am (almost) certain the basic functionality should work, but I am getting no changes to the weights of the network. I am specifically trying to understand neural networks without the abstraction offered by the actual frameworks. Is there a Python scope issue that is keeping the weights from updating?

Related

Neural Network from scratch : matrix version with numpy (debugging)

Recently I've been trying to create a Neural Network from scratch in Python. I've chosen the matrix way using numpy.
My Neural Network has 1 input layer, 1 hidden layer with 10 nodes and 1 output layer with 1 node. The activating function is the same for all the layer and is the ReLU function.
The cost function is based on the Mean Squared Error.
However, at the first iteration, the correction on the weight / bias matrix has very big numbers. Meaninng that first iteration correction bias / weight matrix get only negative umbers so that at next iterations, prediction for every samples in dataset is "0" because of the ReLU function.
So the implementation doesn't seems to work completely right. I guess that maybe I'm wrong with the derivative of the cost function.
from os import sep
import numpy as np
import pandas as pd
#from sklearn.model_selection import train_test_split
from src.NeuralNetwork import NeuralNetwork
# read training dataset
# 11 categories => 11 inputs ; 1 output (wine quality)
red_wine_dataset = pd.read_csv('./dataset/winequality-red.csv', sep=";").to_numpy()
# get dimensions: n lines / m columns
n, m = red_wine_dataset.shape
# number of different wines in the dataset
# we split the dataset in two so we have n/2
NUMBER_OF_DATA = int(n/2)
# number of categories = number of inputs = m - 1 (because 1 column is the true output = y)
NUMBER_OF_NODES_INPUTS = m - 1
# number of nodes for hidden layers
NUMBER_OF_NODES_HIDDEN_LAYER = 10
# number of nodes for output layer
NUMBER_OF_NODES_OUTPUT_LAYER = 1
# number of iterations for the algorithm
EPOCH = 1000
# learning rate (eta)
LEARNING_RATE = 0.05
# split dataset into two : a training dataset / a testing dataset
training_dataset = red_wine_dataset[0:NUMBER_OF_DATA]
training_inputs = training_dataset[:,0:m - 1] # wine classifying categories (acidity, sugar, ph,...)
training_output = training_dataset[:,m - 1] # wine quality
#print(training_output)
testing_dataset = red_wine_dataset[NUMBER_OF_DATA:n]
testing_inputs = testing_dataset[0:m - 1] # wine classifying categories (acidity, sugar, ph,...)
testing_output = testing_dataset[m - 1] # wine quality
# architecture of the neural network
# number of nodes by layer
layers = np.array([NUMBER_OF_NODES_INPUTS, NUMBER_OF_NODES_HIDDEN_LAYER, NUMBER_OF_NODES_OUTPUT_LAYER])
if __name__ == "__main__":
NN = NeuralNetwork(dataset=training_dataset, layers=layers)
NN.gradient_descent(X=training_inputs, Y=training_output, epoch=EPOCH, learning_rate=LEARNING_RATE)
import numpy as np
import matplotlib.pyplot as plt
from src.utils import ReLU, ReLU_derivative, mean_squared_error
class NeuralNetwork:
def __init__(self, dataset, layers) -> None:
"""[summary]
NOTATION:
X = training input (A0)
Y = training output (y_true)
Y_hat = predicted output (y_pred) = = activated output associated with the last layer (that is the output layer)
Wi = weight matrix associated with i-th layer
Bi = bias matrix associated with i-th layer
Zi = (A_{i-1} \cdot Wi) + Bi = output matrix associated with i-th layer
Ai = f(Zi) = activated output associated with i-th layer where f is the activation function (ex: ReLU)
L = Loss function (ex: MSE)
Args:
architecture
"""
# parameters of the neural network
self.parameters = {}
# partial derivatives to update the parameters (weight, bias) of the neural network
self.derivatives = {}
# number of the last layer (the output layer)
# number of layers - 1 because numerotation begins at 0.
self.N = layers.size - 1
# number of samples in the dataset
self.m = dataset[:,0].size
#print(dataset[0].size)
# initialize neural network parameters
for i in range(1, self.N + 1):
self.parameters[f"W{str(i)}"] = np.random.uniform(size=(layers[i - 1], layers[i]))
self.parameters[f"B{str(i)}"] = np.random.uniform(size=(self.m, layers[i]))
self.parameters[f"Z{str(i)}"] = np.ones((self.m, layers[i]))
self.parameters[f"A{str(i)}"] = np.ones((self.m, layers[i]))
# initialize cost function value
self.parameters["C"] = 1
def forward_propagate(self, X):
# initial the neural network with the input dataset
self.parameters["A0"] = X
# forward propagate each subsequent layers
for i in range(1, self.N + 1):
# Z^i = (A^{i-1} \cdot W^i) + B^i
Zi = (self.parameters[f"A{str(i-1)}"] # self.parameters[f"W{str(i)}"]) + self.parameters[f"B{str(i)}"]
self.parameters[f"Z{str(i)}"] = Zi
# A^i = f(Z^i)
print(f"A{str(i-1)}", self.parameters[f"A{str(i-1)}"])
print(f"W{str(i)}", self.parameters[f"W{str(i)}"])
print(f"B{str(i)}", self.parameters[f"B{str(i)}"])
self.parameters[f"A{str(i)}"] = ReLU(Zi)
#print(ReLU(Zi))
# for key, value in self.parameters.items():
# if type(value) is int:
# print(key, value)
# else:
# print(key, value.shape)
# print("================================")
def backward_propagate(self, X, Y):
# compute derivatives of our loss function
# we go backward
# partial derivatives for the last layer
dL_dAN = np.sum(self.parameters[f"A{str(self.N)}"] - Y)
# Hadamar product: "*"
dL_dZN = dL_dAN * ReLU_derivative(self.parameters[f"Z{str(self.N)}"])
#print(dL_dZN.T.shape, self.parameters[f"A{str(self.N - 1)}"].shape)
dL_dWN = dL_dZN.T # self.parameters[f"A{str(self.N - 1)}"]
dL_dBN = dL_dZN
self.derivatives[f"dLdZ{str(self.N)}"] = dL_dZN
self.derivatives[f"dLdW{str(self.N)}"] = dL_dWN.T
self.derivatives[f"dLdB{str(self.N)}"] = dL_dBN
# partial derivatives for the subsequent layers
for i in range(self.N - 1, 0, -1):
# Hadamar product
# "*" is to multiply entries 1 by 1 => shape: (m, n) * (m, n) for any m, n
dL_dZi = ((self.derivatives[f"dLdZ{str(i + 1)}"]) # self.parameters[f"W{str(i + 1)}"].T) * ReLU_derivative(self.parameters[f"Z{str(i)}"])
dL_dWi = self.parameters[f"A{str(i - 1)}"].T # dL_dZi
dL_dBi = dL_dZi
self.derivatives[f"dLdZ{str(i)}"] = dL_dZi
self.derivatives[f"dLdW{str(i)}"] = dL_dWi
self.derivatives[f"dLdB{str(i)}"] = dL_dBi
def update_weights_and_bias(self, learning_rate):
for i in range(1, self.N + 1):
self.parameters[f"W{str(i)}"] -= learning_rate * self.derivatives[f"dLdW{str(i)}"]
self.parameters[f"B{str(i)}"] -= learning_rate * self.derivatives[f"dLdB{str(i)}"]
def gradient_descent(self, X, Y, epoch, learning_rate):
cost_history = []
for i in range(epoch):
if i == 2:
return
self.forward_propagate(X)
self.backward_propagate(X, Y)
self.update_weights_and_bias(learning_rate)
cost = mean_squared_error(Y=Y, Y_hat=self.parameters[f"A{str(self.N)}"])
cost_history.append(cost)
#print(cost)
# if (i % 10 == 0):
# print(f"iteration: {i}")
# predictions = np.argmax(self.parameters[f"A{str(self.N)}"], 0)
# print(predictions, Y)
# print(np.sum(predictions == Y) / Y.size)
plt.plot(range(epoch), cost_history)
plt.show()
import numpy as np
# useful activation functions for hidden layer 1 / output layer
"""
#param x: np.array
"""
def sigmoid(x):
return 1 / (1 - np.exp(x))
"""
#param x: np.array
"""
def ReLU(x):
return np.maximum(0, x)
def ReLU_derivative(x):
return x > 0
# loss function
"""
#param y_true: np.array
#param y_pred: np.array
J(y_true, y_pred) = 1/n \sum_{i=1}^{n} (y_true - y_pred)^2
"""
def mean_squared_error(Y, Y_hat):
#print("Y", Y)
#print("Y_hat", np.sum(Y_hat), Y_hat.shape)
#print("somme", sum((Y - Y_hat) ** 2))
return (1 / 2) * np.sum((Y - Y_hat) ** 2)

Scipy fails to minimize cost function

Currently I'm learning from Andrew Ng course on Coursera called "Machine Learning". In exercise 5, we built a model that can predict digits, trained by the MNIST dataset. This task was completed successfully in Matlab by me, but I wanted to migrate that code to Python, just to see how different things are and maybe continue to play around with the model.
I managed to implement the cost function and the back propagation algorithm correctly. I know that because I compared the metrics with my working model in Matlab and it emits the same numbers.
Now, because in the course we train the model using fmincg, I tried to do the same using Scipy fmin_cg
function.
My problem is, the cost function takes extra small steps and fails to converge.
Here is my code for the network:
import numpy as np
import utils
import scipy.optimize as op
class Network:
def __init__(self, layers):
self.layers = layers
self.weights = self.generate_params()
# Function for generating theta multidimensional matrix
def generate_params(self):
theta = []
epsilon = 0.12
for i in range(len(self.layers) - 1):
current_layer_units = self.layers[i]
next_layer_units = self.layers[i + 1]
theta_i = np.multiply(
np.random.rand(next_layer_units, current_layer_units + 1),
2 * epsilon - epsilon
)
# Appending the params to the theta matrix
theta.append(theta_i)
return theta
# Function to append bias row/column to matrix X
def append_bias(self, X, d):
m = X.shape[0]
n = 1 if len(X.shape) == 1 else X.shape[1]
if (d == 'column'):
ones = np.ones((m, n + 1))
ones[:, 1:] = X.reshape((m, n))
elif (d == 'row'):
ones = np.ones((m + 1, n))
ones[1:, :] = X.reshape((m, n))
return ones
# Function for computing the gradient for 1 training example
def back_prop(self, y, feed, theta):
activations = feed["activations"]
weighted_layers = feed["weighted_layers"]
delta_output = activations[-1] - y.reshape(len(y), 1)
current_delta = delta_output
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Peforming delta calculations.
# Here, we continue to propagate the delta values backwards
# until we arrive to the second layer.
for i in reversed(range(len(theta))):
theta_i = theta[i]
if (i > 0):
i_weighted_inputs = self.append_bias(weighted_layers[i - 1], 'row')
t_theta_i = np.transpose(theta_i)
delta_i = np.multiply(np.dot(t_theta_i, current_delta), utils.sigmoidGradient(i_weighted_inputs))
delta_i = delta_i[1:]
gradients[i] = current_delta * np.transpose(activations[i])
# Setting current delta for the next layer
current_delta = delta_i
else:
gradients[i] = current_delta * np.transpose(activations[i])
return gradients
# Function for computing the cost and the derivatives
def compute_cost(self, theta, X, y, r12n = 0):
m = len(X)
num_labels = self.layers[-1]
costs = np.zeros(m)
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Iterating over the training set
for i in range(m):
inputs = X[i]
observed = utils.create_output_vector(y[i], num_labels)
feed = self.feed_forward(inputs)
predicted = feed["activations"][-1]
total_cost = 0
for k, o in enumerate(observed):
if (o == 1):
total_cost += np.log(predicted[k])
else:
total_cost += np.log(1 - predicted[k])
cost = -1 * total_cost
# Storing the cost for the i-th training example
costs[i] = cost
# Calculating the gradient for this training example
# using back propagation algorithm
gradients_i = self.back_prop(observed, feed, theta)
for i, gradient in enumerate(gradients_i):
gradients[i] += gradient
# Calculating the avg regularization term for the cost
sum_of_theta = 0
for i, theta_i in enumerate(theta):
squared_theta = np.power(theta_i[:, 1:], 2)
sum_of_theta += np.sum(squared_theta)
r12n_avg = r12n * sum_of_theta / (2 * m)
total_cost = np.sum(costs) / m + r12n_avg
# Applying regularization terms to the gradients
for i, theta_i in enumerate(theta):
lambda_i = np.copy(theta_i)
lambda_i[:, 0] = 0
lambda_i = np.multiply((r12n / m), lambda_i)
# Adding the r12n matrix to the gradient
gradients[i] = gradients[i] / m + lambda_i
return total_cost, gradients
# Function for training the neural network using conjugate gradient algorithm
def train_cg(self, X, y, r12n = 0, iterations = 50):
weights = self.weights
def Cost(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
cost, _ = self.compute_cost(theta, X, y, r12n)
print(cost);
return cost
def Gradient(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
_, gradient = self.compute_cost(theta, X, y, r12n)
return utils.unroll_theta(gradient)
unrolled_theta = utils.unroll_theta(weights)
result = op.fmin_cg(f = Cost,
x0 = unrolled_theta,
args=(X, y),
fprime=Gradient,
maxiter = iterations)
self.weights = utils.roll_theta(result, self.layers)
# Function for feeding forward the network
def feed_forward(self, X):
# Useful variables
activations = []
weighted_layers = []
weights = self.weights
currentActivations = self.append_bias(X, 'row')
activations.append(currentActivations)
for i in range(len(self.layers) - 1):
layer_weights = weights[i]
weighted_inputs = np.dot(layer_weights, currentActivations)
# Storing the weighted inputs
weighted_layers.append(weighted_inputs)
activation_nodes = []
# If the next layer is not the output layer, we'd like to add a bias unit to it
# (Excluding the input and the output layer)
if (i < len(self.layers) - 2):
activation_nodes = self.append_bias(utils.sigmoid(weighted_inputs), 'row')
else:
activation_nodes = utils.sigmoid(weighted_inputs)
# Appending the layer of nodes to the activations array
activations.append(activation_nodes)
currentActivations = activation_nodes
data = {
"activations": activations,
"weighted_layers": weighted_layers
}
return data
def predict(self, X):
data = self.feed_forward(X)
output = data["activations"][-1]
# Finding the max index in the output layer
return np.argmax(output, axis=0)
Here is the invocation of the code:
import numpy as np
from network import Network
# %% Load data
X = np.genfromtxt('data/mnist_data.csv', delimiter=',')
y = np.genfromtxt('data/mnist_outputs.csv', delimiter=',').astype(int)
# %% Create network
num_labels = 10
input_layer = 400
hidden_layer = 25
output_layer = num_labels
layers = [input_layer, hidden_layer, output_layer]
# Create a new neural network
network = Network(layers)
# %% Train the network and save the weights
network.train_cg(X, y, r12n = 1, iterations = 20)
This is what the code emits after each iteration:
15.441233231650283
15.441116436313076
15.441192262452514
15.44122384651483
15.441231216030646
15.441232804294314
15.441233141284435
15.44123321255294
15.441233227614855
As you can see, the changes to the cost are very small.
I checked for the shapes of the vectors and gradient and they both seem fine, just like in my Matlab implementation. I'm not sure what I do wrong here.
If you guys could help me, that'd be great :)

Neural network can learn |sin(x)| for [0,pi] but not [0,2pi] or [0, 4pi]

My neural network can learn |sin(x)| for [0,pi], but not larger intervals than that. I tried changing the quantity and widths of hidden layers in various ways, but none of the changes leads to a good result.
I train the NN on thousands of random values from a uniform distribution in the chosen interval. using back propagation with gradient descent.
I am starting to think there is a fundamental problem in my network.
For the following examples I used a 1-10-10-1 layer structure:
[0, pi]:
[0, 2pi]:
[0, 4pi]:
Here is the code for the neural network:
import math
import numpy
import random
import copy
import matplotlib.pyplot as plt
def sigmoid(x):
return 1.0/(1+ numpy.exp(-x))
def sigmoid_derivative(x):
return x * (1.0 - x)
class NeuralNetwork:
def __init__(self, weight_dimensions, x=None, y=None):
self.weights = []
self.layers = [[]] * len(weight_dimensions)
self.weight_gradients = []
self.learning_rate = 1
self.layers[0] = x
for i in range(len(weight_dimensions) - 1):
self.weights.append(numpy.random.rand(weight_dimensions[i],weight_dimensions[i+1]) - 0.5)
self.y = y
def feed_forward(self):
# calculate an output using feed forward layer-by-layer
for i in range(len(self.layers) - 1):
self.layers[i + 1] = sigmoid(numpy.dot(self.layers[i], self.weights[i]))
def print_loss(self):
loss = numpy.square(self.layers[-1] - self.y).sum()
print(loss)
def get_weight_gradients(self):
return self.weight_gradients
def apply_weight_gradients(self):
for i in range(len(self.weight_gradients)):
self.weights[i] += self.weight_gradients[i] * self.learning_rate
if self.learning_rate > 0.001:
self.learning_rate -= 0.0001
def back_prop(self):
# find derivative of the loss function with respect to weights
self.weight_gradients = []
deltas = []
output_error = (self.y - self.layers[-1])
output_delta = output_error * sigmoid_derivative(self.layers[-1])
deltas.append(output_delta)
self.weight_gradients.append(self.layers[-2].T.dot(output_delta))
for i in range(len(self.weights) - 1):
i_error = deltas[i].dot(self.weights[-(i+1)].T)
i_delta = i_error * sigmoid_derivative(self.layers[-(i+2)])
self.weight_gradients.append(self.layers[-(i+3)].T.dot(i_delta))
deltas.append(copy.deepcopy(i_delta))
# Unreverse weight gradient list
self.weight_gradients = self.weight_gradients[::-1]
def get_output(self, inp):
self.layers[0] = inp
self.feed_forward()
return self.layers[-1]
def sin_test():
interval = numpy.random.uniform(0, 2*math.pi, int(1000*(2*math.pi)))
x_values = []
y_values = []
for i in range(len(interval)):
y_values.append([abs(math.sin(interval[i]))])
x_values.append([interval[i]])
x = numpy.array(x_values)
y = numpy.array(y_values)
nn = NeuralNetwork([1, 10, 10, 1], x, y)
for i in range(10000):
tmp_input = []
tmp_output = []
mini_batch_indexes = random.sample(range(0, len(x)), 10)
for j in mini_batch_indexes:
tmp_input.append(x[j])
tmp_output.append(y[j])
nn.layers[0] = numpy.array(tmp_input)
nn.y = numpy.array(tmp_output)
nn.feed_forward()
nn.back_prop()
nn.apply_weight_gradients()
nn.print_loss()
nn.layers[0] = numpy.array(numpy.array(x))
nn.y = numpy.array(numpy.array(y))
nn.feed_forward()
axis_1 = []
axis_2 = []
for i in range(len(nn.layers[-1])):
axis_1.append(nn.layers[0][i][0])
axis_2.append(nn.layers[-1][i][0])
true_axis_2 = []
for x in axis_1:
true_axis_2.append(abs(math.sin(x)))
axises = []
for i in range(len(axis_1)):
axises.append([axis_1[i], axis_2[i], true_axis_2[i]])
axises.sort(key=lambda x: x[0], reverse=False)
axis_1_new = []
axis_2_new = []
true_axis_2_new = []
for elem in axises:
axis_1_new.append(elem[0])
axis_2_new.append(elem[1])
true_axis_2_new.append(elem[2])
plt.plot(axis_1_new, axis_2_new, label="nn")
plt.plot(axis_1_new, true_axis_2_new, 'k--', label="sin(x)")
plt.grid()
plt.axis([0, 2*math.pi, -1, 2.5])
plt.show()
sin_test()
The main issue with your network seem to be that you apply the activation function to the final "layer" of your network. The final output of your network should be a linear combination without any sigmoid applied.
As a warning though, do not expect the model to generalize outside of the region included in the training data.
Here is an example in PyTorch:
import torch
import torch.nn as nn
import math
import numpy as np
import matplotlib.pyplot as plt
N = 1000
p = 2.5
x = 2 * p * math.pi * torch.rand(N, 1)
y = np.abs(np.sin(x))
with torch.no_grad():
plt.plot(x.numpy(), y.numpy(), '.')
plt.savefig("training_data.png")
inner = 20
model = nn.Sequential(
nn.Linear(1, inner, bias=True),
nn.Sigmoid(),
nn.Linear(inner, 1, bias=True)#,
#nn.Sigmoid()
)
loss_fn = nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500000):
y_pred = model(x)
loss = loss_fn(y_pred, y)
if t % 1000 == 0:
print("MSE: {}".format(t), loss.item())
model.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
X = torch.arange(0, p * 2 * math.pi, step=0.01).reshape(-1, 1)
Y = model(X)
Y_TRUTH = np.abs(np.sin(X))
print(Y.shape)
print(Y_TRUTH.shape)
loss = loss_fn(Y, Y_TRUTH)
plt.clf()
plt.plot(X.numpy(), Y_TRUTH.numpy())
plt.plot(X.numpy(), Y.numpy())
plt.title("MSE: {}".format(loss.item()))
plt.savefig("output.png")
The output is available here: Image showing neural network prediction and ground truth. The yellow line is the predicted line by the neural network and the blue line is the ground truth.
First and foremost, you've chosen a topology suited for a different class of problems. A simple, fully-connected NN such as this is great with trivial classification (e.g. Boolean operators) or functions with at least two continuous derivatives. You've tried to apply it to a function that is simply one step beyond its capabilities.
Try your model on sin(x) and see how it performs at larger ranges. Try it on max(sin(x), 0). Do you see how the model has trouble with certain periodicity and irruptions? These are an emergent feature of the many linear equations struggling to predict the proper functional value: the linear combinations have trouble emulating non-linearities past a simple level.

Why is my LSTM in tensorflow learning so slowly and badly?

This program reads a text file RNNtext.txt, creates one-hot vector representation for all the data, trains the LSTM with the data and displays a bunch of sampled characters every now and then. However, even looking at the cost vs iterations graph shows that it's learning very very inefficiently. Honestly, the raw code (numpy) for the LSTM I have does a MUCH better job. It's not only faster but it produces mostly meaningful words. This produces gibberish only. Where is my mistake? I really am out of ideas and I can't seem to find where it is logically wrong.
import numpy as np
import random
import tensorflow as tf
import os
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
# Reading RNNtext.txt file
direc = os.path.dirname(os.path.realpath(__file__))
data = open(direc + "/RNNtext.txt", "r").read()
# Array of unique characters
chars = list(set(data))
num_hidden = 80
iterations = 1000
display_iteration = 100 # Sample when iteration % display_iteration == 0
sample_size = 250
batch_size = 120 # batch size or the number of time steps to unroll RNN
alpha = 0.01 # Learning rate
#Vocabulary and text file sizes
vocab_size = len(chars)
data_size = len(data)
# Bijection from a unique character to an index
char_to_ix = {}
# Bijection from an index to a unique character
ix_to_char = {}
for j in range(vocab_size):
char_to_ix[chars[j]] = j
ix_to_char[j] = chars[j]
# Transforming all characters to indices
data_ix = [char_to_ix[ch] for ch in data]
train_data = [] # This will contain one-hot vectors
for k in range(data_size):
# Representing each index/character by a one-hot vector
hot1 = np.zeros((vocab_size, 1))
hot1[data_ix[k]] = 1
train_data.append(hot1)
X = tf.placeholder(tf.float32, [None, vocab_size, 1]) #Number of examples, number of input, dimension of each input
target = tf.placeholder(tf.float32, [None, vocab_size])
cell = tf.contrib.rnn.LSTMCell(num_hidden,state_is_tuple=True)
output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)
output = tf.transpose(output, [1, 0, 2])
weight = tf.Variable(tf.random_normal([num_hidden, vocab_size]))
bias = tf.Variable(tf.constant(0.0, shape=[vocab_size]))
prediction = tf.matmul(output[-1], weight) + bias
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=target))
optimizer = tf.train.ProximalGradientDescentOptimizer(alpha)
minimize = optimizer.minimize(cost)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
ARR = [i for i in range(vocab_size)] # for extracting index by probabilities in np.random.choice()
ITER = []
COST = []
p = 0 # p will be iterated by batch_size steps
for i in range(iterations):
if p + batch_size >= data_size:
p = 0
# sweeping through data one-hot vectors
inp, out = train_data[p:p+batch_size], train_data[p+1:p+batch_size+1]
out = np.reshape(out, [-1, vocab_size])
c = sess.run(cost, {X: inp, target: out}) # calculating cost for plotting later
COST.append(c)
ITER.append(i)
sess.run(minimize, {X: inp, target: out})
# displaying sample_size number of characters with random seed
# doesn't affect training
if i % display_iteration == 0:
seed = np.random.randint(0, vocab_size)
CHARS = []
for j in range(sample_size):
x = np.zeros((vocab_size, 1))
x[seed] = 1
x = [x]
pred = sess.run(prediction, {X: x})[0]
pred = np.exp(pred) / np.sum(np.exp(pred))
pred = pred.ravel()
seed = np.random.choice(ARR, 1, p = pred)[0]
ch = ix_to_char[seed]
CHARS.append(ch)
TXT = ''.join(CHARS)
print("-------------------------------------------------")
print(TXT)
print("Iteration: ", str(i))
p += batch_size
sess.close()
plt.plot(ITER, COST)
plt.show()
EDIT: Added numpy code for comparison
import numpy as np
import matplotlib.pyplot as plt
import os
plt.style.use('fivethirtyeight')
direc = os.path.dirname(os.path.realpath(__file__))
readFile = open(direc + "\RNNtext.txt", 'r')
data = readFile.read()
readFile.close()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(chars)
print("Vocabulary size: " + str(vocab_size))
char_to_ix = {}
ix_to_char = {}
for j in range(len(chars)):
char_to_ix[chars[j]] = j
ix_to_char[j] = chars[j]
hidden_size = 80
batch_size = 120
alpha = 0.1
sample_size = 250
iterations = 1000
display_iteration = 100
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias
def sample(hid, seed, weights, sample_size):
X = np.zeros((vocab_size, 1))
X[seed] = 1
CHARS = []
ARR = [i for i in range(vocab_size)]
for t in range(sample_size):
hid = np.tanh(np.dot(Wxh, X) + np.dot(Whh, hid) + bh)
y = np.dot(Why, hid) + by
prob = np.exp(y) / np.sum(np.exp(y))
prob = prob.ravel()
ix = np.random.choice(ARR, 1, p=prob)[0]
CHARS.append(ix_to_char[ix])
X = np.zeros((vocab_size, 1))
X[ix] = 1
TXT = ''.join(CHARS)
return TXT
LOSS = []
ITER = []
p = 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*batch_size # loss at iteration 0
hprev = np.zeros((hidden_size,1))
for i in range(iterations): ## just time passing by
dWxh = np.zeros_like(Wxh)
dWhh = np.zeros_like(Whh)
dWhy = np.zeros_like(Why)
dbh = np.zeros_like(bh)
dby = np.zeros_like(by)
if p+batch_size >= len(data) or i == 0:
hprev = np.zeros((hidden_size,1))
p = 0
inputs = [char_to_ix[ch] for ch in data[p:p+batch_size]]
targets = [char_to_ix[ch] for ch in data[p+1:p+batch_size+1]]
HID = {}
X = {}
Y = {}
P = {}
HID[-1] = np.copy(hprev)
loss = 0
##======FORWARD======##
for t in range(len(inputs)):
X[t] = np.zeros((vocab_size,1))
X[t][inputs[t]] = 1
HID[t] = np.tanh(np.dot(Wxh, X[t]) + np.dot(Whh, HID[t-1]) + bh) # inp -> X
Y[t] = np.dot(Why, HID[t]) + by # tanh
P[t] = np.exp(Y[t]) / np.sum(np.exp(Y[t]))
loss += -np.log(P[t][targets[t]][0])
dhnext = np.zeros_like(HID[0])
##======BACKPROP======##
for t in reversed(range(len(inputs))):
dy = np.copy(P[t])
dy[targets[t]] -= 1
dh = (np.dot(Why.T, dy) + dhnext)*(1-HID[t]*HID[t])
dx = np.dot(Why.T, dy)*(1 - HID[t]**2)
dWhy += np.dot(dy, HID[t].T)
dWhh += np.dot(dh, HID[t-1].T)
dWxh += np.dot(dh, X[t].T)
dby += dy
dbh += dh
dhnext = np.dot(Whh.T, dh)
##=====================##
hprev = HID[-1]
smooth_loss = smooth_loss * 0.999 + loss * 0.001
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
np.clip(dparam, -1, 1, out=dparam) # clip to mitigate exploding gradients
for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
[dWxh, dWhh, dWhy, dbh, dby],
[mWxh, mWhh, mWhy, mbh, mby]):
mem += dparam * dparam
param += -alpha * dparam / np.sqrt(mem + 1e-8) # Adagrad
if i % display_iteration == 0:
print(str(i))
weights = [Wxh,Whh,Why,bh,by]
seed = inputs[np.random.randint(0,len(inputs))]
TXT = sample(HID[-1], seed, weights, sample_size)
print("-----------------------------------------------")
print(TXT)
print("-----------------------------------------------")
with open(direc + "\RNNout.txt", 'w') as writeFile:
writeFile.write(TXT)
ITER.append(i)
LOSS.append(loss)
p += batch_size
best_text = sample(HID[-1], inputs[0], weights, sample_size)
plt.plot(ITER, LOSS, linewidth = 1)
plt.show()
writeFile.close()
Well, doh... looks like you are not re-using the state! How is LSTM (state machine) supposed to work properly if you are not maintaining the state?
To me this looks like a red flag:
output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)
the second output from tf.nn.dynamic_rnn is the latest state after the given sequence has been processed. Looks like you are explicitly ignoring it and not re-feeding it into each following iteration of training in sess.run(...) (and hence your dynamic_rnn doesn't have the initial_state parameter).
I would highly recommend changing that part of your code before looking any further.
Also, I don't know what your data looks like, but your feeding and batching strategy needs to be such as to make sense out of this whole state-passing exercise. Otherwise, once again, it will just produce gibberish.
With the information provided, I would suggest these two initial steps to try to improve the model.
Increase the number of iterations, Recurrent Neural Networks work differently than other deep arhitectures and could need maybe an additional order of magnitude in iteration number, to settle.
Play with the seeds: from my experience in order to get meaningful sequences can depend on the quality of the used seeds.

Neural network backpropagation algorithm not working in Python

I am writing a neural network in Python, following the example here. It seems that the backpropagation algorithm isn't working, given that the neural network fails to produce the right value (within a margin of error) after being trained 10 thousand times. Specifically, I am training it to compute the sine function in the following example:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = .1
for layer in range(len(neurons)):
self.inputs.append(np.empty(neurons[layer]))
self.outputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = np.dot(self.weights[layer].T, self.outputs[layer])
self.outputs[-1] = np.tanh(self.inputs[-1])
def backpropagate(self, targets):
gradient = 1 - self.outputs[-1] * self.outputs[-1]
self.errors[-1] = gradient * (self.outputs[-1] - targets)
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(100000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(x)])
net.feedforward([-2])
print(net.outputs[-1])
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
sine_example()
The output fails to be close to tanh(sin(3)) = 0.140190616. I suspected a mistake involving wrong indices or alignment, but Numpy isn't raising any errors like these. Any tips on where I went wrong?
EDIT: I forgot to add the bias neurons. Here is the updated code:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.outputs = []
self.inputs = []
self.errors = []
self.offsets = []
self.rate = .01
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
self.outputs.append(np.empty(neurons[layer]))
self.inputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
self.offsets.append(np.random.normal(scale=1/np.sqrt(neurons[layer]), size=neurons[layer + 1]))
self.inputs.append(np.empty(neurons[-1]))
self.errors.append(np.empty(neurons[-1]))
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = self.offsets[layer] + np.dot(self.weights[layer].T, self.outputs[layer])
def backpropagate(self, targets):
self.errors[-1] = self.inputs[-1] - targets
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
self.offsets[layer] -= self.rate * self.errors[layer + 1]
def sine_example():
net = Neuralnet([1, 5, 1])
for step in range(10000):
x = np.random.uniform(-5, 5)
net.feedforward([x])
net.backpropagate([np.sin(x)])
net.feedforward([np.pi])
print(net.inputs[-1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(10000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(10000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([x])
net.feedforward([-2])
print(net.outputs[-1])
identity_example()
I think you train the NN in the wrong way. You have a loop over 10000 iterations and feed a new sample in each cycle. The NN will never get trained in this case.
(the statement is wrong! See the update! )
What you need to do is to generate a large array of true samples Y = sin(X), give it to your network ONCE and iterate over the training set forwards and backwards, in order to minimize the cost function. To check the algorithm you may need to plot the cost function depending on the iteration number and make sure the cost goes down.
Another important point is the initialization of the weights. Your numbers are pretty large and the network will take a lot of time to converge, especially when using low rates. It's a good practice to generate the initial weights in some small range [-eps .. eps] uniformly.
In my code I implemented two different activation functions: sigmoid() and tanh(). You need to scale your inputs depending on the selected function: [0 .. 1] and [-1 .. 1] respectively.
Here are some images which show the cost function and the resulting predictions for sigmoid() and tanh() activation functions:
As you can see the sigmoid() activation gives a little bit better results, than the tanh().
Also I got much better predictions when using a network [1, 6, 1], compared to a bigger network with 4 layers [1, 6, 4, 1]. So the size of the NN is not always the crucial factor. Here is the prediction for the mentioned network with 4 layers:
Here is my code with some comments. I tried to use your notations where it was possible.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.5
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; # range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
plt.plot(J_history)
plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' # the input should be scaled into [ 0..1]
activation2 = 'tanh' # the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
##########################################################################################
# TRAINING
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); # input training set
Y = np.sin(X) # target
kx = 0.1 # noise parameter
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise # noisy target
# scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
# number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
# gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) # prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
# visualization
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
plt.show()
raw_input('press any key to exit')
UPDATE
I would like to take back the statement regarding the training method used in your code. The network can be indeed trained using only one sample per iteration. I got interesting results in online-training using both sigmoid and tanh activation functions:
Online-training using Sigmoid (cost function and prediction)
Online-training using Tanh (cost function and prediction)
As can be seen the choice of Sigmoid as activation function gives better performance. The cost function looks not that good as during the offline-training, but at least it tends to go down.
I plotted the cost function in your implementation, it looks pretty jerky as well:
May be it is a good idea to try your code with the sigmoid or even the ReLU function.
Here is the updated source code. To switch between online and offline training modes just change the method variable.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.2
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; #range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
#plt.plot(J_history)
#plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def cost_online(self, min_x, max_x, iter_number):
h_arr = np.zeros([iter_number, 1])
y_arr = np.zeros([iter_number, 1])
for step in range(iter_number):
x = np.random.uniform(min_x, max_x, 1).reshape(1, 1)
self.feedforward(x)
h_arr[step, 0] = self.outputs[-1]
y_arr[step, 0] = np.sin(x)
J = np.sum(np.sum(np.power((y_arr - h_arr), 2), axis=0))/(2*iter_number)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' #the input should be scaled into [0..1]
activation2 = 'tanh' #the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
method1 = 'online'
method2 = 'offline'
method = method1
kx = 0.1 #noise parameter
###################################################################################################
# TRAINING
if (method == 'offline'):
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); #input training set
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
#number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
elif (method == 'online'):
sampling_count = 100000 # number of samplings during the training stage
m = 1 #batch size
iter_count = sampling_count/m
for layer in range(net.L):
net.inputs.append(np.empty([m, net.neurons[layer]]))
net.errors.append(np.empty([m, net.neurons[layer]]))
if (layer < net.L -1):
net.outputs.append(np.empty([m, net.neurons[layer]+1]))
else:
net.outputs.append(np.empty([m, net.neurons[layer]]))
J_history = []
step_history = []
for i in range(iter_count):
X = np.random.uniform(0, 4*math.pi, m).reshape(m, 1)
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
net.feedforward(X)
net.backpropagate(Y_scaled)
if (np.remainder(i, 1000) == 0):
J = net.cost_online(0, 4*math.pi, 1000)
J_history.append(J)
step_history.append(i)
plt.plot(step_history, J_history)
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
#plt.ylim([0, 0.1])
plt.show()
#gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) #prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
#visualization
#fake sine curve to show the ideal signal
if (method == 'online'):
X = np.linspace(0, 4*math.pi, num = 100)
Y = np.sin(X)
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
if (method == 'online'):
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
plt.ylim([-1.5, 1.5])
plt.show()
raw_input('press any key to exit')
Now I have some remarks to your current code:
Your sine function looks like this:
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
I don't know why you use tanh in your target input. If you really want to use tanh of sine as target, you need to scale it to [-1..1], because tanh(sin(x)) returns values in range [-0.76..0.76].
The next thing is the range of your training set. You use x = np.random.normal() to generate the samples. Here is the distribution of such an input:
After it you want your network to predict the sine of 3, but the network has almost never seen this number during the training stage. I would use the uniform distribution in a wider range for sample generation instead.

Categories