Error in Backpropagation: Neural Network predicts same class - python

I am writing Neural Network code from scratch using Numpy. But even after training my Network for many epochs, the predictions for each class is random and remains same irrespective of the input.
I have checked my concept according to Andrew Ng's Coursera ML course and towardsdatascience.com 's post. I think I'm making some very conceptual mistake which I cannot figure out.
Here is my code:
import numpy as np
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def dsigmoid(y):
return y * (1 - y)
class NeuralNetwork:
def __init__(self, shape):
self.n_layers = len(shape)
self.shape = shape
self.weight = []
self.bias = []
i = 0
while i < self.n_layers - 1:
self.weight.append(np.random.normal(loc=0.0, scale=0.5,
size=(self.shape[i + 1], self.shape[i])))
self.bias.append(np.random.normal(loc=0.0, scale=0.3,
size=(self.shape[i + 1], 1)))
i += 1
def predict(self, X):
z = self.weight[0] # X + self.bias[0]
a = sigmoid(z)
i = 1
while i < self.n_layers - 1:
z = self.weight[i] # a + self.bias[i]
a = sigmoid(z)
i += 1
return a
def predictVerbose(self, X):
layers = [X]
z = self.weight[0] # X + self.bias[0]
a = sigmoid(z)
layers.append(a)
i = 1
while i < self.n_layers - 1:
z = self.weight[i] # a + self.bias[i]
a = sigmoid(z)
layers.append(a)
i += 1
return layers
def gradOne(self, X, y):
layers = self.predictVerbose(X)
h = layers[-1]
delta_b = [(h - y) * dsigmoid(h)]
delta_w = [delta_b[0] # layers[-2].T]
i = 1
while i < self.n_layers - 1:
buff = delta_b[-1]
delta_b.append((self.weight[-i].T # buff) * dsigmoid(layers[-(i + 1)]))
delta_w.append(delta_b[-1] # layers[-(i + 2)].T)
i += 1
return delta_b[::-1], delta_w[::-1]
def grad(self, data, l_reg=0):
#data: x1, x2, x3, ..., xm, y=(0, 1, 2,...)
m = len(data)
delta_b = []
delta_w = []
i = 0
while i < self.n_layers - 1:
delta_b.append(np.zeros((self.shape[i + 1], 1)))
delta_w.append(np.zeros((self.shape[i + 1], self.shape[i])))
i += 1
for row in data:
X = np.array(row[:-1])[np.newaxis].T
y = np.zeros((self.shape[-1], 1))
# print(row)
y[row[-1], 0] = 1
buff1, buff2 = self.gradOne(X, y)
i = 0
while i < len(delta_b):
delta_b[i] += buff1[i] / m
delta_w[i] += buff2[i] / m
i += 1
return delta_b, delta_w
def train(self, data, batch_size, epoch, alpha, l_reg=0):
m = len(data)
for i in range(epoch):
j = 0
while j < m:
delta_b, delta_w = self.grad(data[i: (i + batch_size + 1)])
i = 0
while i < len(self.weight):
self.weight[i] -= alpha * delta_w[i]
self.bias[i] -= alpha * delta_b[i]
i += 1
j += batch_size
if __name__ == "__main__":
x = NeuralNetwork([2, 2, 2])
# for y in x.gradOne(np.array([[1], [2], [3]]), np.array([[0], [1]])):
# print(y.shape)
data = [
[1, 1, 0],
[0, 0, 0],
[1, 0, 1],
[0, 1, 1]
]
x.train(data, 4, 1000, 0.1)
print(x.predict(np.array([[1], [0]])))
print(x.predict(np.array([[1], [1]])))
Please point out where I am going wrong.

Unfortunately I don't have enough reputation to comment on your post but here's a link to a numpy only neural network that I've made (tested on blob data from sklearn and mnist).
https://github.com/jaymody/backpropagation/blob/master/old/NeuralNetwork.py

Are you still interested in this problem? As I understood, you try to get the XOR-perceptron with direct and inverse outputs?
It looks like:
1. You need to change the expression
delta_b, delta_w = self.grad(data[i: (i + batch_size + 1)]) to
delta_b, delta_w = self.grad(data[::])
in the train function.
2. Some of random values, used for initialization of synaptic and biases weights, requires much more training cycles for alpha=0.1. Try to play with the alpha (I set it up to 2) and number of epochs (I tried up to 20000).
Also your code do not works with 1-layered networks. I tried to train 1-layered AND and OR perceptrons and I got very strange results (or maybe it requires even much more cycles). But in 2-layered cases it works fine.

Related

Python-coded neural network does not learn properly

My network is not trained to recognize inputs separately, it either outputs the averaged result or becomes biased to one particular output. What am I doing wrong?
import numpy as np
sigmoid = lambda x: 1 / (1 + np.exp(-x))
sigmoid_der = lambda x: sigmoid(x) * (1 - sigmoid(x))
ReLU = lambda x: np.maximum(0, x)
ReLU_der = lambda x: x > 0
class NeuralNetwork:
def __init__(self, shape: tuple):
self.layers = len(shape) # The amount layers
self.shape = shape # The amount of neurons per each layer
self.weights = [
np.array([np.random.rand(shape[l - 1]) for _ in range(shape[l])])
for l in range(1, self.layers)
] # A list of matrices of weights connecting neighbouring layers
self.weighted_sums = [np.zeros(l) for l in shape]
self.activations = [np.zeros(l) for l in shape]
def inspect(self):
print("=============NeuralNetwork===============")
print(f"Shape: {self.shape}")
print(f"Weights: {self.weights}")
print(f"Activations: {self.activations}")
def forward_prop(self, X):
self.activations[0] = X
for l in range(1, self.layers):
self.weighted_sums[l] = self.weights[l - 1] # self.activations[l - 1]
self.activations[l] = sigmoid(self.weighted_sums[l])
def backprop(self, X, Y):
delta = [np.empty(self.shape[l]) for l in range(1, self.layers)] # Here errors get stored
delta[-1] = (Y - self.activations[-1]) * sigmoid_der(self.weighted_sums[-1]) # The output error
for l in reversed(range(self.layers - 2)): # The errors get backpropagated
delta[l] = self.weights[l + 1].T # delta[l + 1] * sigmoid_der(self.weighted_sums[l])
for l in range(self.layers - 1): # The weights get updated online
for j in range(self.shape[l + 1]):
self.weights[l][j] -= 0.1 * self.activations[l + 1][j] * delta[l][j]
nn = NeuralNetwork((2, 2, 1))
X = np.array([
[1, 0],
[0, 1],
[1, 1],
[0, 0]
])
Y = np.array([
[1],
[1],
[0],
[0]
])
# I train my network by randomly picking an example from my training sets
for _ in range(1000):
i = np.random.randint(0, 4)
nn.forward_prop(X[i])
nn.backprop(X[i], Y[i])
for x in X:
nn.forward_prop(x)
print(nn.activations[-1])
The matrix math of backpropagation is quite tough. It is especially confusing that the length of the lists of weight matrices and deltas (actually the list of bias arrays too) should be one less than the amount of layers in a network which makes indexing confusing. Apparently, the problem was due to misindexing. Finally it works!
import numpy as np
sigmoid = lambda x: 1 / (1 + np.exp(-x))
sigmoid_der = lambda x: sigmoid(x) * (1 - sigmoid(x))
ReLU = lambda x: np.maximum(0, x)
ReLU_der = lambda x: x > 0
class NeuralNetwork:
def __init__(self, shape: tuple):
self.layers = len(shape)
self.shape = shape
self.weights = [
np.array([2 * np.random.random(shape[l - 1]) - 1 for _ in range(shape[l])])
for l in range(1, self.layers)
]
self.biases = [np.zeros(l) for l in shape[1:]]
self.weighted_sums = [None for l in shape]
self.activations = [None for l in shape]
self.deltas = [None for l in shape[1:]]
def inspect(self):
print("=============NeuralNetwork===============")
print(f"Shape: {self.shape}")
print(f"Weights: {self.weights}")
print(f"Activations: {self.activations}")
def forward_prop(self, X):
self.activations[0] = X
for l in range(1, self.layers):
self.weighted_sums[l] = self.weights[l - 1] # self.activations[l - 1] + self.biases[l - 1]
self.activations[l] = sigmoid(self.weighted_sums[l])
def backprop(self, X, Y, lr):
self.deltas[-1] = (Y - self.activations[-1]) * sigmoid_der(self.weighted_sums[-1])
for l in range(self.layers - 2, 0, -1):
self.deltas[l - 1] = self.weights[l].T # self.deltas[l] * sigmoid_der(self.weighted_sums[l])
for l in range(self.layers - 1):
for j in range(self.shape[l + 1]):
self.weights[l][j] += lr * self.activations[l] * self.deltas[l][j]
self.biases[l] += self.deltas[l]
def train(self, X, Y, lr, epochs):
for e in range(epochs):
if not e % 1000: self.test(X)
i = np.random.randint(len(X))
self.forward_prop(X[i])
self.backprop(X[i], Y[i], lr)
def test(self, X):
print()
for x in X:
self.forward_prop(x)
print(x, self.activations[-1])
if __name__ == "__main__":
nn = NeuralNetwork((2, 3, 2, 1))
X = np.array([
[1, 0],
[0, 1],
[1, 1],
[0, 0]
])
Y = np.array([
[1],
[1],
[0],
[0]
])
nn.train(X, Y, 0.4, 20000)
nn.test(X)

Neural network only learns the last pattern when given several patterns sequentially

This post is about the same issue, but no proper answer has been given. And since this problem seems to be widespread, I'll keep my code behind the scene.
Following this source, I've written a network which does well when I give it a training example with a target vector. Using gradient descent I minimize the cost function to make the network provide the target vector when given the corresponding input vector. But this only works for one example!
The main goal of a neural network is to react differently for different inputs, and we should be able to train it to do so. I tried changing network weights by an average of delta-weights computed for each example, which failed: the training process gets stuck with the output vector holding averages of all the target vectors from the training set. No ideas left, no sources found to explain.
How do I train a neural network with a set of examples, not with just one input vector?
Update
For those wondering I'll attach my code below. Try to run this and you will see that instead of outputting 0 1 it provides 0.5 0.5, which is the result of subtracting averaged delta-weights.
import numpy as np
from sympy import symbols, lambdify
from sympy.functions.elementary.exponential import exp
from time import sleep
x = symbols('x')
sigmoid = exp(x) / (1 + exp(x))
sigmoid_der = sigmoid.diff(x)
sigmoid = lambdify(x, sigmoid)
sigmoid_der = lambdify(x, sigmoid_der)
class Neuron:
def __init__(self, amount_of_inputs: int, hidden = True):
self.inputs = np.random.rand(amount_of_inputs) if hidden else np.array([1])
self.bias = 0.0
self._activation = 0.0
self._wsum = 0.0
#property
def activation(self) -> float:
return self._activation
#property
def wsum(self) -> float:
return self._wsum
def calculate(self, indata):
wval = self.inputs * indata + self.bias
self._wsum = wval.sum()
self._activation = sigmoid(self._wsum)
class NeuralNetwork:
def __init__(self, shape: tuple):
self.shape = shape
self.layers = len(self.shape)
self.network = [None for _ in range(self.layers)]
self.network[0] = tuple([Neuron(1, hidden = False) for _ in range(shape[0])])
for L in range(1, self.layers):
self.network[L] = tuple([Neuron(shape[L - 1]) for _ in range(shape[L])])
self.network = tuple(self.network)
y = [symbols(f'y[{i}]') for i in range(shape[self.layers - 1])]
a = [symbols(f'a[{i}]') for i in range(shape[self.layers - 1])]
self.cost_function = sum([(y[i] - a[i]) ** 2 / 2 for i in range(shape[self.layers - 1])])
self.gradient = tuple([self.cost_function.diff(a[i]) for i in range(shape[self.layers - 1])])
self.cost_function = lambdify((y, a), self.cost_function)
self.gradient = lambdify((y, a), self.gradient)
def getLayer(self, L):
return np.array([self.network[L][i].activation for i in range(self.shape[L])])
def getWeightedSum(self, L):
return np.array([self.network[L][i].wsum for i in range(self.shape[L])])
def getInputsMatrix(self, L):
return np.array([self.network[L][i].inputs for i in range(self.shape[L])])
def calculate(self, values):
for i in range(self.shape[0]):
self.network[0][i].calculate(values[i])
for L in range(1, self.layers):
indata = self.getLayer(L - 1)
for j in range(self.shape[L]):
self.network[L][j].calculate(indata)
def get_result(self) -> tuple:
return tuple([self.network[self.layers - 1][i].activation for i in range(self.shape[self.layers - 1])])
def teach(self, targets, examples):
if len(targets) != len(examples):
raise TypeError("The amounts of target and input vectors do not coincide")
activations = [None for _ in range(len(examples))]
delta = activations.copy()
cost_is_low_enough = False
while not cost_is_low_enough:
for x in range(len(examples)):
self.calculate(examples[x])
activations[x] = [self.getLayer(l) for l in range(self.layers)]
delta[x] = [None for _ in range(self.layers - 1)]
network_output = self.getLayer(self.layers - 1)
output_weighted = self.getWeightedSum(self.layers - 1)
gradient_vector = np.array(self.gradient(targets[x], network_output))
delta[x][-1] = gradient_vector * sigmoid_der(output_weighted)
for l in range(self.layers - 2, 0, -1):
weight_matrix = self.getInputsMatrix(l + 1).transpose()
output_weighted = self.getWeightedSum(l)
activation = self.getLayer(l)
for j in range(self.shape[l]):
delta[x][l - 1] = (weight_matrix # delta[x][l]) * sigmoid_der(output_weighted) * activation
dw = [None for _ in range(self.layers - 1)]
for x in range(len(examples)):
self.calculate(examples[x])
for l in range(self.layers - 1):
dw[l] = np.empty(self.shape[l + 1])
for j in range(self.shape[l + 1]):
dw[l][j] = np.mean([delta[x][l][j] for x in range(len(examples))])
for l in range(1, self.layers):
for j in range(self.shape[l]):
for k in range(self.shape[l - 1]):
self.network[l][j].inputs[k] -= 0.1 * dw[l - 1][j]
cost = 0
for x in range(len(examples)):
self.calculate(examples[x])
network_output = np.array(self.get_result())
incost = self.cost_function(targets[x], network_output)
print(network_output, incost)
cost += incost
# sleep(0.05)
cost /= len(examples)
print()
if cost < 0.001: cost_is_low_enough = True
network = NeuralNetwork((2, 4, 1))
examples = np.array([
[1, 2],
[3, 4],
])
targets = np.array([
[0],
[1]
])
network.teach(targets, examples)
values_1 = np.array([5, 10])
network.calculate(values_1)
result = network.get_result()
print(result)
'''
values_2 = np.array([3, 4])
network.calculate(values_2)
result = network.get_result()
print(result)
'''

Gradient Descent returns NaN values for slope and error

I'm new to machine learning and am trying to implement gradient descent. The code I have looks like this and has been resulting in NaN values for all parameters:
def compute_error_for_line_given_points(b,m,points):
totalError = 0 #sum of square error formula
for i in range (0, len(points)):
x = points[i, 0]
y = points[i, 1]
totalError += (y-(m*x + b)) ** 2
return totalError/ float(len(points))
def step_gradient(b_current, m_current, points, learning_rate):
#gradient descent
b_gradient = 0
m_gradient = 0
N = float(len(points))
for i in range(0, len(points)):
x = points[i, 0]
y = points[i, 1]
b_gradient += -(2/N) * (y - (m_current * x + b_current))
m_gradient += -(2/N) * x * (y - (m_current * x + b_current))
new_b = b_current - (learning_rate * b_gradient)
new_m = m_current - (learning_rate * m_gradient)
return [new_b,new_m]
def gradient_descent_runner(points, starting_b, starting_m, learning_rate, num_iterations):
b = starting_b
m = starting_m
for i in range(num_iterations):
b,m = step_gradient(b, m, array(points), learning_rate)
return [b,m]
def run():
#Step 1: Collect the data
points = genfromtxt("C:/Users/mishruti/Downloads/For Linear Regression.csv", delimiter = ",")
#Step 2: Define our Hyperparameters
learning_rate = 0.0000001 #how fast the data converge
#y=mx+b (Slope formule)
initial_b = 0 # initial y-intercept guess
initial_m = 0 # initial slope guess
num_iterations = 4
print("Starting gradient descent at b = {0}, m = {1}, error = {2}".format(initial_b, initial_m, compute_error_for_line_given_points(initial_b, initial_m, points)))
print("Running...")
[b, m] = gradient_descent_runner(points, initial_b, initial_m, learning_rate, num_iterations)
print("After {0} iterations b = {1}, m = {2}, error = {3}".format(num_iterations, b, m, compute_error_for_line_given_points(b, m, points)))
# main function
if __name__ == "__main__":
run()
A sample from my data set is attached. Can someone please help me figure this out? Thanks!

my perceptron could classify OR/AND but it could not classify NOR/NAND

Here is my code in python
def error(y_desired, y):
return y_desired != y
def step_func(weighted_sum, theta):
return 1 if ((weighted_sum - theta) >= 0) else 0
def weight_adjustment(error, alpha, x_element):
return error*alpha*x_element
def weighted_sum(w, x_epoch):
weighted_sum = 0
for i in range(len(w)):
weighted_sum += w[i]*(x_epoch[i])
return weighted_sum
def perceptron(x, y_desired, w, theta, alpha):
cond = True
epochs = 0
while(cond == True):
count = 0
epochs += 1
print(f'Epoch number - {epochs}')
for epoch in range(len(x)):
weighted_sums = round(weighted_sum(w, x[epoch]), 10)
y = step_func(weighted_sums, theta)
if error(y_desired[epoch], y):
count += 1
for weights in range(len(w)):
w[weights] = round(w[weights] + weight_adjustment(error = (y_desired[epoch] - y),
alpha = alpha,
x_element = x[epoch][weights]), 10)
print(w)
print('\n')
if count == 0:
cond = False
print('Final Weights -')
return w
x = [[0, 0], [0, 1], [1, 0], [1, 1]]
y_desired = [0, 0, 0, 1]
w = [0.3, -0.1]
perceptron(x, y_desired, w, theta = 0.2, alpha = 0.1)
OR/AND works fine
but when I go for [1, 0, 0, 0]
it goes in an infinite loop. There is no difference in classification of AND/NAND/OR/NOR.
All can be linearly classified.
Did I miss something fundamentally or in weight training? Where I did the mistake?
If possible share the study material too.
Firstly you are rounding the weights, which is a bad habit to form.
Secondly you demand 0 error while classification in general will be probability based.
I recommend https://stackabuse.com/creating-a-neural-network-from-scratch-in-python/
series of 3 articles that does a decent job explaining the basics.

Prediction always 1 or 0

EDIT: squashing input between 0, 1 gives me about 0.5 output per neuron per data set.
It seems the output is always 1 with every set of inputs I feed forward after I train. However if I change the learning rate from pos. to neg. and vice versa, The output is always 0.
LN = -0.05
def Matrix(numI, numO):
matrix = []
for i in range(0, numO):
matrix.append([])
for c in range(0, numI):
if c > numI:
rw = random.random()
matrix[i].append(rw)
else:
rw = random.random()
matrix[i].append(rw)
return matrix
class Neuralnetwork:
def __init__(self, numI, numO):
self.Output_layer = Output_layer(numI, numO)
self.Feed_forward = self.Output_layer.Feed_forward
def train(self, t_inputs, t_targets):
for n in range(len(self.Output_layer.Neurons)):
self.Output_layer.new_weight(t_inputs, t_targets, n)
class Output_layer:
def __init__(self, numI, numO):
self.Bias = 1
self.Matrix = Matrix(numI, numO)
self.Neurons = []
for o in range(numO):
self.Neurons.append(Neuron(self.Matrix, o))
def Feed_forward(self, inputs):
outputs = []
for i in self.Neurons:
outputs.append(i.Output(inputs, self.Bias))
print(outputs)
def new_weight(self, t_inputs, t_targets, a):
for aw in range(len(self.Neurons[a].Weights)):
totalsw = []
totalsb = []
for i in range(len(t_inputs)):
pd_c_wrt_output = 2 * (self.Neurons[a].Output(t_inputs[i], self.Bias) - t_targets[i][a])
pd_output_wrt_net = self.Neurons[a].Output(t_inputs[i], self.Bias) * (1 - self.Neurons[a].Output(t_inputs[i], self.Bias))
pd_net_wrt_weight = t_inputs[aw][aw]
pd_c_wrt_weight = pd_c_wrt_output * pd_output_wrt_net * pd_net_wrt_weight
totalsw.append(pd_c_wrt_weight)
pd_c_wrt_output = 2 * (self.Neurons[a].Output(t_inputs[i], self.Bias) - t_targets[i][a])
pd_output_wrt_net = self.Neurons[a].Output(t_inputs[i], self.Bias) * (1 - self.Neurons[a].Output(t_inputs[i], self.Bias))
pd_net_wrt_bias = 1
pd_c_wrt_bias = pd_c_wrt_output * pd_output_wrt_net * pd_net_wrt_bias
totalsb.append(pd_c_wrt_bias)
pd_weight = sum(totalsw)
pd_bias = sum(totalsb)
self.Neurons[a].Weights[aw] -= LN * pd_weight
self.Bias -= LN * pd_bias
class Neuron:
def __init__(self, matrix, index_of_M):
self.Weights = matrix[index_of_M]
def Weighted_sum(self, weights, inputs, bias):
ind = 0
weightedI = []
for i in weights:
output = i * inputs[ind]
weightedI.append(output)
ind += 1
list = sum(weightedI) + bias
return list
def Sigmoid(self, prediction):
e = math.exp(-prediction)
prediction = 1 / (1 + e)
return round(prediction, 8)
def Output(self, inputs, bias):
output = self.Sigmoid(self.Weighted_sum(self.Weights, inputs, bias))
return output
nn = Neuralnetwork(2, 2)
nn.Feed_forward([10, 20])
for i in range(100000):
nn.train([[10, 20], [15, 30], [8, 16], [3, 9], [6, 18], [2, 6]],
[[1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1]])`
In my first neural network it worked fine. Really can't find the bug though.
I tried different things like putting the new_weight in neuron class, different amounts of inputs and outputs etc.
Try setting the weight values to random. This will help to break the symmetry. Also set the biases to 1.
You have two output classes. So , I suggest you to use a loss function like mean squared error with Gradient descent optimizer.
Also set the learning rate to something like 0.001 or 0.01.
You can learn more here.

Categories