I am trying a simple implementation of a multi-layer perceptron (MLP) using pure NumPy. My previous implementation using RMSE and sigmoid activation at the output (single output) works perfectly with appropriate data. However, when I consider multi-output system (Due to one-hot encoding) with Cross-entropy loss function and softmax activation always fails.
I believe I am doing something wrong with my implementation for gradient calculation but unable to figure it out. So I am here for help.
For the current implementation, I use IRIS dataset for testing the model.
The data for IRIS is obtained as follows:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import minmax_scale
def one_hot_encoder(y):
y_oh = np.zeros((len(y), np.max(y)+1))
for t in np.unique(y):
y_oh[y==t,t] = 1
return y_oh
data = load_iris().data
target = load_iris().target
data_scaled = minmax_scale(data)
target_oh = one_hot_encoder(target)
A Neural network class is defined with a simple 1-hidden layer network as follows:
class NeuralNetwork:
def __init__(self, x, y):
self.x = x
# hidden layer with 16 nodes
self.weights1= np.random.rand(self.x.shape[1],16)
self.bias1 = np.random.rand(16)
# output layer with 3 nodes (for 3 output - One-hot encoded)
self.weights2 = np.random.rand(16,3)
self.bias2 = np.random.rand(3)
self.y = y
self.pred = np.zeros(y.shape)
self.lr = 0.001
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.x, self.weights1) + self.bias1)
self.layer2 = softmax(np.dot(self.layer1, self.weights2) + self.bias2)
# print(self.layer2.shape)
return self.layer2.clip(min=1e-8, max=None)
def backprop(self):
dloss = cross_entropy_derivative(self.pred, self.y) # 2*(self.y - self.pred)
d_weights2 = np.dot(self.layer1.T, dloss*softmax_derivative(self.pred))
d_bias2 = np.dot(np.ones([self.x.shape[0]]), dloss*softmax_derivative(self.pred))
d_weights1 = np.dot(self.x.T, np.dot(dloss*softmax_derivative(self.pred), self.weights2.T)*sigmoid_derivative(self.layer1))
d_bias1 = np.dot(np.ones([self.x.shape[0]]), np.dot(dloss*softmax_derivative(self.pred), self.weights2.T)*sigmoid_derivative(self.layer1))
self.weights1 += self.lr*d_weights1
self.weights2 += self.lr*d_weights2
self.bias1 += self.lr*d_bias1
self.bias2 += self.lr*d_bias2
def train(self, X, y):
self.x = X
self.y = y
self.pred = self.feedforward()
self.backprop()
def predict(self, X):
self.x = X
self.pred = self.feedforward()
return self.pred
def evaluate(self, y, pred):
self.y = y
self.pred = pred
# self.loss = np.sqrt(np.mean((self.pred-self.y)**2))
self.loss = cross_entropy(self.pred, self.y)
return self.loss
The activation functions and their derivatives are computed as follows (I feel there is something wrong here)
# Activation functions
def sigmoid(t):
return 1/(1+np.exp(-t))
# Derivative of sigmoid
def sigmoid_derivative(p):
return p * (1 - p)
# sofmax activation
def softmax(X):
exps = np.exp(X - np.max(X,axis=1).reshape(-1,1))
return exps / np.sum(exps,axis=1)[:,None]
# derivative of softmax
def softmax_derivative(pred):
return pred * (1 -(1 * pred).sum(axis=1)[:,None])
The cross-entropy loss function and its derivatives are as shown below:
def cross_entropy(X,y):
X = X.clip(min=1e-8,max=None)
# print('\n\nCE: ', (np.where(y==1,-np.log(X), 0)).sum(axis=1))
return (np.where(y==1,-np.log(X), 0)).sum(axis=1)
def cross_entropy_derivative(X,y):
X = X.clip(min=1e-8,max=None)
# print('\n\nCED: ', np.where(y==1,-1/X, 0))
return np.where(y==1,-1/X, 0)
The main function call:
NN = NeuralNetwork(data_scaled, target_oh)
for i in range(10000): # trains the NN 10,000 times
NN.train(data_scaled, target_oh)
loss.append(NN.evaluate(NN.y, NN.pred))
y_pred = NN.predict(data_scaled)
The output is approximately constant always predicting a single class. What am I doing wrong? Appreciate your help on the code or directions to look at. Thanks.
subtract the gradient and also derive the unactivated output instead of activated output. read this piece of code that i wrote for more info and watch Sebastian Lagues video about Neural Networks for help about this topic
P.S. The video is not in python, but it explains exactly what 3 years in college tries to explain.
Related
Would I use the weights and Biases that's been saved by the training model?
Below is the code. Do I just call in predict with the new test data?? .predict(X_test)
where X is the input array. I am trying to understand the math and how the testing work for neural networks instead of using TensorFlow or Pytorch.
Thanks!
class AdalineSGD:
def __init__(self,eta = 0.01, n_iter = 10, random_state = None,
shuffle = True):
self.eta = eta
self.n_iter = n_iter
self.random_state = random_state
self.w_initializaed = False
self.shuffle = shuffle
self.random_state = random_state
def fit(self,X,y):
self._initialize_weights(X.shape[1])
self.cost_ = []
for _ in range(self.n_iter):
if self.shuffle:
X,y = self._shuffle(X,y)
cost = []
for xi, target in zip(X,y):
cost.append(self._update_weights(xi,target))
avg_cost = sum(cost) / len(y)
self.cost_.append(avg_cost)
return self
def partial_fit(self,X,y):
if not self.w_initialized:
self._initialize_weights(X.shape[1])
if y.ravel().shape[0] > 1:
for xi,target in zip(X,y):
self._update_weights(xi, target)
else:
self._update_weights(X,y)
return self
def _shuffle(self,X,y):
r = self.rgen.permutation(len(y))
return(X[r], y[r])
def _initialize_weights(self,m):
self.rgen = np.random.RandomState(self.random_state)
self.w_ = self.rgen.normal(loc= 0.0, scale = 0.01,size = 1+m)
self.w_initialized = True
def _update_weights(self,xi,target):
output = self.activation(self.net_input(xi))
error = (target - output)
self.w_[1:] += self.eta * xi.dot(error)
self.w_[0] += self.eta * error
cost = 0.5 * error**2
return cost
def net_input(self,X):
return np.dot(X,self.w_[1:]) + self.w_[0]
def activation(self,X):
return X
def predict(self, X):
return np.where(self.net_input(X) >= 0, 1,0)
Would I use the weights and Biases that's been saved by the training model?
Yes. This happens when you call .fit()
Do I just call in predict with the new test data?? .predict(X_test)
Again, yes
I am trying to understand the math and how the testing work for neural networks
What happens when we train a neural network? We are finding the weights that best approximate a function that minimises the loss for our problem.
Once you have the right weights, which you do by fitting, you can just pass an input through those weights and get a corresponding output.
Below is the code which I tried writing for Logistic Regression:
class LogitReg:
def __init__(self, learning_rate=0.001, n_iters=1000):
self.lr = learning_rate
self.n_iters = n_iters
self.W = None # Weights matric
self.b = None # Bias
def fit(self, X, y):
n_samples, n_features = X.shape
self.W = np.zeros(n_features)
self.b = 0
# gradient descent
for i in range(self.n_iters):
linear_model = np.dot(X, self.W) + self.b # Y = W.X + b :: Predict y_hat with linear combination of weights and X, plus bias
y_predicted = self._sigmoid(linear_model) # Sigmoid function for Binary classification
# compute partial derivatives of weights and bias
dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y)) # formula of partial derivative of weight with respect to loss
db = (1 / n_samples) * np.sum(y_predicted - y) # formula of partial derivative of bias
# update weights abd bias according to the formula
self.W -= self.lr * dw
self.b -= self.lr * db
def predict(self, X, thresh:float = 0.5):
'''
Predict a class based on a threshold
'''
linear_model = np.dot(X, self.W) + self.b
y_predicted = self._sigmoid(linear_model)
y_predicted_cls = [1 if i > thresh else 0 for i in y_predicted]
return np.array(y_predicted_cls)
def _sigmoid(self, x):
'''
Sigmoid activation function
'''
return 1 / (1 + np.exp(-x))
But when I try to plot the boundary, it gives very weird results while the sklearn implementation gives perfect results. could someone please point out the mistake or the reason why it must be happening?
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
import numpy as np
from mlxtend.plotting import plot_decision_regions
X, y = make_blobs(n_samples=150, n_features=2, centers=2, cluster_std=1.05, random_state=2)
p = LogitReg(0.0001, 500)
p.fit(X, y)
p1 = LogisticRegression()
p1.fit(X, y)
plot_decision_regions(X=X, y=y,clf=p) # Perfect plot with scikit implementation
plot_decision_regions(X=X, y=y,clf=p1) # Weird boundary with custom implementation
I followed an article here: TowardsDataScience.
I wrote math equations about the network, everything made sense.
However, after writing the code, results are pretty strange, like it is predicting always same class...
I spent a lot of time on it, changed many things, but I still cannot understand what I did wrong.
Here is the code:
# coding: utf-8
from mnist import MNIST
import numpy as np
import math
import os
import pdb
DATASETS_PREFIX = '../Datasets/MNIST'
mndata = MNIST(DATASETS_PREFIX)
TRAINING_IMAGES, TRAINING_LABELS = mndata.load_training()
TESTING_IMAGES , TESTING_LABELS = mndata.load_testing()
### UTILS
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def d_sigmoid(x):
return x.T * (1 - x)
#return np.dot(x.T, 1.0 - x)
def softmax(x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
def d_softmax(x):
#This function has not yet been tested.
return x.T * (1 - x)
def tanh(x):
return np.tanh(x)
def d_tanh(x):
return 1 - x.T * x
def normalize(image):
return image / (255.0 * 0.99 + 0.01)
### !UTILS
class NeuralNetwork(object):
"""
This is a 3-layer neural network (1 hidden layer).
#_input : input layer
#_weights1: weights between input layer and hidden layer (matrix shape (input.shape[1], 4))
#_weights2: weights between hidden layer and output layer (matrix shape (4, 1))
#_y : output
#_output : computed output
#_alpha : learning rate
"""
def __init__(self, xshape, yshape):
self._neurones_nb = 20
self._input = None
self._weights1 = np.random.randn(xshape, self._neurones_nb)
self._weights2 = np.random.randn(self._neurones_nb, yshape)
self._y = np.mat(np.zeros(yshape))
self._output = np.mat(np.zeros(yshape))
self._alpha1 = 0.1
self._alpha2 = 0.1
self._function = sigmoid
self._derivative = d_sigmoid
self._epoch = 1
def Train(self, xs, ys):
for j in range(self._epoch):
for i in range(len(xs)):
self._input = normalize(np.mat(xs[i]))
self._y[0, ys[i]] = 1
self.feedforward()
self.backpropagation()
self._y[0, ys[i]] = 0
def Predict(self, image):
self._input = normalize(image)
out = self.feedforward()
return out
def feedforward(self):
self._layer1 = self._function(np.dot(self._input, self._weights1))
self._output = self._function(np.dot(self._layer1, self._weights2))
return self._output
def backpropagation(self):
d_weights2 = np.dot(
self._layer1.T,
2 * (self._y - self._output) * self._derivative(self._output)
)
d_weights1 = np.dot(
self._input.T,
np.dot(
2 * (self._y - self._output) * self._derivative(self._output),
self._weights2.T
) * self._derivative(self._layer1)
)
self._weights1 += self._alpha1 * d_weights1
self._weights2 += self._alpha2 * d_weights2
if __name__ == '__main__':
neural_network = NeuralNetwork(len(TRAINING_IMAGES[0]), 10)
print('* training neural network')
neural_network.Train(TRAINING_IMAGES, TRAINING_LABELS)
print('* testing neural network')
count = 0
for i in range(len(TESTING_IMAGES)):
image = np.mat(TESTING_IMAGES[i])
expected = TESTING_LABELS[i]
prediction = neural_network.Predict(image)
if i % 100 == 0: print(expected, prediction)
#print(f'* results: {count} / {len(TESTING_IMAGES)}')
Thank you for your help, really appreciated.
Julien
Well, I don't see any error in the implementation so considering your network, this could be improved by doing two things :
One epoch is not enough. Like not a all ! You need to pass over your data multiple times (a great minimum is 10 times, average might be around 100 epochs and this could go up to 5000 or more)
You network is a shallow network, e.g. really simple. To detect difficult things (like images), you could implement a CNN (Convolutional Neural Network) or first trying to deepen your network and complexify it
=> Try to add layers (3, 4, 5 etc..) and then add neurons to each layers (50, 60, ..) depending of the size of your input. You can still go up to 800, 900 or more.
My neural network can learn |sin(x)| for [0,pi], but not larger intervals than that. I tried changing the quantity and widths of hidden layers in various ways, but none of the changes leads to a good result.
I train the NN on thousands of random values from a uniform distribution in the chosen interval. using back propagation with gradient descent.
I am starting to think there is a fundamental problem in my network.
For the following examples I used a 1-10-10-1 layer structure:
[0, pi]:
[0, 2pi]:
[0, 4pi]:
Here is the code for the neural network:
import math
import numpy
import random
import copy
import matplotlib.pyplot as plt
def sigmoid(x):
return 1.0/(1+ numpy.exp(-x))
def sigmoid_derivative(x):
return x * (1.0 - x)
class NeuralNetwork:
def __init__(self, weight_dimensions, x=None, y=None):
self.weights = []
self.layers = [[]] * len(weight_dimensions)
self.weight_gradients = []
self.learning_rate = 1
self.layers[0] = x
for i in range(len(weight_dimensions) - 1):
self.weights.append(numpy.random.rand(weight_dimensions[i],weight_dimensions[i+1]) - 0.5)
self.y = y
def feed_forward(self):
# calculate an output using feed forward layer-by-layer
for i in range(len(self.layers) - 1):
self.layers[i + 1] = sigmoid(numpy.dot(self.layers[i], self.weights[i]))
def print_loss(self):
loss = numpy.square(self.layers[-1] - self.y).sum()
print(loss)
def get_weight_gradients(self):
return self.weight_gradients
def apply_weight_gradients(self):
for i in range(len(self.weight_gradients)):
self.weights[i] += self.weight_gradients[i] * self.learning_rate
if self.learning_rate > 0.001:
self.learning_rate -= 0.0001
def back_prop(self):
# find derivative of the loss function with respect to weights
self.weight_gradients = []
deltas = []
output_error = (self.y - self.layers[-1])
output_delta = output_error * sigmoid_derivative(self.layers[-1])
deltas.append(output_delta)
self.weight_gradients.append(self.layers[-2].T.dot(output_delta))
for i in range(len(self.weights) - 1):
i_error = deltas[i].dot(self.weights[-(i+1)].T)
i_delta = i_error * sigmoid_derivative(self.layers[-(i+2)])
self.weight_gradients.append(self.layers[-(i+3)].T.dot(i_delta))
deltas.append(copy.deepcopy(i_delta))
# Unreverse weight gradient list
self.weight_gradients = self.weight_gradients[::-1]
def get_output(self, inp):
self.layers[0] = inp
self.feed_forward()
return self.layers[-1]
def sin_test():
interval = numpy.random.uniform(0, 2*math.pi, int(1000*(2*math.pi)))
x_values = []
y_values = []
for i in range(len(interval)):
y_values.append([abs(math.sin(interval[i]))])
x_values.append([interval[i]])
x = numpy.array(x_values)
y = numpy.array(y_values)
nn = NeuralNetwork([1, 10, 10, 1], x, y)
for i in range(10000):
tmp_input = []
tmp_output = []
mini_batch_indexes = random.sample(range(0, len(x)), 10)
for j in mini_batch_indexes:
tmp_input.append(x[j])
tmp_output.append(y[j])
nn.layers[0] = numpy.array(tmp_input)
nn.y = numpy.array(tmp_output)
nn.feed_forward()
nn.back_prop()
nn.apply_weight_gradients()
nn.print_loss()
nn.layers[0] = numpy.array(numpy.array(x))
nn.y = numpy.array(numpy.array(y))
nn.feed_forward()
axis_1 = []
axis_2 = []
for i in range(len(nn.layers[-1])):
axis_1.append(nn.layers[0][i][0])
axis_2.append(nn.layers[-1][i][0])
true_axis_2 = []
for x in axis_1:
true_axis_2.append(abs(math.sin(x)))
axises = []
for i in range(len(axis_1)):
axises.append([axis_1[i], axis_2[i], true_axis_2[i]])
axises.sort(key=lambda x: x[0], reverse=False)
axis_1_new = []
axis_2_new = []
true_axis_2_new = []
for elem in axises:
axis_1_new.append(elem[0])
axis_2_new.append(elem[1])
true_axis_2_new.append(elem[2])
plt.plot(axis_1_new, axis_2_new, label="nn")
plt.plot(axis_1_new, true_axis_2_new, 'k--', label="sin(x)")
plt.grid()
plt.axis([0, 2*math.pi, -1, 2.5])
plt.show()
sin_test()
The main issue with your network seem to be that you apply the activation function to the final "layer" of your network. The final output of your network should be a linear combination without any sigmoid applied.
As a warning though, do not expect the model to generalize outside of the region included in the training data.
Here is an example in PyTorch:
import torch
import torch.nn as nn
import math
import numpy as np
import matplotlib.pyplot as plt
N = 1000
p = 2.5
x = 2 * p * math.pi * torch.rand(N, 1)
y = np.abs(np.sin(x))
with torch.no_grad():
plt.plot(x.numpy(), y.numpy(), '.')
plt.savefig("training_data.png")
inner = 20
model = nn.Sequential(
nn.Linear(1, inner, bias=True),
nn.Sigmoid(),
nn.Linear(inner, 1, bias=True)#,
#nn.Sigmoid()
)
loss_fn = nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500000):
y_pred = model(x)
loss = loss_fn(y_pred, y)
if t % 1000 == 0:
print("MSE: {}".format(t), loss.item())
model.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
X = torch.arange(0, p * 2 * math.pi, step=0.01).reshape(-1, 1)
Y = model(X)
Y_TRUTH = np.abs(np.sin(X))
print(Y.shape)
print(Y_TRUTH.shape)
loss = loss_fn(Y, Y_TRUTH)
plt.clf()
plt.plot(X.numpy(), Y_TRUTH.numpy())
plt.plot(X.numpy(), Y.numpy())
plt.title("MSE: {}".format(loss.item()))
plt.savefig("output.png")
The output is available here: Image showing neural network prediction and ground truth. The yellow line is the predicted line by the neural network and the blue line is the ground truth.
First and foremost, you've chosen a topology suited for a different class of problems. A simple, fully-connected NN such as this is great with trivial classification (e.g. Boolean operators) or functions with at least two continuous derivatives. You've tried to apply it to a function that is simply one step beyond its capabilities.
Try your model on sin(x) and see how it performs at larger ranges. Try it on max(sin(x), 0). Do you see how the model has trouble with certain periodicity and irruptions? These are an emergent feature of the many linear equations struggling to predict the proper functional value: the linear combinations have trouble emulating non-linearities past a simple level.
I wanted to predict heart disease using backpropagation algorithm for neural networks. For this I used UCI heart disease data set linked here: processed cleveland. To do this, I used the cde found on the following blog: Build a flexible Neural Network with Backpropagation in Python and changed it little bit according to my own dataset. My code is as follows:
import numpy as np
import csv
reader = csv.reader(open("cleveland_data.csv"), delimiter=",")
x = list(reader)
result = np.array(x).astype("float")
X = result[:, :13]
y0 = result[:, 13]
y1 = np.array([y0])
y = y1.T
# scale units
X = X / np.amax(X, axis=0) # maximum of X array
class Neural_Network(object):
def __init__(self):
# parameters
self.inputSize = 13
self.outputSize = 1
self.hiddenSize = 13
# weights
self.W1 = np.random.randn(self.inputSize, self.hiddenSize)
self.W2 = np.random.randn(self.hiddenSize, self.outputSize)
def forward(self, X):
# forward propagation through our network
self.z = np.dot(X, self.W1)
self.z2 = self.sigmoid(self.z) # activation function
self.z3 = np.dot(self.z2, self.W2)
o = self.sigmoid(self.z3) # final activation function
return o
def sigmoid(self, s):
# activation function
return 1 / (1 + np.exp(-s))
def sigmoidPrime(self, s):
# derivative of sigmoid
return s * (1 - s)
def backward(self, X, y, o):
# backward propgate through the network
self.o_error = y - o # error in output
self.o_delta = self.o_error * self.sigmoidPrime(o) # applying derivative of sigmoid to error
self.z2_error = self.o_delta.dot(
self.W2.T) # z2 error: how much our hidden layer weights contributed to output error
self.z2_delta = self.z2_error * self.sigmoidPrime(self.z2) # applying derivative of sigmoid to z2 error
self.W1 += X.T.dot(self.z2_delta) # adjusting first set (input --> hidden) weights
self.W2 += self.z2.T.dot(self.o_delta) # adjusting second set (hidden --> output) weights
def train(self, X, y):
o = self.forward(X)
self.backward(X, y, o)
NN = Neural_Network()
for i in range(100): # trains the NN 100 times
print("Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n" + str(NN.forward(X)))
print("Loss: \n" + str(np.mean(np.square(y - NN.forward(X))))) # mean sum squared loss
print("\n")
NN.train(X, y)
But when I run this code, my all predicted outputs become = 1 after few iterations and then stays the same for up to all 100 iterations. what is the problem in the code?
Few mistakes that I've noticed:
The output of your network is a sigmoid, i.e. a value between [0, 1] -- suits for predicting probabilities. But the target seems to be a value between [0, 4]. This explains the desire of the network to maximize the output to get as close as possible to large labels. But it can't go more than 1.0 and gets stuck.
You should either get rid of the final sigmoid or pre-process the label and scale it to [0, 1]. Both options will make it learn better.
You don't use the learning rate (effectively setting it to 1.0), which is probably a bit high, so it's possible for the NN to diverge. My experiments showed that 0.01 is a good learning rate, but you can play around with that.
Other than this, your backprop seems working right.