NN Output leans towards latest training examples

NN Output leans towards latest training examples - python

I'm trying to make my NN recognize "happy" and "sad" faces, hand-drawn in a 20x20 canvas. I'm using 400 input layer units, 100 hidden layer units, and 2 output units (happy/sad). The activation function for hidden & output layer is sigmoid.
I'm training the neural network progressively by manually drawing a face, selecting if it's happy or sad, and feeding the training sample to the neural network. Also, as I draw, the program shows the NN output dynamically.
The problem is that, as soon as I add a bunch of happy faces in a row, the output for "happy" raises up to almost 1. However, if I start adding sad faces, the output changes to happy=0.0241... sad=0.9988...
Sometimes, both outputs rise to almost one (happy = 0.99 / sad = 0.99).
It seems it should work that way, but in order to test the neural network, I started painting in the upper half of the canvas for HAPPY faces, and in the lower half for SAD faces. Same thing happens (the output leaning towards the latest samples), but I added like 50 training examples and the outputs don't seem to react to me painting in the lower / upper half at all.
I don't understand if I'm building the NN correctly or feeding the data correctly (it's just a 400-sized array with either 0.01 (white) or 0.99 (black)).
How can I prevent this behavior?
Neural Network class:
import scipy.special
import numpy
class NeuralNetwork:
def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate):
# set number of nodes in each input, hidden, output layer
self.inodes = inputnodes
self.hnodes = hiddennodes
self.onodes = outputnodes
# learning rate
self.lr = learningrate
# activation function
self.activation_function = lambda x: scipy.special.expit(x)
# link weight matrices, wih and who
self.wih = numpy.random.normal(0.0, pow(self.hnodes, -0.5), (self.hnodes, self.inodes))
self.who = numpy.random.normal(0.0, pow(self.onodes, -0.5), (self.onodes, self.hnodes))
pass
# train the neural network
def train(self, inputs_list, targets_list):
# convert inputs list to 2d array
inputs = numpy.array(inputs_list, ndmin=2).T
targets = numpy.array(targets_list, ndmin=2).T
# calculate signals into hidden layer
hidden_inputs = numpy.dot(self.wih, inputs)
# calculate the signals emerging from hidden layer
hidden_outputs = self.activation_function(hidden_inputs)
# calculate signals into final output layer
final_inputs = numpy.dot(self.who, hidden_outputs)
# calculate signals emerging from final output layer
final_outputs = self.activation_function(final_inputs)
# error is the (target - actual)
output_errors = targets - final_outputs
# hidden layer error is the output_errors, split by weights, recombined at hidden nodes
hidden_errors = numpy.dot(self.who.T, output_errors)
# update the weights for the links between the hidden and output layers
self.who += self.lr * numpy.dot((output_errors * final_outputs * (1.0 - final_outputs)),
numpy.transpose(hidden_outputs))
# update the weights for the links between the input and hidden layers
self.wih += self.lr * numpy.dot((hidden_errors * hidden_outputs * (1.0 - hidden_outputs)),
numpy.transpose(inputs))
pass
def query(self, inputs_list):
# convert inputs list to 2d array
inputs = numpy.array(inputs_list, ndmin=2).T
# calculate signals into hidden layer
hidden_inputs = numpy.dot(self.wih, inputs)
# calculate the signals emerging from the hidden layer
hidden_outputs = self.activation_function(hidden_inputs)
# calculate signals into final output layer
final_inputs = numpy.dot(self.who, hidden_outputs)
# calculate the signals emerging from final output layer
final_outputs = self.activation_function(final_inputs)
return final_outputs
Main code:
from tkinter import *
import numpy
# scipy.special for the sigmoid function expit()
import scipy.special
# library for plotting arrays
import matplotlib.pyplot
from nn import *
root = Tk()
frame = Frame(root)
w = Canvas(frame, width=400, height=400, background="green")
w.pack()
canvasRectangles = []
for i in range(20):
ls = []
for k in range(20):
x = 20*i
y = 20*k
ls.append(w.create_rectangle(x,y,x+20,y+20,fill="white"))
canvasRectangles.append(ls)
label = Label(frame,text='Number pressed: N/A')
label.pack()
hdnLayer1Label = Label(frame,text="Hidden Layer 1",justify=LEFT,wraplength=300)
hdnLayer1Label.pack()
outLayerLabel = Label(frame,text="Output Layer",justify=LEFT,wraplength=300)
outLayerLabel.pack()
def clearCanvas():
for i in range(20):
for k in range(20):
w.itemconfig(canvasRectangles[i][k],fill="white")
def key(event):
if event.char is 'r':
clearCanvas()
else:
if event.char is '1':
label.config(text="SMILE: Happy")
if event.char is '2':
label.config(text="SMILE: Sad")
global number
number = event.char
def initNN():
input_nodes = 400
hidden_nodes = 100
output_nodes = 2
learning_rate = 0.3
global n
n = NeuralNetwork(input_nodes, hidden_nodes, output_nodes, learning_rate)
def queryNN(rectangles):
unscaledInputs = getRectangleValues(rectangles)
arr = numpy.array(unscaledInputs)
scaledInputs = scaleInputs(arr)
res = n.query(scaledInputs)
return res
def trainNN(rectangles, desiredOuts):
inputs = getRectangleValues(rectangles)
arr = numpy.array(inputs)
# Scale the inputs
scaledInputs = scaleInputs(arr)
global n
n.train(scaledInputs, desiredOuts)
pass
def getRectangleValues(rectangles):
rectvals = []
for col in range(20):
for row in range(20):
if w.itemcget(rectangles[col][row], "fill") == "black":
rectvals.append(0)
else:
rectvals.append(1)
return rectvals
def tab(event):
desiredOuts = [0, 0]
if number is '1':
desiredOuts[0] = 1
print("desiredSmile= HAPPY")
if number is '2':
desiredOuts[1] = 1
print("desiredSmile= SAD")
print(desiredOuts)
trainNN(canvasRectangles, desiredOuts)
clearCanvas()
def draw(event):
x = event.x
y = event.y
if x <= 400 and y <= 400 and x >= 0 and y >= 0:
squarex = int(translate(x, 0, 400, 0, 20))
squarey = int(translate(y, 0, 400, 0, 20))
if not squarex is 20 or not squarey is 20:
w.itemconfig(canvasRectangles[squarex][squarey], fill="black")
outLayerLabel.config(text="Output: " + repr(queryNN(canvasRectangles)))
def translate(value, leftMin, leftMax, rightMin, rightMax):
# Figure out how 'wide' each range is
leftSpan = leftMax - leftMin
rightSpan = rightMax - rightMin
# Convert the left range into a 0-1 range (float)
valueScaled = float(value - leftMin) / float(leftSpan)
# Convert the 0-1 range into a value in the right range.
return rightMin + (valueScaled * rightSpan)
def scaleInputs(unscaledins):
return (numpy.asfarray(unscaledins) / 1.0 * 0.99) + 0.01
initNN()
w.bind("<Button-1>", draw)
w.bind("<B1-Motion>", draw)
frame.bind("1",key)
frame.bind("2",key)
frame.bind("r", key)
frame.bind("<Tab>", tab)
frame.pack()
frame.focus_set()
'root.after(0,doWork)'
root.mainloop()

If anyone else has a similar problem, I solved it by reducing the canvas resolution (from 20x20 to 7x7). I guess that such a high resolution needs much more training samples.
Also, I made a training set and iterated through it several times (about 8 epochs did the trick for me).
The solution is simple: if your inputs are more complex, you need much more training.

Related

Pytorch: visualize architecture of loss function in vae

I am new to machine learning in general and pytorch, so I apologize if my terminology is incorrect. I am trying to understand the code that was used to train a temporal dependent VAE based on this paper. I am trying to follow the architecture of the model based on the answers here. The answer using torchviz is not working for me but torchview is working. The issue is that it only gives me the architecture included in the forward function (ie the functions PreProcess and LSTM in the code) as shown in the image below. I have another function which is used to calculate the loss. I would like to able to generate a similar flow chart following the input and output dimensions for this part of the loss function (DBlock in the code below). Is this possible to visualize? .
'''
class DBlock(nn.Module):
# A basic building block for parameterizing a normal distribution.
# It corresponds to the D operation in the reference Appendix.
def __init__(self, input_size, hidden_size, output_size):
super(DBlock, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(input_size, hidden_size)
self.fc_mu = nn.Linear(hidden_size, output_size)
self.fc_logsigma = nn.Linear(hidden_size, output_size)
def forward(self, input):
t = torch.tanh(self.fc1(input))
t = t * torch.sigmoid(self.fc2(input))
mu = self.fc_mu(t)
logsigma = self.fc_logsigma(t)
return mu, logsigma
class PreProcess(nn.Module):
# The pre-process layer for MNIST image
def __init__(self, input_size, processed_x_size):
super(PreProcess, self).__init__()
self.input_size = input_size
self.fc1 = nn.Linear(input_size, processed_x_size)
self.fc2 = nn.Linear(processed_x_size, processed_x_size)
def forward(self, input):
t = torch.relu(self.fc1(input))
t = torch.relu(self.fc2(t))
return t
class Decoder(nn.Module):
# The decoder layer converting state to observation.
# Because the observation is MNIST image whose elements are values
# between 0 and 1, the output of this layer are probabilities of
# elements being 1.
def __init__(self, z_size, hidden_size, x_size):
super(Decoder, self).__init__()
self.fc1 = nn.Linear(z_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, x_size)
def forward(self, z):
t = torch.tanh(self.fc1(z))
t = torch.tanh(self.fc2(t))
p = torch.sigmoid(self.fc3(t))
return p
class TD_VAE(nn.Module):
The full TD_VAE model with jumpy prediction.
First, let's first go through some definitions which would help
understanding what is going on in the following code.
Belief: As the model is fed a sequence of observations, x_t, the
model updates its belief state, b_t, through a LSTM network.
It is a deterministic function of x_t.
We call b_t the belief at time t instead of belief state,
because we call the hidden state z state.
State: The latent state variable, z.
Observation: The observed variable, x.
In this case, it represents binarized MNIST images
def __init__(self, x_size, processed_x_size, b_size, z_size):
super(TD_VAE, self).__init__()
self.x_size = x_size
self.processed_x_size = processed_x_size
self.b_size = b_size
self.z_size = z_size
## input pre-process layer
self.process_x = PreProcess(self.x_size, self.processed_x_size)
## one layer LSTM for aggregating belief states
## One layer LSTM is used here and I am not sure how many layers
## are used in the original paper
self.lstm = nn.LSTM(input_size = self.processed_x_size,
hidden_size = self.b_size,
batch_first = True)
## Two layer state model is used
## belief to state (b to z)
## (this is corresponding to P_B distribution in the reference;
## weights are shared across time but not across layers.)
self.l2_b_to_z = DBlock(b_size, 50, z_size) # layer 2
# TODO: input size is to clean, what does this mean?
self.l1_b_to_z = DBlock(b_size + z_size, 50, z_size) # layer 1
## Given belief and state at time t2, infer the state at time t1
self.l2_infer_z = DBlock(b_size + 2*z_size, 50, z_size) # layer 2
self.l1_infer_z = DBlock(b_size + 2*z_size + z_size, 50, z_size) # layer 1
## Given the state at time t1, model state at time t2 through state transition
self.l2_transition_z = DBlock(2*z_size, 50, z_size)
self.l1_transition_z = DBlock(2*z_size + z_size, 50, z_size)
## state to observation
self.z_to_x = Decoder(2*z_size, 200, x_size)
def forward(self, images):
self.batch_size = images.size()[0]
self.x = images
## pre-precess image x
self.processed_x = self.process_x(self.x)
## aggregate the belief b
# TODO: are h_n and c_n used internally by pytorch?
self.b, (h_n, c_n) = self.lstm(self.processed_x)
def calculate_loss(self, t1, t2):
"""
Calculate the jumpy VD-VAE loss, which is corresponding to
the equation (6) and equation (8) in the reference.
"""
## Because the loss is based on variational inference, we need to
## draw samples from the variational distribution in order to estimate
## the loss function.
## sample a state at time t2 (see the reparameterization trick is used)
## z in layer 2
t2_l2_z_mu, t2_l2_z_logsigma = self.l2_b_to_z(self.b[:, t2, :])
t2_l2_z_epsilon = torch.randn_like(t2_l2_z_mu)
t2_l2_z = t2_l2_z_mu + torch.exp(t2_l2_z_logsigma)*t2_l2_z_epsilon
## z in layer 1
t2_l1_z_mu, t2_l1_z_logsigma = self.l1_b_to_z(
torch.cat((self.b[:,t2,:], t2_l2_z),dim = -1))
t2_l1_z_epsilon = torch.randn_like(t2_l1_z_mu)
t2_l1_z = t2_l1_z_mu + torch.exp(t2_l1_z_logsigma)*t2_l1_z_epsilon
## concatenate z from layer 1 and layer 2
t2_z = torch.cat((t2_l1_z, t2_l2_z), dim = -1)
## sample a state at time t1
## infer state at time t1 based on states at time t2
t1_l2_qs_z_mu, t1_l2_qs_z_logsigma = self.l2_infer_z(
torch.cat((self.b[:,t1,:], t2_z), dim = -1))
t1_l2_qs_z_epsilon = torch.randn_like(t1_l2_qs_z_mu)
t1_l2_qs_z = t1_l2_qs_z_mu + torch.exp(t1_l2_qs_z_logsigma)*t1_l2_qs_z_epsilon
t1_l1_qs_z_mu, t1_l1_qs_z_logsigma = self.l1_infer_z(
torch.cat((self.b[:,t1,:], t2_z, t1_l2_qs_z), dim = -1))
t1_l1_qs_z_epsilon = torch.randn_like(t1_l1_qs_z_mu)
t1_l1_qs_z = t1_l1_qs_z_mu + torch.exp(t1_l1_qs_z_logsigma)*t1_l1_qs_z_epsilon
t1_qs_z = torch.cat((t1_l1_qs_z, t1_l2_qs_z), dim = -1)
#### After sampling states z from the variational distribution, we can calculate
#### the loss.
## state distribution at time t1 based on belief at time 1
t1_l2_pb_z_mu, t1_l2_pb_z_logsigma = self.l2_b_to_z(self.b[:, t1, :])
t1_l1_pb_z_mu, t1_l1_pb_z_logsigma = self.l1_b_to_z(
torch.cat((self.b[:,t1,:], t1_l2_qs_z),dim = -1))
## state distribution at time t2 based on states at time t1 and state transition
t2_l2_t_z_mu, t2_l2_t_z_logsigma = self.l2_transition_z(t1_qs_z)
t2_l1_t_z_mu, t2_l1_t_z_logsigma = self.l1_transition_z(
torch.cat((t1_qs_z, t2_l2_z), dim = -1))
## observation distribution at time t2 based on state at time t2
t2_x_prob = self.z_to_x(t2_z)
#### start calculating the loss
#### KL divergence between z distribution at time t1 based on variational
#### distribution (inference model) and z distribution at time t1 based on belief.
#### This divergence is between two normal distributions and it can be
#### calculated analytically
## KL divergence between t1_l2_pb_z, and t1_l2_qs_z
loss = 0.5*torch.sum(((t1_l2_pb_z_mu - t1_l2_qs_z)/torch.exp(t1_l2_pb_z_logsigma))**2,-1) + \
torch.sum(t1_l2_pb_z_logsigma, -1) - torch.sum(t1_l2_qs_z_logsigma, -1)
## KL divergence between t1_l1_pb_z and t1_l1_qs_z
loss += 0.5*torch.sum(((t1_l1_pb_z_mu - t1_l1_qs_z)/torch.exp(t1_l1_pb_z_logsigma))**2,-1) + \
torch.sum(t1_l1_pb_z_logsigma, -1) - torch.sum(t1_l1_qs_z_logsigma, -1)
#### The following four terms estimate the KL divergence between
#### the z distribution at time t2 based on variational distribution
#### (inference model) and z distribution at time t2 based on transition.
#### In contrast with the above KL divergence for z distribution at time t1,
#### this KL divergence can not be calculated analytically because
#### the transition distribution depends on z_t1, which is sampled after z_t2.
#### Therefore, the KL divergence is estimated using samples
## state log probability at time t2 based on belief
loss += torch.sum(-0.5*t2_l2_z_epsilon**2 - 0.5*t2_l2_z_epsilon.new_tensor(2*np.pi) - t2_l2_z_logsigma, dim = -1)
loss += torch.sum(-0.5*t2_l1_z_epsilon**2 - 0.5*t2_l1_z_epsilon.new_tensor(2*np.pi) - t2_l1_z_logsigma, dim = -1)
## state log probability at time t2 based on transition
loss += torch.sum(0.5*((t2_l2_z - t2_l2_t_z_mu)/torch.exp(t2_l2_t_z_logsigma))**2 + 0.5*t2_l2_z.new_tensor(2*np.pi) + t2_l2_t_z_logsigma, -1)
loss += torch.sum(0.5*((t2_l1_z - t2_l1_t_z_mu)/torch.exp(t2_l1_t_z_logsigma))**2 + 0.5*t2_l1_z.new_tensor(2*np.pi) + t2_l1_t_z_logsigma, -1)
## observation prob at time t2
loss += -torch.sum(self.x[:,t2,:]*torch.log(t2_x_prob) + (1-self.x[:,t2,:])*torch.log(1-t2_x_prob), -1)
loss = torch.mean(loss)
return loss
'''

Neural Network from scratch : matrix version with numpy (debugging)

Recently I've been trying to create a Neural Network from scratch in Python. I've chosen the matrix way using numpy.
My Neural Network has 1 input layer, 1 hidden layer with 10 nodes and 1 output layer with 1 node. The activating function is the same for all the layer and is the ReLU function.
The cost function is based on the Mean Squared Error.
However, at the first iteration, the correction on the weight / bias matrix has very big numbers. Meaninng that first iteration correction bias / weight matrix get only negative umbers so that at next iterations, prediction for every samples in dataset is "0" because of the ReLU function.
So the implementation doesn't seems to work completely right. I guess that maybe I'm wrong with the derivative of the cost function.
from os import sep
import numpy as np
import pandas as pd
#from sklearn.model_selection import train_test_split
from src.NeuralNetwork import NeuralNetwork
# read training dataset
# 11 categories => 11 inputs ; 1 output (wine quality)
red_wine_dataset = pd.read_csv('./dataset/winequality-red.csv', sep=";").to_numpy()
# get dimensions: n lines / m columns
n, m = red_wine_dataset.shape
# number of different wines in the dataset
# we split the dataset in two so we have n/2
NUMBER_OF_DATA = int(n/2)
# number of categories = number of inputs = m - 1 (because 1 column is the true output = y)
NUMBER_OF_NODES_INPUTS = m - 1
# number of nodes for hidden layers
NUMBER_OF_NODES_HIDDEN_LAYER = 10
# number of nodes for output layer
NUMBER_OF_NODES_OUTPUT_LAYER = 1
# number of iterations for the algorithm
EPOCH = 1000
# learning rate (eta)
LEARNING_RATE = 0.05
# split dataset into two : a training dataset / a testing dataset
training_dataset = red_wine_dataset[0:NUMBER_OF_DATA]
training_inputs = training_dataset[:,0:m - 1] # wine classifying categories (acidity, sugar, ph,...)
training_output = training_dataset[:,m - 1] # wine quality
#print(training_output)
testing_dataset = red_wine_dataset[NUMBER_OF_DATA:n]
testing_inputs = testing_dataset[0:m - 1] # wine classifying categories (acidity, sugar, ph,...)
testing_output = testing_dataset[m - 1] # wine quality
# architecture of the neural network
# number of nodes by layer
layers = np.array([NUMBER_OF_NODES_INPUTS, NUMBER_OF_NODES_HIDDEN_LAYER, NUMBER_OF_NODES_OUTPUT_LAYER])
if __name__ == "__main__":
NN = NeuralNetwork(dataset=training_dataset, layers=layers)
NN.gradient_descent(X=training_inputs, Y=training_output, epoch=EPOCH, learning_rate=LEARNING_RATE)
import numpy as np
import matplotlib.pyplot as plt
from src.utils import ReLU, ReLU_derivative, mean_squared_error
class NeuralNetwork:
def __init__(self, dataset, layers) -> None:
"""[summary]
NOTATION:
X = training input (A0)
Y = training output (y_true)
Y_hat = predicted output (y_pred) = = activated output associated with the last layer (that is the output layer)
Wi = weight matrix associated with i-th layer
Bi = bias matrix associated with i-th layer
Zi = (A_{i-1} \cdot Wi) + Bi = output matrix associated with i-th layer
Ai = f(Zi) = activated output associated with i-th layer where f is the activation function (ex: ReLU)
L = Loss function (ex: MSE)
Args:
architecture
"""
# parameters of the neural network
self.parameters = {}
# partial derivatives to update the parameters (weight, bias) of the neural network
self.derivatives = {}
# number of the last layer (the output layer)
# number of layers - 1 because numerotation begins at 0.
self.N = layers.size - 1
# number of samples in the dataset
self.m = dataset[:,0].size
#print(dataset[0].size)
# initialize neural network parameters
for i in range(1, self.N + 1):
self.parameters[f"W{str(i)}"] = np.random.uniform(size=(layers[i - 1], layers[i]))
self.parameters[f"B{str(i)}"] = np.random.uniform(size=(self.m, layers[i]))
self.parameters[f"Z{str(i)}"] = np.ones((self.m, layers[i]))
self.parameters[f"A{str(i)}"] = np.ones((self.m, layers[i]))
# initialize cost function value
self.parameters["C"] = 1
def forward_propagate(self, X):
# initial the neural network with the input dataset
self.parameters["A0"] = X
# forward propagate each subsequent layers
for i in range(1, self.N + 1):
# Z^i = (A^{i-1} \cdot W^i) + B^i
Zi = (self.parameters[f"A{str(i-1)}"] # self.parameters[f"W{str(i)}"]) + self.parameters[f"B{str(i)}"]
self.parameters[f"Z{str(i)}"] = Zi
# A^i = f(Z^i)
print(f"A{str(i-1)}", self.parameters[f"A{str(i-1)}"])
print(f"W{str(i)}", self.parameters[f"W{str(i)}"])
print(f"B{str(i)}", self.parameters[f"B{str(i)}"])
self.parameters[f"A{str(i)}"] = ReLU(Zi)
#print(ReLU(Zi))
# for key, value in self.parameters.items():
# if type(value) is int:
# print(key, value)
# else:
# print(key, value.shape)
# print("================================")
def backward_propagate(self, X, Y):
# compute derivatives of our loss function
# we go backward
# partial derivatives for the last layer
dL_dAN = np.sum(self.parameters[f"A{str(self.N)}"] - Y)
# Hadamar product: "*"
dL_dZN = dL_dAN * ReLU_derivative(self.parameters[f"Z{str(self.N)}"])
#print(dL_dZN.T.shape, self.parameters[f"A{str(self.N - 1)}"].shape)
dL_dWN = dL_dZN.T # self.parameters[f"A{str(self.N - 1)}"]
dL_dBN = dL_dZN
self.derivatives[f"dLdZ{str(self.N)}"] = dL_dZN
self.derivatives[f"dLdW{str(self.N)}"] = dL_dWN.T
self.derivatives[f"dLdB{str(self.N)}"] = dL_dBN
# partial derivatives for the subsequent layers
for i in range(self.N - 1, 0, -1):
# Hadamar product
# "*" is to multiply entries 1 by 1 => shape: (m, n) * (m, n) for any m, n
dL_dZi = ((self.derivatives[f"dLdZ{str(i + 1)}"]) # self.parameters[f"W{str(i + 1)}"].T) * ReLU_derivative(self.parameters[f"Z{str(i)}"])
dL_dWi = self.parameters[f"A{str(i - 1)}"].T # dL_dZi
dL_dBi = dL_dZi
self.derivatives[f"dLdZ{str(i)}"] = dL_dZi
self.derivatives[f"dLdW{str(i)}"] = dL_dWi
self.derivatives[f"dLdB{str(i)}"] = dL_dBi
def update_weights_and_bias(self, learning_rate):
for i in range(1, self.N + 1):
self.parameters[f"W{str(i)}"] -= learning_rate * self.derivatives[f"dLdW{str(i)}"]
self.parameters[f"B{str(i)}"] -= learning_rate * self.derivatives[f"dLdB{str(i)}"]
def gradient_descent(self, X, Y, epoch, learning_rate):
cost_history = []
for i in range(epoch):
if i == 2:
return
self.forward_propagate(X)
self.backward_propagate(X, Y)
self.update_weights_and_bias(learning_rate)
cost = mean_squared_error(Y=Y, Y_hat=self.parameters[f"A{str(self.N)}"])
cost_history.append(cost)
#print(cost)
# if (i % 10 == 0):
# print(f"iteration: {i}")
# predictions = np.argmax(self.parameters[f"A{str(self.N)}"], 0)
# print(predictions, Y)
# print(np.sum(predictions == Y) / Y.size)
plt.plot(range(epoch), cost_history)
plt.show()
import numpy as np
# useful activation functions for hidden layer 1 / output layer
"""
#param x: np.array
"""
def sigmoid(x):
return 1 / (1 - np.exp(x))
"""
#param x: np.array
"""
def ReLU(x):
return np.maximum(0, x)
def ReLU_derivative(x):
return x > 0
# loss function
"""
#param y_true: np.array
#param y_pred: np.array
J(y_true, y_pred) = 1/n \sum_{i=1}^{n} (y_true - y_pred)^2
"""
def mean_squared_error(Y, Y_hat):
#print("Y", Y)
#print("Y_hat", np.sum(Y_hat), Y_hat.shape)
#print("somme", sum((Y - Y_hat) ** 2))
return (1 / 2) * np.sum((Y - Y_hat) ** 2)

Sequence classification binary model LSTM from scratch

I am writing a LSTM sequence classifier from scratch (no use of AI library).
I first tried with a classical RNN which I started from a many to many model for a many to one model, with a forward propagation looking like that:
def rnn_forward(inputs,rnnNet):
fw_cache = []
hidden_state = np.zeros((rnnNet.d[0], 1))
fw_cache = []
for t in range(len(inputs)):
hidden_state = cm.tanh( np.dot(rnnNet.p['U'], inputs[t]) + np.dot(rnnNet.p['V'], hidden_state) + rnnNet.p['b_h'] )
fw_cache.append(hidden_state.copy())
outputs = cm.softmax( np.dot(rnnNet.p['W'], hidden_state) + rnnNet.p['b_o'],rnn=True)
return outputs, fw_cache
I could rewrite my parameters dimensions accordingly and this is working as expected.
However, I struggle with doing the same thing on a LSTM network. Below is the forward prop:
def lstm_forward(inputs,lstmNet):
fw_cache = []
# lstmNet.d[0] is the hidden_size
h_prev = np.zeros((lstmNet.d[0], 1))
C_prev = np.zeros((lstmNet.d[0], 1))
for x in inputs:
cache = {'C': C_prev, 'h': h_prev}
# Concatenate input and hidden state
cache['z'] = np.row_stack((cache['h'], x))
# Calculate forget gate
cache['f'] = cm.sigmoid(np.dot(lstmNet.p['W_f'], cache['z']) + lstmNet.p['b_f'])
# Calculate input gate
cache['i'] = cm.sigmoid(np.dot(lstmNet.p['W_i'], cache['z']) + lstmNet.p['b_i'])
# Calculate candidate
cache['g'] = cm.tanh(np.dot(lstmNet.p['W_g'], cache['z']) + lstmNet.p['b_g'])
# Calculate memory state
C_prev = cache['f'] * cache['C'] + cache['i'] * cache['g']
# Calculate output gate
cache['o'] = cm.sigmoid(np.dot(lstmNet.p['W_o'], cache['z']) + lstmNet.p['b_o'])
# Calculate hidden state
h_prev = cache['o'] * cm.tanh(cache['C'])
# Calculate logits
cache['v'] = np.dot(lstmNet.p['W_v'], h_prev) + lstmNet.p['b_v']
# Calculate softmax
fw_cache.append(copy.deepcopy(cache))
outputs = cm.softmax(cache['v'],rnn=True)
return outputs, fw_cache
My parameters are:
def init_params(lstmNet):
hidden_size = lstmNet.d[0]
vocab_size = lstmNet.d[1]
z_size = lstmNet.d[2]
output_size = lstmNet.d[3]
# Weight matrix (forget gate)
lstmNet.p['W_f'] = np.random.randn(hidden_size, z_size)
# Bias for forget gate
lstmNet.p['b_f'] = np.zeros((hidden_size, 1))
# Weight matrix (input gate)
lstmNet.p['W_i'] = np.random.randn(hidden_size, z_size)
# Bias for input gate
lstmNet.p['b_i'] = np.zeros((hidden_size, 1))
# Weight matrix (candidate)
lstmNet.p['W_g'] = np.random.randn(hidden_size, z_size)
# Bias for candidate
lstmNet.p['b_g'] = np.zeros((hidden_size, 1))
# Weight matrix of the output gate !!! I expect this to change dimensions
lstmNet.p['W_o'] = np.random.randn(hidden_size, z_size)
lstmNet.p['b_o'] = np.zeros((hidden_size, 1))
# Weight matrix relating the hidden-state to the output !!! I expect this to change dimensions
lstmNet.p['W_v'] = np.random.randn(vocab_size, hidden_size)
lstmNet.p['b_v'] = np.zeros((vocab_size, 1))
Any help in passing from this LSTM many to many model to a many to one model with output only on the last cell / input would be much appreciated.

I can't find the bug in this implementation of backpropogation?

My data is 4123 rows of inputs and outputs to an xor gate.
I want to write a Neural Network with three input layer neurons (the third one is bias), a hidden layer, and an output layer.
Here's my implementation
import numpy as np
class TwoLayerNetwork:
def __init__(self, input_size, hidden_size, output_size):
"""
input_size: the number of neurons in the input layer
hidden_size: the number of neurons in the hidden layer
output_size: the number of neurons in the output layer
"""
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.params = {}
self.params['W1'] = 0.01 * np.random.randn(input_size, hidden_size) # FxH
self.params['b1'] = np.zeros((hidden_size, 1)) # Hx1
self.params['W2'] = 0.01 * np.random.randn(hidden_size, output_size) # HxO
self.params['b2'] = np.zeros((output_size, 1)) # Ox1
self.optimal_weights = []
self.errors = {}
def train(self, X, y, epochs):
"""
X: input data matrix, NxF
y: output vector, Nx1
returns:
the optimal set of parameters that best minimize the loss function
"""
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
for iteration in range(epochs):
forward_to_hidden = X.dot(W1) # NxH
activate_hidden = sigmoid(forward_to_hidden) # NxH
forward_to_output = activate_hidden.dot(W2) # NxO
output = sigmoid(forward_to_output) # NxO
self.errors[iteration] = np.mean(0.5 * (y**2 - output**2))
output_error = y - output # NxO
output_layer_delta = output_error * sigmoidPrime(output) # NxO
hidden_layer_error = output_layer_delta.dot(W2.T) # NxO . OxH = NxH
hidden_layer_delta = hidden_layer_error * sigmoidPrime(activate_hidden) # NxH
W1_update = X.T.dot(hidden_layer_delta) # FxN . NxH = FxH
W2_update = activate_hidden.T.dot(output_layer_delta) # HxN . NxO = HxO
W1 += W1_update
W2 += W2_update
self.optimal_weights.append(W1)
self.optimal_weights.append(W2)
def predict(self, X):
W1, W2 = self.optimal_weights[0], self.optimal_weights[1]
forward = sigmoid(X.dot(W1)) # NxH
forward = forward.dot(W2) # NxO
forward = sigmoid(forward) # NxO
return forward
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoidPrime(x):
return sigmoid(x) * (1 - sigmoid(x))
I realize that's very vanilla, but that's intentional. I want to understand the most basic form of NN architecture first.
Now, my problem is that my error plot is confusing.
The neural network just stops learning.
My second problem is that my weights are blowing up reaching up to -10000, which causes overflow because of exp in the sigmoid function.
My third problem is that my output vector only outputs 0.5 instead of 1 or 0
import pandas as pd
data = pd.read_csv('xor.csv').sample(frac=1)
X = data.iloc[:, [0, 1]] # 1st and 2nd cols are the input
X = np.hstack((X, np.ones((data.shape[0], 1)))) # adding the bias 1's
y = data.iloc[:, 2][:, np.newaxis] # 3rd col is the output
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
nn.train(X_train, y_train, 100)
plt.plot(range(100), [i for i in nn.errors.values()])
plt.show()
The link for the dataset

So, if I read your code correctly, your network is specified correctly, but is missing a few key points in order to learn XOR by backpropagation.
The fun part is, your error specification is weird.
I made it into
self.errors[iteration] = np.mean(0.5 * (y - output)**2)
for visualization.
With x-axis denoting epoch and y-axis denoting error:
So what happens, the backpropagation hits a plateau, then rapidly blows up the weights. To slow down the blowing up of the weights and allow the network some time to re-evaluate its mistakes, you can add a so-called "learning rate" != 1. This adresses one of the pitfalls.
Another one is the second figure: you hit oscillatory behaviour in the updates and the program will never reach its optimum state. To adress this, you can deliberately enter an imperfection in the form of a "momentum".
Additionally, the initial conditions matter for the speed at which you converge, so you need to have enough epochs to overcome the local plateaux:
Last, but certainly not least, I did find an error with your specification, but all of the above still applies.
In your layer_deltas you do sigmoidPrime(sigmoid(forwards)) which is one call to sigmoid too many.
last_update = np.zeros((X.shape[1], W1.shape[1]))
last_update2 = np.zeros((W1.shape[1], W2.shape[1]))
output_layer_delta = output_error * sigmoidPrime(forward_to_output) # NxO
hidden_layer_delta = hidden_layer_error * sigmoidPrime(forward_to_hidden) # NxH
W1 += 0.001*(W1_update + last_update * 0.5)
W2 += 0.001*(W2_update + last_update2 * 0.5)
# W1 = 0.001*W1_update
# W2 = 0.001*W2_update
last_update = W1_update.copy()
last_update2 = W2_update.copy()
Did the final trick for me. Now please verify and appease this grumbling man who spent the better part of a night and day on figuring it out. ;)

Neural Network loss starts increasing while acc is increasing on both train/val data sets

For past few days I have been debugging my NN but I can't find an issue.
I've created total raw implementation of multi-layer perceptron for identifying MNIST dataset images.
Network seems to learn because after train cycle test data accuracy is above 94% accuracy. I have problem with loss function - it starts increasing after a while, when test/val accuracy reaches ~76%.
Can someone please check my forward/backprop math and tell me if my loss function is properly implemented, or suggest what might be wrong?
NN structure:
input layer: 758 nodes, (1 node per pixel)
hidden layer 1: 300 nodes
hidden layer 2: 75 nodes
output layer: 10 nodes
NN activation functions:
input layer -> hidden layer 1: ReLU
hidden layer 1 -> hidden layer 2: ReLU
hidden layer 2 -> output layer 3: Softmax
NN Loss function:
Categorial Cross-Entropy
Full CLEAN code available here as Jupyter Notebook.
Neural Network forward/backward pass:
def train(self, features, targets):
n_records = features.shape[0]
# placeholders for weights and biases change values
delta_weights_i_h1 = np.zeros(self.weights_i_to_h1.shape)
delta_weights_h1_h2 = np.zeros(self.weights_h1_to_h2.shape)
delta_weights_h2_o = np.zeros(self.weights_h2_to_o.shape)
delta_bias_i_h1 = np.zeros(self.bias_i_to_h1.shape)
delta_bias_h1_h2 = np.zeros(self.bias_h1_to_h2.shape)
delta_bias_h2_o = np.zeros(self.bias_h2_to_o.shape)
for X, y in zip(features, targets):
### forward pass
# input to hidden 1
inputs_to_h1_layer = np.dot(X, self.weights_i_to_h1) + self.bias_i_to_h1
inputs_to_h1_layer_activated = self.activation_ReLU(inputs_to_h1_layer)
# hidden 1 to hidden 2
h1_to_h2_layer = np.dot(inputs_to_h1_layer_activated, self.weights_h1_to_h2) + self.bias_h1_to_h2
h1_to_h2_layer_activated = self.activation_ReLU(h1_to_h2_layer)
# hidden 2 to output
h2_to_output_layer = np.dot(h1_to_h2_layer_activated, self.weights_h2_to_o) + self.bias_h2_to_o
h2_to_output_layer_activated = self.softmax(h2_to_output_layer)
# output
final_outputs = h2_to_output_layer_activated
### backpropagation
# output to hidden2
error = y - final_outputs
output_error_term = error.dot(self.dsoftmax(h2_to_output_layer_activated))
h2_error = np.dot(output_error_term, self.weights_h2_to_o.T)
h2_error_term = h2_error * self.activation_dReLU(h1_to_h2_layer_activated)
# hidden2 to hidden1
h1_error = np.dot(h2_error_term, self.weights_h1_to_h2.T)
h1_error_term = h1_error * self.activation_dReLU(inputs_to_h1_layer_activated)
# weight & bias step (input to hidden)
delta_weights_i_h1 += h1_error_term * X[:, None]
delta_bias_i_h1 = np.sum(h1_error_term, axis=0)
# weight & bias step (hidden1 to hidden2)
delta_weights_h1_h2 += h2_error_term * inputs_to_h1_layer_activated[:, None]
delta_bias_h1_h2 = np.sum(h2_error_term, axis=0)
# weight & bias step (hidden2 to output)
delta_weights_h2_o += output_error_term * h1_to_h2_layer_activated[:, None]
delta_bias_h2_o = np.sum(output_error_term, axis=0)
# update the weights and biases
self.weights_i_to_h1 += self.lr * delta_weights_i_h1 / n_records
self.weights_h1_to_h2 += self.lr * delta_weights_h1_h2 / n_records
self.weights_h2_to_o += self.lr * delta_weights_h2_o / n_records
self.bias_i_to_h1 += self.lr * delta_bias_i_h1 / n_records
self.bias_h1_to_h2 += self.lr * delta_bias_h1_h2 / n_records
self.bias_h2_to_o += self.lr * delta_bias_h2_o / n_records
Activation function implementation:
def activation_ReLU(self, x):
return x * (x > 0)
def activation_dReLU(self, x):
return 1. * (x > 0)
def softmax(self, x):
z = x - np.max(x)
return np.exp(z) / np.sum(np.exp(z))
def dsoftmax(self, x):
# TODO: vectorise math
vec_len = len(x)
J = np.zeros((vec_len, vec_len))
for i in range(vec_len):
for j in range(vec_len):
if i == j:
J[i][j] = x[i] * (1 - x[j])
else:
J[i][j] = -x[i] * x[j]
return J
Loss function implementation:
def categorical_cross_entropy(pred, target):
return (1/len(pred)) * -np.sum(target * np.log(pred))

I managed to find the problem.
Neural Network is large so I couldn't stick everything to this question. Though if you check my Jupiter Notebook you could see implementation of my Softmax activation function and how do I use it in train cycle.
Problem with Loss miscalculation was caused by the fact my Softmax implementation worked only for ndarray dim == 1.
During training step I have put only ndarray with dim 1 to activtion function so NN learned well, but my run() function was returning wrong predictions as I have inserted whole test data to it, not only single row of it in for loop. Because of that it calculated Softmax "matrix-wise" rather than "row-wise".
This is very fast fix for it:
def softmax(self, x):
# TODO: vectorise math to speed up computation
softmax_result = None
if x.ndim == 1:
z = x - np.max(x)
softmax_result = np.exp(z) / np.sum(np.exp(z))
return softmax_result
else:
softmax_result = []
for row in x:
z = row - np.max(row)
row_softmax_result = np.exp(z) / np.sum(np.exp(z))
softmax_result.append(row_softmax_result)
return np.array(softmax_result)
Yet this code should be vectorised to avoid for loops and ifs if possible because currently it's ugly and takes too much PC resources.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

NN Output leans towards latest training examples - python

Related

Pytorch: visualize architecture of loss function in vae

Neural Network from scratch : matrix version with numpy (debugging)

Sequence classification binary model LSTM from scratch

I can't find the bug in this implementation of backpropogation?

Neural Network loss starts increasing while acc is increasing on both train/val data sets

Categories

Resources