I am a total noob and this is first thing in ML im trying to do. I just want to run the code.
I know feedforward is correct and my errors should be correct, but I get incorrect results.
Please help
import numpy as np
inputs = np.array([
[[0],[0]],
[[1],[0]],
[[0],[1]],
[[1],[1]]
])
expected_output = np.array([
[[0]],
[[1]],
[[1]],
[[0]]
])
epochs = 1000
lr = 0.01
hidden_weights = np.array([
[0.2, 0.3],
[0.4, 0.5]
])
hidden_bias = np.array([[0.3], [0.6]])
output_weights = np.array([[0.6, 0.7]])
output_bias = np.array([[0.5]])
def sigmoid(z):
return 1/(1+np.exp(-z))
def sigmoid_derivative(z):
return np.multiply(sigmoid(z), sigmoid(1.0-z))
for _ in range(epochs):
for index, input in enumerate(inputs):
hidden_layer_activation = np.dot(hidden_weights, input)
hidden_layer_activation += hidden_bias
hidden_layer_output = sigmoid(hidden_layer_activation)
output_layer_activation = np.dot(output_weights, hidden_layer_output)
output_layer_activation += output_bias
predicted_output = sigmoid(output_layer_activation)
#Backpropagation
output_errors = expected_output[index] - predicted_output
hidden_errors = output_weights.T.dot(output_errors)
d_predicted_output = output_errors * sigmoid_derivative(predicted_output)
d_hidden_layer = hidden_errors * sigmoid_derivative(hidden_layer_output)
# I am almost certain the problem is in the next 2 linees
output_weights += d_predicted_output.dot(hidden_layer_output.T) * lr
hidden_weights += d_hidden_layer.dot(input.T) * lr
output_bias += np.sum(d_predicted_output,axis=0,keepdims=True) * lr
hidden_bias += np.sum(d_hidden_layer,axis=0,keepdims=True) * lr
# NOW THE TESTING,I pass 2 input neurons. One with value 0 and value 1
test = np.array([
[[0], [1]]
])
hidden_layer_activation = np.dot(hidden_weights, test[0])
hidden_layer_activation += hidden_bias
hidden_layer_output = sigmoid(hidden_layer_activation)
output_layer_activation = np.dot(output_weights, hidden_layer_output)
output_layer_activation += output_bias
predicted_output = sigmoid(output_layer_activation)
print(predicted_output) # I usually get somewhere around [[0.5]], and the ideal answer should be [[1]] since it is a XOR gate
Result: [[0.5]] for inputs 0 and 1
Wanted: [[1]] for input 0 and 1
That's all the code... thank you in advance
I am guessing problem is somewhere where I update the weight and bias. I was doing path for forward propagation and i got correct results.
The problem must be transposing and taking the dot product in the backpropogation step.
My code on XOR:
import numpy as np
def sigmoid(z):
return 1/(1+np.exp(-z))
def sigmoid_derivative(z):
return np.multiply(sigmoid(z), sigmoid(1.0-z))
def init_w(epsilon):
# Input nodes
theta1=2*np.random.random([2,3])*epsilon - epsilon
# Output nodes
theta2=2*np.random.random([1,3])*epsilon - epsilon
theta1,theta2=np.mat(theta1),np.mat(theta2)
return theta1,theta2
def fit(X, Y, theta1,theta2, predict=False, x=None):
grad1,grad2=np.mat(np.zeros(np.shape(theta1))),np.mat(np.zeros(np.shape(theta2)))
for i in range(len(X)):
x = x if predict else X[i]
y = Y[0,i]
# forward propagate
a = x
a1=np.mat(np.append(1, a)).T
z2=theta1*a1
a2=sigmoid(z2)
a2=np.mat(np.append(1, a2)).T
z3=theta2*a2
a3=sigmoid(z3)
if predict: return a3
# back propagate
delta3 = a3 - y.T
grad2 += delta3 * a2.T
delta2 = np.multiply(theta2.T*delta3, sigmoid_derivative(a2))
grad1 += (delta2[1:] * a1.T)
return grad1,grad2
def predict(x):
return fit(X, Y, theta1,theta2, True, x)
X = np.mat([[0,0],
[0,1],
[1,0],
[1,1]])
Y = np.mat([0,1,1,0])
epochs = 10000
alpha = 0.85
epsilon = 1
theta1,theta2 = init_w(epsilon)
for i in range(epochs):
g1,g2 = fit(X, Y, theta1,theta2)
theta1 -= alpha * g1
theta2 -= alpha * g2
for i in range(len(X)):
x = X[i]
guess = predict(x)
print(x, ":", guess)
Output:
[[0 0]] : [[ 0.00233143]]
[[0 1]] : [[ 0.99775431]]
[[1 0]] : [[ 0.9977526]]
[[1 1]] : [[ 0.00233134]]
Edit:
Your array format is too complex so I will suggest you to write down the shapes after each steps so that you can debug easily.
Update:
import numpy as np
#np.random.seed(0)
def sigmoid (x):
return 1/(1 + np.exp(-x))
def sigmoid_derivative(x):
return x * (1 - x)
#Input datasets
inputs = np.array([[0,0],[0,1],[1,0],[1,1]])
expected_output = np.array([[0],[1],[1],[0]])
epochs = 10000
lr = 0.1
inputLayerNeurons, hiddenLayerNeurons, outputLayerNeurons = 2,2,1
#Random weights and bias initialization
#hidden_weights = np.random.uniform(size=(inputLayerNeurons,hiddenLayerNeurons))
#hidden_bias =np.random.uniform(size=(1,hiddenLayerNeurons))
#output_weights = np.random.uniform(size=(hiddenLayerNeurons,outputLayerNeurons))
#output_bias = np.random.uniform(size=(1,outputLayerNeurons))
hidden_weights = np.array([
[0.2, 0.3],
[0.4, 0.5]
])
hidden_bias = np.array([[0.3, 0.6]])
output_weights = np.array([[0.6], [0.7]])
output_bias = np.array([[0.5]])
print("Initial hidden weights: ",end='')
print(*hidden_weights)
print("Initial hidden biases: ",end='')
print(*hidden_bias)
print("Initial output weights: ",end='')
print(*output_weights)
print("Initial output biases: ",end='')
print(*output_bias)
#Training algorithm
for _ in range(epochs):
#Forward Propagation
hidden_layer_activation = np.dot(inputs,hidden_weights)
hidden_layer_activation += hidden_bias
hidden_layer_output = sigmoid(hidden_layer_activation)
output_layer_activation = np.dot(hidden_layer_output,output_weights)
output_layer_activation += output_bias
predicted_output = sigmoid(output_layer_activation)
#Backpropagation
error = expected_output - predicted_output
d_predicted_output = error * sigmoid_derivative(predicted_output)
error_hidden_layer = d_predicted_output.dot(output_weights.T)
d_hidden_layer = error_hidden_layer * sigmoid_derivative(hidden_layer_output)
#Updating Weights and Biases
output_weights += hidden_layer_output.T.dot(d_predicted_output) * lr
output_bias += np.sum(d_predicted_output,axis=0,keepdims=True) * lr
hidden_weights += inputs.T.dot(d_hidden_layer) * lr
hidden_bias += np.sum(d_hidden_layer,axis=0,keepdims=True) * lr
print("Final hidden weights: ",end='')
print(*hidden_weights)
print("Final hidden bias: ",end='')
print(*hidden_bias)
print("Final output weights: ",end='')
print(*output_weights)
print("Final output bias: ",end='')
print(*output_bias)
print("\nOutput from neural network after 10,000 epochs: ",end='')
print(*predicted_output)
test = np.array([
[0, 1]
])
hidden_layer_activation = np.dot(test, hidden_weights)
hidden_layer_activation += hidden_bias
hidden_layer_output = sigmoid(hidden_layer_activation)
output_layer_activation = np.dot( hidden_layer_output, output_weights)
output_layer_activation += output_bias
predicted_output = sigmoid(output_layer_activation)
print(predicted_output)
Final hidden weights: [3.59882402 5.68799788] [3.60260363 5.70714658]
Final hidden bias: [-5.50709978 -2.3415549 ]
Final output weights: [-7.85976304] [7.26409199]
Final output bias: [-3.26766959]
Output from neural network after 10,000 epochs: [0.06525552] [0.93906737] [0.93899963] [0.06635071]
[[0.93907536]]
here is the result:
[[0.93907536]]
Related
Why does:
with torch.no_grad():
w = w - lr*w.grad
print(w)
results in:
tensor(0.9871)
and
with torch.no_grad():
w -= lr*w.grad
print(w)
results in:
tensor(0.9871, requires_grad=True)
Aren't both operations the same?
Here is some test code:
def test_stack():
np.random.seed(0)
n = 50
feat1 = np.random.randn(n, 1)
feat2 = np.random.randn(n, 1)
X = torch.tensor(feat1).view(-1, 1)
Y = torch.tensor(feat2).view(-1, 1)
w = torch.tensor(1.0, requires_grad=True)
epochs = 1
lr = 0.001
for epoch in range(epochs):
for i in range(len(X)):
y_pred = w*X[i]
loss = (y_pred - Y[i])**2
loss.backward()
with torch.no_grad():
#w = w - lr*w.grad # DOESN'T WORK!!!!
#print(w); return
w -= lr*w.grad
print(w); return
w.grad.zero_()
Remove the comments and you'll se the requires_grad disappearing. Could this be a bug?
I'm try to make a simple linear model to predict parameters of formula.
y = 3*x1 + x2 - 2*x3
Unfortunately, there are some problem when i try to compute loss.
def answer(x):
return 3 * x[:,0] + x[:,1] - 2 * x[:,2]
def loss_f(x):
y = answer(x)
y_hat = model(x)
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
return loss
When i set batch_size = 3, the size of each result is different
x = torch.randn(3,3)
answer(x)
tensor([ 2.0201, -3.8354, 2.0059])
model(x)
tensor([[ 0.2085],
[-0.0670],
[-1.3635]], grad_fn=<ThAddmmBackward>)
answer(x.data).size()
torch.Size([3])
model(x.data).size()
torch.Size([3, 1])
I think the broadcast applied automatically.
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
How can i make same size of two tensors? Thanks
This is my code
import torch
import torch.nn as nn
import torch.optim as optim
class model(nn.Module):
def __init__(self, input_size, output_size):
super(model, self).__init__()
self.linear = nn.Linear(input_size, output_size)
def forward(self, x):
y = self.linear(x)
return y
model = model(3,1)
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum=0.1)
print('Parameters : ')
for p in model.parameters():
print(p)
print('')
print('Optimizer : ')
print(optimizer)
def generate_data(batch_size):
x = torch.randn(batch_size, 3)
return x
def answer(x):
return 3 * x[:,0] + x[:,1] - 2 * x[:,2]
def loss_f(x):
y = answer(x)
y_hat = model(x)
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
return loss
x = torch.randn(3,3)
print(x)
x = torch.FloatTensor(x)
batch_size = 3
epoch_n = 1000
iter_n = 100
for epoch in range(epoch_n):
avg_loss = 0
for i in range(iter_n):
x = torch.randn(batch_size, 3)
optimizer.zero_grad()
loss = loss_f(x.data)
loss.backward()
optimizer.step()
avg_loss += loss
avg_loss = avg_loss / iter_n
x_valid = torch.FloatTensor([[1,2,3]])
y_valid = answer(x_valid)
model.eval()
y_hat = model(x_valid)
model.train()
print(avg_loss, y_valid.data[0], y_hat.data[0])
if avg_loss < 0.001:
break
You can use Tensor.view
https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
So something like
answer(x.data).view(-1, 1)
should do the trick.
I have implemented a simple Neural Net with just a single sigmoid hidden layer, with the choice of a sigmoid or softmax output layer and squared error or cross entropy loss function, respectively. After much research on the softmax activation function, the cross entropy loss, and their derivatives (and with following this blog) I believe that my implementation seems correct.
When attempting to learn the simple XOR function, the NN with the sigmoid output learns to a very small loss very quickly when using single binary outputs of 0 and 1. However, when changing the labels to one-hot encodings of [1, 0] = 0 and [0, 1] = 1, the softmax implementation does not work. The loss consistently increases as the network's outputs converge to exactly [0, 1] for the two outputs on every input, yet the labels of the data set is perfectly balanced between [0, 1] and [1, 0].
My code is below, where the choice of using sigmoid or softmax at the output layer can be chosen by uncommenting the necessary two lines near the bottom of the code. I cannot figure out why the softmax implementation is not working.
import numpy as np
class MLP:
def __init__(self, numInputs, numHidden, numOutputs, activation):
self.numInputs = numInputs
self.numHidden = numHidden
self.numOutputs = numOutputs
self.activation = activation.upper()
self.IH_weights = np.random.rand(numInputs, numHidden) # Input -> Hidden
self.HO_weights = np.random.rand(numHidden, numOutputs) # Hidden -> Output
self.IH_bias = np.zeros((1, numHidden))
self.HO_bias = np.zeros((1, numOutputs))
# Gradients corresponding to weight matrices computed during backprop
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
# Gradients corresponding to biases computed during backprop
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
# Input, hidden and output layer neuron values
self.I = np.zeros(numInputs) # Inputs
self.L = np.zeros(numOutputs) # Labels
self.H = np.zeros(numHidden) # Hidden
self.O = np.zeros(numOutputs) # Output
# ##########################################################################
# ACIVATION FUNCTIONS
# ##########################################################################
def sigmoid(self, x, derivative=False):
if derivative:
return x * (1 - x)
return 1 / (1 + np.exp(-x))
def softmax(self, prediction, label=None, derivative=False):
if derivative:
return prediction - label
return np.exp(prediction) / np.sum(np.exp(prediction))
# ##########################################################################
# LOSS FUNCTIONS
# ##########################################################################
def squaredError(self, prediction, label, derivative=False):
if derivative:
return (-2 * prediction) + (2 * label)
return (prediction - label) ** 2
def crossEntropy(self, prediction, label, derivative=False):
if derivative:
return [-(y / x) for x, y in zip(prediction, label)] # NOT NEEDED ###############################
return - np.sum([y * np.log(x) for x, y in zip(prediction, label)])
# ##########################################################################
def forward(self, inputs):
self.I = np.array(inputs).reshape(1, self.numInputs) # [numInputs, ] -> [1, numInputs]
self.H = self.I.dot(self.IH_weights) + self.IH_bias
self.H = self.sigmoid(self.H)
self.O = self.H.dot(self.HO_weights) + self.HO_bias
if self.activation == 'SIGMOID':
self.O = self.sigmoid(self.O)
elif self.activation == 'SOFTMAX':
self.O = self.softmax(self.O) + 1e-10 # allows for log(0)
return self.O
def backward(self, labels):
self.L = np.array(labels).reshape(1, self.numOutputs) # [numOutputs, ] -> [1, numOutputs]
if self.activation == 'SIGMOID':
self.O_error = self.squaredError(self.O, self.L)
self.O_delta = self.squaredError(self.O, self.L, derivative=True) * self.sigmoid(self.O, derivative=True)
elif self.activation == 'SOFTMAX':
self.O_error = self.crossEntropy(self.O, self.L)
self.O_delta = self.softmax(self.O, self.L, derivative=True)
self.H_error = self.O_delta.dot(self.HO_weights.T)
self.H_delta = self.H_error * self.sigmoid(self.H, derivative=True)
self.IH_w_gradients += self.I.T.dot(self.H_delta)
self.HO_w_gradients += self.H.T.dot(self.O_delta)
self.IH_b_gradients += self.H_delta
self.HO_b_gradients += self.O_delta
return self.O_error
def updateWeights(self, learningRate):
self.IH_weights += learningRate * self.IH_w_gradients
self.HO_weights += learningRate * self.HO_w_gradients
self.IH_bias += learningRate * self.IH_b_gradients
self.HO_bias += learningRate * self.HO_b_gradients
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
sigmoidData = [
[[0, 0], 0],
[[0, 1], 1],
[[1, 0], 1],
[[1, 1], 0]
]
softmaxData = [
[[0, 0], [1, 0]],
[[0, 1], [0, 1]],
[[1, 0], [0, 1]],
[[1, 1], [1, 0]]
]
sigmoidMLP = MLP(2, 10, 1, 'SIGMOID')
softmaxMLP = MLP(2, 10, 2, 'SOFTMAX')
# SIGMOID #######################
# data = sigmoidData
# mlp = sigmoidMLP
# ###############################
# SOFTMAX #######################
data = softmaxData
mlp = softmaxMLP
# ###############################
numEpochs = 5000
for epoch in range(numEpochs):
losses = []
for i in range(len(data)):
print(mlp.forward(data[i][0])) # Print outputs
# mlp.forward(data[i][0]) # Don't print outputs
loss = mlp.backward(data[i][1])
losses.append(loss)
mlp.updateWeights(0.001)
# if epoch % 1000 == 0 or epoch == numEpochs - 1: # Print loss every 1000 epochs
print(np.mean(losses)) # Print loss every epoch
Contrary to all the information online, simply changing the derivative of the softmax cross entropy from prediction - label to label - prediction solved the problem. Perhaps I have something else backwards somewhere since every source I have come across has it as prediction - label.
I'm trying to adapt the example from http://cs231n.github.io/neural-networks-case-study/#together to make a neural network for a numeric target variable so it will be a neural network with regression. I surely do something wrong in the derivation part because my loss function in insanely growing.
Here is the code:
h = neurons # size of hidden layer
D = X[0].size
K = 1
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))
# some hyperparameters
step_size = 1 #learning rate
reg = 0.001 # regularization strength
loss_vec = []
# gradient descent loop
num_examples = X.shape[0]
for i in xrange(1000):
# evaluate class scores, [N x K]
hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
scores = np.dot(hidden_layer, W2) + b2
loss = np.power(y - scores,2)
#if i % 50 == 0:
loss_vec.append(np.mean(np.abs(loss)))
print "iteration %d: loss %f" % (i, np.mean(np.abs(loss)))
# compute the gradient on scores
dscores = 2*(y-scores) # here I am not sure is correct
# backpropate the gradient to the parameters
# first backprop into parameters W2 and b2
dW2 = np.dot(hidden_layer.T, dscores)
db2 = np.sum(dscores, axis=0, keepdims=True)
# next backprop into hidden layer
dhidden = np.dot(dscores, W2.T)
# backprop the ReLU non-linearity
dhidden[hidden_layer <= 0] = 0
# finally into W,b
dW = np.dot(X.T, dhidden)
db = np.sum(dhidden, axis=0, keepdims=True)
# add regularization gradient contribution
dW2 += reg * W2
dW += reg * W
# perform a parameter update
W += -step_size * dW
b += -step_size * db
W2 += -step_size * dW2
b2 += -step_size * db2
Code output:
iteration 0: loss 5786.021888
iteration 1: loss 24248543152533318464172949461134213120.000000
iteration 2: loss 388137710832824223006297769344993376570435619092
I've noticed several important mistakes:
the learning rate is too big, no chance to learn anything. I used 0.0005, but it depends on the data, size of hidden layer, etc
the loss derivative dscores should be flipped: scores - y
the loss also ignores regularization (probably dropped for debugging purposes)
Complete code below:
import numpy as np
# Generate data: learn the sum x[0] + x[1]
np.random.seed(0)
N = 100
D = 2
X_test = np.zeros([N, D])
y = np.zeros([N, 1])
for i in range(N):
X_test[i, :] = np.random.random_integers(0, 4, size=2)
y[i] = X_test[i, 0] + X_test[i, 1]
# Network params
H = 10
W = 0.01 * np.random.randn(D, H)
b = np.zeros([1, H])
W2 = 0.01 * np.random.randn(H, 1)
b2 = np.zeros([1, 1])
# Hyper params
step_size = 0.0005
reg = 0.001
for i in xrange(100):
hidden_layer = np.maximum(0, np.dot(X_test, W) + b)
scores = np.dot(hidden_layer, W2) + b2
reg_loss = 0.5 * reg * np.sum(W * W) + 0.5 * reg * np.sum(W2 * W2)
loss = np.mean(np.power(y - scores, 2)) + reg_loss
print "iteration %d: loss %f" % (i, loss)
dscores = (scores - y)
dW2 = np.dot(hidden_layer.T, dscores)
db2 = np.sum(dscores, axis=0, keepdims=True)
dhidden = np.dot(dscores, W2.T)
dhidden[hidden_layer <= 0] = 0
dW = np.dot(X_test.T, dhidden)
db = np.sum(dhidden, axis=0, keepdims=True)
dW2 += reg * W2
dW += reg * W
W += -step_size * dW
b += -step_size * db
W2 += -step_size * dW2
b2 += -step_size * db2
# Test
X_test = np.array([[1, 0], [0, 1], [2, 3], [2, 2]]).reshape([-1, 2])
y_test = np.array([1, 1, 5, 4]).reshape([-1, 1])
hidden_layer = np.maximum(0, np.dot(X_test, W) + b)
scores = np.dot(hidden_layer, W2) + b2
print 'Average test error = %f' % np.mean((scores - y_test).T)
I implemented bias units for my neural network with gradient descent. But I'm not 100% sure If I've implemented it the right way. Would be glade if you can quickly look through my code. Only the parts with
if bias:
are important.
And my second question:
Shouldn't the derivate of the softmax function be 1-x, because x is the output of the softmax function?
I tried my net with 1-x but its performance was worse.
Every help is appreciated.
Thanks in advance.
import numpy as np
import pickle
import time
import math
class FeedForwardNetwork():
def __init__(self, input_dim, hidden_dim, output_dim, dropout=False, dropout_prop=0.5, bias=False):
np.random.seed(1)
self.input_layer = np.array([])
self.hidden_layer = np.array([])
self.output_layer = np.array([])
self.hidden_dim = hidden_dim
self.dropout = dropout
self.dropout_prop = dropout_prop
self.bias = bias
r_input_hidden = math.sqrt(6 / (input_dim + hidden_dim))
r_hidden_output = math.sqrt(6 / (hidden_dim + output_dim))
#self.weights_input_hidden = np.random.uniform(low=-r_input_hidden, high=r_input_hidden, size=(input_dim, hidden_dim))
#self.weights_hidden_output = np.random.uniform(low=-r_hidden_output, high=r_hidden_output, size=(hidden_dim, output_dim))
self.weights_input_hidden = np.random.uniform(low=-0.01, high=0.01, size=(input_dim, hidden_dim))
self.weights_hidden_output = np.random.uniform(low=-0.01, high=0.01, size=(hidden_dim, output_dim))
self.validation_data = np.array([])
self.validation_data_solution = np.array([])
self.velocities_input_hidden = np.zeros(self.weights_input_hidden.shape)
self.velocities_hidden_output = np.zeros(self.weights_hidden_output.shape)
if bias:
self.weights_bias_hidden = np.random.uniform(low=-0.01, high=0.01, size=((1, hidden_dim)))
self.weights_bias_output = np.random.uniform(low=-0.01, high=0.01, size=((1, output_dim)))
self.velocities_bias_hidden = np.zeros(self.weights_bias_hidden.shape)
self.velocities_bias_output = np.zeros(self.weights_bias_output.shape)
def _tanh(self, x, deriv=False):
#The derivate is: 1-np.tanh(x)**2; Because x is already the output of tanh(x) 1-x*x is the correct derivate.
if not deriv:
return np.tanh(x)
return 1-x*x
def _softmax(self, x, deriv=False):
if not deriv:
return np.exp(x) / np.sum(np.exp(x), axis=0)
return 1 - np.exp(x) / np.sum(np.exp(x), axis=0)
def set_training_data(self, training_data_input, training_data_target, validation_data_input=None, validation_data_target=None):
"""Splits the data up into training and validation data with a ratio of 0.85/0.15 if no validation data is given.
Sets the data for training."""
if len(training_data_input) != len(training_data_target):
raise ValueError(
'Number of training examples and'
' training targets does not match!'
)
if (validation_data_input is None) and (validation_data_target is None):
len_training_data = int((len(training_data_input)/100*85//1))
self.input_layer = training_data_input[:len_training_data]
self.output_layer = training_data_target[:len_training_data]
self.validation_data = training_data_input[len_training_data:]
self.validation_data_solution = training_data_target[len_training_data:]
else:
self.input_layer = training_data_input
self.output_layer = training_data_target
self.validation_data = validation_data_input
self.validation_data_solution = validation_data_target
def save(self, filename):
"""Saves the weights into a pickle file."""
with open(filename, "wb") as network_file:
pickle.dump(self.weights_input_hidden, network_file)
pickle.dump(self.weights_hidden_output, network_file)
def load(self, filename):
"""Loads network weights from a pickle file."""
with open(filename, "rb") as network_file:
weights_input_hidden = pickle.load(network_file)
weights_hidden_output = pickle.load(network_file)
if (
len(weights_input_hidden) != len(self.weights_input_hidden)
or len(weights_hidden_output) != len(self.weights_hidden_output)
):
raise ValueError(
'File contains weights that does not'
' match the current networks size!'
)
self.weights_input_hidden = weights_input_hidden
self.weights_hidden_output = weights_hidden_output
def measure_error(self, input_data, output_data):
return 1/2 * np.sum((output_data - self.forward_propagate(input_data))**2)
#return np.sum(np.nan_to_num(-output_data*np.log(self.forward_propagate(input_data))-(1-output_data)*np.log(1-self.forward_propagate(input_data))))
def forward_propagate(self, input_data, dropout=False):
"""Proceds the input data from input neurons up to output neurons and returns the output layer.
If dropout is True some of the neurons are randomly turned off."""
input_layer = input_data
self.hidden_layer = self._tanh(np.dot(input_layer, self.weights_input_hidden))
if self.bias:
self.hidden_layer += self.weights_bias_hidden
if dropout:
self.hidden_layer *= np.random.binomial([np.ones((len(input_data),self.hidden_dim))],1-self.dropout_prop)[0] * (1.0/(1-self.dropout_prop))
if self.bias:
return self._softmax((np.dot(self.hidden_layer, self.weights_hidden_output) + self.weights_bias_output).T).T
else:
return self._softmax(np.dot(self.hidden_layer, self.weights_hidden_output).T).T
#return self._softmax(output_layer.T).T
def back_propagate(self, input_data, output_data, alpha, beta, momentum):
"""Calculates the difference between target output and output and adjusts the weights to fit the target output better.
The parameter alpha is the learning rate.
Beta is the parameter for weight decay which penaltizes large weights."""
sample_count = len(input_data)
output_layer = self.forward_propagate(input_data, dropout=self.dropout)
output_layer_error = output_layer - output_data
output_layer_delta = output_layer_error * self._softmax(output_layer, deriv=True)
print("Error: ", np.mean(np.abs(output_layer_error)))
#How much did each hidden neuron contribute to the output error?
#Multiplys delta term with weights
hidden_layer_error = output_layer_delta.dot(self.weights_hidden_output.T)
#If the prediction is good, the second term will be small and the change will be small
#Ex: target: 1 -> Slope will be 1 so the second term will be big
hidden_layer_delta = hidden_layer_error * self._tanh(self.hidden_layer, deriv=True)
#The both lines return a matrix. A row stands for all weights connected to one neuron.
#E.g. [1, 2, 3] -> Weights to Neuron A
# [4, 5, 6] -> Weights to Neuron B
hidden_weights_gradient = input_data.T.dot(hidden_layer_delta)/sample_count
output_weights_gradient = self.hidden_layer.T.dot(output_layer_delta)/sample_count
velocities_input_hidden = self.velocities_input_hidden
velocities_hidden_output = self.velocities_hidden_output
self.velocities_input_hidden = velocities_input_hidden * momentum - alpha * hidden_weights_gradient
self.velocities_hidden_output = velocities_hidden_output * momentum - alpha * output_weights_gradient
#Includes momentum term and weight decay; The weight decay parameter is beta
#Weight decay penalizes large weights to prevent overfitting
self.weights_input_hidden += -velocities_input_hidden * momentum + (1 + momentum) * self.velocities_input_hidden
- alpha * beta * self.weights_input_hidden / sample_count
self.weights_hidden_output += -velocities_hidden_output * momentum + (1 + momentum) * self.velocities_hidden_output
- alpha * beta * self.weights_hidden_output / sample_count
if self.bias:
velocities_bias_hidden = self.velocities_bias_hidden
velocities_bias_output = self.velocities_bias_output
hidden_layer_delta = np.sum(hidden_layer_delta, axis=0)
output_layer_delta = np.sum(output_layer_delta, axis=0)
self.velocities_bias_hidden = velocities_bias_hidden * momentum - alpha * hidden_layer_delta
self.velocities_bias_output = velocities_bias_output * momentum - alpha * output_layer_delta
self.weights_bias_hidden += -velocities_bias_hidden * momentum + (1 + momentum) * self.velocities_bias_hidden
- alpha * beta * self.weights_bias_hidden / sample_count
self.weights_bias_output += -velocities_bias_output * momentum + (1 + momentum) * self.velocities_bias_output
- alpha * beta * self.weights_bias_output / sample_count
def batch_train(self, epochs, alpha, beta, momentum, patience=10):
"""Trains the network in batch mode that means the weights are updated after showing all training examples.
alpha is the learning rate and patience is the number of epochs that the validation error is allowed to increase before aborting.
Beta is the parameter for weight decay which penaltizes large weights."""
#The weight decay parameter is beta
validation_error = self.measure_error(self.validation_data, self.validation_data_solution)
for epoch in range(epochs):
self.back_propagate(self.input_layer, self.output_layer, alpha, beta, momentum)
validation_error_new = self.measure_error(self.validation_data, self.validation_data_solution)
if validation_error_new < validation_error:
validation_error = validation_error_new
else:
patience -= 1
if patience == 0:
print("Abort Training. Overfitting has started! Epoch: {0}. Error: {1}".format(epoch, validation_error_new))
return
print("Epoch: {0}, Validation Error: {1}".format(epoch, validation_error))
self.save("Network_Mnist.net")
def mini_batch_train(self, batch_size, epochs, alpha, beta, momentum, patience=10):
"""Trains the network in mini batch mode, that means the weights are updated after showing only a bunch of training examples.
alpha is the learning rate and patience is the number of epochs that the validation error is allowed to increase before aborting."""
validation_error = self.measure_error(self.validation_data, self.validation_data_solution)
sample_count = len(self.input_layer)
epoch_counter = 0
for epoch in range(0, epochs*batch_size, batch_size):
epoch_counter += 1
self.back_propagate(self.input_layer[epoch%sample_count:(epoch%sample_count)+batch_size],
self.output_layer[epoch%sample_count:(epoch%sample_count)+batch_size], alpha, beta, momentum)
validation_error_new = self.measure_error(self.validation_data, self.validation_data_solution)
if validation_error_new < validation_error:
validation_error = validation_error_new
patience = 20
else:
patience -= 1
if patience == 0:
print("Abort Training. Overfitting has started! Epoch: {0}. Error: {1}".format(epoch_counter, validation_error_new))
return
print("Epoch: {0}, Validation Error: {1}".format(epoch_counter, validation_error))
self.save("Network_Mnist.net")
if __name__ == "__main__":
#If the first row is a one the first output neuron should be on the second off
x = np.array([ [0, 0, 1, 1, 0],
[0, 1, 1, 1, 1],
[1, 0, 1, 1, 1],
[1, 1, 1, 1, 0],
[0, 1, 1, 1, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 0, 1, 0, 0] ])
y = np.array([ [0, 1],
[0, 1],
[1, 0],
[1, 0],
[0, 1],
[1, 0],
[1, 0],
[1, 0] ])
#x = np.array([ [0, 0, 1, 1] ])
#y = np.array([[0]]).T
a = FeedForwardNetwork(input_dim=5, hidden_dim=200, output_dim=2, bias=False)
a.set_training_data(x, y)
start = time.time()
a.batch_train(epochs=2000, alpha=0.05, beta=0.0001, momentum=0.99, patience=20)
print(time.time()-start)
In relation with the derivatives...
If you are using the tanh activation function, i.e. the derivative is: y' = 1 - y^2. The tanh is commonly used because it is zero-centered.
If you are using the logistic equation, then the derivative is: y' = y(1+y). The softmax has a similar derivative.
The nice thing is that all these can be expressed as functions of themselves, so you need to have a look at the def _softmax(self, x, deriv=False) function, to define it in a similar way than def _tanh(self, x, deriv=False).