I've implemented a neural network to predict the xor gate. It has 1 input layer with 2 nodes, 1 hidden layer with 2 nodes and 1 output layer with 1 node. No matter what I try to do my cost keeps on increasing. I've tried setting my learning rate to small values but that just makes the cost increase slowly. Please, any tips appreciated.
import numpy as np
train_data = np.array([[0,0],[0,1],[1,0],[1,1]]).T
labels = np.array([[0,1,1,0]])
def sigmoid(z,deriv = False):
sig = 1/(1+np.exp(-z))
if deriv == True:
return np.multiply(sig,1-sig)
return sig
w1 = np.random.randn(2,2)*0.01
b1 = np.zeros((2,1))
w2 = np.random.randn(1,2)*0.01
b2 = np.zeros((1,1))
iterations = 1000
lr = 0.1
for i in range(1000):
z1 = np.dot(w1,train_data) + b1
a1 = sigmoid(z1)
z2 = np.dot(w2,a1) + b2
al = sigmoid(z2) #forward_prop
cost = np.dot(labels,np.log(al).T) + np.dot(1-labels,np.log(1-al).T)
cost = cost*(-1/4)
cost = np.squeeze(cost)#calcost
dal = (-1/4) * (np.divide(labels,al) + np.divide(1-labels,1-al))
dz2 = np.multiply(dal,sigmoid(z2,deriv = True))
dw2 = np.dot(dz2,a1.T)
db2 = np.sum(dz2,axis=1,keepdims = True)
da1 = np.dot(w2.T,dz2)
dz1 = np.multiply(da1,sigmoid(z1,deriv = True))
dw1 = np.dot(dz1,train_data.T)
db1 = np.sum(dz1,axis=1,keepdims = True) #backprop
w1 = w1 - lr*dw1
w2 = w2 - lr*dw2
b1 = b1 - lr*db1
b2 = b2 - lr*db2 #update params
print(cost,'------',str(i))
The main mistake is in cross-entropy backprop (recommend these notes for checking). The correct formula is the following:
dal = -labels / al + (1 - labels) / (1 - al)
I have also simplified the code a little bit. Here's a complete working version:
import numpy as np
train_data = np.array([[0,0], [0,1], [1,0], [1,1]]).T
labels = np.array([0, 1, 1, 1])
def sigmoid(z):
return 1 / (1 + np.exp(-z))
w1 = np.random.randn(2,2) * 0.001
b1 = np.zeros((2,1))
w2 = np.random.randn(1,2) * 0.001
b2 = np.zeros((1,1))
lr = 0.1
for i in range(1000):
z1 = np.dot(w1, train_data) + b1
a1 = sigmoid(z1)
z2 = np.dot(w2, a1) + b2
a2 = sigmoid(z2)
cost = -np.mean(labels * np.log(a2) + (1 - labels) * np.log(1 - a2))
da2 = (a2 - labels) / (a2 * (1 - a2)) # version #1
# da2 = -labels / a2 + (1 - labels) / (1 - a2) # version #2
dz2 = np.multiply(da2, a2 * (1 - a2))
dw2 = np.dot(dz2, a1.T)
db2 = np.sum(dz2, axis=1, keepdims=True)
da1 = np.dot(w2.T, dz2)
dz1 = np.multiply(da1, a1 * (1 - a1))
dw1 = np.dot(dz1, train_data.T)
db1 = np.sum(dz1, axis=1, keepdims=True)
w1 = w1 - lr*dw1
w2 = w2 - lr*dw2
b1 = b1 - lr*db1
b2 = b2 - lr*db2
print i, cost
Related
I've written a simple neural network that can predict XOR gate function. I think I've used the math correctly, but the loss doesn't go down and remains near 0.6. Can anyone help me find the reason why?
import numpy as np
import matplotlib as plt
train_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
train_Y = np.array([[0,1,1,0]])
test_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
test_Y = np.array([[0,1,1,0]])
learning_rate = 0.1
S = 5
def sigmoid(z):
return 1/(1+np.exp(-z))
def sigmoid_derivative(z):
return sigmoid(z)*(1-sigmoid(z))
S0, S1, S2 = 2, 5, 1
m = 4
w1 = np.random.randn(S1, S0) * 0.01
b1 = np.zeros((S1, 1))
w2 = np.random.randn(S2, S1) * 0.01
b2 = np.zeros((S2, 1))
for i in range(1000000):
Z1 = np.dot(w1, train_X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(w2, A1) + b2
A2 = sigmoid(Z2)
J = np.sum(-train_Y * np.log(A2) + (train_Y-1) * np.log(1-A2)) / m
dZ2 = A2 - train_Y
dW2 = np.dot(dZ2, A1.T) / m
dB2 = np.sum(dZ2, axis = 1, keepdims = True) / m
dZ1 = np.dot(w2.T, dZ2) * sigmoid_derivative(Z1)
dW1 = np.dot(dZ1, train_X.T) / m
dB1 = np.sum(dZ1, axis = 1, keepdims = True) / m
w1 = w1 - dW1 * 0.03
w2 = w2 - dW2 * 0.03
b1 = b1 - dB1 * 0.03
b2 = b2 - dB2 * 0.03
print(J)
I think your dZ2 is not correct, as you do not multiply it with the derivative of sigmoid.
For the XOR problem, if you inspect the outputs the 1's are slightly higher than 0.5 and the 0's are slightly lower. I believe this is because the search has reached a plateau and therefore therefore progressing very slowly. I tried RMSProp which converged to almost 0 very fast. I also tried a pseudo second order algorithm, RProp, which converged almost immediately (I used iRProp-). I am showing the plot for RMSPprop below
Also, the final output of the network is now
[[1.67096234e-06 9.99999419e-01 9.99994158e-01 6.87836337e-06]]
Rounding which gets
array([[0., 1., 1., 0.]])
But, I would highly recommend to perform gradient checking to be sure that the analytical gradients match with the ones computed numerically. Also see Andrew Ng's coursera lecture on gradient checking.
I am adding the modified code to with the RMSProp implementation.
#!/usr/bin/python3
import numpy as np
import matplotlib.pyplot as plt
train_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
train_Y = np.array([[0,1,1,0]])
test_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
test_Y = np.array([[0,1,1,0]])
learning_rate = 0.1
S = 5
def sigmoid(z):
return 1/(1+np.exp(-z))
def sigmoid_derivative(z):
return sigmoid(z)*(1-sigmoid(z))
S0, S1, S2 = 2, 5, 1
m = 4
w1 = np.random.randn(S1, S0) * 0.01
b1 = np.zeros((S1, 1))
w2 = np.random.randn(S2, S1) * 0.01
b2 = np.zeros((S2, 1))
# RMSProp variables
dWsqsum1 = np.zeros_like (w1)
dWsqsum2 = np.zeros_like (w2)
dBsqsum1 = np.zeros_like (b1)
dBsqsum2 = np.zeros_like (b2)
alpha = 0.9
lr = 0.01
err_vec = list ();
for i in range(20000):
Z1 = np.dot(w1, train_X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(w2, A1) + b2
A2 = sigmoid(Z2)
J = np.sum(-train_Y * np.log(A2) + (train_Y-1) * np.log(1-A2)) / m
dZ2 = (A2 - train_Y) * sigmoid_derivative (Z2);
dW2 = np.dot(dZ2, A1.T) / m
dB2 = np.sum(dZ2, axis = 1, keepdims = True) / m
dZ1 = np.dot(w2.T, dZ2) * sigmoid_derivative(Z1)
dW1 = np.dot(dZ1, train_X.T) / m
dB1 = np.sum(dZ1, axis = 1, keepdims = True) / m
# RMSProp update
dWsqsum1 = alpha * dWsqsum1 + (1 - learning_rate) * np.square (dW1);
dWsqsum2 = alpha * dWsqsum2 + (1 - learning_rate) * np.square (dW2);
dBsqsum1 = alpha * dBsqsum1 + (1 - learning_rate) * np.square (dB1);
dBsqsum2 = alpha * dBsqsum2 + (1 - learning_rate) * np.square (dB2);
w1 = w1 - (lr * dW1 / (np.sqrt (dWsqsum1) + 10e-10));
w2 = w2 - (lr * dW2 / (np.sqrt (dWsqsum2) + 10e-10));
b1 = b1 - (lr * dB1 / (np.sqrt (dBsqsum1) + 10e-10));
b2 = b2 - (lr * dB2 / (np.sqrt (dBsqsum2) + 10e-10));
print(J)
err_vec.append (J);
Z1 = np.dot(w1, train_X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(w2, A1) + b2
A2 = sigmoid(Z2)
print ("\n", A2);
plt.plot (np.array (err_vec));
plt.show ();
I want to define a custom loss function in Keras with Tensorflow backend which uses only the predicted y values, regardless of the true ones. The graph compiles successfully, but at the start of the training it returns an exception: InvalidArgumentError (see above for traceback): Self-adjoint eigen decomposition was not successful. The input might not be valid. I have tried replacing my data with random dummy data, but it produces the same exception.
My full code of the loss definition can be found below. Why is the input to the
tf.self_adjoint_eig not valid?
def model_correlation_loss(representation_size, k_singular_values):
global batch_size
def keras_loss(y_true, y_pred):
global batch_size
regularization_constant_1 = regularization_constant_2 = 1e-4
epsilon = 1e-12
o1 = o2 = int(y_pred.shape[1] // 2)
h_1 = y_pred[:, 0:o1]
h_2 = y_pred[:, o1:o1+o2]
h_1 = tf.transpose(h_1)
h_2 = tf.transpose(h_2)
m = tf.shape(h_1)[1]
centered_h_1 = h_1 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul(h_1, tf.ones(shape=(m, m)))
centered_h_2 = h_2 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul(h_2, tf.ones(shape=(m, m)))
sigma_hat_12 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_2))
sigma_hat_11 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_1)) + regularization_constant_1 * tf.eye(num_rows=o1)
sigma_hat_22 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_2, tf.transpose(centered_h_2)) + regularization_constant_2 * tf.eye(num_rows=o2)
w_1, v_1 = tf.self_adjoint_eig(sigma_hat_11)
w_2, v_2 = tf.self_adjoint_eig(sigma_hat_22)
zero = tf.constant(False, dtype=tf.bool)
idx_pos_entries_1 = tf.where(tf.equal(tf.greater(w_1, epsilon), True))
idx_pos_entries_1 = tf.reshape(idx_pos_entries_1, [-1, tf.shape(idx_pos_entries_1)[0]])[0]
w_1 = tf.gather(w_1, idx_pos_entries_1)
v_1 = tf.gather(v_1, idx_pos_entries_1)
idx_pos_entries_2 = tf.where(tf.equal(tf.greater(w_2, epsilon), True))
idx_pos_entries_2 = tf.reshape(idx_pos_entries_2, [-1, tf.shape(idx_pos_entries_2)[0]])[0]
w_2 = tf.gather(w_2, idx_pos_entries_2)
v_2 = tf.gather(v_2, idx_pos_entries_2)
sigma_hat_rootinvert_11 = tf.matmul(tf.matmul(v_1, tf.diag(tf.sqrt(w_1))), tf.transpose(v_1))
sigma_hat_rootinvert_22 = tf.matmul(tf.matmul(v_2, tf.diag(tf.sqrt(w_2))), tf.transpose(v_2))
t_matrix = tf.matmul(tf.matmul(sigma_hat_rootinvert_11, sigma_hat_12), sigma_hat_rootinvert_22)
if k_singular_values == representation_size: # use all
correlation = tf.sqrt(tf.trace(tf.matmul(K.transpose(t_matrix), t_matrix)))
return correlation
return keras_loss
Here's the tf code provided by Wang on his website for computing the loss function:
def CCA_loss(H1, H2, N, d1, d2, dim, rcov1, rcov2):
# Remove mean.
m1 = tf.reduce_mean(H1, axis=0, keep_dims=True)
H1 = tf.subtract(H1, m1)
m2 = tf.reduce_mean(H2, axis=0, keep_dims=True)
H2 = tf.subtract(H2, m2)
S11 = tf.matmul(tf.transpose(H1), H1) / (N-1) + rcov1 * tf.eye(d1)
S22 = tf.matmul(tf.transpose(H2), H2) / (N-1) + rcov2 * tf.eye(d2)
S12 = tf.matmul(tf.transpose(H1), H2) / (N-1)
E1, V1 = tf.self_adjoint_eig(S11)
E2, V2 = tf.self_adjoint_eig(S22)
# For numerical stability.
idx1 = tf.where(E1>eps_eig)[:,0]
E1 = tf.gather(E1, idx1)
V1 = tf.gather(V1, idx1, axis=1)
idx2 = tf.where(E2>eps_eig)[:,0]
E2 = tf.gather(E2, idx2)
V2 = tf.gather(V2, idx2, axis=1)
K11 = tf.matmul( tf.matmul(V1, tf.diag(tf.reciprocal(tf.sqrt(E1)))), tf.transpose(V1))
K22 = tf.matmul( tf.matmul(V2, tf.diag(tf.reciprocal(tf.sqrt(E2)))), tf.transpose(V2))
T = tf.matmul( tf.matmul(K11, S12), K22)
# Eigenvalues are sorted in increasing order.
E2, U = tf.self_adjoint_eig(tf.matmul(T, tf.transpose(T)))
return tf.reduce_sum(tf.sqrt(E2[-dim:]))
Following Andrew Traks's example, I want to implement a 3 layer neural network - 1 input, 1 hidden, 1 output - with a simple dropout, for binary classification.
If I include bias terms b1 and b2, then I would need to slightly modify Andrew's code as below.
X = np.array([ [0,0,1],[0,1,1],[1,0,1],[1,1,1] ])
y = np.array([[0,1,1,0]]).T
alpha,hidden_dim,dropout_percent = (0.5,4,0.2)
synapse_0 = 2*np.random.random((X.shape[1],hidden_dim)) - 1
synapse_1 = 2*np.random.random((hidden_dim,1)) - 1
b1 = np.zeros(hidden_dim)
b2 = np.zeros(1)
for j in range(60000):
# sigmoid activation function
layer_1 = (1/(1+np.exp(-(np.dot(X,synapse_0) + b1))))
# dropout
layer_1 *= np.random.binomial([np.ones((len(X),hidden_dim))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))
layer_2 = 1/(1+np.exp(-(np.dot(layer_1,synapse_1) + b2)))
# sigmoid derivative = s(x)(1-s(x))
layer_2_delta = (layer_2 - y)*(layer_2*(1-layer_2))
layer_1_delta = layer_2_delta.dot(synapse_1.T) * (layer_1 * (1-layer_1))
synapse_1 -= (alpha * layer_1.T.dot(layer_2_delta))
synapse_0 -= (alpha * X.T.dot(layer_1_delta))
b1 -= alpha*layer_1_delta
b2 -= alpha*layer_2_delta
The problem is, of course, with the code above the dimensions of b1 dont match with the dimensions of layer_1_delta, similarly with b2 and layer_2_delta.
I don't understand how the delta is calculated to update b1 and b2 - according to Michael Nielsen's example, b1 and b2 should be updated by a delta which in my code I believe to be layer_1_delta and layer_2_delta respectively.
What am I doing wrong here? Have I messed up the dimensionality of the deltas or of the biases? I feel it is the latter, because if I remove the biases from this code it works fine. Thanks in advance
So first I would change X in bX to 0 and 1 to correspond to synapse_X, because this is where they belong and it makes it:
b1 -= alpha * 1.0 / m * np.sum(layer_2_delta)
b0 -= alpha * 1.0 / m * np.sum(layer_1_delta)
Where m is the number of examples in the training set. Also, the drop rate is stupidly high and actually hurts convergence. So in all considered the whole code:
import numpy as np
X = np.array([ [0,0,1],[0,1,1],[1,0,1],[1,1,1] ])
m = X.shape[0]
y = np.array([[0,1,1,0]]).T
alpha,hidden_dim,dropout_percent = (0.5,4,0.02)
synapse_0 = 2*np.random.random((X.shape[1],hidden_dim)) - 1
synapse_1 = 2*np.random.random((hidden_dim,1)) - 1
b0 = np.zeros(hidden_dim)
b1 = np.zeros(1)
for j in range(10000):
# sigmoid activation function
layer_1 = (1/(1+np.exp(-(np.dot(X,synapse_0) + b0))))
# dropout
layer_1 *= np.random.binomial([np.ones((len(X),hidden_dim))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))
layer_2 = 1/(1+np.exp(-(np.dot(layer_1,synapse_1) + b1)))
# sigmoid derivative = s(x)(1-s(x))
layer_2_delta = (layer_2 - y)*(layer_2*(1-layer_2))
layer_1_delta = layer_2_delta.dot(synapse_1.T) * (layer_1 * (1-layer_1))
synapse_1 -= (alpha * layer_1.T.dot(layer_2_delta))
synapse_0 -= (alpha * X.T.dot(layer_1_delta))
b1 -= alpha * 1.0 / m * np.sum(layer_2_delta)
b0 -= alpha * 1.0 / m * np.sum(layer_1_delta)
print layer_2
I am slightly new to python and I am trying to convert some code.This is an approximation method. Which isn't important. In my oddev function I get returned
c2[1:modes+1] = v* 1j
ValueError: could not broadcast input array from shape (25) into shape (25,1)
When I do this Matlab I believe it automatically casts it, and will store the complex array. The function is a getting the coefficient from a partial sine transform to do this. At first I tried storing the random matrix which just an array using np.matlib method and this had the same shape but I believe I will lose the real values of the filter when I cast it. How do I store this?
import math
import numpy as np
def quickcontmin(datain):
n = np.shape(datain)[0]
m = math.floor(n / 2)
modes = math.floor(m / 2)
addl = 20
nn = 20 * n
chi = 10 ** -13
def evenhp(xv):
"Even high pass"
n1 = np.shape(xv)[0]
vx = np.array(xv[:-1])
vx = vx[::-1]
c1 = np.append(xv,vx)
c1 = np.fft.fft(c1)
c1[0:modes-1] = 0.0
c1[-1 - modes + 2:-1] = 0.0
evenl = np.real(np.fft.ifft(c1))
even = evenl[0:n1-1]
return even
def evenhpt(xv):
" Transpose of EvenHP"
n1 = np.shape(xv)[0]
xy = np.zeros((n1- 2, 1))
c1 = np.append(xv,xy)
c1 = np.fft.fft(c1)
c1[0:modes-1] = 0.0
c1[-1 - modes + 1:-1] = 0.0
evenl = np.real(np.fft.ifft(c1))
even = evenl[0:n1-1]
even[1:-2] = even[1:-2] + evenl[-1:-1:n1+1]
return even``
def evenlp(xv):
" Low pass cosine filter"
n1 = np.shape(xv)[0]
vx = np.array(xv[:-1])
vx = vx[::-1]
c1 = np.append(xv,vx)
c1 = np.fft.fft(c1)
c1[modes + 1:-1 - modes + 1] = 0.0
evenl = np.real(np.fft.ifft(c1))
even = evenl[0:n1-1]
return even
def oddev(xv):
"Evaluate the sine modes on the grid"
c2 = np.zeros((2 *n - 2, 1))*1j
v = np.array(xv[:])
v1 = v[:-1]
v1 = v[::-1]
c2[1:modes+1] = v* 1j
c2[-1 - modes + 1:-1] = -v1* 1j
evall = np.fft.ifft(c2) * math.sqrt(2 * n - 2)
eva = evall[0:n-1]
return eva
def oddevt(xv):
" Transpose the sine modes on the function OddEv"
c1 = np.array(xv[1:-2])
c1 = np.insert(c1,0.0,0)
c1 = np.append(c1,0.0)
c1 = np.append(c1,xv[-2:-1:2])
c1a = np.divide(np.fft.fft(c1),math.sqrt(2 * n - 2))
fcoef = np.imag(c1a[1:modes])
return fcoef
def eextnd(xv):
"Obtain cosine coefficients and evalue on the refined grid"
vx = np.array(xv[:-1])
vx = vx[::-1]
c1 = np.append(xv,vx)
c1 = np.fft.fft(c1)
cL = np.zeros((2*nn-2,1))
cL[0:modes-1] = c1[0:modes-1]
cL[-1 - modes + 1:-1] = c1[-1 - modes + 1:-1]
evenexL = np.multiply(np.fft.ifft(cL) , (nn - 1) / (n - 1))
evenex = evenexL[0:nn-1]
return evenex
def oextnd(xv):
"Evaluate sine coefficients on the refined grid"
c2 = np.zeros((2 * nn - 2, 1))
c2[0] = 0.0
c2[1:modes + 1] = np.multiply(xv[0:-1],1j)
c2[-1 - modes + 1:-1] = np.multiply(-xv[-1:-1:1],1j)
evall = np.real(np.multiply(np.fft.ifft(c2), math.sqrt(2 * n - 2) * (2 *nn - 2) / (2 * n - 2)))
oox = evall[0:nn-1]
return oox
dc = evenlp(datain)
#L in paper, number of vectors used to sample the columnspace
lll = round(4 * math.log(m )/ math.log(2)) + addl
lll = int(lll)
#The following should be straightforward from the psuedo-code
w=2 * np.random.rand(modes , lll) - 1
p=np.matlib.zeros(shape=(n,lll))
for j in range(lll):
p[:,j] = evenhp(oddev(w[:,j]))
q,r = np.linalg.qr(p , mode='reduced')
z = np.zeros(shape=(modes,lll))
for j in range(lll):
z[:,j]= oddevt(evenhpt(q[:,j]))
un,s,v = np.linalg.svd(z,full_matrices='False')
ds=np.diag(s)
aa=np.extract(np.diag(s)>(chi))
aa[-1] = aa
aa = int(aa)
s = 0 * s
for j in range(aa):
s[j,j] = 1.0 / ds(j)
#find the sine coefficents
b=un*s* v.T* q.T* evenhp(datain)
#Constructing the continuation
exs=oddev(b)
pexs = evenlp(exs)
dataCont=exs-pexs+dc
dataCont[n+1:2*n-2]=-exs[-2:-1:1]-pexs[-2:-1:1]+dc[-2:-1:1]
#Evaluate the continuation on the refined grid
dataRefined=eextnd(dc-exs)+oextnd(b)
return dataRefined, dataCont
n1 = 100
t = np.linspace(0,2*math.pi,n1)
y = np.sin(t)
data = quickcontmin(y)
dc1 = data[1]
dc1 = dc1[0:n1-1]`
Replacing c2[1:modes+1] = v* 1j by c2[1:modes+1, 0] = v* 1j should fix that specific error.
More consistent would be to replace:
v = np.array(xv[:])
v1 = v[:-1]
v1 = v[::-1]
by
v = xv
v1 = v[:-1]
v is already a column vector so you don't need to transform it into a 1d vector when you later need a column vector.
I am building a sketch of a neural network in Python 3.4 with numpy and matrices to learn a simple XOR.
My Notation is as follows:
a is the activity of a neuron
z is the input of a neuron
W is a weight matrix with size R^{#number of neurons in previous layer}x{#number of neurons in next layer}
B is a vector of bias values
After implementing a very simple network in python, everything works fine when training on only a single input vector. However, when training on all four training examples of XOR the error function shows a quite weird behaviour (see picture) and the output of the network is always roughly 0.5.
Changing the network size, the learning rate or the training epochs does not seem to help.
Cost J while only training on one training example
Cost J while training with all training examples
This is the code for the network:
import numpy as np
import time
import matplotlib.pyplot as plt
Js = []
start = time.time()
np.random.seed(2)
#Sigmoid
def activation(x, derivative = False):
if(derivative):
a = activation(x)
return a * (1 - a)
else:
return 1/(1+np.exp(-x))
def cost(output, target):
return (1/2) * np.sum((target - output)**2)
INPUTS = np.array([
[0, 1],
[1, 0],
[0, 0],
[1, 1],
])
TARGET = np.array([
[1],
[1],
[0],
[0],
])
"Hyper-Parameters"
# Layer Structure
LAYER = [2, 3, 1]
LEARNING_RATE = 0.1
ITERATIONS = int(1e3)
# Init Weights
W1 = np.random.rand(LAYER[0], LAYER[1])
W2 = np.random.rand(LAYER[1], LAYER[2])
# Init Biases
B1 = np.random.rand(LAYER[1], 1)
B2 = np.random.rand(LAYER[2], 1)
for i in range(0, ITERATIONS):
exampleIndex = i % len(INPUTS)
#exampleIndex = 2
"Forward Pass"
# Layer One Activity (Input layer)
A0 = np.transpose(INPUTS[exampleIndex:exampleIndex+1])
# Layer Two Activity (Hidden Layer)
Z1 = np.dot(np.transpose(W1), A0) + B1
A1 = activation(Z1)
# Layer Three Activity (Output Layer)
Z2 = np.dot(np.transpose(W2), A1) + B2
A2 = activation(Z2)
# Output
O = A2
# Cost J
# Target Vector T
T = np.transpose(TARGET[exampleIndex:exampleIndex+1])
J = cost(O, T)
Js.append(J)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
"Backward Pass"
# Calculate Delta of output layer
D2 = (O - T) * activation(Z2, True)
# Calculate Delta of hidden layer
D1 = np.dot(W2, D2) * activation(Z1, True)
# Calculate Derivatives w.r.t. W2
DerW2 = np.dot(A1, np.transpose(D2))
# Calculate Derivatives w.r.t. W1
DerW1 = np.dot(A0, np.transpose(D1))
# Calculate Derivatives w.r.t. B2
DerB2 = D2
# Calculate Derivatives w.r.t. B1
DerB1 = D1
"Update Weights and Biases"
W1 -= LEARNING_RATE * DerW1
B1 -= LEARNING_RATE * DerB1
W2 -= LEARNING_RATE * DerW2
B2 -= LEARNING_RATE * DerB2
# Show prediction
print("Time elapsed {}s".format(time.time() - start))
plt.plot(Js)
plt.ylabel("Cost J")
plt.xlabel("Iterations")
plt.show()
What could be the reason for this strange behaviour in my implementation?
I think your cost function is jumping since you perform your weight updates after each sample. However, your network is training the correct behavior nonetheless:
479997
J = 4.7222501603409765e-05
I = [[1]
[0]], O = [[ 0.99028172]]
T = [[1]]
479998
J = 7.3205311398742e-05
I = [[0]
[0]], O = [[ 0.01210003]]
T = [[0]]
479999
J = 4.577485181547362e-05
I = [[1]
[1]], O = [[ 0.00956816]]
T = [[0]]
480000
J = 4.726257702199439e-05
I = [[0]
[1]], O = [[ 0.9902776]]
T = [[1]]
The cost function shows some interesting behavior: the training process reaches a point where jumps in the cost function will become quite small.
You can reproduce this with the code below (I have only made slight changes; note that I trained over much more epochs):
import numpy as np
import time
import matplotlib.pyplot as plt
Js = []
start = time.time()
np.random.seed(2)
#Sigmoid
def activation(x, derivative = False):
if(derivative):
a = activation(x)
return a * (1 - a)
else:
return 1/(1+np.exp(-x))
def cost(output, target):
return (1/2) * np.sum((target - output)**2)
INPUTS = np.array([[0, 1],[1, 0],[0, 0],[1, 1]])
TARGET = np.array([[1],[1],[0],[0]])
"Hyper-Parameters"
# Layer Structure
LAYER = [2, 3, 1]
LEARNING_RATE = 0.1
ITERATIONS = int(5e5)
# Init Weights
W1 = np.random.rand(LAYER[0], LAYER[1])
W2 = np.random.rand(LAYER[1], LAYER[2])
# Init Biases
B1 = np.random.rand(LAYER[1], 1)
B2 = np.random.rand(LAYER[2], 1)
for i in range(0, ITERATIONS):
exampleIndex = i % len(INPUTS)
# exampleIndex = 2
"Forward Pass"
# Layer One Activity (Input layer)
A0 = np.transpose(INPUTS[exampleIndex:exampleIndex+1])
# Layer Two Activity (Hidden Layer)
Z1 = np.dot(np.transpose(W1), A0) + B1
A1 = activation(Z1)
# Layer Three Activity (Output Layer)
Z2 = np.dot(np.transpose(W2), A1) + B2
A2 = activation(Z2)
# Output
O = A2
# Cost J
# Target Vector T
T = np.transpose(TARGET[exampleIndex:exampleIndex+1])
J = cost(O, T)
Js.append(J)
# print("J = {}".format(J))
# print("I = {}, O = {}".format(A0, O))
# print("T = {}".format(T))
if ((i+3) % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))
if ((i+2) % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))
if ((i+1) % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))
if (i % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))
"Backward Pass"
# Calculate Delta of output layer
D2 = (O - T) * activation(Z2, True)
# Calculate Delta of hidden layer
D1 = np.dot(W2, D2) * activation(Z1, True)
# Calculate Derivatives w.r.t. W2
DerW2 = np.dot(A1, np.transpose(D2))
# Calculate Derivatives w.r.t. W1
DerW1 = np.dot(A0, np.transpose(D1))
# Calculate Derivatives w.r.t. B2
DerB2 = D2
# Calculate Derivatives w.r.t. B1
DerB1 = D1
"Update Weights and Biases"
W1 -= LEARNING_RATE * DerW1
B1 -= LEARNING_RATE * DerB1
W2 -= LEARNING_RATE * DerW2
B2 -= LEARNING_RATE * DerB2
# Show prediction
print("Time elapsed {}s".format(time.time() - start))
plt.plot(Js)
plt.ylabel("Cost J")
plt.xlabel("Iterations")
plt.savefig('cost.pdf')
plt.show()
In order to reduce fluctuations in the cost function, one usually uses multiple data samples before performing an update (some averaged update), but I see that this is difficult in a set containing only four different training events.
So, to conclude this rather long answer: your cost function jumps because it is calculated for every single example and not for an average of multiple examples. However, the network output follows the distribution of the XOR function quite well, so you don't need to change it.