I've written a simple neural network that can predict XOR gate function. I think I've used the math correctly, but the loss doesn't go down and remains near 0.6. Can anyone help me find the reason why?
import numpy as np
import matplotlib as plt
train_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
train_Y = np.array([[0,1,1,0]])
test_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
test_Y = np.array([[0,1,1,0]])
learning_rate = 0.1
S = 5
def sigmoid(z):
return 1/(1+np.exp(-z))
def sigmoid_derivative(z):
return sigmoid(z)*(1-sigmoid(z))
S0, S1, S2 = 2, 5, 1
m = 4
w1 = np.random.randn(S1, S0) * 0.01
b1 = np.zeros((S1, 1))
w2 = np.random.randn(S2, S1) * 0.01
b2 = np.zeros((S2, 1))
for i in range(1000000):
Z1 = np.dot(w1, train_X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(w2, A1) + b2
A2 = sigmoid(Z2)
J = np.sum(-train_Y * np.log(A2) + (train_Y-1) * np.log(1-A2)) / m
dZ2 = A2 - train_Y
dW2 = np.dot(dZ2, A1.T) / m
dB2 = np.sum(dZ2, axis = 1, keepdims = True) / m
dZ1 = np.dot(w2.T, dZ2) * sigmoid_derivative(Z1)
dW1 = np.dot(dZ1, train_X.T) / m
dB1 = np.sum(dZ1, axis = 1, keepdims = True) / m
w1 = w1 - dW1 * 0.03
w2 = w2 - dW2 * 0.03
b1 = b1 - dB1 * 0.03
b2 = b2 - dB2 * 0.03
print(J)
I think your dZ2 is not correct, as you do not multiply it with the derivative of sigmoid.
For the XOR problem, if you inspect the outputs the 1's are slightly higher than 0.5 and the 0's are slightly lower. I believe this is because the search has reached a plateau and therefore therefore progressing very slowly. I tried RMSProp which converged to almost 0 very fast. I also tried a pseudo second order algorithm, RProp, which converged almost immediately (I used iRProp-). I am showing the plot for RMSPprop below
Also, the final output of the network is now
[[1.67096234e-06 9.99999419e-01 9.99994158e-01 6.87836337e-06]]
Rounding which gets
array([[0., 1., 1., 0.]])
But, I would highly recommend to perform gradient checking to be sure that the analytical gradients match with the ones computed numerically. Also see Andrew Ng's coursera lecture on gradient checking.
I am adding the modified code to with the RMSProp implementation.
#!/usr/bin/python3
import numpy as np
import matplotlib.pyplot as plt
train_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
train_Y = np.array([[0,1,1,0]])
test_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
test_Y = np.array([[0,1,1,0]])
learning_rate = 0.1
S = 5
def sigmoid(z):
return 1/(1+np.exp(-z))
def sigmoid_derivative(z):
return sigmoid(z)*(1-sigmoid(z))
S0, S1, S2 = 2, 5, 1
m = 4
w1 = np.random.randn(S1, S0) * 0.01
b1 = np.zeros((S1, 1))
w2 = np.random.randn(S2, S1) * 0.01
b2 = np.zeros((S2, 1))
# RMSProp variables
dWsqsum1 = np.zeros_like (w1)
dWsqsum2 = np.zeros_like (w2)
dBsqsum1 = np.zeros_like (b1)
dBsqsum2 = np.zeros_like (b2)
alpha = 0.9
lr = 0.01
err_vec = list ();
for i in range(20000):
Z1 = np.dot(w1, train_X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(w2, A1) + b2
A2 = sigmoid(Z2)
J = np.sum(-train_Y * np.log(A2) + (train_Y-1) * np.log(1-A2)) / m
dZ2 = (A2 - train_Y) * sigmoid_derivative (Z2);
dW2 = np.dot(dZ2, A1.T) / m
dB2 = np.sum(dZ2, axis = 1, keepdims = True) / m
dZ1 = np.dot(w2.T, dZ2) * sigmoid_derivative(Z1)
dW1 = np.dot(dZ1, train_X.T) / m
dB1 = np.sum(dZ1, axis = 1, keepdims = True) / m
# RMSProp update
dWsqsum1 = alpha * dWsqsum1 + (1 - learning_rate) * np.square (dW1);
dWsqsum2 = alpha * dWsqsum2 + (1 - learning_rate) * np.square (dW2);
dBsqsum1 = alpha * dBsqsum1 + (1 - learning_rate) * np.square (dB1);
dBsqsum2 = alpha * dBsqsum2 + (1 - learning_rate) * np.square (dB2);
w1 = w1 - (lr * dW1 / (np.sqrt (dWsqsum1) + 10e-10));
w2 = w2 - (lr * dW2 / (np.sqrt (dWsqsum2) + 10e-10));
b1 = b1 - (lr * dB1 / (np.sqrt (dBsqsum1) + 10e-10));
b2 = b2 - (lr * dB2 / (np.sqrt (dBsqsum2) + 10e-10));
print(J)
err_vec.append (J);
Z1 = np.dot(w1, train_X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(w2, A1) + b2
A2 = sigmoid(Z2)
print ("\n", A2);
plt.plot (np.array (err_vec));
plt.show ();
Related
import numpy as np
x = np.array([1,2,3,4,5,6,7])
f1= np.array([1,2,3,4,5,6,7])
f2= np.array([1,2,3,4,5,6,7])
def func(w1,w2,x,f1,f2):
w1=1-w2
return np.std(x/(w1*f1+w2*f2))
i need my code to minimize func(w1,w2,x,f1,f2) by changing w1 and w2 then give me w1 and w2 values. w1 + w2 should be equal to 1.
Something like this might be what you need:
x = np.random.randint(1, 10, 7)
f1 = np.random.randint(1, 10, 7)
f2 = np.random.randint(1, 10, 7)
def func(w, x, f1, f2): # no need to pass w1 and w2 separately
return np.std(x / (w[0] * f1 + (1 - w[0]) * f2))
res = scipy.optimize.minimize(func, x0=[0.5], args=(x, f1, f2), bounds=[(0, 1)])
w1 = res.x[0]
w2 = 1 - w1
print("Optimal weights are", w1, w2)
I want to define a custom loss function in Keras with Tensorflow backend which uses only the predicted y values, regardless of the true ones. The graph compiles successfully, but at the start of the training it returns an exception: InvalidArgumentError (see above for traceback): Self-adjoint eigen decomposition was not successful. The input might not be valid. I have tried replacing my data with random dummy data, but it produces the same exception.
My full code of the loss definition can be found below. Why is the input to the
tf.self_adjoint_eig not valid?
def model_correlation_loss(representation_size, k_singular_values):
global batch_size
def keras_loss(y_true, y_pred):
global batch_size
regularization_constant_1 = regularization_constant_2 = 1e-4
epsilon = 1e-12
o1 = o2 = int(y_pred.shape[1] // 2)
h_1 = y_pred[:, 0:o1]
h_2 = y_pred[:, o1:o1+o2]
h_1 = tf.transpose(h_1)
h_2 = tf.transpose(h_2)
m = tf.shape(h_1)[1]
centered_h_1 = h_1 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul(h_1, tf.ones(shape=(m, m)))
centered_h_2 = h_2 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul(h_2, tf.ones(shape=(m, m)))
sigma_hat_12 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_2))
sigma_hat_11 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_1)) + regularization_constant_1 * tf.eye(num_rows=o1)
sigma_hat_22 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_2, tf.transpose(centered_h_2)) + regularization_constant_2 * tf.eye(num_rows=o2)
w_1, v_1 = tf.self_adjoint_eig(sigma_hat_11)
w_2, v_2 = tf.self_adjoint_eig(sigma_hat_22)
zero = tf.constant(False, dtype=tf.bool)
idx_pos_entries_1 = tf.where(tf.equal(tf.greater(w_1, epsilon), True))
idx_pos_entries_1 = tf.reshape(idx_pos_entries_1, [-1, tf.shape(idx_pos_entries_1)[0]])[0]
w_1 = tf.gather(w_1, idx_pos_entries_1)
v_1 = tf.gather(v_1, idx_pos_entries_1)
idx_pos_entries_2 = tf.where(tf.equal(tf.greater(w_2, epsilon), True))
idx_pos_entries_2 = tf.reshape(idx_pos_entries_2, [-1, tf.shape(idx_pos_entries_2)[0]])[0]
w_2 = tf.gather(w_2, idx_pos_entries_2)
v_2 = tf.gather(v_2, idx_pos_entries_2)
sigma_hat_rootinvert_11 = tf.matmul(tf.matmul(v_1, tf.diag(tf.sqrt(w_1))), tf.transpose(v_1))
sigma_hat_rootinvert_22 = tf.matmul(tf.matmul(v_2, tf.diag(tf.sqrt(w_2))), tf.transpose(v_2))
t_matrix = tf.matmul(tf.matmul(sigma_hat_rootinvert_11, sigma_hat_12), sigma_hat_rootinvert_22)
if k_singular_values == representation_size: # use all
correlation = tf.sqrt(tf.trace(tf.matmul(K.transpose(t_matrix), t_matrix)))
return correlation
return keras_loss
Here's the tf code provided by Wang on his website for computing the loss function:
def CCA_loss(H1, H2, N, d1, d2, dim, rcov1, rcov2):
# Remove mean.
m1 = tf.reduce_mean(H1, axis=0, keep_dims=True)
H1 = tf.subtract(H1, m1)
m2 = tf.reduce_mean(H2, axis=0, keep_dims=True)
H2 = tf.subtract(H2, m2)
S11 = tf.matmul(tf.transpose(H1), H1) / (N-1) + rcov1 * tf.eye(d1)
S22 = tf.matmul(tf.transpose(H2), H2) / (N-1) + rcov2 * tf.eye(d2)
S12 = tf.matmul(tf.transpose(H1), H2) / (N-1)
E1, V1 = tf.self_adjoint_eig(S11)
E2, V2 = tf.self_adjoint_eig(S22)
# For numerical stability.
idx1 = tf.where(E1>eps_eig)[:,0]
E1 = tf.gather(E1, idx1)
V1 = tf.gather(V1, idx1, axis=1)
idx2 = tf.where(E2>eps_eig)[:,0]
E2 = tf.gather(E2, idx2)
V2 = tf.gather(V2, idx2, axis=1)
K11 = tf.matmul( tf.matmul(V1, tf.diag(tf.reciprocal(tf.sqrt(E1)))), tf.transpose(V1))
K22 = tf.matmul( tf.matmul(V2, tf.diag(tf.reciprocal(tf.sqrt(E2)))), tf.transpose(V2))
T = tf.matmul( tf.matmul(K11, S12), K22)
# Eigenvalues are sorted in increasing order.
E2, U = tf.self_adjoint_eig(tf.matmul(T, tf.transpose(T)))
return tf.reduce_sum(tf.sqrt(E2[-dim:]))
Following Andrew Traks's example, I want to implement a 3 layer neural network - 1 input, 1 hidden, 1 output - with a simple dropout, for binary classification.
If I include bias terms b1 and b2, then I would need to slightly modify Andrew's code as below.
X = np.array([ [0,0,1],[0,1,1],[1,0,1],[1,1,1] ])
y = np.array([[0,1,1,0]]).T
alpha,hidden_dim,dropout_percent = (0.5,4,0.2)
synapse_0 = 2*np.random.random((X.shape[1],hidden_dim)) - 1
synapse_1 = 2*np.random.random((hidden_dim,1)) - 1
b1 = np.zeros(hidden_dim)
b2 = np.zeros(1)
for j in range(60000):
# sigmoid activation function
layer_1 = (1/(1+np.exp(-(np.dot(X,synapse_0) + b1))))
# dropout
layer_1 *= np.random.binomial([np.ones((len(X),hidden_dim))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))
layer_2 = 1/(1+np.exp(-(np.dot(layer_1,synapse_1) + b2)))
# sigmoid derivative = s(x)(1-s(x))
layer_2_delta = (layer_2 - y)*(layer_2*(1-layer_2))
layer_1_delta = layer_2_delta.dot(synapse_1.T) * (layer_1 * (1-layer_1))
synapse_1 -= (alpha * layer_1.T.dot(layer_2_delta))
synapse_0 -= (alpha * X.T.dot(layer_1_delta))
b1 -= alpha*layer_1_delta
b2 -= alpha*layer_2_delta
The problem is, of course, with the code above the dimensions of b1 dont match with the dimensions of layer_1_delta, similarly with b2 and layer_2_delta.
I don't understand how the delta is calculated to update b1 and b2 - according to Michael Nielsen's example, b1 and b2 should be updated by a delta which in my code I believe to be layer_1_delta and layer_2_delta respectively.
What am I doing wrong here? Have I messed up the dimensionality of the deltas or of the biases? I feel it is the latter, because if I remove the biases from this code it works fine. Thanks in advance
So first I would change X in bX to 0 and 1 to correspond to synapse_X, because this is where they belong and it makes it:
b1 -= alpha * 1.0 / m * np.sum(layer_2_delta)
b0 -= alpha * 1.0 / m * np.sum(layer_1_delta)
Where m is the number of examples in the training set. Also, the drop rate is stupidly high and actually hurts convergence. So in all considered the whole code:
import numpy as np
X = np.array([ [0,0,1],[0,1,1],[1,0,1],[1,1,1] ])
m = X.shape[0]
y = np.array([[0,1,1,0]]).T
alpha,hidden_dim,dropout_percent = (0.5,4,0.02)
synapse_0 = 2*np.random.random((X.shape[1],hidden_dim)) - 1
synapse_1 = 2*np.random.random((hidden_dim,1)) - 1
b0 = np.zeros(hidden_dim)
b1 = np.zeros(1)
for j in range(10000):
# sigmoid activation function
layer_1 = (1/(1+np.exp(-(np.dot(X,synapse_0) + b0))))
# dropout
layer_1 *= np.random.binomial([np.ones((len(X),hidden_dim))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))
layer_2 = 1/(1+np.exp(-(np.dot(layer_1,synapse_1) + b1)))
# sigmoid derivative = s(x)(1-s(x))
layer_2_delta = (layer_2 - y)*(layer_2*(1-layer_2))
layer_1_delta = layer_2_delta.dot(synapse_1.T) * (layer_1 * (1-layer_1))
synapse_1 -= (alpha * layer_1.T.dot(layer_2_delta))
synapse_0 -= (alpha * X.T.dot(layer_1_delta))
b1 -= alpha * 1.0 / m * np.sum(layer_2_delta)
b0 -= alpha * 1.0 / m * np.sum(layer_1_delta)
print layer_2
I've implemented a neural network to predict the xor gate. It has 1 input layer with 2 nodes, 1 hidden layer with 2 nodes and 1 output layer with 1 node. No matter what I try to do my cost keeps on increasing. I've tried setting my learning rate to small values but that just makes the cost increase slowly. Please, any tips appreciated.
import numpy as np
train_data = np.array([[0,0],[0,1],[1,0],[1,1]]).T
labels = np.array([[0,1,1,0]])
def sigmoid(z,deriv = False):
sig = 1/(1+np.exp(-z))
if deriv == True:
return np.multiply(sig,1-sig)
return sig
w1 = np.random.randn(2,2)*0.01
b1 = np.zeros((2,1))
w2 = np.random.randn(1,2)*0.01
b2 = np.zeros((1,1))
iterations = 1000
lr = 0.1
for i in range(1000):
z1 = np.dot(w1,train_data) + b1
a1 = sigmoid(z1)
z2 = np.dot(w2,a1) + b2
al = sigmoid(z2) #forward_prop
cost = np.dot(labels,np.log(al).T) + np.dot(1-labels,np.log(1-al).T)
cost = cost*(-1/4)
cost = np.squeeze(cost)#calcost
dal = (-1/4) * (np.divide(labels,al) + np.divide(1-labels,1-al))
dz2 = np.multiply(dal,sigmoid(z2,deriv = True))
dw2 = np.dot(dz2,a1.T)
db2 = np.sum(dz2,axis=1,keepdims = True)
da1 = np.dot(w2.T,dz2)
dz1 = np.multiply(da1,sigmoid(z1,deriv = True))
dw1 = np.dot(dz1,train_data.T)
db1 = np.sum(dz1,axis=1,keepdims = True) #backprop
w1 = w1 - lr*dw1
w2 = w2 - lr*dw2
b1 = b1 - lr*db1
b2 = b2 - lr*db2 #update params
print(cost,'------',str(i))
The main mistake is in cross-entropy backprop (recommend these notes for checking). The correct formula is the following:
dal = -labels / al + (1 - labels) / (1 - al)
I have also simplified the code a little bit. Here's a complete working version:
import numpy as np
train_data = np.array([[0,0], [0,1], [1,0], [1,1]]).T
labels = np.array([0, 1, 1, 1])
def sigmoid(z):
return 1 / (1 + np.exp(-z))
w1 = np.random.randn(2,2) * 0.001
b1 = np.zeros((2,1))
w2 = np.random.randn(1,2) * 0.001
b2 = np.zeros((1,1))
lr = 0.1
for i in range(1000):
z1 = np.dot(w1, train_data) + b1
a1 = sigmoid(z1)
z2 = np.dot(w2, a1) + b2
a2 = sigmoid(z2)
cost = -np.mean(labels * np.log(a2) + (1 - labels) * np.log(1 - a2))
da2 = (a2 - labels) / (a2 * (1 - a2)) # version #1
# da2 = -labels / a2 + (1 - labels) / (1 - a2) # version #2
dz2 = np.multiply(da2, a2 * (1 - a2))
dw2 = np.dot(dz2, a1.T)
db2 = np.sum(dz2, axis=1, keepdims=True)
da1 = np.dot(w2.T, dz2)
dz1 = np.multiply(da1, a1 * (1 - a1))
dw1 = np.dot(dz1, train_data.T)
db1 = np.sum(dz1, axis=1, keepdims=True)
w1 = w1 - lr*dw1
w2 = w2 - lr*dw2
b1 = b1 - lr*db1
b2 = b2 - lr*db2
print i, cost
I am slightly new to python and I am trying to convert some code.This is an approximation method. Which isn't important. In my oddev function I get returned
c2[1:modes+1] = v* 1j
ValueError: could not broadcast input array from shape (25) into shape (25,1)
When I do this Matlab I believe it automatically casts it, and will store the complex array. The function is a getting the coefficient from a partial sine transform to do this. At first I tried storing the random matrix which just an array using np.matlib method and this had the same shape but I believe I will lose the real values of the filter when I cast it. How do I store this?
import math
import numpy as np
def quickcontmin(datain):
n = np.shape(datain)[0]
m = math.floor(n / 2)
modes = math.floor(m / 2)
addl = 20
nn = 20 * n
chi = 10 ** -13
def evenhp(xv):
"Even high pass"
n1 = np.shape(xv)[0]
vx = np.array(xv[:-1])
vx = vx[::-1]
c1 = np.append(xv,vx)
c1 = np.fft.fft(c1)
c1[0:modes-1] = 0.0
c1[-1 - modes + 2:-1] = 0.0
evenl = np.real(np.fft.ifft(c1))
even = evenl[0:n1-1]
return even
def evenhpt(xv):
" Transpose of EvenHP"
n1 = np.shape(xv)[0]
xy = np.zeros((n1- 2, 1))
c1 = np.append(xv,xy)
c1 = np.fft.fft(c1)
c1[0:modes-1] = 0.0
c1[-1 - modes + 1:-1] = 0.0
evenl = np.real(np.fft.ifft(c1))
even = evenl[0:n1-1]
even[1:-2] = even[1:-2] + evenl[-1:-1:n1+1]
return even``
def evenlp(xv):
" Low pass cosine filter"
n1 = np.shape(xv)[0]
vx = np.array(xv[:-1])
vx = vx[::-1]
c1 = np.append(xv,vx)
c1 = np.fft.fft(c1)
c1[modes + 1:-1 - modes + 1] = 0.0
evenl = np.real(np.fft.ifft(c1))
even = evenl[0:n1-1]
return even
def oddev(xv):
"Evaluate the sine modes on the grid"
c2 = np.zeros((2 *n - 2, 1))*1j
v = np.array(xv[:])
v1 = v[:-1]
v1 = v[::-1]
c2[1:modes+1] = v* 1j
c2[-1 - modes + 1:-1] = -v1* 1j
evall = np.fft.ifft(c2) * math.sqrt(2 * n - 2)
eva = evall[0:n-1]
return eva
def oddevt(xv):
" Transpose the sine modes on the function OddEv"
c1 = np.array(xv[1:-2])
c1 = np.insert(c1,0.0,0)
c1 = np.append(c1,0.0)
c1 = np.append(c1,xv[-2:-1:2])
c1a = np.divide(np.fft.fft(c1),math.sqrt(2 * n - 2))
fcoef = np.imag(c1a[1:modes])
return fcoef
def eextnd(xv):
"Obtain cosine coefficients and evalue on the refined grid"
vx = np.array(xv[:-1])
vx = vx[::-1]
c1 = np.append(xv,vx)
c1 = np.fft.fft(c1)
cL = np.zeros((2*nn-2,1))
cL[0:modes-1] = c1[0:modes-1]
cL[-1 - modes + 1:-1] = c1[-1 - modes + 1:-1]
evenexL = np.multiply(np.fft.ifft(cL) , (nn - 1) / (n - 1))
evenex = evenexL[0:nn-1]
return evenex
def oextnd(xv):
"Evaluate sine coefficients on the refined grid"
c2 = np.zeros((2 * nn - 2, 1))
c2[0] = 0.0
c2[1:modes + 1] = np.multiply(xv[0:-1],1j)
c2[-1 - modes + 1:-1] = np.multiply(-xv[-1:-1:1],1j)
evall = np.real(np.multiply(np.fft.ifft(c2), math.sqrt(2 * n - 2) * (2 *nn - 2) / (2 * n - 2)))
oox = evall[0:nn-1]
return oox
dc = evenlp(datain)
#L in paper, number of vectors used to sample the columnspace
lll = round(4 * math.log(m )/ math.log(2)) + addl
lll = int(lll)
#The following should be straightforward from the psuedo-code
w=2 * np.random.rand(modes , lll) - 1
p=np.matlib.zeros(shape=(n,lll))
for j in range(lll):
p[:,j] = evenhp(oddev(w[:,j]))
q,r = np.linalg.qr(p , mode='reduced')
z = np.zeros(shape=(modes,lll))
for j in range(lll):
z[:,j]= oddevt(evenhpt(q[:,j]))
un,s,v = np.linalg.svd(z,full_matrices='False')
ds=np.diag(s)
aa=np.extract(np.diag(s)>(chi))
aa[-1] = aa
aa = int(aa)
s = 0 * s
for j in range(aa):
s[j,j] = 1.0 / ds(j)
#find the sine coefficents
b=un*s* v.T* q.T* evenhp(datain)
#Constructing the continuation
exs=oddev(b)
pexs = evenlp(exs)
dataCont=exs-pexs+dc
dataCont[n+1:2*n-2]=-exs[-2:-1:1]-pexs[-2:-1:1]+dc[-2:-1:1]
#Evaluate the continuation on the refined grid
dataRefined=eextnd(dc-exs)+oextnd(b)
return dataRefined, dataCont
n1 = 100
t = np.linspace(0,2*math.pi,n1)
y = np.sin(t)
data = quickcontmin(y)
dc1 = data[1]
dc1 = dc1[0:n1-1]`
Replacing c2[1:modes+1] = v* 1j by c2[1:modes+1, 0] = v* 1j should fix that specific error.
More consistent would be to replace:
v = np.array(xv[:])
v1 = v[:-1]
v1 = v[::-1]
by
v = xv
v1 = v[:-1]
v is already a column vector so you don't need to transform it into a 1d vector when you later need a column vector.