I'm coding a neural network purely from scratch (absolutely no numpy or anything for calculations). I completed this easily using Numpy and matrix multiplication, but for some reason purely from scratch this doesn't work. I'm getting poor results like so:
Predictions:
[array([[0.6043366]]), array([[0.69588648]]), array([[0.4990016]]), array([[0.00369652]])]
X = [[0,0],[1,0],[0,1],[1,1]]
Y = [0,1,1,0]
epochs = 100000
W1 = np.random.randn(1,1)
W2 = np.random.randn(1,1)
W3 = np.random.randn(1,1)
W4 = np.random.randn(1,1)
W5 = np.random.randn(1,1)
W6 = np.random.randn(1,1)
h1_bias = 0
h2_bias = 0
output_bias = 0
def sigmoid(inputs):
return 1.0 / (1 + np.exp(-inputs))
lr = 0.01
for i in range(epochs):
preds = []
for idx, x in enumerate(X):
label = Y[idx]
i1 = x[0]
i2 = x[1]
h1 = W1*i1 + W2*i2 + h1_bias
h2 = W3*i1 + W4*i2 + h2_bias
h1 = sigmoid(h1)
h2 = sigmoid(h2)
output = W5*h1 + W6*h2 + output_bias
output = sigmoid(output)
preds.append(output)
loss_gradient = -1*((label / output) - (1-label) / (1-output))
output_activation_grad = loss_gradient * output * (1 - output)
update_output_bias = output_activation_grad
update_W5 = output_activation_grad * h1
update_W6 = output_activation_grad * h2
h1_grad = output_activation_grad * W5
h2_grad = output_activation_grad * W6
h1_activation_grad = h1_grad * h1 * (1 - h1)
h2_activation_grad = h2_grad * h2 * (1 - h2)
update_h1_bias = h1_activation_grad
update_h2_bias = h2_activation_grad
update_W1 = h1_activation_grad * i1
update_W2 = h1_activation_grad * i2
update_W3 = h2_activation_grad * i1
update_W4 = h2_activation_grad * i2
W1 -= lr*update_W1
W2 -= lr*update_W2
W3 -= lr*update_W3
W4 -= lr*update_W4
W5 -= lr*update_W5
W6 -= lr*update_W6
output_bias -= lr*update_output_bias
h1_bias -= lr*update_h1_bias
h2_bias -= lr*update_h2_bias
print("Predictions: {}".format(preds))
Is my derivation purely from scratch correct? Why is it not converging anymore?
Related
I should find optimal value of model by Koshi simulated annealing.
Model:
Index Height Weight
0 1 65.78331 112.9925
1 2 71.51521 136.4873
2 3 69.39874 153.0269
3 4 68.21660 142.3354
4 5 67.78781 144.2971
def mse(w):
height = dataset['Height']
weight = dataset['Weight']
predicts = list(map(lambda x: w[0] + w[1] * x, weight))
mse = mean_squared_error(height, predicts)
return mse
Initial values:
Tmax = 13
Tmin = -9.5
T = Tmax
wop0 = 1
wop1 = 100
Eglob = random.random()
w0_ = random.uniform(0, 100)
w1_ = random.uniform(0, 5)
w0 = w0_
w1 = w1_
E = mse([w0, w1])
Main function:
iteration = 0
while T > Tmin:
i = 1
for i in range(1,6000):
w0 = w0 + T * np.random.standard_cauchy()
w1 = w1 + T * np.random.standard_cauchy()
E = mse([w0, w1])
dE = E - mse([wop0, wop1])
if dE < 0:
wop0 = w0
wop1 = w1
Eglob = E
else:
a = random.random()
#h = 1 / (1 + np.exp(dE / T))
h = np.exp(-dE / T)
if a < h:
wop0 = w0
wop1 = w1
Eglob = E
T= Tmax / math.log(1+i);
iteration += 1
break
Result:
iteration: 5999
wop0: -706.4779870159789
wop1: 3.716864959467018
mse: 93084.16405699923
But if I use basinhopping function:
from scipy.optimize import basinhopping
ret = basinhopping(mse, [0, 1], niter = 50)
print("global min: w = [%.4f, %.4f], mse(w0, w1) = %.4f" % (ret.x[0], ret.x[1], ret.fun))
Result:
global min: w = [57.5718, 0.0820], mse(w0, w1) = 2.7018
What I'm doing wrong?
Thanks
I'm doing course on deeplearning.ai by Andrew Ng and I wanted to try to write my own neural network to classify handwritten digits.
There's something wrong with my algorithm and I can't find the problem.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
def sigmoid(Z):
return 1/(1 + np.exp(-Z))
def sigmoid_derivative(Z):
return sigmoid(Z) * (1-sigmoid(Z))
train_data = pd.read_csv('train.csv')
X_train, y_train = train_data.drop(columns = ['label']).to_numpy(), train_data['label'].to_numpy().reshape(train_data.shape[0], 1)
X_train = X_train.T
y_train = y_train.T
def initialize_parameters(X, hidden_nodes):
np.random.seed(3)
parameters = {}
output_layer = 10
parameters['W1'] = np.random.randn(hidden_nodes, X.shape[0]) * np.sqrt(2/X.shape[0])
parameters['b1'] = np.zeros((hidden_nodes, 1))
parameters['W2'] = np.random.randn(output_layer, hidden_nodes) * np.sqrt(1/hidden_nodes)
parameters['b2'] = np.zeros((output_layer, 1))
return parameters
def forward_propagation(X, parameters):
W1, b1, W2, b2 = parameters['W1'], parameters['b1'], parameters['W2'], parameters['b2']
Z1 = np.dot(W1,X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(W2,A1) + b2
A2 = sigmoid(Z2)
forward_prop_cache = {
'Z1' : Z1,
'A1' : A1,
'Z2' : Z2,
'A2' : A2
}
return forward_prop_cache
def backpropagation(forward_prop_cache, parameters,X,Y):
A1, A2, Z1, Z2 = forward_prop_cache['A1'], forward_prop_cache['A2'], forward_prop_cache['Z1'], forward_prop_cache['Z2']
W2 = parameters['W2']
m = X_train.shape[1]
da2 = (A2 - Y) / (A2 * (1 - A2))
dz2 = np.multiply(da2, sigmoid_derivative(Z2))
dw2 = (1/m) * np.dot(dz2, A1.T)
db2 = (1/m) * np.sum(dz2, keepdims=True, axis=1)
da1 = np.dot(W2.T, dz2)
dz1 = da1 * sigmoid_derivative(Z1)
dw1 = (1/m) * np.dot(dz1, X.T)
db1 = (1/m) * np.sum(dz1, keepdims=True, axis=1)
derivatives = {
'dw2':dw2,
'db2':db2,
'dw1':dw1,
'db1':db1
}
return derivatives
def update_params(learning_rate, derivatives, parameters):
dw2, db2, dw1, db1 = derivatives.values()
W1 = parameters['W1']
b1 = parameters['b1']
W2 = parameters['W2']
b2 = parameters['b2']
W2 = W2 - learning_rate * dw2
b2 = b2 - learning_rate * db2
W1 = W1 - learning_rate * dw1
b1 = b1 - learning_rate * db1
return {'W1':W1,
'b1':b1,
'W2':W2,
'b2':b2
}
def cost_function(forward_prop_cache, Y):
A2 = forward_prop_cache['A2']
return -np.mean(Y * np.log(A2) + (1 - Y) * np.log(1 - A2))
def model(num_iterations, X, Y, learning_rate, hidden_nodes):
cost = []
parameters = initialize_parameters(X, hidden_nodes)
for i in range(num_iterations):
forward_prop_cache = forward_propagation(X, parameters)
current_cost = cost_function(forward_prop_cache, Y)
cost.append(current_cost)
derivatives = backpropagation(forward_prop_cache, parameters, X, Y)
parameters = update_params(learning_rate, derivatives, parameters)
return cost, parameters, forward_prop_cache
num_iterations = 200
X = X_train[:,:1000]
Y = y_train[:,:1000]
learning_rate = 0.01
hidden_nodes = 200
cost, parameters, cache = model(num_iterations, X, Y, learning_rate, hidden_nodes)
Cost screenshot:
Cost plot:
As you can see there is something wrong with my algorithm. Would really appreciate some help. thank you.
I have this relation to which I write the code using python to compute it , I am not sure if the code is right or not. Could you please give me any advice if it is true or how I can improve the code??, thanks
import matplotlib.pyplot as plt
import numpy as np
from scipy.special import comb
from scipy.constants import k
from numpy import arange
p12 = 1 # system initial state 12
w0 = 1 # system
wn = 0 # wb/w0 bath
U = 0.1
N = 50
n = (N/2)
a =0.007
t = 1000# Time
Z = 2**N * (np.cosh(U*wn/2))**N
q12 = []
f11 = []
def Jrange(start, n, step):
numelements = int((stop-start)/float(step))
for i in range(numelements+1):
yield start + i*step
def trange(tstart,tstop,tstep):
tnumelements = int((tstop-tstart)/float(tstep))
for i in range(tnumelements+1):
yield tstart + i*tstep
for t in trange(tstart,tstop,tstep):
roh2 = 0
roh12 = 0
roh1 = 0
roh11 = 0
for J in Jrange (0,stop,1) :
Nj = (comb (N,(n+J))) - (comb (N,(n+J+1)))
for m in arange (-J,J+1):
r1 = np.sqrt (J*(J + 1) - m*(m + 1)) #r+
r2 = np.sqrt (J*(J + 1) - m*(m - 1)) #r-
Omega1 = (w0 - wn) + (4 *a*(m + (1/2)))/(np.sqrt(N)) #Omeg+
gamma1 = np.sqrt( (Omega1**2 /4)+ (4*a**2 * r1**2)/N) # gamma+
Omega2 =-(w0 - wn) - (4 *a*(m - (1/2)))/(np.sqrt(N)) #Omega-
gamma2 = np.sqrt( (Omega2**2 /4)+ (4*a**2 * r2**2)/N)#gamma-
A1 = np.cos(gamma1*t)
B1 = np.sin(gamma1*t)
A2 = np.cos(gamma2*t)
B2 = np.sin(gamma2*t)
C = np.exp(-m*U*wn)
H12 = C * (A1 - 1j*B1*Omega1/(2*gamma1)) * ((A2 +1j*B2*Omega2/(2*gamma2))
H2 = r2**2 * B2**2 * ((4*a**2)/ (gamma2**2 * N))
H1 = A1**2 + B1**2 *((Omega1**2)/ (4 * gamma1**2))
H11 = C * ((p11*H1) + (p22*H2))
roh11 = roh11+H11
roh12 = roh12 + H12
roh2= roh2 +roh12*Nj
roh1 = roh1 + roh11*Nj
Roh12 = roh2*D *p12*np.exp(1j*(w0-wn)*t)
Roh11 = roh1 *D
q12.append(Roh12)
f11.append(Roh11)
I am trying to program a neural network and was trying to minimize the cost function using scipy.optimize_bfgs() and after attempting to use this I get the error that "TypeError: cost() takes 3 positional arguments but 4 were given". Where are these four arguments coming from and how can I rectify this?
The cost function is defined by:
def cost(param,X,y):
Theta1 = np.reshape(param[0:106950:1],(75,1426))
Theta2 = np.reshape(param[106950:112650:1],(75,76))
Theta3 = np.reshape(param[112650::1],(1,76))
m = len(X)
J = 0
a1 = X
z2 = np.dot(a1,np.transpose(Theta1))
a2 = sigmoid(z2)
a2 = np.concatenate((np.ones((len(a2),1)),a2),axis=1)
z3 = np.dot(a2,Theta2.T)
a3 = sigmoid(z3)
a3 = np.concatenate((np.ones((len(a3),1)),a3),axis=1)
z4 = np.dot(a3,Theta3.T)
a4 = sigmoid(z4)
h = a4
##Calculate cost
J = np.sum(np.sum(np.multiply(-y,np.log(h)) - np.multiply((1-y),np.log(1-h))))/(2*m)
theta1_reg[:,0] = 0
theta2_reg[:,0] = 0
theta3_reg[:,0] = 0
Reg = (lamb/(2*m))*(np.sum(np.sum(np.square(theta1_reg)))+np.sum(np.sum(np.sqaure(theta2_reg)))+np.sum(np.sum(np.square(theta3_reg))))
J = J + Reg
return J
The gradient is then calculated with:
def grad(param,X,y):
Theta1 = np.reshape(param[0:106950:1],(75,1426))
Theta2 = np.reshape(param[106950:112650:1],(75,76))
Theta3 = np.reshape(param[112650::1],(1,76))
Theta1_grad = np.zeros(Theta1.shape)
Theta2_grad = np.zeros(Theta2.shape)
Theta3_grad = np.zeros(Theta3.shape)
m = len(X)
##Forward propogation
a1 = X
z2 = np.dot(a1,np.transpose(Theta1))
a2 = sigmoid(z2)
a2 = np.concatenate((np.ones((len(a2),1)),a2),axis=1)
z3 = np.dot(a2,Theta2.T)
a3 = sigmoid(z3)
a3 = np.concatenate((np.ones((len(a3),1)),a3),axis=1)
z4 = np.dot(a3,Theta3.T)
a4 = sigmoid(z4)
h = a4
##Backward propogation
d4 = a4 - y
d3 = np.multiply(np.dot(d4,Theta3[:,1:]),sigmoidGradient(z3))
d2 = np.multiply(np.dot(d3,Theta2[:,1:]),sigmoidGradient(z2)) ## or sigmoid(z2) .* ( 1 - sigmoid(z2))
D1 = np.dot(d2.T,a1)
D2 = np.dot(d3.T,a2)
D3 = np.dot(d4.T,a3)
##Unregularized gradients
Theta1_grad = (1/m)*D1
Theta2_grad = (1/m)*D2
Theta3_grad = (1/m)*D3
##Regularize gradients
theta1_reg = Theta1
theta2_reg = Theta2
theta3_reg = Theta3
theta1_reg[:,0] = 0
theta2_reg[:,0] = 0
theta3_reg[:,0] = 0
theta1_reg = (lamb/m)*theta1_reg
theta2_reg = (lamb/m)*theta2_reg
theta3_reg = (lamb/m)*theta3_reg
Theta1_grad = Theta1_grad + theta1_reg
Theta2_grad = Theta2_grad + theta2_reg
Theta3_grad = Theta3_grad + theta3_reg
##Concatenate gradients
grad = np.concatenate((Theta1_grad,Theta2_grad,Theta3_grad),axis=None)
return grad
Other functions defined are
def sigmoid(z):
sig = 1 / (1 + np.exp(z))
return sig
def randInitializeWeights(l_in, l_out):
epsilon = 0.12;
W = np.random.rand(l_out, 1+l_in)*2*epsilon - epsilon;
return W
def sigmoidGradient(z):
g = np.multiply(sigmoid(z),(1-sigmoid(z)))
return g
As an example:
import numpy as np
import scipy.optimize
X = np.random.rand(479,1426)
y1 = np.zeros((frames,1))
y2 = np.ones((framesp,1))
y = np.concatenate((y1,y2),axis=0)
init_param = np.random.rand(112726,)
lamb = 0.5
scipy.optimize.fmin_bfgs(cost,fprime=grad,x0=init_param,args=(param,X,y))
Then the error appears.
Thanks for any help
The arguments passed into the cost functions are the parameters, followed by the extra arguments. The parameters are chosen by the minimization function, the extra arguments are passed through.
When calling fmin_bfgs, only pass the extra arguments as args, not the actual parameters to optimize:
scipy.optimize.fmin_bfgs(..., args=(X,y))
How to find the best population (gStar) from random population, and then print weights from best population, i want to optimize the best population using FPA, i'm sorry cz i'm newbie
#FP Parameters
num_train = len(X_trainNorm)
input_dim = windowSize
output_dim = 1
popSize = 5
#Model of JST
def buildModel(hidden_dim):
np.random.seed(7)
pop = []
for i in range(popSize):
W1 = np.random.randn(input_dim, hidden_dim) / np.sqrt(input_dim)
b1 = np.zeros((1, hidden_dim))
W2 = np.random.randn(hidden_dim, output_dim) / np.sqrt(hidden_dim)
b2 = np.zeros((1, output_dim))
pop.append((W1, W2, b1, b2))
return pop
buildModel(10)
#Find the best population
def gStar(Weight1, Weight2, bias1, bias2):
z1 = X_trainNorm.dot(Weight1) + bias1
a1 = np.tanh(z1)
z2 = a1.dot(Weight2) + bias2
target = np.reshape(y_trainNorm,(-1,1))
error = 0
error = abs(z2-target)
tot_error = sum(error)
print (tot_error)