mnist with numpy only - cost does not approach 0 - python

I'm doing course on deeplearning.ai by Andrew Ng and I wanted to try to write my own neural network to classify handwritten digits.
There's something wrong with my algorithm and I can't find the problem.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
def sigmoid(Z):
return 1/(1 + np.exp(-Z))
def sigmoid_derivative(Z):
return sigmoid(Z) * (1-sigmoid(Z))
train_data = pd.read_csv('train.csv')
X_train, y_train = train_data.drop(columns = ['label']).to_numpy(), train_data['label'].to_numpy().reshape(train_data.shape[0], 1)
X_train = X_train.T
y_train = y_train.T
def initialize_parameters(X, hidden_nodes):
np.random.seed(3)
parameters = {}
output_layer = 10
parameters['W1'] = np.random.randn(hidden_nodes, X.shape[0]) * np.sqrt(2/X.shape[0])
parameters['b1'] = np.zeros((hidden_nodes, 1))
parameters['W2'] = np.random.randn(output_layer, hidden_nodes) * np.sqrt(1/hidden_nodes)
parameters['b2'] = np.zeros((output_layer, 1))
return parameters
def forward_propagation(X, parameters):
W1, b1, W2, b2 = parameters['W1'], parameters['b1'], parameters['W2'], parameters['b2']
Z1 = np.dot(W1,X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(W2,A1) + b2
A2 = sigmoid(Z2)
forward_prop_cache = {
'Z1' : Z1,
'A1' : A1,
'Z2' : Z2,
'A2' : A2
}
return forward_prop_cache
def backpropagation(forward_prop_cache, parameters,X,Y):
A1, A2, Z1, Z2 = forward_prop_cache['A1'], forward_prop_cache['A2'], forward_prop_cache['Z1'], forward_prop_cache['Z2']
W2 = parameters['W2']
m = X_train.shape[1]
da2 = (A2 - Y) / (A2 * (1 - A2))
dz2 = np.multiply(da2, sigmoid_derivative(Z2))
dw2 = (1/m) * np.dot(dz2, A1.T)
db2 = (1/m) * np.sum(dz2, keepdims=True, axis=1)
da1 = np.dot(W2.T, dz2)
dz1 = da1 * sigmoid_derivative(Z1)
dw1 = (1/m) * np.dot(dz1, X.T)
db1 = (1/m) * np.sum(dz1, keepdims=True, axis=1)
derivatives = {
'dw2':dw2,
'db2':db2,
'dw1':dw1,
'db1':db1
}
return derivatives
def update_params(learning_rate, derivatives, parameters):
dw2, db2, dw1, db1 = derivatives.values()
W1 = parameters['W1']
b1 = parameters['b1']
W2 = parameters['W2']
b2 = parameters['b2']
W2 = W2 - learning_rate * dw2
b2 = b2 - learning_rate * db2
W1 = W1 - learning_rate * dw1
b1 = b1 - learning_rate * db1
return {'W1':W1,
'b1':b1,
'W2':W2,
'b2':b2
}
def cost_function(forward_prop_cache, Y):
A2 = forward_prop_cache['A2']
return -np.mean(Y * np.log(A2) + (1 - Y) * np.log(1 - A2))
def model(num_iterations, X, Y, learning_rate, hidden_nodes):
cost = []
parameters = initialize_parameters(X, hidden_nodes)
for i in range(num_iterations):
forward_prop_cache = forward_propagation(X, parameters)
current_cost = cost_function(forward_prop_cache, Y)
cost.append(current_cost)
derivatives = backpropagation(forward_prop_cache, parameters, X, Y)
parameters = update_params(learning_rate, derivatives, parameters)
return cost, parameters, forward_prop_cache
num_iterations = 200
X = X_train[:,:1000]
Y = y_train[:,:1000]
learning_rate = 0.01
hidden_nodes = 200
cost, parameters, cache = model(num_iterations, X, Y, learning_rate, hidden_nodes)
Cost screenshot:
Cost plot:
As you can see there is something wrong with my algorithm. Would really appreciate some help. thank you.

Related

XOR Neural Network completely from scratch (No numpy)

I'm coding a neural network purely from scratch (absolutely no numpy or anything for calculations). I completed this easily using Numpy and matrix multiplication, but for some reason purely from scratch this doesn't work. I'm getting poor results like so:
Predictions:
[array([[0.6043366]]), array([[0.69588648]]), array([[0.4990016]]), array([[0.00369652]])]
X = [[0,0],[1,0],[0,1],[1,1]]
Y = [0,1,1,0]
epochs = 100000
W1 = np.random.randn(1,1)
W2 = np.random.randn(1,1)
W3 = np.random.randn(1,1)
W4 = np.random.randn(1,1)
W5 = np.random.randn(1,1)
W6 = np.random.randn(1,1)
h1_bias = 0
h2_bias = 0
output_bias = 0
def sigmoid(inputs):
return 1.0 / (1 + np.exp(-inputs))
lr = 0.01
for i in range(epochs):
preds = []
for idx, x in enumerate(X):
label = Y[idx]
i1 = x[0]
i2 = x[1]
h1 = W1*i1 + W2*i2 + h1_bias
h2 = W3*i1 + W4*i2 + h2_bias
h1 = sigmoid(h1)
h2 = sigmoid(h2)
output = W5*h1 + W6*h2 + output_bias
output = sigmoid(output)
preds.append(output)
loss_gradient = -1*((label / output) - (1-label) / (1-output))
output_activation_grad = loss_gradient * output * (1 - output)
update_output_bias = output_activation_grad
update_W5 = output_activation_grad * h1
update_W6 = output_activation_grad * h2
h1_grad = output_activation_grad * W5
h2_grad = output_activation_grad * W6
h1_activation_grad = h1_grad * h1 * (1 - h1)
h2_activation_grad = h2_grad * h2 * (1 - h2)
update_h1_bias = h1_activation_grad
update_h2_bias = h2_activation_grad
update_W1 = h1_activation_grad * i1
update_W2 = h1_activation_grad * i2
update_W3 = h2_activation_grad * i1
update_W4 = h2_activation_grad * i2
W1 -= lr*update_W1
W2 -= lr*update_W2
W3 -= lr*update_W3
W4 -= lr*update_W4
W5 -= lr*update_W5
W6 -= lr*update_W6
output_bias -= lr*update_output_bias
h1_bias -= lr*update_h1_bias
h2_bias -= lr*update_h2_bias
print("Predictions: {}".format(preds))
Is my derivation purely from scratch correct? Why is it not converging anymore?

Model Overfits after adding bias

Implementation of 4bit up counter. First, I have implemented the model without using the bias term. the model seems to have worked correctly but after adding the bias term the model overfits at the very initial stage and the loss becomes zero. Even for the unseen data, the model predicts the same output as of training data. Below is the implementation of the same. What is the problem...
import numpy as np
import matplotlib.pyplot as plt
#Batch training
#input & output
x = np.array([[0,0,1,0],[0,0,0,0],[0,0,0,1],[0,0,1,1],[0,1,0,0],[0,1,0,1],[0,1,1,0],[0,1,1,1],[1,0,0,0],[1,0,0,1]]) # 10*4
y = np.array([[0,0,1,1],[0,0,0,1],[0,0,1,0],[0,1,0,0],[0,1,0,1],[0,1,1,0],[0,1,1,1],[1,0,0,0],[1,0,0,1],[1,0,1,0]]) # 10*4
def sigmoid(x):
return 1/(1+np.exp(-x))
def sigmoid_prime(x):
return sigmoid(x)*(1-sigmoid(x))
Input_Size = 4
Output_Size = 4
Hidden_Layer_Neurons = 8
Learning_Rate = 0.01
weight1 = np.random.uniform( size = ( Input_Size, Hidden_Layer_Neurons ) ) # 4*8
weight2 = np.random.uniform( size = ( Hidden_Layer_Neurons, Output_Size ) ) # 8*4
loss = []
iteration = []
bias1 = np.random.uniform( size = ( x.shape[0], Hidden_Layer_Neurons ) )
bias2 = np.random.uniform( size = ( x.shape[0], Output_Size ) )
for i in range(30000):
a1 = x #10*4
z2 = np.dot( a1, weight1 ) + bias1 # 10*4 ** 4*8 = 10*8
a2 = sigmoid(z2) # 10*8
z3 = np.dot( a2, weight2 ) + bias2 # 10*8 ** 8*4 = 10*4
val = 0
err1 = 0
if i > 100:
for j in range(10):
for k in range(4):
val += (y[j][k]-z3[j][k])*(y[j][k]-z3[j][k])
val = val/(2*10)
loss.append(val);
iteration.append(i)
del_out = ( z3 - y ) # 10*4 - 10*4 = 10*4
weight2 -= Learning_Rate*np.dot( a2.T, del_out )#8*10 ** 10*4= 8*4
bias2 -= Learning_Rate*del_out
err = np.dot(del_out, weight2.T)*sigmoid_prime(z2) #10*4 ** 4*8 = 10*8 * 10*8= 10*8
weight1 -= Learning_Rate*np.dot( a1.T, err ) #4*10 ** 10*8 = 4*8
bias1 -= Learning_Rate*err
print(z3)
plt.plot( iteration, loss )
plt.show()
def model():
q = np.array([[1,0,1,0],[1,0,1,1],[1,1,0,0], [1,1,0,1], [1,1,1,0], [1,0,0,0],[1,1,1,1],[0,0,1,1],[0,0,0,1],[0,0,1,0]])
z = np.dot(q, weight1) + bias1
act_hidden = sigmoid(z)
output = np.dot(act_hidden, weight2) + bias2
print(output)
model()
Why bias adding creates a problem here and when we should add bias?

One layer neural network giving approx 0.5 prediction to XOR problem every time

I tried to construct a neural network( with one hidden layer) to solve the XOR problem.
but it gives me approx 0.5 to every input I give. I guess there is a problem in Back Propagation.
I have used the Sigmoid Function as the activation function.
I have not used regularization.
I have tried multiple learning rates and tried multiple iteration numbers.
but the cost is not decreasing as it should be.
The code is as follows:-
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
import matplotlib.pyplot as plt
# In[2]:
data = [[0,0], [0,1], [1,0], [1,1]]
X = np.array(data)
X = X.T
Y = [[1,0,0,1]]
Y = np.array(Y)
print(np.shape(X))
print(np.shape(Y))
# In[3]:
def set_nodes():
print("Enter number of nodes of Input Layer: ")
x_n = int(input())
print("Enter number of nodes of Hidden Layer: ")
h_n = int(input())
print("Enter number of nodes of Output Layer: ")
o_n = int(input())
nodes = {"x_n":x_n, "h_n":h_n, "o_n":o_n}
return(nodes)
# In[4]:
nodes = set_nodes()
# In[35]:
def set_param():
x_n = nodes["x_n"]
h_n = nodes["h_n"]
o_n = nodes["o_n"]
W1 = np.random.randn(h_n, x_n)*0.01
b1 = np.zeros((h_n,1))
W2 = np.random.randn(o_n, h_n)*0.01
b2 = np.zeros((o_n,1))
parameters = {"W1":W1, "b1":b1, "W2":W2, "b2":b2}
return parameters
# In[36]:
parameters = set_param()
np.shape(parameters["b2"])
# In[37]:
def sigmoid(z):
a_sig = 1/(1+np.exp(-z))
return a_sig
# In[38]:
def hypotheses(X_para, parameters):
z1 = np.dot(parameters["W1"], X_para)+parameters["b1"]
a1 = sigmoid(z1)
#print("A1 shape", np.shape(a1))
z2 = np.dot(parameters["W2"], a1) + parameters["b2"]
a2 = sigmoid(z2)
cache = {"Z1":z1, "A1":a1, "Z2":z2, "A2":a2}
return a2, cache
# In[39]:
A2, cache = hypotheses(X,parameters)
print(np.shape(A2))
print(A2)
print(np.shape(cache["A2"]))
print(1-Y[0])
# In[40]:
m = Y.shape[1]
def find_cost(A2, Y, cache):
Loss = (np.multiply(Y, np.log(A2)))+(np.multiply((1-Y), np.log(1-A2)))
cost = -1*(1/m)*np.sum(Loss)
cost = float(np.squeeze(cost))
return(cost)
# In[41]:
cost = find_cost(cache["A2"], Y, cache)
print(cost)
# In[42]:
def back_prop(cache, parameters):
dZ2 = cache["A2"]-Y
dW2 = (1/m)*(np.dot(dZ2, cache['A2'].T))
db2 = (1/m)*np.sum(dZ2, axis = 1, keepdims = True)
dZ1 = np.dot(parameters["W2"].T, dZ2) *(1 - np.power(cache["A1"],2))
dW1 = (1/m)*np.dot(dZ1, X.T)
db1 = (1/m)*np.sum(dZ1, axis=1, keepdims = True)
grads = {"dZ2":dZ2, "dW2":dW2, "db2":db2, "dZ1":dZ1, "dW1":dW1, "db1":db1}
return grads
# In[43]:
grads = back_prop(cache, parameters)
# print(grads["dZ1"])
print(np.shape(parameters["W2"]))
print(np.shape(np.power(cache["A1"],2)))
print(np.shape(grads["dZ1"]))
print(np.shape(grads["dW1"]))
print(grads["dW1"])
# In[44]:
def update_parameters(grads, parameters, learning_rate = 0.012):
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
W2 = W2-learning_rate*grads["dW2"]
b2 = b2-learning_rate*grads["db2"]
W1 = W1-learning_rate*grads["dW1"]
b1 = b1-learning_rate*grads["db1"]
parameters = {"W2":W2, "b2": b2, "W1":W1, "b1":b1}
return(parameters)
# In[45]:
parameters = update_parameters(grads, parameters)
# In[46]:
def nn_final(X, Y, parameters, iterations = 1000):
X_temp = X
costl = []
for i in range(iterations):
a2, cache = hypotheses(X_temp, parameters)
cost = find_cost(a2, Y, cache)
costl.append(cost)
grads = back_prop(cache, parameters)
parameters = update_parameters(grads, parameters)
return costl, parameters
# In[47]:
costl, parameters = nn_final(X,Y,parameters)
plt.plot(costl)
plt.show()
# In[48]:
print(costl[len(costl)-2]-costl[len(costl)-1])
# In[49]:
print("Enter First Input: ")
inp1 = int(input())
print("Enter Second Input: ")
inp2 = int(input())
ls = [inp1, inp2]
print(ls)
inp = np.array(ls)
inp = inp.T
inp = np.array(inp)
inp = np.reshape(inp, (2,1))
print("inp",np.shape(inp))
a2,cache = hypotheses(inp, parameters)
print("A2=", a2)

Scipy.fmin_bfgs passing to many arguments to function

I am trying to program a neural network and was trying to minimize the cost function using scipy.optimize_bfgs() and after attempting to use this I get the error that "TypeError: cost() takes 3 positional arguments but 4 were given". Where are these four arguments coming from and how can I rectify this?
The cost function is defined by:
def cost(param,X,y):
Theta1 = np.reshape(param[0:106950:1],(75,1426))
Theta2 = np.reshape(param[106950:112650:1],(75,76))
Theta3 = np.reshape(param[112650::1],(1,76))
m = len(X)
J = 0
a1 = X
z2 = np.dot(a1,np.transpose(Theta1))
a2 = sigmoid(z2)
a2 = np.concatenate((np.ones((len(a2),1)),a2),axis=1)
z3 = np.dot(a2,Theta2.T)
a3 = sigmoid(z3)
a3 = np.concatenate((np.ones((len(a3),1)),a3),axis=1)
z4 = np.dot(a3,Theta3.T)
a4 = sigmoid(z4)
h = a4
##Calculate cost
J = np.sum(np.sum(np.multiply(-y,np.log(h)) - np.multiply((1-y),np.log(1-h))))/(2*m)
theta1_reg[:,0] = 0
theta2_reg[:,0] = 0
theta3_reg[:,0] = 0
Reg = (lamb/(2*m))*(np.sum(np.sum(np.square(theta1_reg)))+np.sum(np.sum(np.sqaure(theta2_reg)))+np.sum(np.sum(np.square(theta3_reg))))
J = J + Reg
return J
The gradient is then calculated with:
def grad(param,X,y):
Theta1 = np.reshape(param[0:106950:1],(75,1426))
Theta2 = np.reshape(param[106950:112650:1],(75,76))
Theta3 = np.reshape(param[112650::1],(1,76))
Theta1_grad = np.zeros(Theta1.shape)
Theta2_grad = np.zeros(Theta2.shape)
Theta3_grad = np.zeros(Theta3.shape)
m = len(X)
##Forward propogation
a1 = X
z2 = np.dot(a1,np.transpose(Theta1))
a2 = sigmoid(z2)
a2 = np.concatenate((np.ones((len(a2),1)),a2),axis=1)
z3 = np.dot(a2,Theta2.T)
a3 = sigmoid(z3)
a3 = np.concatenate((np.ones((len(a3),1)),a3),axis=1)
z4 = np.dot(a3,Theta3.T)
a4 = sigmoid(z4)
h = a4
##Backward propogation
d4 = a4 - y
d3 = np.multiply(np.dot(d4,Theta3[:,1:]),sigmoidGradient(z3))
d2 = np.multiply(np.dot(d3,Theta2[:,1:]),sigmoidGradient(z2)) ## or sigmoid(z2) .* ( 1 - sigmoid(z2))
D1 = np.dot(d2.T,a1)
D2 = np.dot(d3.T,a2)
D3 = np.dot(d4.T,a3)
##Unregularized gradients
Theta1_grad = (1/m)*D1
Theta2_grad = (1/m)*D2
Theta3_grad = (1/m)*D3
##Regularize gradients
theta1_reg = Theta1
theta2_reg = Theta2
theta3_reg = Theta3
theta1_reg[:,0] = 0
theta2_reg[:,0] = 0
theta3_reg[:,0] = 0
theta1_reg = (lamb/m)*theta1_reg
theta2_reg = (lamb/m)*theta2_reg
theta3_reg = (lamb/m)*theta3_reg
Theta1_grad = Theta1_grad + theta1_reg
Theta2_grad = Theta2_grad + theta2_reg
Theta3_grad = Theta3_grad + theta3_reg
##Concatenate gradients
grad = np.concatenate((Theta1_grad,Theta2_grad,Theta3_grad),axis=None)
return grad
Other functions defined are
def sigmoid(z):
sig = 1 / (1 + np.exp(z))
return sig
def randInitializeWeights(l_in, l_out):
epsilon = 0.12;
W = np.random.rand(l_out, 1+l_in)*2*epsilon - epsilon;
return W
def sigmoidGradient(z):
g = np.multiply(sigmoid(z),(1-sigmoid(z)))
return g
As an example:
import numpy as np
import scipy.optimize
X = np.random.rand(479,1426)
y1 = np.zeros((frames,1))
y2 = np.ones((framesp,1))
y = np.concatenate((y1,y2),axis=0)
init_param = np.random.rand(112726,)
lamb = 0.5
scipy.optimize.fmin_bfgs(cost,fprime=grad,x0=init_param,args=(param,X,y))
Then the error appears.
Thanks for any help
The arguments passed into the cost functions are the parameters, followed by the extra arguments. The parameters are chosen by the minimization function, the extra arguments are passed through.
When calling fmin_bfgs, only pass the extra arguments as args, not the actual parameters to optimize:
scipy.optimize.fmin_bfgs(..., args=(X,y))

How to find the best population (gStar)

How to find the best population (gStar) from random population, and then print weights from best population, i want to optimize the best population using FPA, i'm sorry cz i'm newbie
#FP Parameters
num_train = len(X_trainNorm)
input_dim = windowSize
output_dim = 1
popSize = 5
#Model of JST
def buildModel(hidden_dim):
np.random.seed(7)
pop = []
for i in range(popSize):
W1 = np.random.randn(input_dim, hidden_dim) / np.sqrt(input_dim)
b1 = np.zeros((1, hidden_dim))
W2 = np.random.randn(hidden_dim, output_dim) / np.sqrt(hidden_dim)
b2 = np.zeros((1, output_dim))
pop.append((W1, W2, b1, b2))
return pop
buildModel(10)
#Find the best population
def gStar(Weight1, Weight2, bias1, bias2):
z1 = X_trainNorm.dot(Weight1) + bias1
a1 = np.tanh(z1)
z2 = a1.dot(Weight2) + bias2
target = np.reshape(y_trainNorm,(-1,1))
error = 0
error = abs(z2-target)
tot_error = sum(error)
print (tot_error)

Categories