NaN in regression neural network - python

I was trying to build a NN on python to solve regression problem with inputs X (a,b) and output Y(c).
Using leaky Relu as an activation function for hidden layer and linear function for the output layer. After 3-4 iterations nn seems to get burst with extremely larges/small numbers and results in NaN.
The derivatives I have used are below. Maybe someone can help me - is the problems with my math or I should do more work to normalize X and Y prior to nn ?
dW2 = -2*(np.dot(dZ2,A1.transpose()))/m
db2 = -2*(np.sum(dZ2, axis = 1, keepdims = True))/m
drel = lrelu(Z1)
dZ1 = (np.dot(W2.transpose(),dZ2))*(drel)
dW1 = (np.dot(dZ1,X.transpose()))/m
db1 = (np.sum(dZ1, axis = 1, keepdims = True))/m
Where
Z1 = np.dot(W1,X)+b1
A1 = np.where(Z1 > 0, Z1, Z1 * 0.01)
Z2 = np.dot(W2,A1)+b2
A2 = Z2*1
cost = np.sum(np.square(Y-A2))/m
And Relu derivative:
def lrelu(rel):
alpha = 0.01
drel = np.ones_like(rel)
drel[rel < 0] = alpha
return drel
Thanks

Already have solved the problem by preprocessing the data.

Related

Neural network built from scratch in python to classify digits stuck at 11.35 percent accuracy. I am using the MNIST dataset

My neural network is stuck at 11.35 percent accuracy and i am unable to trace the error.
low accuracy at 11.35 percent
I am following this code https://github.com/MLForNerds/DL_Projects/blob/main/mnist_ann.ipynb which I found in a youtube video.
Here is my code for the neural network(I have defined Xavier weight initialization in a module called nn):
"""1. 784 neurons in input layer
2. 128 neurons in hidden layer 1
3. 64 neurons in hidden layer 2
4. 10 neurons in output layer"""
def softmax(input):
y = np.exp(input - input.max())
activated = y/ np.sum(y, axis=0)
return activated
def softmax_grad(x):
exps = np.exp(x-x.max())
return exps / np.sum(exps,axis = 0) * (1 - exps /np.sum(exps,axis = 0))
def sigmoid(input):
activated = 1/(1 + np.exp(-input))
return activated
def sigmoid_grad(input):
grad = input*(1-input)
return grad
class DenseNN:
def __init__(self,d0,d1,d2,d3):
self.params = {'w1': nn.Xavier.initialize(d0, d1),
'w2': nn.Xavier.initialize(d1, d2),
'w3': nn.Xavier.initialize(d2, d3)}
def forward(self,a0):
params = self.params
params['a0'] = a0
params['z1'] = np.dot(params['w1'],params['a0'])
params['a1'] = sigmoid(params['z1'])
params['z2'] = np.dot(params['w2'],params['a1'])
params['a2'] = sigmoid(params['z2'])
params['z3'] = np.dot(params['w3'],params['a2'])
params['a3'] = softmax(params['z3'])
return params['a3']
def backprop(self,y_true,y_pred):
params = self.params
w_change = {}
error = softmax_grad(params['z3'])*((y_pred - y_true)/y_true.shape[0])
w_change['w3'] = np.outer(error,params['a2'])
error = np.dot(params['w3'].T,error)*sigmoid_grad(params['a2'])
w_change['w2'] = np.outer(error,params['a1'])
error = np.dot(params['w2'].T,error)*sigmoid_grad(params['a1'])
w_change['w1'] = np.outer(error,params['a0'])
return w_change
def update_weights(self,learning_rate,w_change):
self.params['w1'] -= learning_rate*w_change['w1']
self.params['w2'] -= learning_rate*w_change['w2']
self.params['w3'] -= learning_rate*w_change['w3']
def train(self,epochs,lr):
for epoch in range(epochs):
for i in range(60000):
a0 = np.array([x_train[i]]).T
o = np.array([y_train[i]]).T
y_pred = self.forward(a0)
w_change = self.backprop(o,y_pred)
self.update_weights(lr,w_change)
# print(self.compute_accuracy()*100)
# print(calc_mse(a3, o))
print((self.compute_accuracy())*100)
def compute_accuracy(self):
'''
This function does a forward pass of x, then checks if the indices
of the maximum value in the output equals the indices in the label
y. Then it sums over each prediction and calculates the accuracy.
'''
predictions = []
for i in range(10000):
idx = i
a0 = x_test[idx]
a0 = np.array([a0]).T
#print("acc a1",np.shape(a1))
o = y_test[idx]
o = np.array([o]).T
#print("acc o",np.shape(o))
output = self.forward(a0)
pred = np.argmax(output)
predictions.append(pred == np.argmax(o))
return np.mean(predictions)
Here is the code for loading the data:
#load dataset csv
train_data = pd.read_csv('../Datasets/MNIST/mnist_train.csv')
test_data = pd.read_csv('../Datasets/MNIST/mnist_test.csv')
#train data
x_train = train_data.drop('label',axis=1).to_numpy()
y_train = pd.get_dummies(train_data['label']).values
#test data
x_test = test_data.drop('label',axis=1).to_numpy()
y_test = pd.get_dummies(test_data['label']).values
fac = 0.99 / 255
x_train = np.asfarray(x_train) * fac + 0.01
x_test = np.asfarray(x_test) * fac + 0.01
# train_labels = np.asfarray(train_data[:, :1])
# test_labels = np.asfarray(test_data[:, :1])
#printing dimensions
print(np.shape(x_train)) #(60000,784)
print(np.shape(y_train)) #(60000,10)
print(np.shape(x_test)) #(10000,784)
print(np.shape(y_test)) #(10000,10)
print((x_train))
Kindly help
I am a newbie in machine learning so any help would be appreciated.I am unable to figure out where i am going wrong.Most of the code is almost similar to https://github.com/MLForNerds/DL_Projects/blob/main/mnist_ann.ipynb but it manages to get 60 percent accuracy.
EDIT
I found the mistake :
Thanks to Bartosz Mikulski.
The problem was with how the weights were initialized in my Xavier weights initialization algorithm.
I changed the code for weights initialization to this:
self.params = {
'w1':np.random.randn(d1, d0) * np.sqrt(1. / d1),
'w2':np.random.randn(d2, d1) * np.sqrt(1. / d2),
'w3':np.random.randn(d3, d2) * np.sqrt(1. / d3),
'b1':np.random.randn(d1, 1) * np.sqrt(1. / d1),
'b2':np.random.randn(d2, 1) * np.sqrt(1. / d2),
'b3':np.random.randn(d3, 1) * np.sqrt(1. / d3),
}
then i got the output:
After changing weights initialization
after adding the bias parameters i got the output:
After changing weights initialization and adding bias
3: After changing weights initialization and adding bias
The one problem that I can see is that you are using only weights but no biases. They are very important because they allow your model to change the position of the decision plane (boundary) in the solution space. If you only have weights you can only angle the solution.
I guess that basically, this is the best fit you can get without biases. The dense layer is basically a linear function: w*x + b and you are missing the b. See the PyTorch documentation for the example: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html#linear.
Also, can you show your Xavier initialization? In your case, even the simple normal distributed values would be enough as initialization, no need to rush into more advanced topics.
I would also suggest you start from the smaller problem (for example Iris dataset) and no hidden layers (just a simple linear regression that learns by using gradient descent). Then you can expand it by adding hidden layers, and then by trying harder problems with the code you already have.

Problem with implementation manual Linear Regression using Stochastic Gradient Descent

I am working with a real estate dataset, the size of which is about 21 thousand, the size of the training data is 15129. There are 15 features. The task is to implement manual linear regression using SGD and compare features weights with the weights that the sklearn linear regression model gives us. ( all data is normalized using sklearn StandardScaler )
def gradient3(X,y):
X = pd.DataFrame(X)
y = pd.DataFrame(y)
w1 = np.random.randn(len(X.axes[1]))
w2 = np.random.randn(len(X.axes[1]))
b = 0
eps = 0.001
alpha = 1
counter = 1
lmbda = 0.1
while np.linalg.norm(w1 - w2) > eps:
#choosing random index
rand_index = np.random.randint(len(X.axes[0]))
X_tr = X.loc[rand_index].values
y_tr = y.loc[rand_index].values
# colculating new w
err = X_tr.dot(w1) + b - y_tr
loss_w = 2 * err * X_tr + (lmbda * w1)
loss_b = 2 * err
w2 = w1.copy()
w1 = w1 - alpha * loss_w
b = b - alpha * loss_b
# reducing alpha
counter += 1
alpha = 1/counter
return w1, b
I tried implement SGD and expect to get list of feature weights – w, and bias value – b. The problem is that the program sometimes just goes into an infinite loop, sometimes it shows me absolutely chaotic weights, it depends on my learning rate parameter (alpha) and how fast it decreases. I don't quite understand what exactly the problem is. Maybe SGD just doesn't work with this dataset and I need a mini-batch, maybe I missed something in the algorithm, maybe I'm implementing regularization incorrectly.
I would be very grateful if someone could write what is wrong with my implementation.

Loss is nan after a few ephocs

I'm trying to predict a direction given by two angles (theta and phi). I defined a loss function which is the angular distance between the predicted and the true direction but I continue to get nan value after a few epochs.
Given that the output activation is linear, my custom loss is:
import tensorflow as tf
import numpy as np
import tensorflow.keras.backend as K
def square_angular_distance(y_true, y_pred):
the_pred = K.abs(y_pred[:, 0])
phi_pred = (y_pred[:, 1])%(2*np.pi)
the_true = y_true[:, 0]
phi_true = y_true[:, 1]
cos_phi_pred = K.cos(phi_pred)
cos_phi_true = K.cos(phi_true)
sin_phi_pred = K.sin(phi_pred)
sin_phi_true = K.sin(phi_true)
cos_the_pred = K.cos(the_pred)
cos_the_true = K.cos(the_true)
sin_the_pred = K.sin(the_pred)
sin_the_true = K.sin(the_true)
v_true = K.stack((sin_the_true*cos_phi_true, sin_the_true*sin_phi_true, cos_the_true), axis=1)
v_pred = K.stack((sin_the_pred*cos_phi_pred, sin_the_pred*sin_phi_pred, cos_the_pred), axis=1)
v_dot = K.batch_dot(v_true, v_pred)
angle_dist = tf.math.acos(K.clip(v_dot, -1., 1.))*180./np.pi
return K.mean(K.square(angle_dist), axis=-1)
Where y_pred[:, 0] and y_pred[:, 1] are respectively the theta and phi angles of a unitary vector (same for y_true).
I tried to use regularizers, to clip the gradient, the learning rate and I also checked the data to have no Nan/Inf values.
I also tried to clip the output values using a custom activation function for the output layer but it didn't resolve the problem.
Any suggestions on what am I doing wrong?
The comment from #ATony resolved the problem.
Shortening the input domain of tf.math.acos prevented the loss to be Nan.
K.clip(v_dot, -.999, .999)

Multiclass logistic regression - implementation question

This is my try to implement multi-class logistic regression in python using softmax as activation function and mnist digit data set as training and test set.
import numpy as np
def softmax(z):
return np.array([(np.exp(el)/np.sum(np.exp(el))) for el in z])
def cost(W,F,L):
m = F.shape[0] #get number of rows
mul = np.dot(F, W)
sm_mul_T = softmax(mul)
return -(1/m) * np.sum(L * np.log(sm_mul_T))
def gradient(W,F,L):
m = F.shape[0] # get number of rows
mul = np.dot(F, W)
sm_mul_T = softmax(mul)
return -(1 / m) * np.dot(F.T , (L - sm_mul_T))
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("./datasets/MNIST_data/", one_hot=True)
W = np.zeros((785, 10)) #784 features + 1 bias
for _ in range(10000):
F, L = mnist.train.next_batch(100)
F = np.insert(F,0, values=1, axis=1)
total_cost = cost(W,F,L)
print("Total cost is {}".format(total_cost))
gradients = gradient(W,F,L)
W = W - (0.1 * gradients)
FU = mnist.test.images
FU = np.insert(FU,0, values=1, axis=1)
LU = mnist.test.labels
mulU = np.dot(FU, W)
sm_mulU = softmax(mulU)
OK=0
NOK=0
for i in range(10000):
a1 = np.argmax(sm_mulU[i])
a2 = np.argmax(LU[i])
if a1 == a2:
OK = OK + 1
else:
NOK = NOK + 1
print("{} OK vs {} NOK".format(OK, NOK))
print("accur {}%".format((OK/(NOK+OK))*100))
What I wanted to do is basically to try to implement it myself and try to get similar results as using Tensor flow. But the problem is that tensor flow implementation gets in the end 91% accuracy, while I can get only around 70%. Also, it seems like my model diverges and cost starts to increase pretty fast.
Is my implementation wrong, or is it due to more advanced algorithm inside TensorFlow implementation?

performing stochastic gradient descent on a neural network

I want to perform SGD on the following neural network:
Training set size = 200000
input layer size = 784
hidden layer size = 50
output layer size = 10
I have an algorithm that performs batch gradient descent.I guess to perform SGD , the cost function should be modified to perform calculations on single training data(array of size 784) and then theta should be updated for each training data. Is it the correct way of implementing SGD ?If yes, I am not able to get the following cost function(for batch gradient descent) to work for single training data.How can I make it run on a single training set ? If no, then what is the correct way to implement SGD on a neural network ?
python function to calculate cost function and gradient of theta for batch gradient descent :
def cost(theta,X,y,lamb):
#get theta1 and theta2 from unrolled theta vector
th1 = (theta[0:(hiddenLayerSize*(inputLayerSize+1))].reshape((inputLayerSize+1,hiddenLayerSize))).T
th2 = (theta[(hiddenLayerSize*(inputLayerSize+1)):].reshape((hiddenLayerSize+1,outputLayerSize))).T
#matrices to store gradient of theta1 &theta2
th1_grad = np.zeros(th1.shape)
th2_grad = np.zeros(th2.shape)
I = np.identity(outputLayerSize,int)
Y = np.zeros((realTrainSetSize ,outputLayerSize))
#get Y[i] to the size of output Layer
for i in range(0,realTrainSetSize ):
Y[i] = I[y[i]]
#add bais unit in each training example and perform forward prop and backprop
A1 = np.hstack([np.ones((realTrainSetSize ,1)),X])
Z2 = A1 # (th1.T)
A2 = np.hstack([np.ones((len(Z2),1)),sigmoid(Z2)])
Z3 = A2 # (th2.T)
H = A3 = sigmoid(Z3)
penalty = (lamb/(2*trainSetSize))*(sum(sum(np.delete(th1,0,1)**2))+ sum(sum(np.delete(th2,0,1)**2)) )
J = (1/2)*sum(sum( np.multiply(-Y,log(H)) - np.multiply((1-Y),log(1-H)) ))
#backprop
sigma3 = A3 - Y;
sigma2 = np.multiply(sigma3#theta2,sigmoidGradient(np.hstack([np.ones((len(Z2),1)),Z2])))
sigma2 = np.delete(sigma2,0,1)
delta_1 = sigma2.T # A1 #getting dimension mismatch error
delta_2 = sigma3.T # A2
#calculation of gradient of theta1 and theta2
th1_grad = np.divide(delta_1,trainSetSize)+(lamb/trainSetSize)*(np.hstack([np.zeros((len(th1),1)) , np.delete(th1,0,1)]))
th2_grad = np.divide(delta_2,trainSetSize)+(lamb/trainSetSize)*(np.hstack([np.zeros((len(th2),1)) , np.delete(th2,0,1)]))
#unroll gradients of theta1 and theta2
theta_grad = np.concatenate(((th1_grad.T).ravel(),(th2_grad.T).ravel()))
return (J,theta_grad)
I am getting dimension mismatch error while calculating delta_1 and delta_2 on calling this function with single training data but it works fine when called with entire training batch.

Categories