Related
I just started a ML course and I'm trying to run gradient descent in python. The below functions work fine, but as I move on to the bigger chunk where I do the actual learning, I just can't get the expected output and learn the right parameters, as you can tell from this decision boundary I plotted afterwards. And I'm trying to figure out why.
plotting the decision boundary
def sigmoid(z):
sigma = 1/(1+np.exp(-z))
return sigma
def compute_cost(X, y, w, b):
y_hat = sigmoid((X * np.expand_dims(w, axis=0)).sum(axis=1) + b)
total_cost = (-y * np.log(y_hat) - (1-y) * np.log(1-y_hat)).mean()
return total_cost
def compute_gradient(X, y, w, b):
z = w * X + b
yhat = sigmoid(z)
y1 = np.expand_dims(y, axis=1)
error = yhat - y1
db = error.mean()
dw_j1 = (X * error)
dw_j = np.mean(dw_j1,axis=0)
return dw_j, db
Before building this gradient descent function, I tested all the above with my training data & they all work and output the correct numbers. Really appreciate it if you can spot my mistakes.
Learning parameters with gradient descent
def gradient_descent(X, y, w, b, alpha, num_iters):
m = len(X)
J_history = []
wb_history = []
for i in range(num_iters):
cost = compute_cost(X, y, w, b)
dw_j, db = compute_gradient(X, y, w, b)
w = w - alpha * dw_j
b = b - alpha * db
wb_history.append((w,b))
J_history.append(cost)
if i % math.ceil(num_iters/10) == 0 or i == (num_iters-1):
print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}")
return w, b, J_history, wb_history
np.random.seed(1)
initial_w = 0.01 * (np.random.rand(2) - 0.5)
initial_b = -8
iterations = 10000
alpha = 0.001
w, b, J_history, _ = gradient_descent(X_train ,y_train, initial_w, initial_b, alpha, iterations)
I'm following the Andrew-Ng course on Machine Learning and I'm currently doing the week 5 exercise.
I've found myself stuck on the implementation of the backpropagation algorithm, due to the fact that the relative difference, compared to numerical gradient, is very high (order of 1e-1), but I can't find any error within my implementation, so I'm gently asking if someone could take a look at it and explain what I did wrong.
Forward propagation:
def forward_propagation(thetas, X, history=False):
activation_arr = []
a = X # X is the array of the first activation values
for k in range(0, len(thetas)):
a = add_intercept(a) # add the bias unit
a = sigmoid(a # thetas[k].T)
if history:
activation_arr.append(a)
return activation_arr if history else a
Backpropagation:
def gradient_nn(thetas, X, y, num_labels, reg_lambda=None):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
# add intercepted X to the activation array
activation_arr = [add_intercept(X)] + forward_propagation(thetas, X, history=True)
sigma = [activation_arr[-1] - Y] # sigma^L = a^L - y
delta = [sigma[-1].T # activation_arr[-2]] # find delta for the first row
thetas_grad = []
# Calculate sigma and delta
for idx in range(1, len(thetas)): # skip last iteration
sigma = [
(sigma[0] # thetas[-idx][:, 1:]) * partial_derivative(activation_arr[-1-idx])
] + sigma
delta = [
sigma[0].T # activation_arr[-2-idx]
] + delta
return [np.sum(d) / n_examples for d in thetas_grad]
Partial derivative:
def partial_derivative(a):
return a * (1 - a) # element wise multiplication
Numerical gradient:
def compute_numerical_gradient(cost_function, thetas):
# Unroll parameters
nn_params = unroll_thetas(thetas)
num_grad = np.zeros(nn_params.shape)
perturb = np.zeros(nn_params.shape)
shapes = [theta.shape for theta in thetas]
epsilon = 1e-4 # not the one of random initialization
for p in range(nn_params.shape[0]):
# Set perturbation vector
perturb[p] = epsilon
minus_theta = nn_params - perturb
plus_theta = nn_params + perturb
# --- Roll params back in order to use the cost function ---
minus_theta = roll_thetas(minus_theta, shapes)
plus_theta = roll_thetas(plus_theta, shapes)
# calculate the loss of the cost function
minus_loss = cost_function(minus_theta)
plus_loss = cost_function(plus_theta)
# Compute Numerical Gradient
num_grad[p] = (plus_loss - minus_loss) / (2 * epsilon)
perturb[p] = 0
num_grad = roll_thetas(num_grad, shapes)
return [np.sum(num_g) for num_g in num_grad]
Cost function:
def J_nn(num_labels, reg_lambda=None):
def non_reg_func(thetas, X, y):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
prediction = forward_propagation(thetas, X)
return np.sum(np.sum(-Y * np.log(prediction) - (1 - Y) * np.log(1 - prediction))) / n_examples
if reg_lambda is None:
func = non_reg_func
else: # regularization
def func(thetas, X, y):
cost = non_reg_func(thetas, X, y)
for theta in thetas: # regularize for every layer
theta = theta[1:] # remove bias unit
cost = cost + (reg_lambda / (2 * y.shape[0])) * np.sum(np.sum(theta[:, ] ** 2))
return cost
return func
Checking backpropagation with numerical gradient:
def check_nn_gradients(reg_lambda=None):
"""
Creates a small neural network (max 8 x 8 x 7 x 8) and checks that
the implementation of the backpropagation algorithm is good
"""
#n_examples, sizes = random.randint(5, 10), [random.randint(2, 8), random.randint(2, 8), random.randint(1, 8)]
n_examples, sizes = 5, [8, 8, 5, 4]
n_labels = sizes[-1] # Last size is equal to the number of labels
init_epsilon = 0.0001
thetas = random_init_thetas(sizes, init_epsilon)
X = np.array(
random_init_thetas([sizes[0]-1, n_examples], init_epsilon)
).squeeze() # We squeeze it because random_init_thetas returns a 3D array, but we want X to be 2D
y = np.array([random.randint(0, n_labels-1) for _ in X])
y = y[:, np.newaxis]
inner_cost = lambda _thetas: J_nn(n_labels, reg_lambda)(_thetas, X, y)
gradients = gradient_nn(thetas, X, y, n_labels, 0)
unrolled_gradients = unroll_thetas(gradients)
print(unrolled_gradients)
# finite difference method
grad_checking_epsilon = 1e-4
num_grad = compute_numerical_gradient(inner_cost, thetas)
unrolled_num_grad = unroll_thetas(num_grad)
print(unrolled_num_grad)
return diff = np.linalg.norm(unrolled_num_grad - unrolled_gradients) / np.linalg.norm(unrolled_num_grad + unrolled_gradients)
I am trying to code a NN with one neuron. I have one input (x) and bias (b) to solve a simple regression problem to detect x, b for the eq.: (my cost function is y=x)
y = 0.3 * x + 2.
The closest results I am getting is:
x = 0.38178107 (expected: ~0.3)
b = 1.10040842 (expected: ~1.0)
My question is why my results are far from the expected results? Am I falling into an over/underfitting problem or buffer overflow?
I took into consideration the relationship between the learning rate and the number of iterations.
I know my training data is small, but I am looping through each data entry 100 times. Also, I tried increasing the training to 100 entries and reduced the looping for each entry to 10 times, the results were much far something like x= ~3.067 and b=-3.098
Here are the steps I followed:
My training data is x: 1~10 & y:2.3~5.0. Training: [(1, 2.3), .., (10, 5.0)]
The derivatives used:
dE_dw = -(y-A)*x #gradient
new_w = w - lr * dE_dw
dE_db = -(y-A) #gradient
new_b = b - lr * dE_db
The Code:
import random as r
# function: calculate gradient for weight w for the x input or weight b for bias input
def calc_new_Weight(v, lr, grad):
# v is value of the weight
# lr is learning rate
# grad is gradient
new_v = v - lr * grad
return new_v
# linear cost function y=x
def costFunc(s): return s
def nn(x, y, w, b, lr):
s = x*w + 1*b
A = costFunc(s)
#Error: E = 0.5 (y - a) ** 2
#partial deriv E w/ respect to w
dE_dw = -1*(y-A)*x
w_new = calc_new_Weight(w, lr, grad = dE_dw)
# partial deriv E w/ respect to b
dE_db = -1*(y-A)
b_new = calc_new_Weight(b, lr, grad = dE_db)
return (w_new, b_new)
def main():
#random init weights w, b for the inputs x, b
w = r.random()
b = r.random()
for x, y in data:
# y = 0.3*x + 2
for i in range(1, 100):
#update w, b with the new weights
w, b = nn(x, y, w, b, lr=0.001)
print(w, b)
If you can help me understand this I really appreciate your time.
Thank you in advance
Your Gradients are correct but the Gradient Descent Algorithm in the code needs to be modified a little,
The Gradient Descent Algo goes like this: reference
t <- 0
max_iterations <- 1000
Initialize W/theta (Weights)
while t++ < max_iterations do
H = Forward_propogate(Inputs, W)
delta_W = Backward_propogation(H)
W -= n*delta_W
end
In python it looks like this:
w, b, lr, max_iterations = r.random(), r.random(), 0.001, 1000
for i in range(max_iterations):
dw, db = 0, 0
for x,y in data:
# return dw and db from nn and not updated w and b
dw += nn(w, b, x, y)
db += nn(w, b, x, y)
w = w - lr * dw
b = b - lr * db
And If you want to do stochastic gradient descent(i.e update w,b for all points in the data) the code would be:
w, b, lr, max_iterations = r.random(), 0, 0.001, 1000
for i in range(max_iterations):
dw, db = 0, 0
for x,y in data:
dw += nn(w, b, x, y)
db += nn(w, b, x, y)
w = w - lr * dw
b = b - lr * db
Implementing SGD with your example with 50 data points and 1000 iterations and initializing random w and b to be 0 we can consistently converge to expected values:
import random as r
import matplotlib.pyplot as plt
# function: calculate gradient for weight w for the x input or weight b for bias input
def calc_new_Weight(v, lr, grad):
# v is value of the weight
# lr is learning rate
# grad is gradient
new_v = v - lr * grad
return new_v
# linear cost function y=x
def costFunc(s): return s
def nn(x, y, w, b, lr):
s = x*w + 1*b
A = costFunc(s)
#Error: E = 0.5 (y - a) ** 2
#partial deriv E w/ respect to w
dE_dw = -1*(y-A)*x
#w_new = calc_new_Weight(w, lr, grad = dE_dw)
# partial deriv E w/ respect to b
dE_db = -1*(y-A)
#b_new = calc_new_Weight(b, lr, grad = dE_db)
return (dE_dw, dE_db)
def main():
#random init weights w, b for the inputs x, b
w = r.random()
b = 0
x = list(range(1,50))
y = [(0.3*i + 2) for i in x]
data = list(zip(x,y))
#r.shuffle(data)
for i in range(1,1000):
dw , db = 0, 0
for x, y in data:
# y = 0.3*x + 2
#update w, b with the new weights
d_w, d_b = nn(x, y, w, b, lr=0.001)
dw += d_w
db += d_b
w = w - (0.001*d_w)
b = b - (0.001*d_b)
return w, b
w, b = main()
I am comparing the Adam - Algorithm to SGD with Momentum. I realised that the convergence rate of Adam is way worse than the convergence rate of SGD with Momentum if applied to the Rosenbrock function. This finding is in contrast to this visualisation. You can read the underlying code here.
Too ensure that I did not have an implementation error I compared the results of my algorithm to the Pytorch implementation. Pytorch and my implementation return the same result.
Therefore either Pytorch and my implementation is incorrect or the implementation in the link is incorrect. If you check out the code from the link above you will find that the Bias correction step is missing. After adapting my code in the same way the results did not significantly improve.
So my question is why does it work in the linked scenario but not in my/Pytorch implementation? Even though all of the three should return the same result.
import numpy as np
import torch
# Rosenbrock function
class Rosenbrock:
a_f = 1.
b_f = 2.
# The minimum is at (a_f, a_f**2)
class Adam_para:
beta1 = 0.9 # 0.7 # modified because of github: https://gist.github.com/EmilienDupont/f97a3902f4f3a98f350500a3a00371db
beta2 = 0.999
eps = 1e-8
lr = 2e-2
iterations = 100
def f(x,y):
return ( Rosenbrock.a_f - x ) ** 2 + Rosenbrock.b_f * (y - x ** 2 ) ** 2
def grad_f(x,y):
grad_x = - 1. * 2 * (Rosenbrock.a_f - x) + Rosenbrock.b_f * (- 2 * x) * 2 * ( y - x ** 2 )
grad_y = Rosenbrock.b_f * ( 1. ) * 2 * (y - x ** 2)
return np.array([grad_x, grad_y])
def adam_inner(p: np.ndarray,t,exp_avg,exp_avg_sqr, lr):
# inner loop of adam algorithm
# p current point
# exp_avg first moment estimate
# exp_avg_sqr second moment estimate
# lr learning rate
# the following values are taken from the ADAM Paper
beta1 = Adam_para.beta1
beta2 = Adam_para.beta2
eps = Adam_para.eps
t = t+1
g = grad_f(*p)
exp_avg = beta1 * exp_avg + ( 1 - beta1 ) * g
exp_avg_sqr = beta2 * exp_avg_sqr + ( 1 - beta2 ) * np.square(g)
bias_corr_1 = 1 - beta1 ** t
bias_corr_2 = 1 - beta2 ** t
exp_avg_hat = exp_avg / bias_corr_1
exp_avg_sqr_hat = exp_avg_sqr / bias_corr_2
denom = np.sqrt(exp_avg_sqr_hat) + eps
p = p - lr * exp_avg_hat / denom
return {'p': p, 'first_mom': exp_avg, 'second_mom': exp_avg_sqr}
def adam(p, it, lr=0.001):
# it number of iterations
# m first moment estimate
# v second moment estimate
# init
m = 0
v = 0
p_list = [p]
for i in range(it):
tmp = adam_inner(p_list[-1],i,m,v,lr)
p_list.append(tmp['p'])
m = tmp['first_mom']
v = tmp['second_mom']
return np.asarray(p_list)
x0 = np.array([3.,3.])
t = adam(x0,Adam_para.iterations,Adam_para.lr)
x0_torch = torch.tensor(x0, requires_grad=True)
f_torch = f(x0_torch[0],x0_torch[1])
optimizer = torch.optim.Adam([x0_torch], lr = Adam_para.lr, betas=(Adam_para.beta1,Adam_para.beta2))
for i in range(Adam_para.iterations):
optimizer.zero_grad()
f_torch = f(x0_torch[0],x0_torch[1])
f_torch.backward()
optimizer.step()
print("pytorch result:", x0_torch)
print("my result:", t[-1])
I'm starting the ML journey and I'm having troubles with this coding exercise
here is my code
import numpy as np
import pandas as pd
import scipy.optimize as op
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, name['Test1', 'Test2', 'Accepted'])
# Separate the features to make it fit into the mapFeature function
X1 = data['Test1'].values.T
X2 = data['Test2'].values.T
# This function makes more features (degree)
def mapFeature(x1, x2):
degree = 6
out = np.ones((x1.shape[0], sum(range(degree + 2))))
curr_column = 1
for i in range(1, degree + 1):
for j in range(i+1):
out[:,curr_column] = np.power(x1, i-j) * np.power(x2, j)
curr_column += 1
return out
# Separate the data into training and target, also initialize theta
X = mapFeature(X1, X2)
y = np.matrix(data['Accepted'].values).T
m, n = X.shape
cols = X.shape[1]
theta = np.matrix(np.zeros(cols))
#Initialize the learningRate(sigma)
learningRate = 1
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
first = np.multiply(-y, np.log(error))
second = np.multiply(1 - y, np.log(1 - error))
j = np.sum((first - second)) / m + (learningRate * np.sum(np.power(theta, 2)) / 2 * m)
return j
# Define the gradient of the cost function
def gradient(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
grad = (X.T # (error - y)) / m + ((learningRate * theta) / m)
grad_no = (X.T # (error - y)) / m
grad[0] = grad_no[0]
return grad
Result = op.minimize(fun=cost, x0=theta, args=(X, y, learningRate), method='TNC', jac=gradient)
opt_theta = np.matrix(Result.x)
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
p = predict(opt_theta, X)
print('Train Accuracy: {:f}'.format(np.mean(p == y) * 100))
So, when the learningRate = 1, the accuracy should be around 83,05% but I'm getting 80.5% and when the learningRate = 0, the accuracy should be 91.52% but I'm getting 87.28%
So the question is What am I doing wrong? Why my accuracy is below the problem default answer?
Hope someone can guide me in the right direction. Thanks!
P.D: Here is the dataset, maybe it can help
https://raw.githubusercontent.com/TheGirlWhiteWithBandages/Machine-Learning-Algorithms/master/Logistic%20Regression/ex2data2.txt
Hey guys I found a way to make it even better!
Here is the code
import numpy as np
import pandas as pd
import scipy.optimize as op
from sklearn.preprocessing import PolynomialFeatures
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, names=['Test1', 'Test2', 'Accepted'])
# Separate the data into training and target
X = (data.iloc[:, 0:2]).values
y = (data.iloc[:, 2:3]).values
# Modify the features to a certain degree (Polynomial)
poly = PolynomialFeatures(6)
m = y.size
XX = poly.fit_transform(data.iloc[:, 0:2].values)
# Initialize Theta
theta = np.zeros(XX.shape[1])
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return(1 / (1 + np.exp(-z)))
# Define the Regularized cost function
def costFunctionReg(theta, reg, *args):
# This is require to make the optimize function work
h = sigmoid(XX # theta)
first = np.log(h).T # - y
second = np.log(1 - h).T # (1 - y)
J = (1 / m) * (first - second) + (reg / (2 * m)) * np.sum(np.square(theta[1:]))
return J
# Define the Regularized gradient function
def gradientReg(theta, reg, *args):
theta = theta.reshape(-1, 1)
h = sigmoid(XX # theta)
grad = (1 / m) * (XX.T # (h - y)) + (reg / m) * np.r_[[[0]], theta[1:]]
return grad.flatten()
# Define the predict Function
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
# A loop to test between different values for sigma (reg parameter)
for i, Sigma in enumerate([0, 1, 100]):
# Optimize costFunctionReg
res2 = op.minimize(costFunctionReg, theta, args=(Sigma, XX, y), method=None, jac=gradientReg)
# Get the accuracy of the model
accuracy = 100 * sum(predict(res2.x, XX) == y.ravel()) / y.size
# Get the Error between different weights
error1 = costFunctionReg(res2.x, Sigma, XX, y)
# print the accuracy and error
print('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=4), Sigma))
print(error1)
Thanks for all your help!
try out this:
# import library
import pandas as pd
import numpy as np
dataset = pd.read_csv('ex2data2.csv',names = ['Test #1','Test #2','Accepted'])
# splitting to x and y variables for features and target variable
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
print('x[0] ={}, y[0] ={}'.format(x[0],y[0]))
m, n = x.shape
print('#{} Number of training samples, #{} features per sample'.format(m,n))
# import library FeatureMapping
from sklearn.preprocessing import PolynomialFeatures
# We also add one column of ones to interpret theta 0 (x with power of 0 = 1) by
include_bias as True
pf = PolynomialFeatures(degree = 6, include_bias = True)
x_poly = pf.fit_transform(x)
pd.DataFrame(x_poly).head(5)
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_ = 1
# reshape (-1,1) because we just have one feature in y column
y = y.reshape(-1,1)
def sigmoid(z):
return 1/(1+np.exp(-z))
def lr_hypothesis(x,theta):
return np.dot(x,theta)
def compute_cost(theta,x,y,lambda_):
theta = theta.reshape(n,1)
infunc1 = -y*(np.log(sigmoid(lr_hypothesis(x,theta)))) - ((1-y)*(np.log(1 - sigmoid(lr_hypothesis(x,theta)))))
infunc2 = (lambda_*np.sum(theta[1:]**2))/(2*m)
j = np.sum(infunc1)/m+ infunc2
return j
# gradient[0] correspond to gradient for theta(0)
# gradient[1:] correspond to gradient for theta(j) j>0
def compute_gradient(theta,x,y,lambda_):
gradient = np.zeros(n).reshape(n,)
theta = theta.reshape(n,1)
infunc1 = sigmoid(lr_hypothesis(x,theta))-y
gradient_in = np.dot(x.transpose(),infunc1)/m
gradient[0] = gradient_in[0,0] # theta(0)
gradient[1:] = gradient_in[1:,0]+(lambda_*theta[1:,]/m).reshape(n-1,) # theta(j) ; j>0
gradient = gradient.flatten()
return gradient
You can now test your cost and gradient without optimization. Th below code will optimize the model:
# hyperparameters
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_array = [0, 1, 10, 100]
import scipy.optimize as opt
for i in range(0,len(lambda_array)):
# Train
print('======================================== Iteration {} ===================================='.format(i))
optimized = opt.minimize(fun = compute_cost, x0 = theta, args = (x_poly, y,lambda_array[i]),
method = 'TNC', jac = compute_gradient)
new_theta = optimized.x
# Prediction
y_pred_train = predictor(x_poly,new_theta)
cm_train = confusion_matrix(y,y_pred_train)
t_train,f_train,acc_train = acc(cm_train)
print('With lambda = {}, {} correct, {} wrong ==========> accuracy = {}%'
.format(lambda_array[i],t_train,f_train,acc_train*100))
Now you should see output like this :
=== Iteration 0 === With lambda = 0, 104 correct, 14 wrong ==========> accuracy = 88.13559322033898%
=== Iteration 1 === With lambda = 1, 98 correct, 20 wrong ==========> accuracy = 83.05084745762711%
=== Iteration 2 === With lambda = 10, 88 correct, 30 wrong ==========> accuracy = 74.57627118644068%
=== Iteration 3 === With lambda = 100, 72 correct, 46 wrong ==========> accuracy = 61.016949152542374%