Linear Regression parameters are not calculated correctly - python

My gradient descent does not give the correct values for the model coefficients.
Here is my code:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("Advertising.csv")
tv_ad = np.array(data.loc[:, "TV"])
sales = np.array(data.loc[:, "sales"])
#The first 5 datapoints are these:
for i in range(5):
print(f"(x^({i}), y^({i})) = ({tv_ad[i]}, {sales[i]})")
"""Output:
(x^(0), y^(0)) = (230.1, 22.1)
(x^(1), y^(1)) = (44.5, 10.4)
(x^(2), y^(2)) = (17.2, 9.3)
(x^(3), y^(3)) = (151.5, 18.5)
(x^(4), y^(4)) = (180.8, 12.9)"""
#This is the cost function:
def compute_cost_func(x, y, w, b):
count = x.shape[0]
cost_sum = 0
for i in range(count):
x_i, y_i = x[i], y[i]
cost_sum += (w * x_i + b - y_i) ** 2
j_wb = cost_sum / (2 * count)
return j_wb
#This is the gradient calculator:
def compute_gradient(x, y, w, b):
count = x.shape[0]
dj_dw, dj_db = 0, 0
for i in range(count):
x_i, y_i = x[i], y[i]
func_w = ((w * x_i + b) - y_i) * x_i
func_b = (w * x_i + b) - y_i
dj_dw += func_w
dj_db += func_b
dj_dw = dj_dw / count
dj_db = dj_db / count
return dj_dw, dj_db
#This is the gradient descent function:
def gradient_descent(x, y, w_in, b_in, alpha, num_iter, cost_func, gradient_func):
j_history = []
p_history = []
b = b_in
w = w_in
for i in range(num_iter):
dj_dw, dj_db = gradient_func(x, y, w, b)
w = w - alpha * dj_dw
b = b - alpha * dj_db
if i < 100000:
j_history.append(cost_func(x, y, w, b))
p_history.append([w, b])
if i % math.ceil(num_iter / 10) == 0:
print(f"Iteration {i:4}: Cost {j_history[-1]:0.2e} ",
f"dj_dw: {dj_dw: 0.3e}, dj_db: {dj_db: 0.3e} ",
f"w: {w: 0.3e}, b:{b: 0.5e}")
return w, b, j_history, p_history
#When I run the code, at the end it gives w and b but they are not correct.
w_init = 0
b_init = 0
iterations = 10000
tmp_alpha = 1.0e-5
w_final, b_final, j_hist, p_hist = gradient_descent(tv_ad, sales, w_init, b_init, tmp_alpha, iterations,
compute_cost_func, compute_gradient)
print(f"(w,b) found by gradient descent: ({w_final:8.4f},{b_final:8.4f})")
"""
Output:
Iteration 0: Cost 6.22e+01 dj_dw: -2.411e+03, dj_db: -1.402e+01 w: 2.411e-02, b: 1.40225e-04
Iteration 1000: Cost 1.15e+01 dj_dw: 9.022e-03, dj_db: -1.777e+00 w: 8.316e-02, b: 1.82292e-02
Iteration 2000: Cost 1.15e+01 dj_dw: 8.999e-03, dj_db: -1.772e+00 w: 8.307e-02, b: 3.59728e-02
Iteration 3000: Cost 1.14e+01 dj_dw: 8.976e-03, dj_db: -1.768e+00 w: 8.298e-02, b: 5.36715e-02
Iteration 4000: Cost 1.14e+01 dj_dw: 8.954e-03, dj_db: -1.763e+00 w: 8.289e-02, b: 7.13254e-02
Iteration 5000: Cost 1.14e+01 dj_dw: 8.931e-03, dj_db: -1.759e+00 w: 8.280e-02, b: 8.89347e-02
Iteration 6000: Cost 1.13e+01 dj_dw: 8.909e-03, dj_db: -1.754e+00 w: 8.271e-02, b: 1.06499e-01
Iteration 7000: Cost 1.13e+01 dj_dw: 8.886e-03, dj_db: -1.750e+00 w: 8.262e-02, b: 1.24020e-01
Iteration 8000: Cost 1.13e+01 dj_dw: 8.864e-03, dj_db: -1.745e+00 w: 8.253e-02, b: 1.41496e-01
Iteration 9000: Cost 1.12e+01 dj_dw: 8.841e-03, dj_db: -1.741e+00 w: 8.244e-02, b: 1.58928e-01
(w,b) found by gradient descent: ( 0.0824, 0.1763)"""
The given w and b are w = 0.0824, b = 0.1763. And in the scatterplot the graph is like this:
Instead, when I compute the model with statsmodel the w and b values are w = 0.0475 and b = 7.0326. And they looks like more correct. The graph of this model is this.
I wonder what it wrong with my gradient descent function. I know my question is long but I am really curious about it.
I tried it with a smaller dataset, it worked but I don't know whether something is wrong with my gradient descent or it is about the dataset.

Related

Unexpected value of cost function in Logistic regression

I been trying to write a python code for logistic regression but the results are showing very high value of cost function which is unexpected. I have created a random variable X and Y and added a noise term to Y which will flip the element of based on the probability theta. This is my code:
import numpy as np
from scipy.stats import bernoulli
rg = np.random.default_rng(100)
def data_generate(n, m, theta):
X_0 = np.ones((n, 1))
X = np.random.normal(loc=0.0, scale=1.0, size=(n, m))
X = np.concatenate((X_0, X), axis = 1)
beta = rg.random((m+1, 1))
Y = np.zeros((n, 1))
P = 1.0/(1.0 + np.exp(-np.dot(X, beta)))
for i in range(len(P)):
if P[i] >= 0.5:
Y[i] = 1
else:
Y[i] = 0
# Noise addition
noise = bernoulli.rvs(size=(n,1), p=theta)
for j in range(len(noise)):
if noise[i] == 1:
Y[i] = int(not(Y[i]))
else:
pass
return X, Y, beta
def Gradient_Descent(X, Y, k, tollerence, learning_rate):
n,m = np.shape(X)
beta = rg.random((m, 1))
costs = []
initial_cost = 0.0
for i in range(k):
Y_pred = 1.0/(1.0 + np.exp(-np.dot(X, beta)))
cost = np.mean(np.dot(Y.T, np.log(Y_pred)) + np.dot((1-Y).T, np.log(1-Y_pred)))
if (abs(cost - initial_cost) <= tollerence):
break
else:
beta = beta - learning_rate*(np.mean(np.dot(X.T, (Y_pred - Y))))
initial_cost = cost
costs.append(cost)
return cost, beta, i
X = data_generate(200, 3, 0.1)[0]
Y = data_generate(200, 3, 0.1)[1]
Gradient_Descent(X, Y, 10000, 1e-6, 0.01)
# Output of code :
(-154.7689765716959,
array([[-0.02218003],
[-0.1182535 ],
[ 0.1169462 ],
[ 0.58610747]]),
14)`
Please tell what is the problem with the code.

Learning parameters with gradient descent

I just started a ML course and I'm trying to run gradient descent in python. The below functions work fine, but as I move on to the bigger chunk where I do the actual learning, I just can't get the expected output and learn the right parameters, as you can tell from this decision boundary I plotted afterwards. And I'm trying to figure out why.
plotting the decision boundary
def sigmoid(z):
sigma = 1/(1+np.exp(-z))
return sigma
def compute_cost(X, y, w, b):
y_hat = sigmoid((X * np.expand_dims(w, axis=0)).sum(axis=1) + b)
total_cost = (-y * np.log(y_hat) - (1-y) * np.log(1-y_hat)).mean()
return total_cost
def compute_gradient(X, y, w, b):
z = w * X + b
yhat = sigmoid(z)
y1 = np.expand_dims(y, axis=1)
error = yhat - y1
db = error.mean()
dw_j1 = (X * error)
dw_j = np.mean(dw_j1,axis=0)
return dw_j, db
Before building this gradient descent function, I tested all the above with my training data & they all work and output the correct numbers. Really appreciate it if you can spot my mistakes.
Learning parameters with gradient descent
def gradient_descent(X, y, w, b, alpha, num_iters):
m = len(X)
J_history = []
wb_history = []
for i in range(num_iters):
cost = compute_cost(X, y, w, b)
dw_j, db = compute_gradient(X, y, w, b)
w = w - alpha * dw_j
b = b - alpha * db
wb_history.append((w,b))
J_history.append(cost)
if i % math.ceil(num_iters/10) == 0 or i == (num_iters-1):
print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}")
return w, b, J_history, wb_history
np.random.seed(1)
initial_w = 0.01 * (np.random.rand(2) - 0.5)
initial_b = -8
iterations = 10000
alpha = 0.001
w, b, J_history, _ = gradient_descent(X_train ,y_train, initial_w, initial_b, alpha, iterations)

A Simple Linear Regression with One Neuron Gives Incorrect Results

I am trying to code a NN with one neuron. I have one input (x) and bias (b) to solve a simple regression problem to detect x, b for the eq.: (my cost function is y=x)
y = 0.3 * x + 2.
The closest results I am getting is:
x = 0.38178107 (expected: ~0.3)
b = 1.10040842 (expected: ~1.0)
My question is why my results are far from the expected results? Am I falling into an over/underfitting problem or buffer overflow?
I took into consideration the relationship between the learning rate and the number of iterations.
I know my training data is small, but I am looping through each data entry 100 times. Also, I tried increasing the training to 100 entries and reduced the looping for each entry to 10 times, the results were much far something like x= ~3.067 and b=-3.098
Here are the steps I followed:
My training data is x: 1~10 & y:2.3~5.0. Training: [(1, 2.3), .., (10, 5.0)]
The derivatives used:
dE_dw = -(y-A)*x #gradient
new_w = w - lr * dE_dw
dE_db = -(y-A) #gradient
new_b = b - lr * dE_db
The Code:
import random as r
# function: calculate gradient for weight w for the x input or weight b for bias input
def calc_new_Weight(v, lr, grad):
# v is value of the weight
# lr is learning rate
# grad is gradient
new_v = v - lr * grad
return new_v
# linear cost function y=x
def costFunc(s): return s
def nn(x, y, w, b, lr):
s = x*w + 1*b
A = costFunc(s)
#Error: E = 0.5 (y - a) ** 2
#partial deriv E w/ respect to w
dE_dw = -1*(y-A)*x
w_new = calc_new_Weight(w, lr, grad = dE_dw)
# partial deriv E w/ respect to b
dE_db = -1*(y-A)
b_new = calc_new_Weight(b, lr, grad = dE_db)
return (w_new, b_new)
def main():
#random init weights w, b for the inputs x, b
w = r.random()
b = r.random()
for x, y in data:
# y = 0.3*x + 2
for i in range(1, 100):
#update w, b with the new weights
w, b = nn(x, y, w, b, lr=0.001)
print(w, b)
If you can help me understand this I really appreciate your time.
Thank you in advance
Your Gradients are correct but the Gradient Descent Algorithm in the code needs to be modified a little,
The Gradient Descent Algo goes like this: reference
t <- 0
max_iterations <- 1000
Initialize W/theta (Weights)
while t++ < max_iterations do
H = Forward_propogate(Inputs, W)
delta_W = Backward_propogation(H)
W -= n*delta_W
end
In python it looks like this:
w, b, lr, max_iterations = r.random(), r.random(), 0.001, 1000
for i in range(max_iterations):
dw, db = 0, 0
for x,y in data:
# return dw and db from nn and not updated w and b
dw += nn(w, b, x, y)
db += nn(w, b, x, y)
w = w - lr * dw
b = b - lr * db
And If you want to do stochastic gradient descent(i.e update w,b for all points in the data) the code would be:
w, b, lr, max_iterations = r.random(), 0, 0.001, 1000
for i in range(max_iterations):
dw, db = 0, 0
for x,y in data:
dw += nn(w, b, x, y)
db += nn(w, b, x, y)
w = w - lr * dw
b = b - lr * db
Implementing SGD with your example with 50 data points and 1000 iterations and initializing random w and b to be 0 we can consistently converge to expected values:
import random as r
import matplotlib.pyplot as plt
# function: calculate gradient for weight w for the x input or weight b for bias input
def calc_new_Weight(v, lr, grad):
# v is value of the weight
# lr is learning rate
# grad is gradient
new_v = v - lr * grad
return new_v
# linear cost function y=x
def costFunc(s): return s
def nn(x, y, w, b, lr):
s = x*w + 1*b
A = costFunc(s)
#Error: E = 0.5 (y - a) ** 2
#partial deriv E w/ respect to w
dE_dw = -1*(y-A)*x
#w_new = calc_new_Weight(w, lr, grad = dE_dw)
# partial deriv E w/ respect to b
dE_db = -1*(y-A)
#b_new = calc_new_Weight(b, lr, grad = dE_db)
return (dE_dw, dE_db)
def main():
#random init weights w, b for the inputs x, b
w = r.random()
b = 0
x = list(range(1,50))
y = [(0.3*i + 2) for i in x]
data = list(zip(x,y))
#r.shuffle(data)
for i in range(1,1000):
dw , db = 0, 0
for x, y in data:
# y = 0.3*x + 2
#update w, b with the new weights
d_w, d_b = nn(x, y, w, b, lr=0.001)
dw += d_w
db += d_b
w = w - (0.001*d_w)
b = b - (0.001*d_b)
return w, b
w, b = main()

Gradient Descent returns NaN values for slope and error

I'm new to machine learning and am trying to implement gradient descent. The code I have looks like this and has been resulting in NaN values for all parameters:
def compute_error_for_line_given_points(b,m,points):
totalError = 0 #sum of square error formula
for i in range (0, len(points)):
x = points[i, 0]
y = points[i, 1]
totalError += (y-(m*x + b)) ** 2
return totalError/ float(len(points))
def step_gradient(b_current, m_current, points, learning_rate):
#gradient descent
b_gradient = 0
m_gradient = 0
N = float(len(points))
for i in range(0, len(points)):
x = points[i, 0]
y = points[i, 1]
b_gradient += -(2/N) * (y - (m_current * x + b_current))
m_gradient += -(2/N) * x * (y - (m_current * x + b_current))
new_b = b_current - (learning_rate * b_gradient)
new_m = m_current - (learning_rate * m_gradient)
return [new_b,new_m]
def gradient_descent_runner(points, starting_b, starting_m, learning_rate, num_iterations):
b = starting_b
m = starting_m
for i in range(num_iterations):
b,m = step_gradient(b, m, array(points), learning_rate)
return [b,m]
def run():
#Step 1: Collect the data
points = genfromtxt("C:/Users/mishruti/Downloads/For Linear Regression.csv", delimiter = ",")
#Step 2: Define our Hyperparameters
learning_rate = 0.0000001 #how fast the data converge
#y=mx+b (Slope formule)
initial_b = 0 # initial y-intercept guess
initial_m = 0 # initial slope guess
num_iterations = 4
print("Starting gradient descent at b = {0}, m = {1}, error = {2}".format(initial_b, initial_m, compute_error_for_line_given_points(initial_b, initial_m, points)))
print("Running...")
[b, m] = gradient_descent_runner(points, initial_b, initial_m, learning_rate, num_iterations)
print("After {0} iterations b = {1}, m = {2}, error = {3}".format(num_iterations, b, m, compute_error_for_line_given_points(b, m, points)))
# main function
if __name__ == "__main__":
run()
A sample from my data set is attached. Can someone please help me figure this out? Thanks!

Scipy Minimization TNC Working, But Not CG

I'm trying to complete week 4 the Machine Learning course on Coursera. The assingment uses the MINST data for multi-class classification.
The dimensions are X (5000,401), y (5000,1), theta (10,401), which start off as arrays. X was inserted with 1's on the first feature column.
My cost and gradient functions are below:
def sigmoid(z):
g = 1 / (1 + np.exp(-z))
return g
def lrCostFunction(theta, X, y, my_lambda):
m = float(len(X))
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
#cost function:
term1 = np.multiply(-y,np.log(sigmoid(X*theta.T)))
term2 = np.multiply((1-y),np.log(1-sigmoid(X*theta.T)))
reg = np.power(theta[:,1:theta.shape[1]],2)
J = np.sum(term1-term2)/m + (my_lambda/(2.0*m) * np.sum(reg))
return J
def gradient (theta, X, y, my_lambda):
m = float(len(X))
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
#gradient:
error = sigmoid(X * theta.T) - y
g = (X.T * error /(m)).T + ((my_lambda/m) * theta)
g[0,0] = np.sum(np.multiply(error, X[:,0])) / m
return g
Here is my One vs All classification function with the TNC optimization:
def oneVsAll(X, y, num_labels, my_lambda):
m = float(X.shape[0])
n = float(X.shape[1])-1
all_theta = np.zeros((num_labels,n+1))
for K in range(1, num_labels + 1):
theta = np.zeros(n+1)
y_logical = np.array([1 if j == K else 0 for j in y]).reshape(m,1)
opt_theta = opt.minimize(fun=lrCostFunction, x0=theta, \
args=(X,y_logical,my_lambda), \
method='TNC', jac=gradient).x
all_theta[K-1,:] = opt_theta
return all_theta
When I try to run CG however, it returns the error at line 8: "shapes (1,401) and (1,401) not aligned: 401 (dim 1) != 1 (dim 0)":
def oneVsAll(X, y, num_labels, my_lambda):
m = float(X.shape[0])
n = float(X.shape[1])-1
all_theta = np.zeros((num_labels,n+1))
for K in range(1, num_labels + 1):
theta = np.zeros(n+1)
y_logical = np.array([1 if j == K else 0 for j in y]).reshape(m,1)
opt_theta = opt.fmin_cg(f=lrCostFunction, x0=theta, \
fprime=gradient, \
args=(X,y_logical,my_lambda))
all_theta[K-1,:] = opt_theta
return all_theta
I saw elsewhere that CG only likes 1-d vectors from y. If I try to flatten y or reduce its dimension, however, everything else breaks. Is it generally a bad idea to use np.matrix as oppose to use np.dot with arrays? I like being able to easily transpose with matrixes.
Any help would be greatly appreciated.

Categories