Implementation of backpropagation in Python - python

I'm following the Andrew-Ng course on Machine Learning and I'm currently doing the week 5 exercise.
I've found myself stuck on the implementation of the backpropagation algorithm, due to the fact that the relative difference, compared to numerical gradient, is very high (order of 1e-1), but I can't find any error within my implementation, so I'm gently asking if someone could take a look at it and explain what I did wrong.
Forward propagation:
def forward_propagation(thetas, X, history=False):
activation_arr = []
a = X # X is the array of the first activation values
for k in range(0, len(thetas)):
a = add_intercept(a) # add the bias unit
a = sigmoid(a # thetas[k].T)
if history:
return activation_arr if history else a
def gradient_nn(thetas, X, y, num_labels, reg_lambda=None):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
# add intercepted X to the activation array
activation_arr = [add_intercept(X)] + forward_propagation(thetas, X, history=True)
sigma = [activation_arr[-1] - Y] # sigma^L = a^L - y
delta = [sigma[-1].T # activation_arr[-2]] # find delta for the first row
thetas_grad = []
# Calculate sigma and delta
for idx in range(1, len(thetas)): # skip last iteration
sigma = [
(sigma[0] # thetas[-idx][:, 1:]) * partial_derivative(activation_arr[-1-idx])
] + sigma
delta = [
sigma[0].T # activation_arr[-2-idx]
] + delta
return [np.sum(d) / n_examples for d in thetas_grad]
Partial derivative:
def partial_derivative(a):
return a * (1 - a) # element wise multiplication
Numerical gradient:
def compute_numerical_gradient(cost_function, thetas):
# Unroll parameters
nn_params = unroll_thetas(thetas)
num_grad = np.zeros(nn_params.shape)
perturb = np.zeros(nn_params.shape)
shapes = [theta.shape for theta in thetas]
epsilon = 1e-4 # not the one of random initialization
for p in range(nn_params.shape[0]):
# Set perturbation vector
perturb[p] = epsilon
minus_theta = nn_params - perturb
plus_theta = nn_params + perturb
# --- Roll params back in order to use the cost function ---
minus_theta = roll_thetas(minus_theta, shapes)
plus_theta = roll_thetas(plus_theta, shapes)
# calculate the loss of the cost function
minus_loss = cost_function(minus_theta)
plus_loss = cost_function(plus_theta)
# Compute Numerical Gradient
num_grad[p] = (plus_loss - minus_loss) / (2 * epsilon)
perturb[p] = 0
num_grad = roll_thetas(num_grad, shapes)
return [np.sum(num_g) for num_g in num_grad]
Cost function:
def J_nn(num_labels, reg_lambda=None):
def non_reg_func(thetas, X, y):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
prediction = forward_propagation(thetas, X)
return np.sum(np.sum(-Y * np.log(prediction) - (1 - Y) * np.log(1 - prediction))) / n_examples
if reg_lambda is None:
func = non_reg_func
else: # regularization
def func(thetas, X, y):
cost = non_reg_func(thetas, X, y)
for theta in thetas: # regularize for every layer
theta = theta[1:] # remove bias unit
cost = cost + (reg_lambda / (2 * y.shape[0])) * np.sum(np.sum(theta[:, ] ** 2))
return cost
return func
Checking backpropagation with numerical gradient:
def check_nn_gradients(reg_lambda=None):
Creates a small neural network (max 8 x 8 x 7 x 8) and checks that
the implementation of the backpropagation algorithm is good
#n_examples, sizes = random.randint(5, 10), [random.randint(2, 8), random.randint(2, 8), random.randint(1, 8)]
n_examples, sizes = 5, [8, 8, 5, 4]
n_labels = sizes[-1] # Last size is equal to the number of labels
init_epsilon = 0.0001
thetas = random_init_thetas(sizes, init_epsilon)
X = np.array(
random_init_thetas([sizes[0]-1, n_examples], init_epsilon)
).squeeze() # We squeeze it because random_init_thetas returns a 3D array, but we want X to be 2D
y = np.array([random.randint(0, n_labels-1) for _ in X])
y = y[:, np.newaxis]
inner_cost = lambda _thetas: J_nn(n_labels, reg_lambda)(_thetas, X, y)
gradients = gradient_nn(thetas, X, y, n_labels, 0)
unrolled_gradients = unroll_thetas(gradients)
# finite difference method
grad_checking_epsilon = 1e-4
num_grad = compute_numerical_gradient(inner_cost, thetas)
unrolled_num_grad = unroll_thetas(num_grad)
return diff = np.linalg.norm(unrolled_num_grad - unrolled_gradients) / np.linalg.norm(unrolled_num_grad + unrolled_gradients)


Unexpected value of cost function in Logistic regression

I been trying to write a python code for logistic regression but the results are showing very high value of cost function which is unexpected. I have created a random variable X and Y and added a noise term to Y which will flip the element of based on the probability theta. This is my code:
import numpy as np
from scipy.stats import bernoulli
rg = np.random.default_rng(100)
def data_generate(n, m, theta):
X_0 = np.ones((n, 1))
X = np.random.normal(loc=0.0, scale=1.0, size=(n, m))
X = np.concatenate((X_0, X), axis = 1)
beta = rg.random((m+1, 1))
Y = np.zeros((n, 1))
P = 1.0/(1.0 + np.exp(, beta)))
for i in range(len(P)):
if P[i] >= 0.5:
Y[i] = 1
Y[i] = 0
# Noise addition
noise = bernoulli.rvs(size=(n,1), p=theta)
for j in range(len(noise)):
if noise[i] == 1:
Y[i] = int(not(Y[i]))
return X, Y, beta
def Gradient_Descent(X, Y, k, tollerence, learning_rate):
n,m = np.shape(X)
beta = rg.random((m, 1))
costs = []
initial_cost = 0.0
for i in range(k):
Y_pred = 1.0/(1.0 + np.exp(, beta)))
cost = np.mean(, np.log(Y_pred)) +, np.log(1-Y_pred)))
if (abs(cost - initial_cost) <= tollerence):
beta = beta - learning_rate*(np.mean(, (Y_pred - Y))))
initial_cost = cost
return cost, beta, i
X = data_generate(200, 3, 0.1)[0]
Y = data_generate(200, 3, 0.1)[1]
Gradient_Descent(X, Y, 10000, 1e-6, 0.01)
# Output of code :
[-0.1182535 ],
[ 0.1169462 ],
[ 0.58610747]]),
Please tell what is the problem with the code.

Neural network back propagation regression, how to correctly learn the cos function?

After Lutz Lehmann's suggestion, I discovered that it was a problem of random weights and biases. I used np.ramdom.seed(2021) to specify the random seed number, and the error has not converged. But if I use np.ramdom.seed(10) as the random seed number,the 600th ephoch error will converge to a relatively small amount.
Galletti_Lance's suggestion is correct and should be replaced with a periodic activation function. I expanded the interval of the sin function, and the learning error did not converge.Sure enough, it is overfitting.
input_data = np.arange(0, np.pi * 4, 0.1) # input
correct_data = np.sin(input_data) # correct answer
input_data = (input_data - np.pi*2) / np.pi
np.random.seed(2021) Learning cos function, the 20000th epoch is as follows:
Epoch:0/20001 Error:0.2904405534384431
Epoch:200/20001 Error:0.2752981376571506
Epoch:400/20001 Error:0.27356300803051226
Epoch:600/20001 Error:0.27409878767315193
Epoch:800/20001 Error:0.2638216736165815
Epoch:1000/20001 Error:0.27196157366033213
Epoch:1200/20001 Error:0.2743520487664953
Epoch:1400/20001 Error:0.2589745966244678
Epoch:1600/20001 Error:0.2705289192984957
Epoch:1800/20001 Error:0.2689693217636388
Epoch:20000/20001 Error:0.2678723095120438
But if I use np.ramdom.seed(10) as the random seed number,the 600th ephoch error will converge to a relatively small amount.
Epoch:0/20001 Error:0.283958515549615
Epoch:200/20001 Error:0.260819823215878
Epoch:400/20001 Error:0.23267630899157743
Epoch:600/20001 Error:0.0022589485429890047
Epoch:800/20001 Error:0.0007425256677052262
Epoch:1000/20001 Error:0.0003946220094805989
Epoch:2800/20001 Error:0.00011495288247859594
Epoch:3000/20001 Error:9.989662843897715e-05
Epoch:20000/20001 Error:4.6146397913360866e-05
np.random.seed(10) Learning cos function, the 600th epoch is as follows:
I use neural network back propagation regression to learn the cos function. When I learn the sin function, it is normal. If it is changed to cos, it is abnormal. What is the problem?
correct_data = np.cos(input_data)
Related settings:
1.The activation function of the middle layer: sigmoid function
2.Excitation function of the output layer: identity function
3.Loss function: sum of squares error
4.Optimization algorithm: stochastic gradient descent method
5.Batch size: 1
My code is as follows:
import numpy as np
import matplotlib.pyplot as plt
# - Prepare to input and correct answer data -
input_data = np.arange(0, np.pi * 2, 0.1) # input
correct_data = np.cos(input_data) # correct answer
input_data = (input_data - np.pi) / np.pi # Converge the input to the range of -1.0-1.0
n_data = len(correct_data) # number of data
# - Each setting value -
n_in = 1 # The number of neurons in the input layer
n_mid = 3 # The number of neurons in the middle layer
n_out = 1 # The number of neurons in the output layer
wb_width = 0.01 # The spread of weights and biases
eta = 0.1 # learning coefficient
epoch = 2001
interval = 200 # Display progress interval practice
# -- middle layer --
class MiddleLayer:
def __init__(self, n_upper, n): # Initialize settings
self.w = wb_width * np.random.randn(n_upper, n) # weight (matrix)
self.b = wb_width * np.random.randn(n) # offset (vector)
def forward(self, x): # forward propagation
self.x = x
u =, self.w) + self.b
self.y = 1 / (1 + np.exp(-u)) # Sigmoid function
def backward(self, grad_y): # Backpropagation
delta = grad_y * (1 - self.y) * self.y # Differentiation of Sigmoid function
self.grad_w =, delta)
self.grad_b = np.sum(delta, axis=0)
self.grad_x =, self.w.T)
def update(self, eta): # update of weight and bias
self.w -= eta * self.grad_w
self.b -= eta * self.grad_b
# - Output layer -
class OutputLayer:
def __init__(self, n_upper, n): # Initialize settings
self.w = wb_width * np.random.randn(n_upper, n) # weight (matrix)
self.b = wb_width * np.random.randn(n) # offset (vector)
def forward(self, x): # forward propagation
self.x = x
u =, self.w) + self.b
self.y = u # Identity function
def backward(self, t): # Backpropagation
delta = self.y - t
self.grad_w =, delta)
self.grad_b = np.sum(delta, axis=0)
self.grad_x =, self.w.T)
def update(self, eta): # update of weight and bias
self.w -= eta * self.grad_w
self.b -= eta * self.grad_b
# - Initialization of each network layer -
middle_layer = MiddleLayer(n_in, n_mid)
output_layer = OutputLayer(n_mid, n_out)
# -- learn --
for i in range(epoch):
# Randomly scramble the index value
index_random = np.arange(n_data)
# Used for the display of results
total_error = 0
plot_x = []
plot_y = []
for idx in index_random:
x = input_data[idx:idx + 1] # input
t = correct_data[idx:idx + 1] # correct answer
# Forward spread
middle_layer.forward(x.reshape(1, 1)) # Convert the input to a matrix
# Backpropagation
output_layer.backward(t.reshape(1, 1)) # Convert the correct answer to a matrix
# Update of weights and biases
if i % interval == 0:
y = output_layer.y.reshape(-1) # Restore the matrix to a vector
# Error calculation
total_error += 1.0 / 2.0 * np.sum(np.square(y - t)) # Square sum error
# Output record
if i % interval == 0:
# Display the number of epochs and errors
print("Epoch:" + str(i) + "/" + str(epoch), "Error:" + str(total_error / n_data))
# Display the output with a graph
plt.plot(input_data, correct_data, linestyle="dashed")
plt.scatter(plot_x, plot_y, marker="+")
If increasing the number of epochs worked, the model needed more training.
But you may be overfitting... Notice that the cosine function is a periodic function, yet you are using only monotonic functions (sigmoid, and identity) to approximate it.
So while on the bounded interval of your data it may work:
It does not generalize well:
Code for the above plots:
import math as m
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as datasets
from tensorflow import keras
from tensorflow.keras import layers
t, _ = datasets.make_blobs(n_samples=7500, centers=[[0, 0]], cluster_std=1, random_state=0)
X = np.array(list(filter(lambda x : m.cos(4*x[0]) - x[1] < -.5 or m.cos(4*x[0]) - x[1] > .5, t)))
Y = np.array([1 if m.cos(4*x[0]) - x[1] >= 0 else LABEL for x in X])
model = keras.models.Sequential()
model.add(layers.Dense(8, input_dim=2, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss="binary_crossentropy"), Y, batch_size=500, epochs=3000)
# create a mesh to plot in
h = .02 # step size in the mesh
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
meshData = np.c_[xx.ravel(), yy.ravel()]
fig, ax = plt.subplots()
Z = model.predict(meshData)
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=.3,
# Plot also the training points
T = model.predict(X)
T = T.reshape(X[:,0].shape)
ax.scatter(X[:, 0], X[:, 1], color=colors[T].tolist(), s=10, alpha=0.9)
# add duplicate plotting code here to generate second plot
# predicting on data generated from a blob
# with a larger standard deviation

Why do the parameters (beta) blow up in LMS with Kernel Trick?

I am attempting to solve an LMS problem using kernel trick. The reference is taken from this note.
My problem is that the value of beta blows up. It reaches to the order of 10e17 on just about 5 or 6 iterations. One thing I noticed is that the signs entries in beta oscillate (positive to negative and vice versa) on consecutive update of beta in the loop.
Also, lowering the value of alpha (learning rate) does not help. I tried alpha = 0.01 but the value of beta still blows up.
Here's the code
import numpy as np
import pandas as pd
file = pd.read_csv("weatherHistory.csv")
#selecting only the required columns
useful_index = [3, 4, 5, 6, 7, 8, 10]
file = file.iloc[:, useful_index]
# getting training set and test set
file_randomized = file.sample(frac=1, random_state=1)
# useable data size
usable_dataset_size = 1200
# training set size
training_set_index = int(0.75 * usable_dataset_size)
# get rid of unnecessary data
file = file_randomized[:usable_dataset_size]
# make training and test set
training_set = file[:training_set_index]
test_set = file[training_set_index:]
# Select the columns
input_index = [0, 2, 3, 4, 5, 6]
X = training_set.iloc[:, input_index]
output_index = [1]
Y = training_set.iloc[:, output_index]
# Convert to numpy into suitable format
X = X.to_numpy()
n = X.shape[0]
d = X.shape[1]
Y = Y.to_numpy()
# This function calculates K(phi(x), phi(y))
def kernel_matrix(K, degree=1):
"""K is the matrix of dot product. This applies kernel function to the matrix"""
result = np.zeros(K.shape)
for i in range(0, degree+1):
result += np.power(K, i)
return result
# Main training function
def lms_with_kt(X, Y, alpha, degree = 1, num_iters = 1000):
"""X: nxd vector, Y: nx1 vector, beta: nx1 zero vector, alpha: number, degree: number"""
# normalize x
x_min = X.min(axis = 0, keepdims=True)
x_max = X.max(axis = 0, keepdims=True)
X = (X - x_min) / (x_max - x_min)
n = X.shape[0]
# add the column of 1 in the front
X = np.hstack((np.ones((n, 1)), X))
# make K_matrix (kernel matrix)
K = np.matmul(X, X.T)
K = kernel_matrix(K, degree)
# initialize beta
beta = np.zeros((n, 1))
# update beta
for i in range(num_iters):
beta += alpha * (Y - np.matmul(K, beta))
def predict(x):
"""x: 1xd matrix"""
x_norm = (x - x_min) / (x_max - x_min)
n_predict = x_norm.shape[0]
x_norm = np.hstack((np.ones((n_predict, 1)), x_norm))
K_for_prediction = np.matmul(X, x_norm.T)
K_for_prediction = kernel_matrix(K_for_prediction, degree)
return, K_for_prediction)
return predict
predictor = lms_with_kt(X, Y, 0.1, 2, 1000)
The link to the dataset is here.

Curve fitting with gradient descent

I wrote some code that performs gradient descent on a couple of data points.
For some reason the curve is not converging correctly, but I have no idea why that is. I always end up with an exploding tail.
Am I doing one of the computations wrong? Am I actually getting stuck in a local minimum or is it something else?
Here is my code:
import numpy as np
import matplotlib.pyplot as plt
def estimate(weights, x, order):
est = 0
for i in range(order):
est += weights[i] * x ** i
return est
def cost_function(x, y, weights, m):
cost = 0
for i in range(m-1):
cost += (((weights[i] * x ** i) - y) ** 2)
return (np.sum(cost ** 2) / ( 2 * m ))
def descent(A, b, iterations, descent_rate, order):
x = A.T[0]
y = b.reshape(4)
# features
ones = np.vstack(np.ones(len(A)))
x = np.vstack(A.T[0])
x2 = np.vstack(A.T[0] ** 2)
# Our feature matrix
features = np.concatenate((ones,x,x2), axis = 1).T
# Initialize our coefficients to zero
weights = np.zeros(order + 1)
m = len(y)
# gradient descent
for i in range(iterations):
est = estimate(weights, x, order).T
difference = est - y
weights = weights + (-descent_rate * (1/m) * np.matmul(difference, features.T)[0])
cost = cost_function(x, y, weights, m)
u = np.linspace(0,3,100)
plt.plot(u, (u ** 2) * weights[2] + u * weights[1] + weights[0], '-')
A = np.array(((0,1),
b = np.array((1,2,0,3), ndmin = 2 ).T
iterations = 150
descent_rate = 0.01
order = 2
descent(A, b, iterations, descent_rate, order)
I would like to avoid getting stuck in such a minimum. I have attempted setting the initial weights to random values but to no avail, sometimes it dips a bit more but then gives me the same behaviour again.
Here is the one of the plots that I am getting:
And here is the expected result obtained by a least squares solution:
Your estimate function should be
def estimate(weights, x, order):
est = 0
for i in range(order+1):
est += weights[i] * x ** i
return est
Better yet, since the order information is already present in the size of the weights vector, remove the redundancy with:
def estimate(weights, x):
est = 0
for i in range(len(weights)):
est += weights[i] * x ** i
return est
This is what I got when using your code and running 2000 iterations:

Regularized Logistic Regression in Python (Andrew ng Course)

I'm starting the ML journey and I'm having troubles with this coding exercise
here is my code
import numpy as np
import pandas as pd
import scipy.optimize as op
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, name['Test1', 'Test2', 'Accepted'])
# Separate the features to make it fit into the mapFeature function
X1 = data['Test1'].values.T
X2 = data['Test2'].values.T
# This function makes more features (degree)
def mapFeature(x1, x2):
degree = 6
out = np.ones((x1.shape[0], sum(range(degree + 2))))
curr_column = 1
for i in range(1, degree + 1):
for j in range(i+1):
out[:,curr_column] = np.power(x1, i-j) * np.power(x2, j)
curr_column += 1
return out
# Separate the data into training and target, also initialize theta
X = mapFeature(X1, X2)
y = np.matrix(data['Accepted'].values).T
m, n = X.shape
cols = X.shape[1]
theta = np.matrix(np.zeros(cols))
#Initialize the learningRate(sigma)
learningRate = 1
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
first = np.multiply(-y, np.log(error))
second = np.multiply(1 - y, np.log(1 - error))
j = np.sum((first - second)) / m + (learningRate * np.sum(np.power(theta, 2)) / 2 * m)
return j
# Define the gradient of the cost function
def gradient(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
grad = (X.T # (error - y)) / m + ((learningRate * theta) / m)
grad_no = (X.T # (error - y)) / m
grad[0] = grad_no[0]
return grad
Result = op.minimize(fun=cost, x0=theta, args=(X, y, learningRate), method='TNC', jac=gradient)
opt_theta = np.matrix(Result.x)
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
p = predict(opt_theta, X)
print('Train Accuracy: {:f}'.format(np.mean(p == y) * 100))
So, when the learningRate = 1, the accuracy should be around 83,05% but I'm getting 80.5% and when the learningRate = 0, the accuracy should be 91.52% but I'm getting 87.28%
So the question is What am I doing wrong? Why my accuracy is below the problem default answer?
Hope someone can guide me in the right direction. Thanks!
P.D: Here is the dataset, maybe it can help
Hey guys I found a way to make it even better!
Here is the code
import numpy as np
import pandas as pd
import scipy.optimize as op
from sklearn.preprocessing import PolynomialFeatures
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, names=['Test1', 'Test2', 'Accepted'])
# Separate the data into training and target
X = (data.iloc[:, 0:2]).values
y = (data.iloc[:, 2:3]).values
# Modify the features to a certain degree (Polynomial)
poly = PolynomialFeatures(6)
m = y.size
XX = poly.fit_transform(data.iloc[:, 0:2].values)
# Initialize Theta
theta = np.zeros(XX.shape[1])
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return(1 / (1 + np.exp(-z)))
# Define the Regularized cost function
def costFunctionReg(theta, reg, *args):
# This is require to make the optimize function work
h = sigmoid(XX # theta)
first = np.log(h).T # - y
second = np.log(1 - h).T # (1 - y)
J = (1 / m) * (first - second) + (reg / (2 * m)) * np.sum(np.square(theta[1:]))
return J
# Define the Regularized gradient function
def gradientReg(theta, reg, *args):
theta = theta.reshape(-1, 1)
h = sigmoid(XX # theta)
grad = (1 / m) * (XX.T # (h - y)) + (reg / m) * np.r_[[[0]], theta[1:]]
return grad.flatten()
# Define the predict Function
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
# A loop to test between different values for sigma (reg parameter)
for i, Sigma in enumerate([0, 1, 100]):
# Optimize costFunctionReg
res2 = op.minimize(costFunctionReg, theta, args=(Sigma, XX, y), method=None, jac=gradientReg)
# Get the accuracy of the model
accuracy = 100 * sum(predict(res2.x, XX) == y.ravel()) / y.size
# Get the Error between different weights
error1 = costFunctionReg(res2.x, Sigma, XX, y)
# print the accuracy and error
print('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=4), Sigma))
Thanks for all your help!
try out this:
# import library
import pandas as pd
import numpy as np
dataset = pd.read_csv('ex2data2.csv',names = ['Test #1','Test #2','Accepted'])
# splitting to x and y variables for features and target variable
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
print('x[0] ={}, y[0] ={}'.format(x[0],y[0]))
m, n = x.shape
print('#{} Number of training samples, #{} features per sample'.format(m,n))
# import library FeatureMapping
from sklearn.preprocessing import PolynomialFeatures
# We also add one column of ones to interpret theta 0 (x with power of 0 = 1) by
include_bias as True
pf = PolynomialFeatures(degree = 6, include_bias = True)
x_poly = pf.fit_transform(x)
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_ = 1
# reshape (-1,1) because we just have one feature in y column
y = y.reshape(-1,1)
def sigmoid(z):
return 1/(1+np.exp(-z))
def lr_hypothesis(x,theta):
def compute_cost(theta,x,y,lambda_):
theta = theta.reshape(n,1)
infunc1 = -y*(np.log(sigmoid(lr_hypothesis(x,theta)))) - ((1-y)*(np.log(1 - sigmoid(lr_hypothesis(x,theta)))))
infunc2 = (lambda_*np.sum(theta[1:]**2))/(2*m)
j = np.sum(infunc1)/m+ infunc2
return j
# gradient[0] correspond to gradient for theta(0)
# gradient[1:] correspond to gradient for theta(j) j>0
def compute_gradient(theta,x,y,lambda_):
gradient = np.zeros(n).reshape(n,)
theta = theta.reshape(n,1)
infunc1 = sigmoid(lr_hypothesis(x,theta))-y
gradient_in =,infunc1)/m
gradient[0] = gradient_in[0,0] # theta(0)
gradient[1:] = gradient_in[1:,0]+(lambda_*theta[1:,]/m).reshape(n-1,) # theta(j) ; j>0
gradient = gradient.flatten()
return gradient
You can now test your cost and gradient without optimization. Th below code will optimize the model:
# hyperparameters
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_array = [0, 1, 10, 100]
import scipy.optimize as opt
for i in range(0,len(lambda_array)):
# Train
print('======================================== Iteration {} ===================================='.format(i))
optimized = opt.minimize(fun = compute_cost, x0 = theta, args = (x_poly, y,lambda_array[i]),
method = 'TNC', jac = compute_gradient)
new_theta = optimized.x
# Prediction
y_pred_train = predictor(x_poly,new_theta)
cm_train = confusion_matrix(y,y_pred_train)
t_train,f_train,acc_train = acc(cm_train)
print('With lambda = {}, {} correct, {} wrong ==========> accuracy = {}%'
Now you should see output like this :
=== Iteration 0 === With lambda = 0, 104 correct, 14 wrong ==========> accuracy = 88.13559322033898%
=== Iteration 1 === With lambda = 1, 98 correct, 20 wrong ==========> accuracy = 83.05084745762711%
=== Iteration 2 === With lambda = 10, 88 correct, 30 wrong ==========> accuracy = 74.57627118644068%
=== Iteration 3 === With lambda = 100, 72 correct, 46 wrong ==========> accuracy = 61.016949152542374%
