I have implemented logistic regression from scratch, however when I run the script the algorithm always predict the wrong label.
I've tried changing the training output and test_output by switching all 1 to 0 and vice versa but it always predict the wrong label.
I also noticed that changing the "-" sign to "+", when updating the weigths and the bias, the script correctly predicts the label.
What am I doing wrong?
This is the code I've written:
import numpy as np
EPOCHS = 1000
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(y_pred, training_outputs, m):
j = - np.sum(training_outputs * np.log(y_pred) + (1 - training_outputs) * np.log(1 - y_pred)) / m
return j
if __name__ == "__main__":
# Training input and output
x = np.array([[1, 1, 1], [0, 0, 0], [1, 0, 1]])
training_outputs = np.array([1, 0, 1])
# Test input and output
test_input = np.array([[0, 1, 1]])
test_output = np.array([0])
# Weigths
w = np.array([0.3, 0.3, 0.3])
# Biases
b = 0
m = 3
# Training
for iteration in range(EPOCHS):
print("Iteration n.", iteration, end= "\r")
# Compute log odds
z =, w) + b
# Compute predicted probability
y_pred = sigmoid(z)
# Back propagation
dz = y_pred - training_outputs
dw =, dz) / m
db = np.sum(dz) / m
# Update weights and bias according to the gradient descent algorithm
w = w - LEARNING_RATE * dw
b = b - LEARNING_RATE * db
print("Model trained. Proceeding with model evaluation...")
# Test
# Compute log odds
z =, w) + b
# Compute predicted probability
y_pred = sigmoid(z)
# Compute cost
cost = cost(y_pred, test_output, m)
There was an incorrect assumption pointed out by #J_H:
>>> from sklearn.linear_model import LogisticRegression
>>> import numpy as np
>>> x = np.array([[1, 1, 1], [0, 0, 0], [1, 0, 1]])
>>> y = np.array([1, 0, 1])
>>> clf = LogisticRegression().fit(x, y)
>>> clf.predict([[0, 1, 1]])
scikit-learn at appears to believe that test_output should be a 1 rather than a 0.
A few more recommendations:
m should be fine to remove (it's a constant, so it could be included in the LEARNING_RATE)
w should be initialized proportional to the number of columns in x (i.e., x.shape[1])
dw =, dz) should be, x)
Prediction in logistic regression depends on a threshold, usually 0.5
Taking this into account would look something like the following.
# Initialize weights and bias
w, b = np.zeros(X.shape[1]), 0
for _ in range(EPOCHS):
# Compute log odds
z =, w) + b
# Compute predicted probability
y_pred = sigmoid(z)
# Back propagation
dz = y_pred - training_outputs
dw =, x)
db = np.sum(dz)
# Update
w = w - LEARNING_RATE * dw
b = b - LEARNING_RATE * db
# Test
z =, w) + b
test_pred = sigmoid(z) >= 0.5
And a complete example on random train/test sets created with sklearn.datasets.make_classification could look like this—which usually gets within a few decimals of the scikit-learn implementation as well:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
EPOCHS = 100
def sigmoid(z):
return 1 / (1 + np.exp(-z))
if __name__ == "__main__":
X, y = make_classification(n_samples=1000, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y)
# Initialize `w` and `b`
w, b = np.zeros(X.shape[1]), 0
for _ in range(EPOCHS):
z =, w) + b
y_pred = sigmoid(z)
dz = y_pred - y_train
dw =, X_train)
db = np.sum(dz)
w = w - LEARNING_RATE * dw
b = b - LEARNING_RATE * db
# Test
z =, w) + b
test_pred = sigmoid(z) >= 0.5
print(accuracy_score(y_test, test_pred))
I'm trying to compute my gradient for a multiclass classification model with logistic regression and it seems not to be working properly.
This is the data that I am using for this model.
import pandas as pd
from sklearn.preprocessing import normalize
# Create x and y datasets
path = '/kaggle/input/digit-recognizer'
# Set train and test sets
data = pd.read_csv(path + '/train.csv', nrows=6000)
x_train, y_train = data.iloc[:4800, 1:].values, data.iloc[:4800, 0].values
x_test, y_test = data.iloc[4800:, 1:].values, data.iloc[4800:, 0].values
# Normalize and expand dims
x_train, x_test = x_train / 255, x_test / 255
y_train, y_test = np.expand_dims(y_train, 1), np.expand_dims(y_test, 1)
assert len(x_train) == len(y_train)
assert len(x_test) == len(y_test)
x_train.shape, y_train.shape
((4800, 784), (4800, 1))
Here is the following code where I try to implement gradient descent:
def Sigmoid(z):
from math import e
return 1 / (1 + e**-z)
def CostFunction(h, y, m):
j = -(1/m) * (y # np.log(h) + (1-y) # np.log(1-h))
return j
def GradientDescent(X, y, theta, n_classes):
import numpy as np
# Useful variables
m = len(y)
theta0, theta1 = [x.copy() for x in theta]
grad0, grad1 = [np.zeros(x.shape) for x in [theta0, theta1]]
y_vec = np.zeros((m, n_classes))
j = 0
for i in range(m):
y_vec[i, y[i]] = 1
### Forward propagation
a0 = np.concatenate(([1], X[i]))
a1 = np.concatenate(([1], Sigmoid(theta0 # a0)))
a2 = Sigmoid(theta1 # a1)
h = a2
j += CostFunction(h, y_vec[i], m)
### Backpropagation
delta2 = a2 - y_vec[i]
delta1 = theta1.T # delta2 * (a1 * (1 - a1))
grad0 += np.expand_dims(delta1[1:], 1) # np.expand_dims(a0, 0)
grad1 += np.expand_dims(delta2, 1) # np.expand_dims(a1, 0)
grad0 = grad0 / m
grad1 = grad1 / m
return j, [grad0, grad1]
Now comes the training process.
### Create theta parameters
n_layers = 3
n_classes = 10
# Weigth matrix dims(i, j) = (number of nodes, input shape + bias)
theta0 = np.random.uniform(0, 0.01, (24, x_train.shape[1] + 1))
theta1 = np.random.uniform(0, 0.01, (n_classes, len(theta0) + 1))
theta_params = [theta0, theta1]
### Train parameters
epochs = 200
alpha = 0.001
j, t = np.zeros(epochs), theta_params.copy()
for i in range(epochs):
print("Iterarion: {}/{}".format(i + 1, epochs))
j[i], g = GradientDescent(x_train, y_train, t, n_classes)
t[0] = t[0] - a * g[0]
t[1] = t[1] - a * g[1]
The cost starts from J=7.2583 and goes down to approximately J=3.5223, where it gets stuck.
Then, whenever I try to predict any of the samples from the training or test sets it outputs the same approximate probability for all classes.
def Predict(X, theta):
import numpy as np
# Useful variables
m = len(X)
theta0, theta1 = [x for x in theta]
h = np.zeros(m)
for i in range(m):
### Forward propagation
a0 = np.concatenate(([1], X[i]))
a1 = np.concatenate(([1], Sigmoid(theta0 # a0)))
a2 = Sigmoid(theta1 # a1)
h[i] = np.argmax(a2)
return h
Predict(x_train[:1], t)
[0.20078521 0.19842413 0.20535222 0.1953332 0.19425315 0.19302124
0.20107485 0.19589331 0.19688894 0.19526526]
Notice that I'm am printing the results of the hypothesis probability for each node in the last layer during the Predict function.
Anyone could point me the direction by sharing some tips?
I have generated a balanced dataset of 4000 examples, 2000 for the negative class and 2000 for the positive one. Then, I've build a neural net with one single hidden layer and 3 neurons with a ReLU activation function and an output layer with a sigmoid. The cost function is a standard cross-entropy function and I chose Adam as optimizer. Using minibatches of 15 examples, after 1000 epochs of running the final accuracy 96.37%, so I am assuming that the model is doing well on the test set. But when I want to display the decision boundary, that's what I get:
I cannot figure out if the problem is a code error or the model just needs mode training. Script I'm using for this:
# implement a neural network that finds a decision boundary under a
constraint on the second hidden layer with tensorflow
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tf_utils import random_mini_batches
import matplotlib.pyplot as plt
def generate_dataset():
# positive class samples
d1_x = np.random.normal(5, 10, 1000)
d1_y = np.random.normal(5, 2, 1000)
d2_x = np.random.normal(40, 20, 1000)
d2_y = np.random.normal(2, 1, 1000)
# negative class samples
d3_x = np.random.normal(60, 5, 2000)
d3_y = np.random.normal(10, 1, 2000)
plt.scatter(d1_x, d1_y, color='b')
plt.scatter(d2_x, d2_y, color='b')
plt.scatter(d3_x, d3_y, color='r')
Y = np.zeros((4000, 1))
d_x = np.concatenate([d1_x, d2_x, d3_x])
d_y = np.concatenate([d1_y, d2_y, d3_y])
d_x = d_x.reshape(d_x.shape[0], 1)
d_y = d_y.reshape(d_y.shape[0], 1)
X = np.concatenate([d_x, d_y], axis=1)
Y[2000:] = 1
return X, Y
# define a tensorflow model 5-3-1 with two hideen layers and the output
being scalar
costs = []
print_cost = True
learning_rate = .0009
minibatch_size = 15
num_epochs = 1000
XX, YY = generate_dataset()
XX, YY = shuffle(XX, YY)
X_norm = normalize(XX)
X_train, X_test, y_train, y_test = train_test_split(X_norm, YY,
test_size=0.2, random_state=42)
X_train = np.transpose(X_train)
y_train = np.transpose(y_train)
X_test = np.transpose(X_test)
y_test = np.transpose(y_test)
# define train and test sets
m = XX.shape[1] # input dimension
n = YY.shape[1] # output dimension
X = tf.placeholder(tf.float32, shape = [m, None], name = 'X')
y = tf.placeholder(tf.float32, shape = [n, None], name = 'y')
# model parameters
n1 = 3 # output dimension of the first hidden layer
#n2 = 4 # output dimension of the second hidden layer
#n3 = 2
W1 = tf.get_variable("W1", [n1, m],
b1 = tf.get_variable("b1", [n1 ,1], initializer=tf.zeros_initializer)
#W2 = tf.get_variable("W2", [n2, n1],
#b2 = tf.get_variable("b2", [n2, 1], initializer=tf.zeros_initializer)
#W3 = tf.get_variable("W3", [n3, n2],
#b3 = tf.get_variable("b3", [n3, 1], initializer=tf.zeros_initializer)
W4 = tf.get_variable("W4", [n, n1],
b4 = tf.get_variable("b4", [n, 1], initializer=tf.zeros_initializer)
# forward propagation
z1 = tf.add(tf.matmul(W1, X), b1)
a1 = tf.nn.relu(z1)
#z2 = tf.add(tf.matmul(W2, a1), b2)
#a2 = tf.nn.relu(z2)
#z3 = tf.add(tf.matmul(W3, a2), b3)
#a3 = tf.nn.relu(z3)
z4 = tf.add(tf.matmul(W4, a1), b4)
pred = tf.nn.sigmoid(z4)
# cost function
cost = tf.reduce_mean(tf.losses.log_loss(labels=y, predictions=pred)) #
logit is the probability estimate given by the model --> this is what is used inside the formula, not the net input z
# ADAM optimizer
optimizer =
# metrics
correct_prediction = tf.less_equal(tf.abs(pred - y), 0.5)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
init = tf.global_variables_initializer()
with tf.Session() as sess:
seed = 1
for epoch in range(num_epochs):
epoch_cost = 0
seed += 1
num_minibatches = int(X_train.shape[0] / minibatch_size)
minibatches = random_mini_batches(X_train, y_train, minibatch_size, seed)
for minibatch in minibatches:
(minibatch_X, minibatch_Y) = minibatch
_, minibatch_cost =[optimizer, cost], feed_dict={X:minibatch_X, y:minibatch_Y})
epoch_cost += minibatch_cost / minibatch_size
# Print the cost every epoch
if print_cost == True and epoch % 100 == 0:
print("Cost after epoch %i: %f" % (epoch, epoch_cost))
if print_cost == True and epoch % minibatch_size == 0:
cp, val_accuracy =[correct_prediction, accuracy], feed_dict={X: X_test, y: y_test})
# plot the cost
# plt.plot(np.squeeze(costs))
# plt.ylabel('cost'), feed_dict={X: X_test, y: y_test})
# plt.xlabel('iterations (per fives)')
# plt.title("Learning rate =" + str(learning_rate))
cmap = plt.get_cmap('Paired')
# Define region of interest by data limits
xmin, xmax = min(XX[:, 0]) - 1, max(XX[:, 0]) + 1
ymin, ymax = min(XX[:, 1]) - 1, max(XX[:, 1]) + 1
steps = 100
x_span = np.linspace(xmin, xmax, steps)
y_span = np.linspace(ymin, ymax, steps)
xx, yy = np.meshgrid(x_span, y_span)
A = np.concatenate([[xx.ravel()], [yy.ravel()]], axis=0)
A = normalize(A, axis=0)
# Make predictions across region of interest
predictions =, feed_dict={X: A})
# Plot decision boundary in region of interest
z = predictions.reshape(xx.shape)
plt.contourf(xx, yy, z, cmap=cmap, alpha=.5)
# Get predicted labels on training data and plot
#train_labels = model.predict(X)
#ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap, lw=0)
I am freshman & beginner.
I am studying machine learning with open tutorials.
I have a trouble with making gradient descent algorithm
I have to complete "for _ in range(max_iter):" but, I don't know about numpy... so I don't know what code should i add
Could you please help me fill the blank?
I know this type of question is so rude... sorry but I need your help :(
Thank you in advance.
from sklearn import datasets
import numpy as np
from sklearn.metrics import accuracy_score
X, y = datasets.make_classification(
n_samples = 200, n_features = 2, random_state = 333,
n_informative =2, n_redundant = 0 , n_clusters_per_class= 1)
def sigmoid(s):
return 1 / (1 + np.exp(-s))
def loss(y, h):
return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
def gradient(X, y, w):
return -(y * X) / (1 + np.exp(-y *, w)))
X_bias = np.append(np.ones((X.shape[0], 1)), X, axis=1)
y = np.array([[1] if label == 0 else [0] for label in y])
w = np.array([[random.uniform(-1, 1)] for _ in range(X.shape[1]+1)])
max_iter = 100
learning_rate = 0.1
threshold = 0.5
for _ in range(max_iter):
#fill in the blank
what code should i add ????
probabilities = sigmoid(, w))
predictions = [[1] if p > threshold else [0] for p in probabilities]
print("loss: %.2f, accuracy: %.2f" %
(loss(y, probabilities), accuracy_score(y, predictions)))
Inside the for loop, we have to first compute the probabilities. Then find the gradients and then update the weights.
For computing probabilities, you can use the code below
probs=sigmoid(,w)) is numpy command for matrix multiplication. Then we will calculate the loss and its gradients.
Now we will update the weights.
So the final code will be
from sklearn import datasets
import numpy as np
from sklearn.metrics import accuracy_score
X, y = datasets.make_classification(
n_samples = 200, n_features = 2, random_state = 333,
n_informative =2, n_redundant = 0 , n_clusters_per_class= 1)
def sigmoid(s):
return 1 / (1 + np.exp(-s))
def loss(y, h):
return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
def gradient(X, y, w):
return -(y * X) / (1 + np.exp(-y *, w)))
X_bias = np.append(np.ones((X.shape[0], 1)), X, axis=1)
y = np.array([[1] if label == 0 else [0] for label in y])
w = np.array([[np.random.uniform(-1, 1)] for _ in range(X.shape[1]+1)])
max_iter = 100
learning_rate = 0.1
threshold = 0.5
for _ in range(max_iter):
probabilities = sigmoid(, w))
predictions = [[1] if p > threshold else [0] for p in probabilities]
print("loss: %.2f, accuracy: %.2f" %
(loss(y, probabilities), accuracy_score(y, predictions)))
Note: In the for loop, there is no need to compute probs and loss, As we only need gradients to update the weights. I did that because it will be easy to understand.
I currently trained a logistic model for a decision boundary that looks like this:
using the following code that I got online:
x_min, x_max = xbatch[:, 0].min() - .5, xbatch[:, 0].max() + .5
y_min, y_max = xbatch[:, 1].min() - .5, xbatch[:, 1].max() + .5
h = 0.05
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
X = np.vstack( ( xx.reshape(1, np.product(xx.shape)), yy.reshape(1, np.product(yy.shape)) ) ).T
# Predict the function value for the whole grid
z1 =, w1_pred)+b1_pred
h1 = 1 / (1 + np.exp(-z1))
z2 =, w2_pred)+b2_pred
y_hat = 1 / (1 + np.exp(-z2))
pred = np.round(y_hat)
Z = pred.reshape(xx.shape)
# Plot the contour and training examples
plt.contourf(xx, yy, Z)
plt.scatter(xbatch[:, 0], xbatch[:, 1], c=ybatch, s=40, edgecolors="grey", alpha=0.9)
My question is this:
is there a way to plot the decision line without meshgrid or contour?
I would like to just plot the wave sigmoid function on the graph. without the colours or contours so it looks like this:
Use contour with level=[0.5] for sigmoid should work.
A Synthetic training set:
train_X = np.random.multivariate_normal([2.2, 2.2], [[0.1,0],[0,0.1]], 150)
train_Y = np.zeros(150)
train_X = np.concatenate((train_X, np.random.multivariate_normal([1.4, 1.3], [[0.05,0],[0,0.3]], 50)), axis=0)
train_Y = np.concatenate((train_Y, np.ones(50)))
train_X = np.concatenate((train_X, np.random.multivariate_normal([1.3, 2.9], [[0.05,0],[0,0.05]], 50)), axis=0)
train_Y = np.concatenate((train_Y, np.ones(50)))
train_X = np.concatenate((train_X, np.random.multivariate_normal([2.5, 0.95], [[0.1,0],[0,0.1]], 50)), axis=0)
train_Y = np.concatenate((train_Y, np.ones(50)))
An example model:
x = tf.placeholder(tf.float32, [None, 2])
y = tf.placeholder(tf.float32, [None,1])
#Input to hidden units
w_i_h = tf.Variable(tf.truncated_normal([2, 2],mean=0, stddev=0.1))
b_i_h = tf.Variable(tf.zeros([2]))
hidden = tf.sigmoid(tf.matmul(x, w_i_h) + b_i_h)
#hidden to output
w_h_o = tf.Variable(tf.truncated_normal([2, 1],mean=0, stddev=0.1))
b_h_o = tf.Variable(tf.zeros([1]))
logits = tf.sigmoid(tf.matmul(hidden, w_h_o) + b_h_o)
cost = tf.reduce_mean(tf.square(logits-y))
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(cost)
correct_prediction = tf.equal(tf.sign(logits-0.5), tf.sign(y-0.5))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
#Initialize all variables
init = tf.global_variables_initializer()
#Launch the graph
with tf.Session() as sess:
for epoch in range(3000):
_, c =[optimizer, cost], feed_dict={x:train_X, y:np.reshape(train_Y, (train_Y.shape[0],1))})
if epoch%1000 == 0:
print('Epoch: %d' %(epoch+1), 'cost = {:0.4f}'.format(c), end='\r')
acc =[accuracy] , feed_dict={x:train_X, y:np.reshape(train_Y, (train_Y.shape[0],1))})
print('\n Accuracy:', acc)
xx, yy = np.mgrid[0:3.5:0.1, 0:3.5:0.1]
grid = np.c_[xx.ravel(), yy.ravel()]
pred_1 =[logits], feed_dict={x:grid})
The output:
Z = np.array(pred_1).reshape(xx.shape)
plt.contour(xx, yy, Z, levels=[0.5], cmap='gray')
plt.scatter(train_X[:,0], train_X[:,1], s=20, c=train_Y, cmap='jet', vmin=0, vmax=1)