How to select features for logistic regression from scratch in python? - python

I have been trying to code logistic regression from scratch, which I have done, but I am using all the features in my breast cancer dataset, and I would like to select some features (specifically ones that I've found scikit-learn has selected for itself when I compare with it and use its feature selection on the data). However, I am not sure where to do this in my code, what I currently have is this:
X_train = ['texture_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'radius_se', 'symmetry_se'
'fractal_dimension_se', 'radius_worst', 'texture_worst', 'area_worst', 'smoothness_worst', 'compactness_worst']
X_test = ['texture_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'radius_se', 'symmetry_se'
'fractal_dimension_se', 'radius_worst', 'texture_worst', 'area_worst', 'smoothness_worst', 'compactness_worst']
def Sigmoid(z):
return 1/(1 + np.exp(-z))
def Hypothesis(theta, X):
return Sigmoid(X # theta)
def Cost_Function(X,Y,theta,m):
hi = Hypothesis(theta, X)
_y = Y.reshape(-1, 1)
J = 1/float(m) * np.sum(-_y * np.log(hi) - (1-_y) * np.log(1-hi))
return J
def Cost_Function_Derivative(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = theta - Cost_Function_Derivative(X,Y,theta,m,alpha)
return new_theta
def Accuracy(theta):
correct = 0
length = len(X_test)
prediction = (Hypothesis(theta, X_test) > 0.5)
_y = Y_test.reshape(-1, 1)
correct = prediction == _y
my_accuracy = (np.sum(correct) / length)*100
print ('LR Accuracy: ', my_accuracy, "%")
def Logistic_Regression(X,Y,alpha,theta,num_iters):
m = len(Y)
for x in range(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
print #('theta: ', theta)
print #('cost: ', Cost_Function(X,Y,theta,m))
Accuracy(theta)
ep = .012
initial_theta = np.random.rand(X_train.shape[1],1) * 2 * ep - ep
alpha = 0.5
iterations = 10000
Logistic_Regression(X_train,Y_train,alpha,initial_theta,iterations)
I was assuming that if I manually change what features X_train and X_test consist of this would work, but I get an error: AttributeError: 'list' object has no attribute 'shape' at the initial_theta line. Any help in the right direction would be appreciated.

the problem is that X_train is a list and shape only work for dataframes.
you could either:
-keep the list but use len(X_train) instead, OR
-change the X_train type to a pandas dataframe, pandas.DataFrame(X_train).shape[0]

Related

RuntimeWarning: overflow encountered in reduce return

I am testing this code using real data or a generated dataset from sklearn. In both cases, the code works without errors if the number of factors in the model is less than 6. With 7 factors, I get an error:
RuntimeWarning: overflow encountered in square return np.mean((y_true - y_pred)**2)
With more than 9 factors, I already get several errors and the predicted values become Nan:
RuntimeWarning: overflow encountered in reduce
return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
RuntimeWarning: overflow encountered in square
return np.mean((y_true - y_pred)**2)
It is obvious to me that the problem is in dimension, since the error occurs when the number of variables increases, but I do not know how to fix it. I'm already using np.float64 (as recommended for a similar question), but it doesn't help. I enclose the entire code (yes, it is far from perfect because I am relatively new to programming)
import numpy as np
import copy
from itertools import *
def MSE(y_true, y_pred):
return np.mean((y_true - y_pred)**2)
def weight_mult(c, weights):
r = copy.deepcopy(c)
for idx, num in enumerate(weights):
r[idx] = r[idx] * num
return r
def power_factors(X, deg):
X_power = []
for x in X:
x_power = x**deg
X_power.append(x_power)
return np.array(X_power)
def normalise_x(X):
X_norm = []
for x in X.T:
min_idx = np.argmin(x)
x -= x[min_idx]
max_idx = np.argmax(x)
x = x/x[max_idx]
X_norm.append(x)
return np.array(X_norm).T
def normalise_y(X):
min_idx = np.argmin(X)
X -= X[min_idx]
max_idx = np.argmax(X)
X = X/X[max_idx]
return X
class TakagiSugeno:
def __init__(self, cluster_n=2, lr=0.01, n_iters=1500):
self.lr = lr
self.n_iters = n_iters
self.weights = None
self.bias = None
self.weights_best = None
self.bias_best = None
self.combination_best = None
self.cluster_n = cluster_n
def fit(self, X, y, cluster_w):
power_degree = np.arange(self.cluster_n)
power_degree += 1
models_list = [[], [], [], []]
for combination in permutations(power_degree):
X_polynom = []
for c in combination:
X_power = power_factors(X, c)
X_polynom.append(X_power)
self.model_estimation(X_polynom, y, cluster_w)
y_pred = self.y_estimation(X_polynom, cluster_w)
mse = MSE(y, y_pred)
models_list[0].append(copy.deepcopy(self.weights))
models_list[1].append(copy.deepcopy(self.bias))
models_list[2].append(mse)
models_list[3].append(combination)
best_model = np.argmin(models_list[2])
self.weights_best = models_list[0][best_model]
self.bias_best = models_list[1][best_model]
self.combination_best = models_list[3][best_model]
def model_estimation(self, X_polynom, y, cluster_w):
n_samples, n_features = X_polynom[0].shape
self.weights = np.zeros((self.cluster_n, n_features))
self.bias = np.zeros(self.cluster_n)
for _ in range(self.n_iters):
y_predicted = np.zeros(n_samples)
for c in range(self.cluster_n):
# evaluate y
y_pred_cluster = np.dot(X_polynom[c], self.weights[c]) + self.bias[c]
weighted_y_pred = weight_mult(y_pred_cluster, cluster_w[c])
y_predicted += weighted_y_pred
for c in range(self.cluster_n):
# multiple grad count
dw = (2 / n_samples) * np.dot(weight_mult(X_polynom[c], cluster_w[c]).T, (y_predicted - y))
db = (2 / n_samples) * np.sum(weight_mult((y_predicted - y), cluster_w[c]))
# weights update
self.weights[c] -= self.lr * dw
self.bias[c] -= self.lr * db
def y_estimation(self, X_polynom, cluster_w):
y_predicted = np.zeros(len(X_polynom[0]))
for c in range(self.cluster_n):
# evaluate y
y_pred_cluster = np.dot(X_polynom[c], self.weights[c]) + self.bias[c]
weighted_y_pred = weight_mult(y_pred_cluster, cluster_w[c])
y_predicted += weighted_y_pred
return y_predicted
def predict(self, X, cluster_w):
y_predicted = np.zeros(len(X))
X_polynom = []
for c in self.combination_best:
X_power = power_factors(X, c)
X_polynom.append(X_power)
for c in range(self.cluster_n):
# evaluate y
y_pred_cluster = np.dot(X_polynom[c], self.weights_best[c]) + self.bias_best[c]
weighted_y_pred = weight_mult(y_pred_cluster, cluster_w[c])
y_predicted += weighted_y_pred
return y_predicted
if __name__ == '__main__':
import matplotlib.pyplot as plt
from sklearn import datasets
# Prepare data
X_numpy, y_numpy = datasets.make_regression(n_samples=100, n_features=10, noise=20, random_state=1)
# Normalisation
X_norm = np.array(normalise_x(X_numpy), dtype=np.float64)
y_norm = np.array(normalise_y(y_numpy), dtype=np.float64)
# Create y
y_sq = power_factors(y_norm, 2)
y = y_norm * 0.6 + y_sq * 0.4
# Create membership matrix
membership = np.zeros((len(X_norm), 2))
membership[:, 0] = 0.6
membership[:, 1] = 0.4
membership = np.array(membership, dtype=np.float64)
membership = membership.T
# training loop
model = TakagiSugeno(lr=1, n_iters=1000)
model.fit(X_norm, y, membership)
y_pred = model.predict(X_norm, membership)
I found the problem on my own. The code itself has no errors, but it uses gradient descent for optimization. I set the learning rate =1 and 2000 iterations. This, of course, is too much and retraining was taking place. The gradient continued to grow uncontrollably and I was getting huge values. The best solution is to set up a stop criterion.

How to find coefficients in multi-variable gradient descent?

I need to implement gradient descent on my own. My task is to create an arbitrary function, add noise to it, and then find the coefficients values for that function. So first, I created a function and created some random values:
# Preprocessing Input data
function = lambda x: x ** 2 +(x)+1
X=[]
Y=[]
for i in range(-100,100):
X.append(i)
Y.append(function(i) + random.randrange(-10,10)
then I normalized the values-
maxVal = np.max(np.hstack((X,Y)))
X = X/maxVal
Y = Y/maxVal
X= np.asarray(X)
Y= np.asarray(Y)
and this is my code for gradient descent, using derivative to find the coefficients
w1Arr = []
w2Arr = []
bArr = []
lossArr = []
for i in range(epochs):
Y_pred =w1*np.square(X)+w2*X+b
D_w1 = (-2/n) * sum( np.square(X) * (Y - Y_pred)) # Derivative for w1
D_w2 = (-2/n) * sum(X * (Y - Y_pred)) # Derivative for w2
D_b = (-2/n) * sum(Y - Y_pred) # Derivative for b
w1 = w1 - L * D_w1 # Update w1
w2 = w2 - L * D_w2 # Update w2
b = b - L * D_b # Update b
loss = sum((Y - Y_pred) * (Y - Y_pred)) #MSE
w1Arr.append(w1)
w2Arr.append(w2)
bArr.append(b)
lossArr.append(loss)
when I try to plot the results:
# Making predictions
Y_pred = w1*(np.square(X))+w2*X+b
#print(Y_pred)
plt.scatter(X, Y)
plt.plot(X, Y_pred) # predicted
plt.legend()
plt.show()
I see that the coefficients are pretty much the same,and just looks like a linear line-
I'm pretty much stuck, and don't know what is wrong with my code or how to fix it.
I looked online looking for solutions, but couldn't find any.
any help would be appreciated.
Found out the problem!
The normalization you applied just messed up the relation between the x and y, in particular you skewed the domain with respect of the codomain:
maxVal = np.max(np.hstack((X,Y)))
X = X/maxVal
Y = Y/maxVal
Just remove the normalization and you will find that you can learn the coefficients.
If you really want, you can normalize both the axis, but they have to be two proportional values:
X = X / np.max(X)
Y = Y / np.max(Y)

Linear Regression for multi variable not working as expected

When I use this code for Single variable Linear regression, the theta is being evaluated correctly but when on the multi variable it is giving weird output for theta.
I am trying to convert my octave code, that I wrote when I took Andrew Ng's course.
This is the main calling file:
m = data.shape[0]
a = np.array(data[0])
a.shape = (m,1)
b = np.array(data[1])
b.shape = (m, 1)
x = np.append(a, b, axis=1)
y = np.array(data[2])
lr = LR.LinearRegression()
[X, mu, sigma] = lr.featureNormalize(x)
z = np.ones((m, 1), dtype=float)
X = np.append(z, X, axis=1)
alpha = 0.01
num_iters = 400
theta = np.zeros(shape=(3,1))
[theta, J_history] = lr.gradientDescent(X, y, theta, alpha, num_iters)
print(theta)
And here are the contents of class :
class LinearRegression:
def featureNormalize(self, data):#this normalizes the features
data = np.array(data)
x_norm = data
mu = np.zeros(shape=(1, data.shape[1]))#creates mu vector filled with zeros
sigma = np.zeros(shape=(1, data.shape[1]))
for i in range(0, data.shape[1]):
mu[0, i] = np.mean(data[:, i])
sigma[0, i] = np.std(data[:, i])
for i in range(0, data.shape[1]):
x_norm[:, i] = np.subtract(x_norm[:, i], mu[0, i])
x_norm[:, i] = np.divide(x_norm[:, i], sigma[0, i])
return [x_norm, mu, sigma]
def gradientDescent(self, X, y, theta, alpha, num_iters):
m = y.shape[0]
J_history = np.zeros(shape=(num_iters, 1))
for i in range(0, num_iters):
predictions = X.dot(theta) # X is 47*3 theta is 3*1 predictions is 47*1
theta = np.subtract(theta , (alpha / m) * np.transpose((np.transpose(np.subtract(predictions ,y))).dot(X))) #1*97 into 97*3
J_history[i] = self.computeCost(X, y, theta)
return [theta, J_history]
def computeCost(self, X, y, theta):
warnings.filterwarnings('ignore')
m = X.shape[0]
J = 0
predictions = X.dot(theta)
sqrErrors = np.power(predictions - y, 2)
J = 1 / (2 * m) * np.sum(sqrErrors)
return J
I expected a theta that'll be a 3*1 matrix. According to Andrew's course my octave implementation was producing a theta
334302.063993
100087.116006
3673.548451
But in python implementation I am getting very weird output:
[[384596.12996714 317274.97693463 354878.64955708 223121.53576488
519238.43603216 288423.05420641 302849.01557052 191383.45903309
203886.92061274 233219.70871976 230814.42009498 333720.57288972
317370.18827964 673115.35724932 249953.82390212 432682.6678475
288423.05420641 192249.97844569 480863.45534211 576076.72380674
243221.70859887 245241.34318985 233604.4010228 249953.82390212
551937.2817908 240336.51632605 446723.93690857 451051.7253178
456822.10986344 288423.05420641 336509.59208678 163398.05571747
302849.01557052 557707.6...................... this goes on for long
The same code is working absolutely fine in Single Variable dataset. It is also working fine in the octave but seems like I am missing some point for 2+ hours now. Happy to get your help.
Try in gradientDescent the following second line of the for loop:
theta=theta-(alpha/m)*X.T.dot(X.dot(theta)-y)
Also, if you want to add a column of ones, it is easier to do like so:
np.c_[np.ones((m,1)),data]

Regularized Logistic Regression in Python (Andrew ng Course)

I'm starting the ML journey and I'm having troubles with this coding exercise
here is my code
import numpy as np
import pandas as pd
import scipy.optimize as op
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, name['Test1', 'Test2', 'Accepted'])
# Separate the features to make it fit into the mapFeature function
X1 = data['Test1'].values.T
X2 = data['Test2'].values.T
# This function makes more features (degree)
def mapFeature(x1, x2):
degree = 6
out = np.ones((x1.shape[0], sum(range(degree + 2))))
curr_column = 1
for i in range(1, degree + 1):
for j in range(i+1):
out[:,curr_column] = np.power(x1, i-j) * np.power(x2, j)
curr_column += 1
return out
# Separate the data into training and target, also initialize theta
X = mapFeature(X1, X2)
y = np.matrix(data['Accepted'].values).T
m, n = X.shape
cols = X.shape[1]
theta = np.matrix(np.zeros(cols))
#Initialize the learningRate(sigma)
learningRate = 1
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
first = np.multiply(-y, np.log(error))
second = np.multiply(1 - y, np.log(1 - error))
j = np.sum((first - second)) / m + (learningRate * np.sum(np.power(theta, 2)) / 2 * m)
return j
# Define the gradient of the cost function
def gradient(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
grad = (X.T # (error - y)) / m + ((learningRate * theta) / m)
grad_no = (X.T # (error - y)) / m
grad[0] = grad_no[0]
return grad
Result = op.minimize(fun=cost, x0=theta, args=(X, y, learningRate), method='TNC', jac=gradient)
opt_theta = np.matrix(Result.x)
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
p = predict(opt_theta, X)
print('Train Accuracy: {:f}'.format(np.mean(p == y) * 100))
So, when the learningRate = 1, the accuracy should be around 83,05% but I'm getting 80.5% and when the learningRate = 0, the accuracy should be 91.52% but I'm getting 87.28%
So the question is What am I doing wrong? Why my accuracy is below the problem default answer?
Hope someone can guide me in the right direction. Thanks!
P.D: Here is the dataset, maybe it can help
https://raw.githubusercontent.com/TheGirlWhiteWithBandages/Machine-Learning-Algorithms/master/Logistic%20Regression/ex2data2.txt
Hey guys I found a way to make it even better!
Here is the code
import numpy as np
import pandas as pd
import scipy.optimize as op
from sklearn.preprocessing import PolynomialFeatures
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, names=['Test1', 'Test2', 'Accepted'])
# Separate the data into training and target
X = (data.iloc[:, 0:2]).values
y = (data.iloc[:, 2:3]).values
# Modify the features to a certain degree (Polynomial)
poly = PolynomialFeatures(6)
m = y.size
XX = poly.fit_transform(data.iloc[:, 0:2].values)
# Initialize Theta
theta = np.zeros(XX.shape[1])
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return(1 / (1 + np.exp(-z)))
# Define the Regularized cost function
def costFunctionReg(theta, reg, *args):
# This is require to make the optimize function work
h = sigmoid(XX # theta)
first = np.log(h).T # - y
second = np.log(1 - h).T # (1 - y)
J = (1 / m) * (first - second) + (reg / (2 * m)) * np.sum(np.square(theta[1:]))
return J
# Define the Regularized gradient function
def gradientReg(theta, reg, *args):
theta = theta.reshape(-1, 1)
h = sigmoid(XX # theta)
grad = (1 / m) * (XX.T # (h - y)) + (reg / m) * np.r_[[[0]], theta[1:]]
return grad.flatten()
# Define the predict Function
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
# A loop to test between different values for sigma (reg parameter)
for i, Sigma in enumerate([0, 1, 100]):
# Optimize costFunctionReg
res2 = op.minimize(costFunctionReg, theta, args=(Sigma, XX, y), method=None, jac=gradientReg)
# Get the accuracy of the model
accuracy = 100 * sum(predict(res2.x, XX) == y.ravel()) / y.size
# Get the Error between different weights
error1 = costFunctionReg(res2.x, Sigma, XX, y)
# print the accuracy and error
print('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=4), Sigma))
print(error1)
Thanks for all your help!
try out this:
# import library
import pandas as pd
import numpy as np
dataset = pd.read_csv('ex2data2.csv',names = ['Test #1','Test #2','Accepted'])
# splitting to x and y variables for features and target variable
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
print('x[0] ={}, y[0] ={}'.format(x[0],y[0]))
m, n = x.shape
print('#{} Number of training samples, #{} features per sample'.format(m,n))
# import library FeatureMapping
from sklearn.preprocessing import PolynomialFeatures
# We also add one column of ones to interpret theta 0 (x with power of 0 = 1) by
include_bias as True
pf = PolynomialFeatures(degree = 6, include_bias = True)
x_poly = pf.fit_transform(x)
pd.DataFrame(x_poly).head(5)
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_ = 1
# reshape (-1,1) because we just have one feature in y column
y = y.reshape(-1,1)
def sigmoid(z):
return 1/(1+np.exp(-z))
def lr_hypothesis(x,theta):
return np.dot(x,theta)
def compute_cost(theta,x,y,lambda_):
theta = theta.reshape(n,1)
infunc1 = -y*(np.log(sigmoid(lr_hypothesis(x,theta)))) - ((1-y)*(np.log(1 - sigmoid(lr_hypothesis(x,theta)))))
infunc2 = (lambda_*np.sum(theta[1:]**2))/(2*m)
j = np.sum(infunc1)/m+ infunc2
return j
# gradient[0] correspond to gradient for theta(0)
# gradient[1:] correspond to gradient for theta(j) j>0
def compute_gradient(theta,x,y,lambda_):
gradient = np.zeros(n).reshape(n,)
theta = theta.reshape(n,1)
infunc1 = sigmoid(lr_hypothesis(x,theta))-y
gradient_in = np.dot(x.transpose(),infunc1)/m
gradient[0] = gradient_in[0,0] # theta(0)
gradient[1:] = gradient_in[1:,0]+(lambda_*theta[1:,]/m).reshape(n-1,) # theta(j) ; j>0
gradient = gradient.flatten()
return gradient
You can now test your cost and gradient without optimization. Th below code will optimize the model:
# hyperparameters
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_array = [0, 1, 10, 100]
import scipy.optimize as opt
for i in range(0,len(lambda_array)):
# Train
print('======================================== Iteration {} ===================================='.format(i))
optimized = opt.minimize(fun = compute_cost, x0 = theta, args = (x_poly, y,lambda_array[i]),
method = 'TNC', jac = compute_gradient)
new_theta = optimized.x
# Prediction
y_pred_train = predictor(x_poly,new_theta)
cm_train = confusion_matrix(y,y_pred_train)
t_train,f_train,acc_train = acc(cm_train)
print('With lambda = {}, {} correct, {} wrong ==========> accuracy = {}%'
.format(lambda_array[i],t_train,f_train,acc_train*100))
Now you should see output like this :
=== Iteration 0 === With lambda = 0, 104 correct, 14 wrong ==========> accuracy = 88.13559322033898%
=== Iteration 1 === With lambda = 1, 98 correct, 20 wrong ==========> accuracy = 83.05084745762711%
=== Iteration 2 === With lambda = 10, 88 correct, 30 wrong ==========> accuracy = 74.57627118644068%
=== Iteration 3 === With lambda = 100, 72 correct, 46 wrong ==========> accuracy = 61.016949152542374%

How to perform Numpy optimisation for this code?

I have the following code snippet:
def func1(self, X, y):
#X.shape = (455,13)
#y.shape = (455)
num_examples, num_features = np.shape(X)
self.weights = np.random.uniform(-1 / (2 * num_examples), 1 / (2 * num_examples), num_features)
while condition:
new_weights = np.zeros(num_features)
K = (np.dot(X, self.weights) - y)
for j in range(num_features):
summ = 0
for i in range(num_examples):
summ += K[i] * X[i][j]
new_weights[j] = self.weights[j] - ((self.alpha / num_examples) * summ)
self.weights = new_weights
This code works too slow. Is there any optimization, which I can do?
You can efficiently use np.einsum(). See a testing version below:
def func2(X, y):
num_examples, num_features = np.shape(X)
weights = np.random.uniform(-1./(2*num_examples), 1./(2*num_examples), num_features)
K = (np.dot(X, weights) - y)
return weights - alpha/num_examples*np.einsum('i,ij->j', K, X)
You can get new_weights directly using matrix-multiplication with np.dot like so -
new_weights = self.weights- ((self.alpha / num_examples) * np.dot(K[None],X))

Categories