I am testing this code using real data or a generated dataset from sklearn. In both cases, the code works without errors if the number of factors in the model is less than 6. With 7 factors, I get an error:
RuntimeWarning: overflow encountered in square return np.mean((y_true - y_pred)**2)
With more than 9 factors, I already get several errors and the predicted values become Nan:
RuntimeWarning: overflow encountered in reduce
return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
RuntimeWarning: overflow encountered in square
return np.mean((y_true - y_pred)**2)
It is obvious to me that the problem is in dimension, since the error occurs when the number of variables increases, but I do not know how to fix it. I'm already using np.float64 (as recommended for a similar question), but it doesn't help. I enclose the entire code (yes, it is far from perfect because I am relatively new to programming)
import numpy as np
import copy
from itertools import *
def MSE(y_true, y_pred):
return np.mean((y_true - y_pred)**2)
def weight_mult(c, weights):
r = copy.deepcopy(c)
for idx, num in enumerate(weights):
r[idx] = r[idx] * num
return r
def power_factors(X, deg):
X_power = []
for x in X:
x_power = x**deg
X_power.append(x_power)
return np.array(X_power)
def normalise_x(X):
X_norm = []
for x in X.T:
min_idx = np.argmin(x)
x -= x[min_idx]
max_idx = np.argmax(x)
x = x/x[max_idx]
X_norm.append(x)
return np.array(X_norm).T
def normalise_y(X):
min_idx = np.argmin(X)
X -= X[min_idx]
max_idx = np.argmax(X)
X = X/X[max_idx]
return X
class TakagiSugeno:
def __init__(self, cluster_n=2, lr=0.01, n_iters=1500):
self.lr = lr
self.n_iters = n_iters
self.weights = None
self.bias = None
self.weights_best = None
self.bias_best = None
self.combination_best = None
self.cluster_n = cluster_n
def fit(self, X, y, cluster_w):
power_degree = np.arange(self.cluster_n)
power_degree += 1
models_list = [[], [], [], []]
for combination in permutations(power_degree):
X_polynom = []
for c in combination:
X_power = power_factors(X, c)
X_polynom.append(X_power)
self.model_estimation(X_polynom, y, cluster_w)
y_pred = self.y_estimation(X_polynom, cluster_w)
mse = MSE(y, y_pred)
models_list[0].append(copy.deepcopy(self.weights))
models_list[1].append(copy.deepcopy(self.bias))
models_list[2].append(mse)
models_list[3].append(combination)
best_model = np.argmin(models_list[2])
self.weights_best = models_list[0][best_model]
self.bias_best = models_list[1][best_model]
self.combination_best = models_list[3][best_model]
def model_estimation(self, X_polynom, y, cluster_w):
n_samples, n_features = X_polynom[0].shape
self.weights = np.zeros((self.cluster_n, n_features))
self.bias = np.zeros(self.cluster_n)
for _ in range(self.n_iters):
y_predicted = np.zeros(n_samples)
for c in range(self.cluster_n):
# evaluate y
y_pred_cluster = np.dot(X_polynom[c], self.weights[c]) + self.bias[c]
weighted_y_pred = weight_mult(y_pred_cluster, cluster_w[c])
y_predicted += weighted_y_pred
for c in range(self.cluster_n):
# multiple grad count
dw = (2 / n_samples) * np.dot(weight_mult(X_polynom[c], cluster_w[c]).T, (y_predicted - y))
db = (2 / n_samples) * np.sum(weight_mult((y_predicted - y), cluster_w[c]))
# weights update
self.weights[c] -= self.lr * dw
self.bias[c] -= self.lr * db
def y_estimation(self, X_polynom, cluster_w):
y_predicted = np.zeros(len(X_polynom[0]))
for c in range(self.cluster_n):
# evaluate y
y_pred_cluster = np.dot(X_polynom[c], self.weights[c]) + self.bias[c]
weighted_y_pred = weight_mult(y_pred_cluster, cluster_w[c])
y_predicted += weighted_y_pred
return y_predicted
def predict(self, X, cluster_w):
y_predicted = np.zeros(len(X))
X_polynom = []
for c in self.combination_best:
X_power = power_factors(X, c)
X_polynom.append(X_power)
for c in range(self.cluster_n):
# evaluate y
y_pred_cluster = np.dot(X_polynom[c], self.weights_best[c]) + self.bias_best[c]
weighted_y_pred = weight_mult(y_pred_cluster, cluster_w[c])
y_predicted += weighted_y_pred
return y_predicted
if __name__ == '__main__':
import matplotlib.pyplot as plt
from sklearn import datasets
# Prepare data
X_numpy, y_numpy = datasets.make_regression(n_samples=100, n_features=10, noise=20, random_state=1)
# Normalisation
X_norm = np.array(normalise_x(X_numpy), dtype=np.float64)
y_norm = np.array(normalise_y(y_numpy), dtype=np.float64)
# Create y
y_sq = power_factors(y_norm, 2)
y = y_norm * 0.6 + y_sq * 0.4
# Create membership matrix
membership = np.zeros((len(X_norm), 2))
membership[:, 0] = 0.6
membership[:, 1] = 0.4
membership = np.array(membership, dtype=np.float64)
membership = membership.T
# training loop
model = TakagiSugeno(lr=1, n_iters=1000)
model.fit(X_norm, y, membership)
y_pred = model.predict(X_norm, membership)
I found the problem on my own. The code itself has no errors, but it uses gradient descent for optimization. I set the learning rate =1 and 2000 iterations. This, of course, is too much and retraining was taking place. The gradient continued to grow uncontrollably and I was getting huge values. The best solution is to set up a stop criterion.
Related
I been trying to write a python code for logistic regression but the results are showing very high value of cost function which is unexpected. I have created a random variable X and Y and added a noise term to Y which will flip the element of based on the probability theta. This is my code:
import numpy as np
from scipy.stats import bernoulli
rg = np.random.default_rng(100)
def data_generate(n, m, theta):
X_0 = np.ones((n, 1))
X = np.random.normal(loc=0.0, scale=1.0, size=(n, m))
X = np.concatenate((X_0, X), axis = 1)
beta = rg.random((m+1, 1))
Y = np.zeros((n, 1))
P = 1.0/(1.0 + np.exp(-np.dot(X, beta)))
for i in range(len(P)):
if P[i] >= 0.5:
Y[i] = 1
else:
Y[i] = 0
# Noise addition
noise = bernoulli.rvs(size=(n,1), p=theta)
for j in range(len(noise)):
if noise[i] == 1:
Y[i] = int(not(Y[i]))
else:
pass
return X, Y, beta
def Gradient_Descent(X, Y, k, tollerence, learning_rate):
n,m = np.shape(X)
beta = rg.random((m, 1))
costs = []
initial_cost = 0.0
for i in range(k):
Y_pred = 1.0/(1.0 + np.exp(-np.dot(X, beta)))
cost = np.mean(np.dot(Y.T, np.log(Y_pred)) + np.dot((1-Y).T, np.log(1-Y_pred)))
if (abs(cost - initial_cost) <= tollerence):
break
else:
beta = beta - learning_rate*(np.mean(np.dot(X.T, (Y_pred - Y))))
initial_cost = cost
costs.append(cost)
return cost, beta, i
X = data_generate(200, 3, 0.1)[0]
Y = data_generate(200, 3, 0.1)[1]
Gradient_Descent(X, Y, 10000, 1e-6, 0.01)
# Output of code :
(-154.7689765716959,
array([[-0.02218003],
[-0.1182535 ],
[ 0.1169462 ],
[ 0.58610747]]),
14)`
Please tell what is the problem with the code.
I'm following the Andrew-Ng course on Machine Learning and I'm currently doing the week 5 exercise.
I've found myself stuck on the implementation of the backpropagation algorithm, due to the fact that the relative difference, compared to numerical gradient, is very high (order of 1e-1), but I can't find any error within my implementation, so I'm gently asking if someone could take a look at it and explain what I did wrong.
Forward propagation:
def forward_propagation(thetas, X, history=False):
activation_arr = []
a = X # X is the array of the first activation values
for k in range(0, len(thetas)):
a = add_intercept(a) # add the bias unit
a = sigmoid(a # thetas[k].T)
if history:
activation_arr.append(a)
return activation_arr if history else a
Backpropagation:
def gradient_nn(thetas, X, y, num_labels, reg_lambda=None):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
# add intercepted X to the activation array
activation_arr = [add_intercept(X)] + forward_propagation(thetas, X, history=True)
sigma = [activation_arr[-1] - Y] # sigma^L = a^L - y
delta = [sigma[-1].T # activation_arr[-2]] # find delta for the first row
thetas_grad = []
# Calculate sigma and delta
for idx in range(1, len(thetas)): # skip last iteration
sigma = [
(sigma[0] # thetas[-idx][:, 1:]) * partial_derivative(activation_arr[-1-idx])
] + sigma
delta = [
sigma[0].T # activation_arr[-2-idx]
] + delta
return [np.sum(d) / n_examples for d in thetas_grad]
Partial derivative:
def partial_derivative(a):
return a * (1 - a) # element wise multiplication
Numerical gradient:
def compute_numerical_gradient(cost_function, thetas):
# Unroll parameters
nn_params = unroll_thetas(thetas)
num_grad = np.zeros(nn_params.shape)
perturb = np.zeros(nn_params.shape)
shapes = [theta.shape for theta in thetas]
epsilon = 1e-4 # not the one of random initialization
for p in range(nn_params.shape[0]):
# Set perturbation vector
perturb[p] = epsilon
minus_theta = nn_params - perturb
plus_theta = nn_params + perturb
# --- Roll params back in order to use the cost function ---
minus_theta = roll_thetas(minus_theta, shapes)
plus_theta = roll_thetas(plus_theta, shapes)
# calculate the loss of the cost function
minus_loss = cost_function(minus_theta)
plus_loss = cost_function(plus_theta)
# Compute Numerical Gradient
num_grad[p] = (plus_loss - minus_loss) / (2 * epsilon)
perturb[p] = 0
num_grad = roll_thetas(num_grad, shapes)
return [np.sum(num_g) for num_g in num_grad]
Cost function:
def J_nn(num_labels, reg_lambda=None):
def non_reg_func(thetas, X, y):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
prediction = forward_propagation(thetas, X)
return np.sum(np.sum(-Y * np.log(prediction) - (1 - Y) * np.log(1 - prediction))) / n_examples
if reg_lambda is None:
func = non_reg_func
else: # regularization
def func(thetas, X, y):
cost = non_reg_func(thetas, X, y)
for theta in thetas: # regularize for every layer
theta = theta[1:] # remove bias unit
cost = cost + (reg_lambda / (2 * y.shape[0])) * np.sum(np.sum(theta[:, ] ** 2))
return cost
return func
Checking backpropagation with numerical gradient:
def check_nn_gradients(reg_lambda=None):
"""
Creates a small neural network (max 8 x 8 x 7 x 8) and checks that
the implementation of the backpropagation algorithm is good
"""
#n_examples, sizes = random.randint(5, 10), [random.randint(2, 8), random.randint(2, 8), random.randint(1, 8)]
n_examples, sizes = 5, [8, 8, 5, 4]
n_labels = sizes[-1] # Last size is equal to the number of labels
init_epsilon = 0.0001
thetas = random_init_thetas(sizes, init_epsilon)
X = np.array(
random_init_thetas([sizes[0]-1, n_examples], init_epsilon)
).squeeze() # We squeeze it because random_init_thetas returns a 3D array, but we want X to be 2D
y = np.array([random.randint(0, n_labels-1) for _ in X])
y = y[:, np.newaxis]
inner_cost = lambda _thetas: J_nn(n_labels, reg_lambda)(_thetas, X, y)
gradients = gradient_nn(thetas, X, y, n_labels, 0)
unrolled_gradients = unroll_thetas(gradients)
print(unrolled_gradients)
# finite difference method
grad_checking_epsilon = 1e-4
num_grad = compute_numerical_gradient(inner_cost, thetas)
unrolled_num_grad = unroll_thetas(num_grad)
print(unrolled_num_grad)
return diff = np.linalg.norm(unrolled_num_grad - unrolled_gradients) / np.linalg.norm(unrolled_num_grad + unrolled_gradients)
I am writing Neural Network code from scratch using Numpy. But even after training my Network for many epochs, the predictions for each class is random and remains same irrespective of the input.
I have checked my concept according to Andrew Ng's Coursera ML course and towardsdatascience.com 's post. I think I'm making some very conceptual mistake which I cannot figure out.
Here is my code:
import numpy as np
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def dsigmoid(y):
return y * (1 - y)
class NeuralNetwork:
def __init__(self, shape):
self.n_layers = len(shape)
self.shape = shape
self.weight = []
self.bias = []
i = 0
while i < self.n_layers - 1:
self.weight.append(np.random.normal(loc=0.0, scale=0.5,
size=(self.shape[i + 1], self.shape[i])))
self.bias.append(np.random.normal(loc=0.0, scale=0.3,
size=(self.shape[i + 1], 1)))
i += 1
def predict(self, X):
z = self.weight[0] # X + self.bias[0]
a = sigmoid(z)
i = 1
while i < self.n_layers - 1:
z = self.weight[i] # a + self.bias[i]
a = sigmoid(z)
i += 1
return a
def predictVerbose(self, X):
layers = [X]
z = self.weight[0] # X + self.bias[0]
a = sigmoid(z)
layers.append(a)
i = 1
while i < self.n_layers - 1:
z = self.weight[i] # a + self.bias[i]
a = sigmoid(z)
layers.append(a)
i += 1
return layers
def gradOne(self, X, y):
layers = self.predictVerbose(X)
h = layers[-1]
delta_b = [(h - y) * dsigmoid(h)]
delta_w = [delta_b[0] # layers[-2].T]
i = 1
while i < self.n_layers - 1:
buff = delta_b[-1]
delta_b.append((self.weight[-i].T # buff) * dsigmoid(layers[-(i + 1)]))
delta_w.append(delta_b[-1] # layers[-(i + 2)].T)
i += 1
return delta_b[::-1], delta_w[::-1]
def grad(self, data, l_reg=0):
#data: x1, x2, x3, ..., xm, y=(0, 1, 2,...)
m = len(data)
delta_b = []
delta_w = []
i = 0
while i < self.n_layers - 1:
delta_b.append(np.zeros((self.shape[i + 1], 1)))
delta_w.append(np.zeros((self.shape[i + 1], self.shape[i])))
i += 1
for row in data:
X = np.array(row[:-1])[np.newaxis].T
y = np.zeros((self.shape[-1], 1))
# print(row)
y[row[-1], 0] = 1
buff1, buff2 = self.gradOne(X, y)
i = 0
while i < len(delta_b):
delta_b[i] += buff1[i] / m
delta_w[i] += buff2[i] / m
i += 1
return delta_b, delta_w
def train(self, data, batch_size, epoch, alpha, l_reg=0):
m = len(data)
for i in range(epoch):
j = 0
while j < m:
delta_b, delta_w = self.grad(data[i: (i + batch_size + 1)])
i = 0
while i < len(self.weight):
self.weight[i] -= alpha * delta_w[i]
self.bias[i] -= alpha * delta_b[i]
i += 1
j += batch_size
if __name__ == "__main__":
x = NeuralNetwork([2, 2, 2])
# for y in x.gradOne(np.array([[1], [2], [3]]), np.array([[0], [1]])):
# print(y.shape)
data = [
[1, 1, 0],
[0, 0, 0],
[1, 0, 1],
[0, 1, 1]
]
x.train(data, 4, 1000, 0.1)
print(x.predict(np.array([[1], [0]])))
print(x.predict(np.array([[1], [1]])))
Please point out where I am going wrong.
Unfortunately I don't have enough reputation to comment on your post but here's a link to a numpy only neural network that I've made (tested on blob data from sklearn and mnist).
https://github.com/jaymody/backpropagation/blob/master/old/NeuralNetwork.py
Are you still interested in this problem? As I understood, you try to get the XOR-perceptron with direct and inverse outputs?
It looks like:
1. You need to change the expression
delta_b, delta_w = self.grad(data[i: (i + batch_size + 1)]) to
delta_b, delta_w = self.grad(data[::])
in the train function.
2. Some of random values, used for initialization of synaptic and biases weights, requires much more training cycles for alpha=0.1. Try to play with the alpha (I set it up to 2) and number of epochs (I tried up to 20000).
Also your code do not works with 1-layered networks. I tried to train 1-layered AND and OR perceptrons and I got very strange results (or maybe it requires even much more cycles). But in 2-layered cases it works fine.
I'm starting the ML journey and I'm having troubles with this coding exercise
here is my code
import numpy as np
import pandas as pd
import scipy.optimize as op
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, name['Test1', 'Test2', 'Accepted'])
# Separate the features to make it fit into the mapFeature function
X1 = data['Test1'].values.T
X2 = data['Test2'].values.T
# This function makes more features (degree)
def mapFeature(x1, x2):
degree = 6
out = np.ones((x1.shape[0], sum(range(degree + 2))))
curr_column = 1
for i in range(1, degree + 1):
for j in range(i+1):
out[:,curr_column] = np.power(x1, i-j) * np.power(x2, j)
curr_column += 1
return out
# Separate the data into training and target, also initialize theta
X = mapFeature(X1, X2)
y = np.matrix(data['Accepted'].values).T
m, n = X.shape
cols = X.shape[1]
theta = np.matrix(np.zeros(cols))
#Initialize the learningRate(sigma)
learningRate = 1
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
first = np.multiply(-y, np.log(error))
second = np.multiply(1 - y, np.log(1 - error))
j = np.sum((first - second)) / m + (learningRate * np.sum(np.power(theta, 2)) / 2 * m)
return j
# Define the gradient of the cost function
def gradient(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
grad = (X.T # (error - y)) / m + ((learningRate * theta) / m)
grad_no = (X.T # (error - y)) / m
grad[0] = grad_no[0]
return grad
Result = op.minimize(fun=cost, x0=theta, args=(X, y, learningRate), method='TNC', jac=gradient)
opt_theta = np.matrix(Result.x)
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
p = predict(opt_theta, X)
print('Train Accuracy: {:f}'.format(np.mean(p == y) * 100))
So, when the learningRate = 1, the accuracy should be around 83,05% but I'm getting 80.5% and when the learningRate = 0, the accuracy should be 91.52% but I'm getting 87.28%
So the question is What am I doing wrong? Why my accuracy is below the problem default answer?
Hope someone can guide me in the right direction. Thanks!
P.D: Here is the dataset, maybe it can help
https://raw.githubusercontent.com/TheGirlWhiteWithBandages/Machine-Learning-Algorithms/master/Logistic%20Regression/ex2data2.txt
Hey guys I found a way to make it even better!
Here is the code
import numpy as np
import pandas as pd
import scipy.optimize as op
from sklearn.preprocessing import PolynomialFeatures
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, names=['Test1', 'Test2', 'Accepted'])
# Separate the data into training and target
X = (data.iloc[:, 0:2]).values
y = (data.iloc[:, 2:3]).values
# Modify the features to a certain degree (Polynomial)
poly = PolynomialFeatures(6)
m = y.size
XX = poly.fit_transform(data.iloc[:, 0:2].values)
# Initialize Theta
theta = np.zeros(XX.shape[1])
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return(1 / (1 + np.exp(-z)))
# Define the Regularized cost function
def costFunctionReg(theta, reg, *args):
# This is require to make the optimize function work
h = sigmoid(XX # theta)
first = np.log(h).T # - y
second = np.log(1 - h).T # (1 - y)
J = (1 / m) * (first - second) + (reg / (2 * m)) * np.sum(np.square(theta[1:]))
return J
# Define the Regularized gradient function
def gradientReg(theta, reg, *args):
theta = theta.reshape(-1, 1)
h = sigmoid(XX # theta)
grad = (1 / m) * (XX.T # (h - y)) + (reg / m) * np.r_[[[0]], theta[1:]]
return grad.flatten()
# Define the predict Function
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
# A loop to test between different values for sigma (reg parameter)
for i, Sigma in enumerate([0, 1, 100]):
# Optimize costFunctionReg
res2 = op.minimize(costFunctionReg, theta, args=(Sigma, XX, y), method=None, jac=gradientReg)
# Get the accuracy of the model
accuracy = 100 * sum(predict(res2.x, XX) == y.ravel()) / y.size
# Get the Error between different weights
error1 = costFunctionReg(res2.x, Sigma, XX, y)
# print the accuracy and error
print('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=4), Sigma))
print(error1)
Thanks for all your help!
try out this:
# import library
import pandas as pd
import numpy as np
dataset = pd.read_csv('ex2data2.csv',names = ['Test #1','Test #2','Accepted'])
# splitting to x and y variables for features and target variable
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
print('x[0] ={}, y[0] ={}'.format(x[0],y[0]))
m, n = x.shape
print('#{} Number of training samples, #{} features per sample'.format(m,n))
# import library FeatureMapping
from sklearn.preprocessing import PolynomialFeatures
# We also add one column of ones to interpret theta 0 (x with power of 0 = 1) by
include_bias as True
pf = PolynomialFeatures(degree = 6, include_bias = True)
x_poly = pf.fit_transform(x)
pd.DataFrame(x_poly).head(5)
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_ = 1
# reshape (-1,1) because we just have one feature in y column
y = y.reshape(-1,1)
def sigmoid(z):
return 1/(1+np.exp(-z))
def lr_hypothesis(x,theta):
return np.dot(x,theta)
def compute_cost(theta,x,y,lambda_):
theta = theta.reshape(n,1)
infunc1 = -y*(np.log(sigmoid(lr_hypothesis(x,theta)))) - ((1-y)*(np.log(1 - sigmoid(lr_hypothesis(x,theta)))))
infunc2 = (lambda_*np.sum(theta[1:]**2))/(2*m)
j = np.sum(infunc1)/m+ infunc2
return j
# gradient[0] correspond to gradient for theta(0)
# gradient[1:] correspond to gradient for theta(j) j>0
def compute_gradient(theta,x,y,lambda_):
gradient = np.zeros(n).reshape(n,)
theta = theta.reshape(n,1)
infunc1 = sigmoid(lr_hypothesis(x,theta))-y
gradient_in = np.dot(x.transpose(),infunc1)/m
gradient[0] = gradient_in[0,0] # theta(0)
gradient[1:] = gradient_in[1:,0]+(lambda_*theta[1:,]/m).reshape(n-1,) # theta(j) ; j>0
gradient = gradient.flatten()
return gradient
You can now test your cost and gradient without optimization. Th below code will optimize the model:
# hyperparameters
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_array = [0, 1, 10, 100]
import scipy.optimize as opt
for i in range(0,len(lambda_array)):
# Train
print('======================================== Iteration {} ===================================='.format(i))
optimized = opt.minimize(fun = compute_cost, x0 = theta, args = (x_poly, y,lambda_array[i]),
method = 'TNC', jac = compute_gradient)
new_theta = optimized.x
# Prediction
y_pred_train = predictor(x_poly,new_theta)
cm_train = confusion_matrix(y,y_pred_train)
t_train,f_train,acc_train = acc(cm_train)
print('With lambda = {}, {} correct, {} wrong ==========> accuracy = {}%'
.format(lambda_array[i],t_train,f_train,acc_train*100))
Now you should see output like this :
=== Iteration 0 === With lambda = 0, 104 correct, 14 wrong ==========> accuracy = 88.13559322033898%
=== Iteration 1 === With lambda = 1, 98 correct, 20 wrong ==========> accuracy = 83.05084745762711%
=== Iteration 2 === With lambda = 10, 88 correct, 30 wrong ==========> accuracy = 74.57627118644068%
=== Iteration 3 === With lambda = 100, 72 correct, 46 wrong ==========> accuracy = 61.016949152542374%
I have been trying to code logistic regression from scratch, which I have done, but I am using all the features in my breast cancer dataset, and I would like to select some features (specifically ones that I've found scikit-learn has selected for itself when I compare with it and use its feature selection on the data). However, I am not sure where to do this in my code, what I currently have is this:
X_train = ['texture_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'radius_se', 'symmetry_se'
'fractal_dimension_se', 'radius_worst', 'texture_worst', 'area_worst', 'smoothness_worst', 'compactness_worst']
X_test = ['texture_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'radius_se', 'symmetry_se'
'fractal_dimension_se', 'radius_worst', 'texture_worst', 'area_worst', 'smoothness_worst', 'compactness_worst']
def Sigmoid(z):
return 1/(1 + np.exp(-z))
def Hypothesis(theta, X):
return Sigmoid(X # theta)
def Cost_Function(X,Y,theta,m):
hi = Hypothesis(theta, X)
_y = Y.reshape(-1, 1)
J = 1/float(m) * np.sum(-_y * np.log(hi) - (1-_y) * np.log(1-hi))
return J
def Cost_Function_Derivative(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = theta - Cost_Function_Derivative(X,Y,theta,m,alpha)
return new_theta
def Accuracy(theta):
correct = 0
length = len(X_test)
prediction = (Hypothesis(theta, X_test) > 0.5)
_y = Y_test.reshape(-1, 1)
correct = prediction == _y
my_accuracy = (np.sum(correct) / length)*100
print ('LR Accuracy: ', my_accuracy, "%")
def Logistic_Regression(X,Y,alpha,theta,num_iters):
m = len(Y)
for x in range(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
print #('theta: ', theta)
print #('cost: ', Cost_Function(X,Y,theta,m))
Accuracy(theta)
ep = .012
initial_theta = np.random.rand(X_train.shape[1],1) * 2 * ep - ep
alpha = 0.5
iterations = 10000
Logistic_Regression(X_train,Y_train,alpha,initial_theta,iterations)
I was assuming that if I manually change what features X_train and X_test consist of this would work, but I get an error: AttributeError: 'list' object has no attribute 'shape' at the initial_theta line. Any help in the right direction would be appreciated.
the problem is that X_train is a list and shape only work for dataframes.
you could either:
-keep the list but use len(X_train) instead, OR
-change the X_train type to a pandas dataframe, pandas.DataFrame(X_train).shape[0]