Linear regression(Python)-getting different results for different algorithm - python

I implemented gradient descent for linear regression in Python for a data read from csv file. below is the code:
import numpy as np
import scipy.optimize as op
from sklearn import preprocessing
import matplotlib.pyplot as plt
from matplotlib import style
def CostFunc(theta,x,y):
m,n = x.shape;
theta = theta.reshape((n,1));
#y = y.reshape((m,1));
#x=x.reshape((m,1))
J = 0.5*sum(np.square((X.dot(theta)-y)))[0]/m;
return J;
def gradientDescent(X, y, theta, alpha, num_iters):
m,n = X.shape;
initial_iter = num_iters
J = np.zeros((num_iters,1));
while num_iters > 0:
theta = theta - (alpha*np.sum(((X.dot(theta)-y)*X),axis=0)/m).reshape((n,1))
J[initial_iter-num_iters][0]=CostFunc(theta,X,y)
num_iters=num_iters-1
return theta,J;
data = np.loadtxt(open("ex1data2.txt","rb"),delimiter=",",skiprows=1)
nr,nc = data.shape
X=data[:,0:nc - 1]
X=preprocessing.scale(X)
X=np.insert(X,0,1,axis=1)
y= data[:,[nc - 1]]
m , n = X.shape;
#initial_theta = np.zeros(len(n));
initial_theta = np.zeros((n,1));
theta,J=gradientDescent(X,y,initial_theta,0.01,400)
J=J.reshape(400)
count=0
grp=np.zeros((400,2));
for value in J:
grp[count][0] = count+1
grp[count][1] = value
count = count+1
plt.plot(grp)
plt.legend(loc=4)
plt.xlabel('iter')
plt.ylabel('cost')
plt.show()
print(theta)
optimal_theta = Result.x;
I ran 400 iterations and got a decreasing plot of cost as followsiteration vs cost plot
Then I used one of the custom algorithms on the same data with the following code
import numpy as np
import scipy.optimize as op
from sklearn import preprocessing
def Sigmoid(z):
return 1/(1 + np.exp(-z));
def Gradient(theta,x,y):
m , n = x.shape
theta = theta.reshape((n,1));
#y = y.reshape((m,1))
#x=x.reshape((m,1))
#sigmoid_x_theta = Sigmoid(x.dot(theta));
grad = (np.sum(((X.dot(theta)-y)*X),axis=0)/m);
return grad.flatten();
def CostFunc(theta,x,y):
m,n = x.shape;
theta = theta.reshape((n,1));
#y = y.reshape((m,1));
#x=x.reshape((m,1))
J = 0.5*sum(np.square((X.dot(theta)-y)))[0]/m;
return J;
data = np.loadtxt(open("ex1data2.txt","rb"),delimiter=",",skiprows=1)
nr,nc = data.shape
X=data[:,0:nc - 1]
X=preprocessing.scale(X)
X=np.insert(X,0,1,axis=1)
y= data[:,[nc - 1]]
m , n = X.shape;
#initial_theta = np.zeros(len(n));
initial_theta = np.zeros((n,1));
Result = op.minimize(fun = CostFunc,
x0 = initial_theta,
args = (X, y),
method = 'TNC',
jac = Gradient);
print("====================")
print(Result)
#optimal_theta = Result.x;
The theta values of the first case were
[[ 333032.07465084] [ 100130.7408761 ] [ 3699.66611303]]
While from the second I get the following output
fun: 2066502781.7118049
jac: array([ -6.45345806e-11, 7.84261236e-10, -2.42370452e-10])
message: 'Local minimum reached (|pg| ~= 0)'
nfev: 27
nit: 13
status: 0
success: True
x: array([ 339119.45652174, 110248.92165868, -6226.22670554])
I am assuming x: array([ 339119.45652174, 110248.92165868, -6226.22670554]) to be the theta value.
Why am I getting the values different for 2 cases?

Related

Unexpected value of cost function in Logistic regression

I been trying to write a python code for logistic regression but the results are showing very high value of cost function which is unexpected. I have created a random variable X and Y and added a noise term to Y which will flip the element of based on the probability theta. This is my code:
import numpy as np
from scipy.stats import bernoulli
rg = np.random.default_rng(100)
def data_generate(n, m, theta):
X_0 = np.ones((n, 1))
X = np.random.normal(loc=0.0, scale=1.0, size=(n, m))
X = np.concatenate((X_0, X), axis = 1)
beta = rg.random((m+1, 1))
Y = np.zeros((n, 1))
P = 1.0/(1.0 + np.exp(-np.dot(X, beta)))
for i in range(len(P)):
if P[i] >= 0.5:
Y[i] = 1
else:
Y[i] = 0
# Noise addition
noise = bernoulli.rvs(size=(n,1), p=theta)
for j in range(len(noise)):
if noise[i] == 1:
Y[i] = int(not(Y[i]))
else:
pass
return X, Y, beta
def Gradient_Descent(X, Y, k, tollerence, learning_rate):
n,m = np.shape(X)
beta = rg.random((m, 1))
costs = []
initial_cost = 0.0
for i in range(k):
Y_pred = 1.0/(1.0 + np.exp(-np.dot(X, beta)))
cost = np.mean(np.dot(Y.T, np.log(Y_pred)) + np.dot((1-Y).T, np.log(1-Y_pred)))
if (abs(cost - initial_cost) <= tollerence):
break
else:
beta = beta - learning_rate*(np.mean(np.dot(X.T, (Y_pred - Y))))
initial_cost = cost
costs.append(cost)
return cost, beta, i
X = data_generate(200, 3, 0.1)[0]
Y = data_generate(200, 3, 0.1)[1]
Gradient_Descent(X, Y, 10000, 1e-6, 0.01)
# Output of code :
(-154.7689765716959,
array([[-0.02218003],
[-0.1182535 ],
[ 0.1169462 ],
[ 0.58610747]]),
14)`
Please tell what is the problem with the code.

Minimize function via scipy with the one dependent parameter

I need to optimize my function with 3 variables: r, poly, eps.
Parameter poly depends on the r parameter. r and eps are independent but have some boundaries.
How to write the constraints for the minimization function in the right way?
import numpy as np
from sklearn.metrics import mean_squared_error as mse
from rbf.pde.fd import weights
from scipy.optimize import minimize
from sklearn.neighbors import KDTree as KDT
nx = 101
nz = nx
dx = 1
dz = dx
x,z = np.arange(0, nx)*dx, np.arange(0, nz)*dz
X, Z = np.meshgrid(x, z)
grid = np.column_stack([X.ravel(), Z.ravel()])
nn = len(grid)
boundary_index = np.argwhere((grid[:,0]==x[0]) | (grid[:,0]==x[-1]) | (grid[:,1]==z[0]) | (grid[:,1]==z[-1])).ravel()
checking = np.array([True]*nn); checking[boundary_index]=False
FDM_coefs = [-4, 1, 1, 1, 1]
def weight_matrix2(x, p, r ,diffs,
phi='phs3',
order=None,
eps=1.0):
neighbors = KDT(p).query_radius(x, r)
coefficients = np.array([weights(x[i], p[neighbors[i]], diffs = diffs, phi=phi, order=order, eps=eps) for i in range(len(p))])
return neighbors, coefficients
def loss(args):
r, poly, Eps = args[0], args[1], args[2]
nb, coefs = weight_matrix2(
x=grid,
p=grid,
r=r,
diffs=[(2, 0), (0, 2)],
phi='phs4',
order=poly,
eps=Eps)
a = np.array([np.sort([i]) for i in coefs[checking].copy()])
a = np.squeeze(a)
error = np.empty_like(a)
error = [mse(a[i], FDM_coefs, squared=False) for i in range(len(a))]
return np.mean(error)
initial_guess = [1, 1, 10.**(-5)]
bnds = ((1,10),(1,10),(10.**(-10), 10.**(5)))
def constraint2(args): #Return possible poly values
r = args[0]
n = max(np.unique([len(i) for i in KDT(grid).query_radius(grid, r)[checking==True]]))
mp = (np.sqrt(8*n+1)-3)//2
return np.arange(1, mp+1)
#constraint 2 means which values poly variable can take == np.arange(1, mp+1)
con2 = {'type':'eq', 'fun':constraint2}
result = minimize(loss, initial_guess, method='SLSQP',options={'maxiter':100, 'disp':True},
bounds=bnds, constraints=con2)

General minimal residual method with right-preconditioner of SSOR

I am trying to implement the algorithm of GMRES with right-preconditioner P for solving the linear system Ax = b . The code is running without error; however, it pops into unprecise result for me because the error I have is very large. For the GMRES method (without preconditioning matrix - remove P in the algorithm), the error I get is around 1e^{-12} and it converges with the same matrix.
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
from scipy.linalg import norm as norm
import scipy.sparse as sp
from scipy.sparse import diags
"""The program is to split the matrix into D-diagonal; L: strictly lower matrix; U strictly upper matrix
satisfying: A = D - L - U """
def splitMat(A):
n,m = A.shape
if (n == m):
diagval = np.diag(A)
D = diags(diagval,0).toarray()
L = (-1)*np.tril(A,-1)
U = (-1)*np.triu(A,1)
else:
print("A needs to be a square matrix")
return (L,D,U)
"""Preconditioned Matrix for symmetric successive over-relaxation (SSOR): """
def P_SSOR(A,w):
## Split up matrix A:
L,D,U = splitMat(A)
Comp1 = (D - w*U)
Comp2 = (D - w*L)
Comp1inv = np.linalg.inv(Comp1)
Comp2inv = np.linalg.inv(Comp2)
P = w*(2-w)*np.matmul(Comp1inv, np.matmul(D,Comp2inv))
return P
"""GMRES_SSOR using right preconditioning P:
A - matrix of linear system Ax = b
x0 - initial guess
tol - tolerance
maxit - maximum iteration """
def myGMRES_SSOR(A,x0, b, tol, maxit):
matrixSize = A.shape[0]
e = np.zeros((maxit+1,1))
rr = 1
rstart = 2
X = x0
w = 1.9 ## in ssor
P = P_SSOR(A,w) ### preconditioned matrix
### Starting the GMRES ####
for rs in range(0,rstart+1):
### first check the residual:
if rr<tol:
break
else:
r0 = (b-A.dot(x0))
rho = norm(r0)
e[0] = rho
H = np.zeros((maxit+1,maxit))
Qcol = np.zeros((matrixSize, maxit+1))
Qcol[:,0:1] = r0/rho
for k in range(1, maxit+1):
### Arnodi procedure ##
Qcol[:,k] =np.matmul(np.matmul(A,P), Qcol[:,k-1]) ### This step applies P here:
for j in range(0,k):
H[j,k-1] = np.dot(np.transpose(Qcol[:,k]),Qcol[:,j])
Qcol[:,k] = Qcol[:,k] - (np.dot(H[j,k-1], Qcol[:,j]))
H[k,k-1] =norm(Qcol[:,k])
Qcol[:,k] = Qcol[:,k]/H[k,k-1]
### QR decomposition step ###
n = k
Q = np.zeros((n+1, n))
R = np.zeros((n, n))
R[0, 0] = norm(H[0:n+2, 0])
Q[:, 0] = H[0:n+1, 0] / R[0,0]
for j in range (0, n+1):
t = H[0:n+1, j-1]
for i in range (0, j-1):
R[i, j-1] = np.dot(Q[:, i], t)
t = t - np.dot(R[i, j-1], Q[:, i])
R[j-1, j-1] = norm(t)
Q[:, j-1] = t / R[j-1, j-1]
g = np.dot(np.transpose(Q), e[0:k+1])
Y = np.dot(np.linalg.inv(R), g)
Res= e[0:n] - np.dot(H[0:n, 0:n], Y[0:n])
rr = norm(Res)
#### second check on the residual ###
if rr < tol:
break
#### Updating the solution with the preconditioned matrix ####
X = X + np.matmul(np.matmul(P,Qcol[:, 0:k]), Y) ### This steps applies P here:
return X
######
A = np.random.rand(100,100)
x = np.random.rand(100,1)
b = np.matmul(A,x)
x0 = np.zeros((100,1))
maxit = 100
tol = 0.00001
x = myGMRES_SSOR(A,x0,b,tol,maxit)
res = b - np.matmul(A,x)
print(norm(res))
print("Solution with gmres\n", np.matmul(A,x))
print("---------------------------------------")
print("b matrix:", b)
I hope anyone could help me figure out this!!!
I'm not sure where you got you "Symmetric_successive_over-relaxation" SSOR code from, but it appears to be wrong. You also seem to be assuming that A is symmetric matrix, but in your random test case it is not.
Following SSOR's Wikipedia entry, I replaced your P_SSOR function with
def P_SSOR(A,w):
L,D,U = splitMat(A)
P = 2/(2-w) * (1/w*D+L)*np.linalg.inv(D)*(1/w*D+L).T
return P
and your test matrix with
A = np.random.rand(100,100)
A = A + A.T
and your code works up to a 12 digit residual error.

Regularized Logistic Regression in Python (Andrew ng Course)

I'm starting the ML journey and I'm having troubles with this coding exercise
here is my code
import numpy as np
import pandas as pd
import scipy.optimize as op
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, name['Test1', 'Test2', 'Accepted'])
# Separate the features to make it fit into the mapFeature function
X1 = data['Test1'].values.T
X2 = data['Test2'].values.T
# This function makes more features (degree)
def mapFeature(x1, x2):
degree = 6
out = np.ones((x1.shape[0], sum(range(degree + 2))))
curr_column = 1
for i in range(1, degree + 1):
for j in range(i+1):
out[:,curr_column] = np.power(x1, i-j) * np.power(x2, j)
curr_column += 1
return out
# Separate the data into training and target, also initialize theta
X = mapFeature(X1, X2)
y = np.matrix(data['Accepted'].values).T
m, n = X.shape
cols = X.shape[1]
theta = np.matrix(np.zeros(cols))
#Initialize the learningRate(sigma)
learningRate = 1
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
first = np.multiply(-y, np.log(error))
second = np.multiply(1 - y, np.log(1 - error))
j = np.sum((first - second)) / m + (learningRate * np.sum(np.power(theta, 2)) / 2 * m)
return j
# Define the gradient of the cost function
def gradient(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
grad = (X.T # (error - y)) / m + ((learningRate * theta) / m)
grad_no = (X.T # (error - y)) / m
grad[0] = grad_no[0]
return grad
Result = op.minimize(fun=cost, x0=theta, args=(X, y, learningRate), method='TNC', jac=gradient)
opt_theta = np.matrix(Result.x)
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
p = predict(opt_theta, X)
print('Train Accuracy: {:f}'.format(np.mean(p == y) * 100))
So, when the learningRate = 1, the accuracy should be around 83,05% but I'm getting 80.5% and when the learningRate = 0, the accuracy should be 91.52% but I'm getting 87.28%
So the question is What am I doing wrong? Why my accuracy is below the problem default answer?
Hope someone can guide me in the right direction. Thanks!
P.D: Here is the dataset, maybe it can help
https://raw.githubusercontent.com/TheGirlWhiteWithBandages/Machine-Learning-Algorithms/master/Logistic%20Regression/ex2data2.txt
Hey guys I found a way to make it even better!
Here is the code
import numpy as np
import pandas as pd
import scipy.optimize as op
from sklearn.preprocessing import PolynomialFeatures
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, names=['Test1', 'Test2', 'Accepted'])
# Separate the data into training and target
X = (data.iloc[:, 0:2]).values
y = (data.iloc[:, 2:3]).values
# Modify the features to a certain degree (Polynomial)
poly = PolynomialFeatures(6)
m = y.size
XX = poly.fit_transform(data.iloc[:, 0:2].values)
# Initialize Theta
theta = np.zeros(XX.shape[1])
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return(1 / (1 + np.exp(-z)))
# Define the Regularized cost function
def costFunctionReg(theta, reg, *args):
# This is require to make the optimize function work
h = sigmoid(XX # theta)
first = np.log(h).T # - y
second = np.log(1 - h).T # (1 - y)
J = (1 / m) * (first - second) + (reg / (2 * m)) * np.sum(np.square(theta[1:]))
return J
# Define the Regularized gradient function
def gradientReg(theta, reg, *args):
theta = theta.reshape(-1, 1)
h = sigmoid(XX # theta)
grad = (1 / m) * (XX.T # (h - y)) + (reg / m) * np.r_[[[0]], theta[1:]]
return grad.flatten()
# Define the predict Function
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
# A loop to test between different values for sigma (reg parameter)
for i, Sigma in enumerate([0, 1, 100]):
# Optimize costFunctionReg
res2 = op.minimize(costFunctionReg, theta, args=(Sigma, XX, y), method=None, jac=gradientReg)
# Get the accuracy of the model
accuracy = 100 * sum(predict(res2.x, XX) == y.ravel()) / y.size
# Get the Error between different weights
error1 = costFunctionReg(res2.x, Sigma, XX, y)
# print the accuracy and error
print('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=4), Sigma))
print(error1)
Thanks for all your help!
try out this:
# import library
import pandas as pd
import numpy as np
dataset = pd.read_csv('ex2data2.csv',names = ['Test #1','Test #2','Accepted'])
# splitting to x and y variables for features and target variable
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
print('x[0] ={}, y[0] ={}'.format(x[0],y[0]))
m, n = x.shape
print('#{} Number of training samples, #{} features per sample'.format(m,n))
# import library FeatureMapping
from sklearn.preprocessing import PolynomialFeatures
# We also add one column of ones to interpret theta 0 (x with power of 0 = 1) by
include_bias as True
pf = PolynomialFeatures(degree = 6, include_bias = True)
x_poly = pf.fit_transform(x)
pd.DataFrame(x_poly).head(5)
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_ = 1
# reshape (-1,1) because we just have one feature in y column
y = y.reshape(-1,1)
def sigmoid(z):
return 1/(1+np.exp(-z))
def lr_hypothesis(x,theta):
return np.dot(x,theta)
def compute_cost(theta,x,y,lambda_):
theta = theta.reshape(n,1)
infunc1 = -y*(np.log(sigmoid(lr_hypothesis(x,theta)))) - ((1-y)*(np.log(1 - sigmoid(lr_hypothesis(x,theta)))))
infunc2 = (lambda_*np.sum(theta[1:]**2))/(2*m)
j = np.sum(infunc1)/m+ infunc2
return j
# gradient[0] correspond to gradient for theta(0)
# gradient[1:] correspond to gradient for theta(j) j>0
def compute_gradient(theta,x,y,lambda_):
gradient = np.zeros(n).reshape(n,)
theta = theta.reshape(n,1)
infunc1 = sigmoid(lr_hypothesis(x,theta))-y
gradient_in = np.dot(x.transpose(),infunc1)/m
gradient[0] = gradient_in[0,0] # theta(0)
gradient[1:] = gradient_in[1:,0]+(lambda_*theta[1:,]/m).reshape(n-1,) # theta(j) ; j>0
gradient = gradient.flatten()
return gradient
You can now test your cost and gradient without optimization. Th below code will optimize the model:
# hyperparameters
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_array = [0, 1, 10, 100]
import scipy.optimize as opt
for i in range(0,len(lambda_array)):
# Train
print('======================================== Iteration {} ===================================='.format(i))
optimized = opt.minimize(fun = compute_cost, x0 = theta, args = (x_poly, y,lambda_array[i]),
method = 'TNC', jac = compute_gradient)
new_theta = optimized.x
# Prediction
y_pred_train = predictor(x_poly,new_theta)
cm_train = confusion_matrix(y,y_pred_train)
t_train,f_train,acc_train = acc(cm_train)
print('With lambda = {}, {} correct, {} wrong ==========> accuracy = {}%'
.format(lambda_array[i],t_train,f_train,acc_train*100))
Now you should see output like this :
=== Iteration 0 === With lambda = 0, 104 correct, 14 wrong ==========> accuracy = 88.13559322033898%
=== Iteration 1 === With lambda = 1, 98 correct, 20 wrong ==========> accuracy = 83.05084745762711%
=== Iteration 2 === With lambda = 10, 88 correct, 30 wrong ==========> accuracy = 74.57627118644068%
=== Iteration 3 === With lambda = 100, 72 correct, 46 wrong ==========> accuracy = 61.016949152542374%

How can I fit an equation which is not a function

Given data points in the xy plane, I would like to use scipy.optimize.leastsq to find fit parameters for an ellipse (which cannot be written as a function of x and y). I tried setting the entire equation equal to zero, and then fitting this function, but the fit is failing to converge with error output
"The relative error between two consecutive iterates is at most 0.000000."
The code is shown below, as well as the output. The fitter clearly does not find any reasonable parameters. My question is whether or not this is a problem with scipy.optimize.leastsq, or whether the "trick" of setting the function equal to zero and instead fitting that is not valid.
from scipy.optimize import leastsq, curve_fit
import numpy as np
import matplotlib.pyplot as plt
def function(x,y,theta,smaj,smin):
xp = np.cos(theta)*x - np.sin(theta)*y
yp = np.sin(theta)*x + np.cos(theta)*y
z = ((xp)**2)/smaj**2 + ((yp)**2)/smin**2
return z
def g(x,y,smaj,smin):
return x*x/smaj**2 + y*y/smin**2
def window(array,alt,arange):
arr = [array[i] for i,a in enumerate(alt) if a > arange[0] and a < arange[1]]
return np.asarray(arr)
def fitter(p0,x,y,func,errfunc,err):
# the fitter function
out = leastsq(errfunc,p0,args=(x,y,func,err),full_output=1)
pfinal = out[0]
covar = out[1]
mydict = out[2]
mesg = out[3]
ier = out[4]
resids = mydict['fvec']
chisq = np.sum(resids**2)
degs_frdm = len(x)-len(pfinal)
reduced_chisq = chisq/degs_frdm
ls = [pfinal,covar,mydict,mesg,ier,resids,chisq,degs_frdm,reduced_chisq]
print('fitter status: ', ier, '-- aka -- ', mesg)
i = 0
if covar is not None:
if (ier == 1 or ier == 2 or ier == 3 or ier == 4):
for u in pfinal:
print ('Param', i+1, ': ',u, ' +/- ', np.sqrt(covar[i,i]))
i = i + 1
print ('reduced chisq',reduced_chisq)
else:
print('fitter failed')
return ls
def func(x,y,p):
x = x-p[3]
y = y-p[4]
xp = np.cos(p[0])*(x) - np.sin(p[0])*(y)
yp = np.sin(p[0])*(x) + np.cos(p[0])*(y)
z = ((xp)**2)/p[1]**2 + ((yp)**2)/p[2]**2 - 1
return z
def errfunc(p,x,y,func,err):
return (y-func(x,y,p))/err
t = np.linspace(0,2*np.pi,100)
xx = 5*np.cos(t); yy = np.sin(t)
p0 = [0,5,1,0,0]
sigma = np.ones(len(xx))
fit = fitter(p0,xx,yy,func,errfunc,sigma)
params = fit[0]
covariance = fit[1]
residuals = fit[5]
t = np.linspace(0,2*np.pi,100)
xx = 5*np.cos(t); yy = np.sin(t)
plt.plot(xx,yy,'bx',ms = 4)
xx = np.linspace(-10,10, 1000)
yy = np.linspace(-5, 5, 1000)
newx = []
newy = []
for x in xx:
for y in yy:
if 0.99 < func(x,y,params) < 1.01:
#if g(x,y,5,1) == 1:
newx.append(x)
newy.append(y)
plt.plot(newx,newy,'kx',ms = 1)
plt.show()
The blue crosses are the actual data, and the black line is the fitters guess at the parameters.

Categories