Related
I been trying to write a python code for logistic regression but the results are showing very high value of cost function which is unexpected. I have created a random variable X and Y and added a noise term to Y which will flip the element of based on the probability theta. This is my code:
import numpy as np
from scipy.stats import bernoulli
rg = np.random.default_rng(100)
def data_generate(n, m, theta):
X_0 = np.ones((n, 1))
X = np.random.normal(loc=0.0, scale=1.0, size=(n, m))
X = np.concatenate((X_0, X), axis = 1)
beta = rg.random((m+1, 1))
Y = np.zeros((n, 1))
P = 1.0/(1.0 + np.exp(-np.dot(X, beta)))
for i in range(len(P)):
if P[i] >= 0.5:
Y[i] = 1
else:
Y[i] = 0
# Noise addition
noise = bernoulli.rvs(size=(n,1), p=theta)
for j in range(len(noise)):
if noise[i] == 1:
Y[i] = int(not(Y[i]))
else:
pass
return X, Y, beta
def Gradient_Descent(X, Y, k, tollerence, learning_rate):
n,m = np.shape(X)
beta = rg.random((m, 1))
costs = []
initial_cost = 0.0
for i in range(k):
Y_pred = 1.0/(1.0 + np.exp(-np.dot(X, beta)))
cost = np.mean(np.dot(Y.T, np.log(Y_pred)) + np.dot((1-Y).T, np.log(1-Y_pred)))
if (abs(cost - initial_cost) <= tollerence):
break
else:
beta = beta - learning_rate*(np.mean(np.dot(X.T, (Y_pred - Y))))
initial_cost = cost
costs.append(cost)
return cost, beta, i
X = data_generate(200, 3, 0.1)[0]
Y = data_generate(200, 3, 0.1)[1]
Gradient_Descent(X, Y, 10000, 1e-6, 0.01)
# Output of code :
(-154.7689765716959,
array([[-0.02218003],
[-0.1182535 ],
[ 0.1169462 ],
[ 0.58610747]]),
14)`
Please tell what is the problem with the code.
I'm trying to write a function to evaluate the probability mass function for the bivariate poisson distribution.
This is easy when all of the parameters (x, y, theta1, theta2, theta0) are scalars, but tricky to scale up without loops to allow these parameters to be vectors. I need it to scale such that, for:
theta0 being a scalar - the "correlation parameter" in the equation
theta1 and theta2 having length l
x, y both having length n
the output array would have shape (l, n, n). For example, a slice [j, :, :] from the output array would look like:
The first part (the constant, before the summation) I think i've figured out:
import numpy as np
from scipy.special import factorial
def constant(theta1, theta2, theta0, x, y):
exponential_part = np.exp(-(theta1 + theta2 + theta0)).reshape(-1, 1, 1)
x = np.tile(x, (len(x), 1)).transpose()
y = np.tile(y, (len(y), 1))
double_factorial = (np.power(np.array(theta1).reshape(-1, 1, 1), x)/factorial(x)) * \
(np.power(np.array(theta2).reshape(-1, 1, 1), y)/factorial(y))
return exponential_part * double_factorial
But I'm struggling with the summation part. How can I vectorize a summation where the limits depend on variable arrays?
I think I have this figured out, based on the approach that #w-m suggests: calculate every possible summation term which could appear, based on the maximum x or y value which appears, and use a mask to get rid of the ones you don't want. Assuming you have your x and y terms go from 0 to N, in consecutive order, this is calculating up to three times more terms than are actually required, but this is offset by getting to use vectorization.
Reference implementation
I wrote this by first writing a pure-Python reference implementation, which just implements your problem using loops. With 4 nested loops, it's not exactly fast, but it's handy to have while testing the numpy version.
import numpy as np
from scipy.special import factorial, comb
import operator as op
from functools import reduce
def choose(n, r):
# https://stackoverflow.com/a/4941932/530160
r = min(r, n-r)
numer = reduce(op.mul, range(n, n-r, -1), 1)
denom = reduce(op.mul, range(1, r+1), 1)
return numer // denom # or / in Python 2
def reference_impl_constant(s_theta1, s_theta2, s_theta0, s_x, s_y):
# Cast to float to prevent overflow
s_theta1 = float(s_theta1)
s_theta2 = float(s_theta2)
s_theta0 = float(s_theta0)
s_x = float(s_x)
s_y = float(s_y)
term1 = np.exp(-(s_theta1 + s_theta2 + s_theta0))
term2 = (s_theta1 ** s_x / factorial(s_x))
term3 = (s_theta2 ** s_y / factorial(s_y))
assert term1 >= 0
assert term2 >= 0
assert term3 >= 0
return term1 * term2 * term3
def reference_impl_constant_loop(theta1, theta2, theta0, x, y):
theta_len = theta1.shape[0]
xy_len = x.shape[0]
constant_array = np.zeros((theta_len, xy_len, xy_len))
for i in range(theta_len):
for j in range(xy_len):
for k in range(xy_len):
s_theta1 = theta1[i]
s_theta2 = theta2[i]
s_theta0 = theta0
s_x = x[j]
s_y = y[k]
constant_term = reference_impl_constant(s_theta1, s_theta2, s_theta0, s_x, s_y)
assert constant_term >= 0
constant_array[i, j, k] = constant_term
return constant_array
def reference_impl_summation(s_theta1, s_theta2, s_theta0, s_x, s_y):
sum_ = 0
for i in range(min(s_x, s_y) + 1):
sum_ += choose(s_x, i) * choose(s_y, i) * factorial(i) * ((s_theta0/s_theta1/s_theta2) ** i)
assert sum_ >= 0
return sum_
def reference_impl_summation_loop(theta1, theta2, theta0, x, y):
theta_len = theta1.shape[0]
xy_len = x.shape[0]
summation_array = np.zeros((theta_len, xy_len, xy_len))
for i in range(theta_len):
for j in range(xy_len):
for k in range(xy_len):
s_theta1 = theta1[i]
s_theta2 = theta2[i]
s_theta0 = theta0
s_x = x[j]
s_y = y[k]
summation_term = reference_impl_summation(s_theta1, s_theta2, s_theta0, s_x, s_y)
assert summation_term >= 0
summation_array[i, j, k] = summation_term
return summation_array
def reference_impl(theta1, theta2, theta0, x, y):
# all array inputs must be 1D
assert len(theta1.shape) == 1
assert len(theta2.shape) == 1
assert len(x.shape) == 1
assert len(y.shape) == 1
# theta vectors must have same length
theta_len = theta1.shape[0]
assert theta2.shape[0] == theta_len
# x and y must have same length
xy_len = x.shape[0]
assert y.shape[0] == xy_len
# theta0 is scalar
assert isinstance(theta0, (int, float))
constant_array = np.zeros((theta_len, xy_len, xy_len))
output = np.zeros((theta_len, xy_len, xy_len))
constant_array = reference_impl_constant_loop(theta1, theta2, theta0, x, y)
summation_array = reference_impl_summation_loop(theta1, theta2, theta0, x, y)
output = constant_array * summation_array
return output
Numpy implementation
I split the implementation of this across two functions.
The fast_constant() function calculates everything to the left of the summation symbol. The fast_summation() function calculates everything inside the summation symbol.
import numpy as np
from scipy.special import factorial, comb
def fast_summation(theta1, theta2, theta0, x, y):
x = np.tile(x, (len(x), 1)).transpose()
y = np.tile(y, (len(y), 1))
sum_limit = np.minimum(x, y)
max_sum_limit = np.max(sum_limit)
i = np.arange(max_sum_limit + 1).reshape(-1, 1, 1)
summation_mask = (i <= sum_limit)
theta_ratio = (theta0 / (theta1 * theta2)).reshape(-1, 1, 1, 1)
theta_to_power = np.power(theta_ratio, i)
terms = comb(x, i) * comb(y, i) * factorial(i) * theta_to_power
# mask out terms which aren't part of sum
terms *= summation_mask
# axis 0 is theta
# axis 1 is i
# axis 2 & 3 are x and y
# so sum across axis 1
terms = terms.sum(axis=1)
return terms
def fast_constant(theta1, theta2, theta0, x, y):
theta1 = theta1.astype('float64')
theta2 = theta2.astype('float64')
exponential_part = np.exp(-(theta1 + theta2 + theta0)).reshape(-1, 1, 1)
# x and y must be 1D
assert len(x.shape) == 1
assert len(y.shape) == 1
# x and y must have same shape
assert x.shape == y.shape
x_len, y_len = x.shape[0], y.shape[0]
x = x.reshape((x_len, 1))
y = y.reshape((1, y_len))
double_factorial = (np.power(np.array(theta1).reshape(-1, 1, 1), x)/factorial(x)) * \
(np.power(np.array(theta2).reshape(-1, 1, 1), y)/factorial(y))
return exponential_part * double_factorial
def fast_impl(theta1, theta2, theta0, x, y):
return fast_summation(theta1, theta2, theta0, x, y) * fast_constant(theta1, theta2, theta0, x, y)
Benchmarking
Assuming that X and Y range from 0 to 20, and that theta is centered somewhere inside that range, I get the result that the numpy version is roughly 280 times faster than the pure python reference.
Numerical stability
I'm unsure how numerically stable this is. For example, when I center theta at 100, I get a floating-point overflow. Typically, when computing an expression which has lots of choose and factorial expressions inside it, you'll use some mathematical equivalent which results in smaller intermediate sums. In this case I have so little understanding of the math that I don't know how you'd do that.
So, I checked other questions and answers about this and I didn't understand the reason, could you please help me with this, please
I'm learning linear regression and I implemented the code for Linear Regression with two variables.
This implementation should predict the sum of two numbers but it's giving an error
Implemented code here:
import matplotlib.pyplot as plt
import numpy as np
x = np.array([[1,1],[1,2],[2,3],[3,4],[4,5],[5,6]])
y = np.array([2,3,5,7,9,11])
#hypothesis
def hypothesis(theta, x):
return theta[0] + theta[1]*x[0] + theta[2]*x[1]
#cost function J(t0, t1, t2)
def cost(theta, x, y):
m = x.shape[0]
error = 0
for i in range(m):
d = x[i]
hx = hypothesis(theta, d)
error = error + (hx - y[i])**2
return error
#differentiation of cost function
def diffGradient(theta, x, y):
grad = np.zeros((3,))
m = x.shape[0]
for i in range(m):
hx = hypothesis(theta, x)
grad[0] = grad[0] + (hx - y[i])
grad[1] = grad[1] + (hx - y[i])*x[0]
grad[2] = grad[2] + (hx - y[i])*x[1]
return 0
#gradient descent funtion
def gradientDescent(x, y, learning_rate = 0.001):
theta = [-2.0,0.0,1.0]
iter = 100
error_list = []
theta_list = []
for i in range(iter):
d = x[i]
grad = diffGradient(theta, d, y)
e = cost(theta, d, y)
error_list.append(e)
theta_list.append((theta[0],theta[1],theta[2]))
#simultaneous update
theta[0] = theta[0] - learning_rate*grad[0]
theta[1] = theta[1] - learning_rate*grad[1]
theta[2] = theta[2] - learning_rate*grad[2]
return theta, theta_list, error_list
final_theta, theta_list, error_list = gradientDescent(x,y)
After the above line, I'm getting this error
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-56-bda7687d0af9> in <module>
----> 1 final_theta, theta_list, error_list = gradientDescent(x,y)
<ipython-input-55-033133fbfbd5> in gradientDescent(x, y, learning_rate)
8 d = x[i]
9 grad = diffGradient(theta, d, y)
---> 10 e = cost(theta, d, y)
11 error_list.append(e)
12 theta_list.append((theta[0],theta[1],theta[2]))
<ipython-input-41-6a07f4b81c9c> in cost(theta, x, y)
5 for i in range(m):
6 d = x[i]
----> 7 hx = hypothesis(theta, d)
8 error = error + (hx - y[i])**2
9 return error
<ipython-input-27-43ce9d7c567b> in hypothesis(theta, x)
1 #hypothesis
2 def hypothesis(theta, x):
----> 3 return theta[0] + theta[1]*x[0] + theta[2]*x[1]
IndexError: invalid index to scalar variable.
I don't know what I'm doing wrong. Any help would be appreciated.
What's the x that you pass to
gradientDescent(x,y)
Look at the traceback and try to figure out what variable has the problem - the problem is that you can't index a scalar, a number. It has to be a list or array. Trace the problem variable back up through your code.
In the traceback:
in the problem line you use x[0] and x[1]. What's x at this point?
In the calling function it's d, which is set with d = x[i]
In gradientDescent the passed variable is again called d, and set as d = x[i]
So you have 3 levels of indexing. Does the original x support that?
You have to understand the problem before you try a fix.
When I use this code for Single variable Linear regression, the theta is being evaluated correctly but when on the multi variable it is giving weird output for theta.
I am trying to convert my octave code, that I wrote when I took Andrew Ng's course.
This is the main calling file:
m = data.shape[0]
a = np.array(data[0])
a.shape = (m,1)
b = np.array(data[1])
b.shape = (m, 1)
x = np.append(a, b, axis=1)
y = np.array(data[2])
lr = LR.LinearRegression()
[X, mu, sigma] = lr.featureNormalize(x)
z = np.ones((m, 1), dtype=float)
X = np.append(z, X, axis=1)
alpha = 0.01
num_iters = 400
theta = np.zeros(shape=(3,1))
[theta, J_history] = lr.gradientDescent(X, y, theta, alpha, num_iters)
print(theta)
And here are the contents of class :
class LinearRegression:
def featureNormalize(self, data):#this normalizes the features
data = np.array(data)
x_norm = data
mu = np.zeros(shape=(1, data.shape[1]))#creates mu vector filled with zeros
sigma = np.zeros(shape=(1, data.shape[1]))
for i in range(0, data.shape[1]):
mu[0, i] = np.mean(data[:, i])
sigma[0, i] = np.std(data[:, i])
for i in range(0, data.shape[1]):
x_norm[:, i] = np.subtract(x_norm[:, i], mu[0, i])
x_norm[:, i] = np.divide(x_norm[:, i], sigma[0, i])
return [x_norm, mu, sigma]
def gradientDescent(self, X, y, theta, alpha, num_iters):
m = y.shape[0]
J_history = np.zeros(shape=(num_iters, 1))
for i in range(0, num_iters):
predictions = X.dot(theta) # X is 47*3 theta is 3*1 predictions is 47*1
theta = np.subtract(theta , (alpha / m) * np.transpose((np.transpose(np.subtract(predictions ,y))).dot(X))) #1*97 into 97*3
J_history[i] = self.computeCost(X, y, theta)
return [theta, J_history]
def computeCost(self, X, y, theta):
warnings.filterwarnings('ignore')
m = X.shape[0]
J = 0
predictions = X.dot(theta)
sqrErrors = np.power(predictions - y, 2)
J = 1 / (2 * m) * np.sum(sqrErrors)
return J
I expected a theta that'll be a 3*1 matrix. According to Andrew's course my octave implementation was producing a theta
334302.063993
100087.116006
3673.548451
But in python implementation I am getting very weird output:
[[384596.12996714 317274.97693463 354878.64955708 223121.53576488
519238.43603216 288423.05420641 302849.01557052 191383.45903309
203886.92061274 233219.70871976 230814.42009498 333720.57288972
317370.18827964 673115.35724932 249953.82390212 432682.6678475
288423.05420641 192249.97844569 480863.45534211 576076.72380674
243221.70859887 245241.34318985 233604.4010228 249953.82390212
551937.2817908 240336.51632605 446723.93690857 451051.7253178
456822.10986344 288423.05420641 336509.59208678 163398.05571747
302849.01557052 557707.6...................... this goes on for long
The same code is working absolutely fine in Single Variable dataset. It is also working fine in the octave but seems like I am missing some point for 2+ hours now. Happy to get your help.
Try in gradientDescent the following second line of the for loop:
theta=theta-(alpha/m)*X.T.dot(X.dot(theta)-y)
Also, if you want to add a column of ones, it is easier to do like so:
np.c_[np.ones((m,1)),data]
I'm trying to implement baseline als subtraction in pytorch so that I can run it on my GPU but I am running into problems because pytorch.gesv gives a different result than scipy.linalg.spsolve. Here is my code for scipy:
def baseline_als(y, lam, p, niter=10):
L = len(y)
D = sparse.diags([1,-2,1],[0,-1,-2], shape=(L,L-2))
w = np.ones(L)
for i in range(niter):
W = sparse.spdiags(w, 0, L, L)
Z = W + lam * D.dot(D.transpose())
z = spsolve(Z, w*y)
w = p * (y > z) + (1-p) * (y < z)
return z
and here is my code for pytorch
def baseline_als_pytorch(y, lam, p, niter=10):
diag = torch.tensor(np.repeat(1, L))
diag = torch.diag(diag, 0)
diag_minus_one = torch.tensor(np.repeat(-2, L - 1))
diag_minus_one = torch.diag(diag_minus_one, -1)
diag_minus_two = torch.tensor(np.repeat(1, L - 2))
diag_minus_two = torch.diag(diag_minus_two, -2)
D = diag + diag_minus_one + diag_minus_two
D = D[:, :L - 2].double()
w = torch.tensor(np.repeat(1, L)).double()
for i in range(10):
W = diag.double()
Z = W + lam * torch.mm(D, D.permute(1, 0))
z = torch.gesv(w * y, Z)
z = z[0].squeeze()
w = p * (y > z).double() + (1 - p) * (y < z).double()
return z
Sorry that the pytorch code looks so bad I'm just starting out in it.
I've confirmed that Z, w, and y are all the same going into both scipy and pytorch and that z is different between them right after I try to solve the system of equations.
Thanks for the comment, here is an example:
I use 100000 for lam and 0.001 for p.
Using the dummy input: y = (5,5,5,5,5,10,10,5,5,5,10,10,10,5,5,5,5,5,5,5),
I get (3.68010263, 4.90344214, 6.12679489, 7.35022406, 8.57384278, 9.79774074, 11.02197199, 12.2465927 , 13.47164891, 14.69711435,15.92287813, 17.14873257, 18.37456982, 19.60038184, 20.82626043,22.05215157, 23.27805103, 24.50400438, 25.73010693, 26.95625922) from scipy and
(6.4938312 , 6.46912395, 6.44440175, 6.41963499, 6.39477958,6.36977727, 6.34455582, 6.31907933, 6.29334844, 6.26735058, 6.24106029, 6.21443939, 6.18748732, 6.16024137, 6.13277694,6.10515785, 6.07743658, 6.04965455, 6.02184242, 5.99402035) from pytorch.
This is with just one iteration of the loop. Scipy is correct, pytorch is not.
Interestingly, if I use a shorter dummy input (5,5,5,5,5,10,10,5,5,5), I get the same answer from both. My real input is 1011 dimensional.
Your pytorch function is wrong (you never update W at the first line inside the for loop), moreover I get the result you say you got from Pytorch from Scipy too.
Scipy version
def baseline_als(y, lam=100000, p=1e-3, niter=1):
L = len(y)
D = sparse.diags([1,-2,1],[0,-1,-2], shape=(L,L-2))
w = np.ones(L)
for i in range(niter):
W = sparse.spdiags(w, 0, L, L)
Z = W + lam * D.dot(D.transpose())
z = spsolve(Z, w*y)
w = p * (y > z) + (1-p) * (y < z)
return z
equivalent in Pytorch
def baseline_als_pytorch(y, lam=100000, p=1e-3, niter=1):
L = len(y)
D = torch.diag(torch.ones(L), 0) + torch.diag(-2 * torch.ones(L-1), -1) + torch.diag(torch.ones(L-2), -2)
D = D[:, :L-2].double()
w = torch.ones(L).double()
for i in range(niter):
W = torch.diag(w)
Z = W + lam * torch.mm(D, D.permute(1, 0))
z = torch.gesv(w * y, Z)
z = z[0].squeeze()
w = p * (y > z).double() + (1 - p) * (y < z).double()
return z
when I feed them with y = np.array([5,5,5,5,5,10,10,5,5,5,10,10,10,5,5,5,5,5,5,5], dtype='float64'):
scipy:
array([6.4938312 , 6.46912395, 6.44440175, 6.41963499, 6.39477958,
6.36977727, 6.34455582, 6.31907933, 6.29334844, 6.26735058,
6.24106029, 6.21443939, 6.18748732, 6.16024137, 6.13277694,
6.10515785, 6.07743658, 6.04965455, 6.02184242, 5.99402035])
pytorch:
tensor([6.4938, 6.4691, 6.4444, 6.4196, 6.3948, 6.3698, 6.3446, 6.3191, 6.2933,
6.2674, 6.2411, 6.2144, 6.1875, 6.1602, 6.1328, 6.1052, 6.0774, 6.0497,
6.0218, 5.9940], dtype=torch.float64)
If I increase n_iter to 10:
scipy:
array([5.00202571, 5.00199038, 5.00195504, 5.00191963, 5.0018841 ,
5.00184837, 5.00181235, 5.00177598, 5.00173927, 5.00170221,
5.00166475, 5.00162685, 5.00158851, 5.00154979, 5.00151077,
5.00147155, 5.0014322 , 5.00139276, 5.00135329, 5.0013138 ])
pytorch:
tensor([5.0020, 5.0020, 5.0020, 5.0019, 5.0019, 5.0018, 5.0018, 5.0018, 5.0017,
5.0017, 5.0017, 5.0016, 5.0016, 5.0015, 5.0015, 5.0015, 5.0014, 5.0014,
5.0014, 5.0013], dtype=torch.float64)
And it checks out with the code of baseline als you linked to in your question.