Something is wrong with my logistic regression code - python

def calc_log(values):
temp1 = np.exp(-values)
temp2 = 1+temp1
return 1/temp2
def logisticregression(x, y, a):
#a is the learning rate
m = len(y)
x = np.matrix(x)
x = np.c_[np.ones((x.shape[0], 1)), x]
# adds a row of ones at the start of the x matrix which represents x0 (which is multiplied by theta[0])
theta = np.matrix(np.zeros(x.shape[1])).T
#makes a list of 0s as starting theta values with the same number of features that the x matrix has
y = np.matrix(y)
while True:
calculation = x * theta #multiplies x and theta
hypo = calc_log(calculation) #f(z) = 1/(1+e^-z), plugs calculation into f(z)
difference = hypo - y.T #calculates the difference between the predicted y values and the real y values
vals = x.T * difference #multiplies the difference by each feature for every piece of training data
lasttheta = theta.copy()
theta -= (vals*(a/m)) #multiplies the matrix of theta values by a/m and negate it from the previous theta values
array = lasttheta - theta
array = [j for j in array if abs(float(j)) > 0.001] #checks if the difference between each theta value and the previous theta value is more than 0.001.
if not array:
break #Breaks from the loop if the difference between each theta value and its previous theta value is less than 0.001
return theta
X_vals = np.array([[3, 5, 6], [4, 2, 4], [8, 6, 2]])
Y_vals = np.array([1, 0, 1])
value = logisticregression(X_vals, Y_vals, 0.1)
the coefficients returned from the logisticregression function are incorrect however I'm not sure what the error is. Can anyone help?
Formula for gradient descent of logistic regression:

The problem seems to be confusing 'i' with 'j'. You can check it by setting m=1 and re-read the code.
import numpy as np
def calc_log(values):
temp1 = np.exp(-values)
temp2 = 1+temp1
return 1/temp2
def logisticregression(x, y, a):
#a is the learning rate
m = len(y)
# x = np.matrix(x) # x has been a 2d array
x = np.c_[np.ones((x.shape[0], 1)), x]
# adds a row of ones at the start of the x matrix which represents x0 (which is multiplied by theta[0])
theta = np.zeros(x.shape[1])
#makes a list of 0s as starting theta values with the same number of features that the x matrix has
# y = np.matrix(y) # y should be a vector here
while True:
calculation = theta # x.T #multiplies x and theta, or you can use np.dot(theta, x.T) for python < 3.5
hypo = calc_log(calculation) #f(z) = 1/(1+e^-z), plugs calculation into f(z)
difference = hypo - y.T #calculates the difference between the predicted y values and the real y values
vals = np.multiply(x, difference[:, np.newaxis]) #multiplies the difference by each feature for every piece of training data
lasttheta = theta.copy()
theta -= (vals.sum(0)*(a/m)) #multiplies the matrix of theta values by a/m and negate it from the previous theta values
array = lasttheta - theta
array = [j for j in array if abs(float(j)) > 0.001] #checks if the difference between each theta value and the previous theta value is more than 0.001.
loss = np.abs(hypo-y.T).sum()
print(f'loss: {loss}')
if not array:
break #Breaks from the loop if the difference between each theta value and its previous theta value is less than 0.001
return theta
X_vals = np.array([[3, 5, 6], [4, 2, 4], [8, 6, 2]])
Y_vals = np.array([1, 0, 1])
value = logisticregression(X_vals, Y_vals, 0.1)

Related

Implementation of backpropagation in Python

I'm following the Andrew-Ng course on Machine Learning and I'm currently doing the week 5 exercise.
I've found myself stuck on the implementation of the backpropagation algorithm, due to the fact that the relative difference, compared to numerical gradient, is very high (order of 1e-1), but I can't find any error within my implementation, so I'm gently asking if someone could take a look at it and explain what I did wrong.
Forward propagation:
def forward_propagation(thetas, X, history=False):
activation_arr = []
a = X # X is the array of the first activation values
for k in range(0, len(thetas)):
a = add_intercept(a) # add the bias unit
a = sigmoid(a # thetas[k].T)
if history:
activation_arr.append(a)
return activation_arr if history else a
Backpropagation:
def gradient_nn(thetas, X, y, num_labels, reg_lambda=None):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
# add intercepted X to the activation array
activation_arr = [add_intercept(X)] + forward_propagation(thetas, X, history=True)
sigma = [activation_arr[-1] - Y] # sigma^L = a^L - y
delta = [sigma[-1].T # activation_arr[-2]] # find delta for the first row
thetas_grad = []
# Calculate sigma and delta
for idx in range(1, len(thetas)): # skip last iteration
sigma = [
(sigma[0] # thetas[-idx][:, 1:]) * partial_derivative(activation_arr[-1-idx])
] + sigma
delta = [
sigma[0].T # activation_arr[-2-idx]
] + delta
return [np.sum(d) / n_examples for d in thetas_grad]
Partial derivative:
def partial_derivative(a):
return a * (1 - a) # element wise multiplication
Numerical gradient:
def compute_numerical_gradient(cost_function, thetas):
# Unroll parameters
nn_params = unroll_thetas(thetas)
num_grad = np.zeros(nn_params.shape)
perturb = np.zeros(nn_params.shape)
shapes = [theta.shape for theta in thetas]
epsilon = 1e-4 # not the one of random initialization
for p in range(nn_params.shape[0]):
# Set perturbation vector
perturb[p] = epsilon
minus_theta = nn_params - perturb
plus_theta = nn_params + perturb
# --- Roll params back in order to use the cost function ---
minus_theta = roll_thetas(minus_theta, shapes)
plus_theta = roll_thetas(plus_theta, shapes)
# calculate the loss of the cost function
minus_loss = cost_function(minus_theta)
plus_loss = cost_function(plus_theta)
# Compute Numerical Gradient
num_grad[p] = (plus_loss - minus_loss) / (2 * epsilon)
perturb[p] = 0
num_grad = roll_thetas(num_grad, shapes)
return [np.sum(num_g) for num_g in num_grad]
Cost function:
def J_nn(num_labels, reg_lambda=None):
def non_reg_func(thetas, X, y):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
prediction = forward_propagation(thetas, X)
return np.sum(np.sum(-Y * np.log(prediction) - (1 - Y) * np.log(1 - prediction))) / n_examples
if reg_lambda is None:
func = non_reg_func
else: # regularization
def func(thetas, X, y):
cost = non_reg_func(thetas, X, y)
for theta in thetas: # regularize for every layer
theta = theta[1:] # remove bias unit
cost = cost + (reg_lambda / (2 * y.shape[0])) * np.sum(np.sum(theta[:, ] ** 2))
return cost
return func
Checking backpropagation with numerical gradient:
def check_nn_gradients(reg_lambda=None):
"""
Creates a small neural network (max 8 x 8 x 7 x 8) and checks that
the implementation of the backpropagation algorithm is good
"""
#n_examples, sizes = random.randint(5, 10), [random.randint(2, 8), random.randint(2, 8), random.randint(1, 8)]
n_examples, sizes = 5, [8, 8, 5, 4]
n_labels = sizes[-1] # Last size is equal to the number of labels
init_epsilon = 0.0001
thetas = random_init_thetas(sizes, init_epsilon)
X = np.array(
random_init_thetas([sizes[0]-1, n_examples], init_epsilon)
).squeeze() # We squeeze it because random_init_thetas returns a 3D array, but we want X to be 2D
y = np.array([random.randint(0, n_labels-1) for _ in X])
y = y[:, np.newaxis]
inner_cost = lambda _thetas: J_nn(n_labels, reg_lambda)(_thetas, X, y)
gradients = gradient_nn(thetas, X, y, n_labels, 0)
unrolled_gradients = unroll_thetas(gradients)
print(unrolled_gradients)
# finite difference method
grad_checking_epsilon = 1e-4
num_grad = compute_numerical_gradient(inner_cost, thetas)
unrolled_num_grad = unroll_thetas(num_grad)
print(unrolled_num_grad)
return diff = np.linalg.norm(unrolled_num_grad - unrolled_gradients) / np.linalg.norm(unrolled_num_grad + unrolled_gradients)

Why do the parameters (beta) blow up in LMS with Kernel Trick?

I am attempting to solve an LMS problem using kernel trick. The reference is taken from this note.
My problem is that the value of beta blows up. It reaches to the order of 10e17 on just about 5 or 6 iterations. One thing I noticed is that the signs entries in beta oscillate (positive to negative and vice versa) on consecutive update of beta in the loop.
Also, lowering the value of alpha (learning rate) does not help. I tried alpha = 0.01 but the value of beta still blows up.
Here's the code
import numpy as np
import pandas as pd
file = pd.read_csv("weatherHistory.csv")
#selecting only the required columns
useful_index = [3, 4, 5, 6, 7, 8, 10]
file = file.iloc[:, useful_index]
# getting training set and test set
file_randomized = file.sample(frac=1, random_state=1)
# useable data size
usable_dataset_size = 1200
# training set size
training_set_index = int(0.75 * usable_dataset_size)
# get rid of unnecessary data
file = file_randomized[:usable_dataset_size]
# make training and test set
training_set = file[:training_set_index]
test_set = file[training_set_index:]
# Select the columns
input_index = [0, 2, 3, 4, 5, 6]
X = training_set.iloc[:, input_index]
output_index = [1]
Y = training_set.iloc[:, output_index]
# Convert to numpy into suitable format
X = X.to_numpy()
n = X.shape[0]
d = X.shape[1]
Y = Y.to_numpy()
# This function calculates K(phi(x), phi(y))
def kernel_matrix(K, degree=1):
"""K is the matrix of dot product. This applies kernel function to the matrix"""
result = np.zeros(K.shape)
for i in range(0, degree+1):
result += np.power(K, i)
return result
# Main training function
def lms_with_kt(X, Y, alpha, degree = 1, num_iters = 1000):
"""X: nxd vector, Y: nx1 vector, beta: nx1 zero vector, alpha: number, degree: number"""
# normalize x
x_min = X.min(axis = 0, keepdims=True)
x_max = X.max(axis = 0, keepdims=True)
X = (X - x_min) / (x_max - x_min)
n = X.shape[0]
# add the column of 1 in the front
X = np.hstack((np.ones((n, 1)), X))
# make K_matrix (kernel matrix)
K = np.matmul(X, X.T)
K = kernel_matrix(K, degree)
# initialize beta
beta = np.zeros((n, 1))
# update beta
for i in range(num_iters):
beta += alpha * (Y - np.matmul(K, beta))
print(beta)
def predict(x):
"""x: 1xd matrix"""
x_norm = (x - x_min) / (x_max - x_min)
n_predict = x_norm.shape[0]
x_norm = np.hstack((np.ones((n_predict, 1)), x_norm))
K_for_prediction = np.matmul(X, x_norm.T)
K_for_prediction = kernel_matrix(K_for_prediction, degree)
return np.dot(beta.T, K_for_prediction)
return predict
predictor = lms_with_kt(X, Y, 0.1, 2, 1000)
The link to the dataset is here.

Applying Gaussian filter to 1D data "by hands" using Numpy

I have a nonuniformly sampled data that I am trying to apply a Gaussian filter to. I am using python's numpy library to solve this. The data is of XY type, here is how it looks like:
[[ -0.96 390.63523024]
[ -1.085 390.68523024]
[ -1.21 390.44023023]
...
[-76.695 390.86023024]
[-77.105 392.51023024]
[-77.155 392.10023024]]
And here is a link to the whole *.npz file.
Here is my approach:
I start with defining a Gaussian function
Then I start scanning the data with a while loop along the X axis
Within each step of the loop:
I select a portion of data that is within two cutoff lengths
shift the X axis of the selected data portion to make it symmetrical around 0
calculate my Gaussian function at every point, multiply with corresponding Y values, sum and divide by number of elements
Move to next point
Here is how code looks like:
import numpy as np
import matplotlib.pyplot as plt
xy = np.load('1D_data.npz')['arr_0']
def g_func(xx, w=1.0):
a = 0.47 * w
return (1 / a) * np.exp((xx / a) ** 2 * (-np.pi))
x, y, x_, y_ = xy[:, 0], xy[:, 1], [], []
counter, xi, ww = 0, x[0], 1.0
while xi > np.amin(x):
curr_x = x[(x < xi) & (x >= xi - 2 * ww)]
g, ysel = [], []
for i, els in enumerate(curr_x):
xil = els - curr_x[0] + abs(curr_x[0] - curr_x[-1]) / 2
g.append(g_func(xil, ww))
ysel.append(y[counter + i])
y_.append(np.sum(np.multiply(g, ysel)) / len(g))
x_.append(xi)
counter += 1
xi = x[counter]
plt.plot(x, y, '-k')
plt.plot(x_, y_, '-r')
plt.show()
The output doesn't look right though. (See the fig below) Even if discarding the edges, the convolution is very noisy and the values do not seem to correspond to the data. What am I possibly doing wrong?
You made one mistake in your code:
Before multiplying g with y_sel, y_sel is not centered.
The reason why y_sel should be centered is because we want to add the relative differences weighted by the Gaussian to the entry at the center. If you multiply g with y_sel directly, not just the values of the neighboring entries within the window, but also the value of the center entry will be weighted by the Gaussian. This will definitely change the function values dramatically.
Below is my solution using numpy
def g_func(xx, w=1.0):
mean = np.mean(xx)
a = 0.47 * w
return (1 / a) * np.exp(((xx-mean) / a) ** 2 * (-np.pi))
def get_convolution(array,half_window_size):
array = np.concatenate((np.repeat(array[0],half_window_size),
array,
np.repeat(array[-1],half_window_size)))
window_inds = [list(range(ind-half_window_size,ind+half_window_size+1)) \
for ind in range(half_window_size,len(array)-half_window_size)]
return np.take(array,window_inds)
xy = np.load('1D_data.npz')['arr_0']
x, y = xy[:, 0], xy[:, 1]
half_window_size = 4
x_conv = np.apply_along_axis(g_func,axis=1,arr=get_convolution(x,half_window_size=half_window_size))
y_conv = get_convolution(y,half_window_size=half_window_size)
y_mean = np.mean(y_conv,axis=1)
y_centered = y_conv - y_mean[:,None]
smoothed = np.sum(x_conv*y_centered,axis=1) / (half_window_size*2) + y_mean
fig,ax = plt.subplots(figsize=(10,6))
ax.plot(x, y, '-k')
ax.plot(x, smoothed, '-r')
running the code, the output is
UPDATE
In order to unify w with half_window_size, here is one possibility, the idea is to let the standard deviation of the Gaussian to be 2*half_window_size
def g_func(xx):
std = len(xx)
mean = np.mean(xx)
return 1 / (std*np.sqrt(2*np.pi)) * np.exp(-1/2*((xx-mean)/std)**2)
def get_convolution(array,half_window_size):
array = np.concatenate((np.repeat(array[0],half_window_size),
array,
np.repeat(array[-1],half_window_size)))
window_inds = [list(range(ind-half_window_size,ind+half_window_size+1)) \
for ind in range(half_window_size,len(array)-half_window_size)]
return np.take(array,window_inds)
xy = np.load('1D_data.npz')['arr_0']
x, y = xy[:, 0], xy[:, 1]
half_window_size = 4
x_conv = np.apply_along_axis(g_func,axis=1,arr=get_convolution(x,half_window_size=half_window_size))
y_conv = get_convolution(y,half_window_size=half_window_size)
y_mean = np.mean(y_conv,axis=1)
y_centered = y_conv - y_mean[:,None]
smoothed = np.sum(x_conv*y_centered,axis=1) / (half_window_size*2) + y_mean
fig,ax = plt.subplots(figsize=(10,6))
ax.plot(x, y, '-k')
ax.plot(x, smoothed, '-r')

Changing value of matrix and assigning value

The code below based on http://www.johnwittenauer.net/machine-learning-exercises-in-python-part-1/ works
theta = np.matrix(np.array([0, 0]))
def computeCost(X, y, theta, iterations, alpha):
temp = np.matrix(np.zeros(theta.shape))
m = len(X)
theta_trans = theta.T
for j in range(iterations):
hyp = np.dot(X, theta_trans)-y
term = np.multiply(hyp, X[:,0])
temp[0,0] = theta[0,0] - ((alpha / len(X)) * np.sum(term))
term = np.multiply(hyp, X[:,1])
temp[0,1] = theta[0,1] - ((alpha / len(X)) * np.sum(term))
theta = temp
theta_trans = theta.T
return theta
However, when I use theta directly instead of temp e.g. theta[0,0] = theta[0,0] - ((alpha / len(X)) * np.sum(term))) and comment out the theta = temp I always get 0 and 0 for theta.
When I do a similar operation outside the function theta is changed. For instance,
theta = np.matrix(np.array([0,0]))
theta[0,0] = theta[0,0] - 1
print(theta)
theta shows as [-1 , 0].
Why is this type of assignment not working inside the function?
A probable explanation: it is a problem of types (int vs float).
The assignment
theta = np.matrix(np.array([0, 0]))
creates a matrix of integers. There is some implicit conversion to integers when you assign directly its coefficients:
>>> m = np.matrix(np.array([0, 0]))
>>> m
matrix([[0, 0]])
>>> m[0,0] = 0.5 # float
>>> m
matrix([[0, 0]]) # no effect, 0.5 converted to 0
>>> m[0,0] = 1 # int
>>> m
matrix([[1, 0]])
In contrast, the temp variable is a matrix of floats (because np.zeros creates an array of floats when dtype is not specified), so the assignment of floats works as expected.
So just declare theta directly as a matrix of floats and you should be fine.

Optimizing Python distance calculation while accounting for periodic boundary conditions

I have written a Python script to calculate the distance between two points in 3D space while accounting for periodic boundary conditions. The problem is that I need to do this calculation for many, many points and the calculation is quite slow. Here is my function.
def PBCdist(coord1,coord2,UC):
dx = coord1[0] - coord2[0]
if (abs(dx) > UC[0]*0.5):
dx = UC[0] - dx
dy = coord1[1] - coord2[1]
if (abs(dy) > UC[1]*0.5):
dy = UC[1] - dy
dz = coord1[2] - coord2[2]
if (abs(dz) > UC[2]*0.5):
dz = UC[2] - dz
dist = np.sqrt(dx**2 + dy**2 + dz**2)
return dist
I then call the function as so
for i, coord2 in enumerate(coordlist):
if (PBCdist(coord1,coord2,UC) < radius):
do something with i
Recently I read that I can greatly increase performance by using list comprehension. The following works for the non-PBC case, but not for the PBC case
coord_indices = [i for i, y in enumerate([np.sqrt(np.sum((coord2-coord1)**2)) for coord2 in coordlist]) if y < radius]
for i in coord_indices:
do something
Is there some way to do the equivalent of this for the PBC case? Is there an alternative that would work better?
You should write your distance() function in a way that you can vectorise the loop over the 5711 points. The following implementation accepts an array of points as either the x0 or x1 parameter:
def distance(x0, x1, dimensions):
delta = numpy.abs(x0 - x1)
delta = numpy.where(delta > 0.5 * dimensions, delta - dimensions, delta)
return numpy.sqrt((delta ** 2).sum(axis=-1))
Example:
>>> dimensions = numpy.array([3.0, 4.0, 5.0])
>>> points = numpy.array([[2.7, 1.5, 4.3], [1.2, 0.3, 4.2]])
>>> distance(points, [1.5, 2.0, 2.5], dimensions)
array([ 2.22036033, 2.42280829])
The result is the array of distances between the points passed as second parameter to distance() and each point in points.
import numpy as np
bounds = np.array([10, 10, 10])
a = np.array([[0, 3, 9], [1, 1, 1]])
b = np.array([[2, 9, 1], [5, 6, 7]])
min_dists = np.min(np.dstack(((a - b) % bounds, (b - a) % bounds)), axis = 2)
dists = np.sqrt(np.sum(min_dists ** 2, axis = 1))
Here a and b are lists of vectors you wish to calculate the distance between and bounds are the boundaries of the space (so here all three dimensions go from 0 to 10 and then wrap). It calculates the distances between a[0] and b[0], a[1] and b[1], and so on.
I'm sure numpy experts could do better, but this will probably be an order of magnitude faster than what you're doing, since most of the work is now done in C.
I have found that meshgrid is very useful for generating distances. For example:
import numpy as np
row_diff, col_diff = np.meshgrid(range(7), range(8))
radius_squared = (row_diff - x_coord)**2 + (col_diff - y_coord)**2
I now have an array (radius_squared) where every entry specifies the square of the distance from the array position [x_coord, y_coord].
To circularize the array, I can do the following:
row_diff, col_diff = np.meshgrid(range(7), range(8))
row_diff = np.abs(row_diff - x_coord)
row_circ_idx = np.where(row_diff > row_diff.shape[1] / 2)
row_diff[row_circ_idx] = (row_diff[row_circ_idx] -
2 * (row_circ_idx + x_coord) +
row_diff.shape[1])
row_diff = np.abs(row_diff)
col_diff = np.abs(col_diff - y_coord)
col_circ_idx = np.where(col_diff > col_diff.shape[0] / 2)
col_diff[row_circ_idx] = (row_diff[col_circ_idx] -
2 * (col_circ_idx + y_coord) +
col_diff.shape[0])
col_diff = np.abs(row_diff)
circular_radius_squared = (row_diff - x_coord)**2 + (col_diff - y_coord)**2
I now have all the array distances circularized with vector math.

Categories