I am attempting to solve an LMS problem using kernel trick. The reference is taken from this note.
My problem is that the value of beta blows up. It reaches to the order of 10e17 on just about 5 or 6 iterations. One thing I noticed is that the signs entries in beta oscillate (positive to negative and vice versa) on consecutive update of beta in the loop.
Also, lowering the value of alpha (learning rate) does not help. I tried alpha = 0.01 but the value of beta still blows up.
Here's the code
import numpy as np
import pandas as pd
file = pd.read_csv("weatherHistory.csv")
#selecting only the required columns
useful_index = [3, 4, 5, 6, 7, 8, 10]
file = file.iloc[:, useful_index]
# getting training set and test set
file_randomized = file.sample(frac=1, random_state=1)
# useable data size
usable_dataset_size = 1200
# training set size
training_set_index = int(0.75 * usable_dataset_size)
# get rid of unnecessary data
file = file_randomized[:usable_dataset_size]
# make training and test set
training_set = file[:training_set_index]
test_set = file[training_set_index:]
# Select the columns
input_index = [0, 2, 3, 4, 5, 6]
X = training_set.iloc[:, input_index]
output_index = [1]
Y = training_set.iloc[:, output_index]
# Convert to numpy into suitable format
X = X.to_numpy()
n = X.shape[0]
d = X.shape[1]
Y = Y.to_numpy()
# This function calculates K(phi(x), phi(y))
def kernel_matrix(K, degree=1):
"""K is the matrix of dot product. This applies kernel function to the matrix"""
result = np.zeros(K.shape)
for i in range(0, degree+1):
result += np.power(K, i)
return result
# Main training function
def lms_with_kt(X, Y, alpha, degree = 1, num_iters = 1000):
"""X: nxd vector, Y: nx1 vector, beta: nx1 zero vector, alpha: number, degree: number"""
# normalize x
x_min = X.min(axis = 0, keepdims=True)
x_max = X.max(axis = 0, keepdims=True)
X = (X - x_min) / (x_max - x_min)
n = X.shape[0]
# add the column of 1 in the front
X = np.hstack((np.ones((n, 1)), X))
# make K_matrix (kernel matrix)
K = np.matmul(X, X.T)
K = kernel_matrix(K, degree)
# initialize beta
beta = np.zeros((n, 1))
# update beta
for i in range(num_iters):
beta += alpha * (Y - np.matmul(K, beta))
print(beta)
def predict(x):
"""x: 1xd matrix"""
x_norm = (x - x_min) / (x_max - x_min)
n_predict = x_norm.shape[0]
x_norm = np.hstack((np.ones((n_predict, 1)), x_norm))
K_for_prediction = np.matmul(X, x_norm.T)
K_for_prediction = kernel_matrix(K_for_prediction, degree)
return np.dot(beta.T, K_for_prediction)
return predict
predictor = lms_with_kt(X, Y, 0.1, 2, 1000)
The link to the dataset is here.
Related
I'm following the Andrew-Ng course on Machine Learning and I'm currently doing the week 5 exercise.
I've found myself stuck on the implementation of the backpropagation algorithm, due to the fact that the relative difference, compared to numerical gradient, is very high (order of 1e-1), but I can't find any error within my implementation, so I'm gently asking if someone could take a look at it and explain what I did wrong.
Forward propagation:
def forward_propagation(thetas, X, history=False):
activation_arr = []
a = X # X is the array of the first activation values
for k in range(0, len(thetas)):
a = add_intercept(a) # add the bias unit
a = sigmoid(a # thetas[k].T)
if history:
activation_arr.append(a)
return activation_arr if history else a
Backpropagation:
def gradient_nn(thetas, X, y, num_labels, reg_lambda=None):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
# add intercepted X to the activation array
activation_arr = [add_intercept(X)] + forward_propagation(thetas, X, history=True)
sigma = [activation_arr[-1] - Y] # sigma^L = a^L - y
delta = [sigma[-1].T # activation_arr[-2]] # find delta for the first row
thetas_grad = []
# Calculate sigma and delta
for idx in range(1, len(thetas)): # skip last iteration
sigma = [
(sigma[0] # thetas[-idx][:, 1:]) * partial_derivative(activation_arr[-1-idx])
] + sigma
delta = [
sigma[0].T # activation_arr[-2-idx]
] + delta
return [np.sum(d) / n_examples for d in thetas_grad]
Partial derivative:
def partial_derivative(a):
return a * (1 - a) # element wise multiplication
Numerical gradient:
def compute_numerical_gradient(cost_function, thetas):
# Unroll parameters
nn_params = unroll_thetas(thetas)
num_grad = np.zeros(nn_params.shape)
perturb = np.zeros(nn_params.shape)
shapes = [theta.shape for theta in thetas]
epsilon = 1e-4 # not the one of random initialization
for p in range(nn_params.shape[0]):
# Set perturbation vector
perturb[p] = epsilon
minus_theta = nn_params - perturb
plus_theta = nn_params + perturb
# --- Roll params back in order to use the cost function ---
minus_theta = roll_thetas(minus_theta, shapes)
plus_theta = roll_thetas(plus_theta, shapes)
# calculate the loss of the cost function
minus_loss = cost_function(minus_theta)
plus_loss = cost_function(plus_theta)
# Compute Numerical Gradient
num_grad[p] = (plus_loss - minus_loss) / (2 * epsilon)
perturb[p] = 0
num_grad = roll_thetas(num_grad, shapes)
return [np.sum(num_g) for num_g in num_grad]
Cost function:
def J_nn(num_labels, reg_lambda=None):
def non_reg_func(thetas, X, y):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
prediction = forward_propagation(thetas, X)
return np.sum(np.sum(-Y * np.log(prediction) - (1 - Y) * np.log(1 - prediction))) / n_examples
if reg_lambda is None:
func = non_reg_func
else: # regularization
def func(thetas, X, y):
cost = non_reg_func(thetas, X, y)
for theta in thetas: # regularize for every layer
theta = theta[1:] # remove bias unit
cost = cost + (reg_lambda / (2 * y.shape[0])) * np.sum(np.sum(theta[:, ] ** 2))
return cost
return func
Checking backpropagation with numerical gradient:
def check_nn_gradients(reg_lambda=None):
"""
Creates a small neural network (max 8 x 8 x 7 x 8) and checks that
the implementation of the backpropagation algorithm is good
"""
#n_examples, sizes = random.randint(5, 10), [random.randint(2, 8), random.randint(2, 8), random.randint(1, 8)]
n_examples, sizes = 5, [8, 8, 5, 4]
n_labels = sizes[-1] # Last size is equal to the number of labels
init_epsilon = 0.0001
thetas = random_init_thetas(sizes, init_epsilon)
X = np.array(
random_init_thetas([sizes[0]-1, n_examples], init_epsilon)
).squeeze() # We squeeze it because random_init_thetas returns a 3D array, but we want X to be 2D
y = np.array([random.randint(0, n_labels-1) for _ in X])
y = y[:, np.newaxis]
inner_cost = lambda _thetas: J_nn(n_labels, reg_lambda)(_thetas, X, y)
gradients = gradient_nn(thetas, X, y, n_labels, 0)
unrolled_gradients = unroll_thetas(gradients)
print(unrolled_gradients)
# finite difference method
grad_checking_epsilon = 1e-4
num_grad = compute_numerical_gradient(inner_cost, thetas)
unrolled_num_grad = unroll_thetas(num_grad)
print(unrolled_num_grad)
return diff = np.linalg.norm(unrolled_num_grad - unrolled_gradients) / np.linalg.norm(unrolled_num_grad + unrolled_gradients)
def calc_log(values):
temp1 = np.exp(-values)
temp2 = 1+temp1
return 1/temp2
def logisticregression(x, y, a):
#a is the learning rate
m = len(y)
x = np.matrix(x)
x = np.c_[np.ones((x.shape[0], 1)), x]
# adds a row of ones at the start of the x matrix which represents x0 (which is multiplied by theta[0])
theta = np.matrix(np.zeros(x.shape[1])).T
#makes a list of 0s as starting theta values with the same number of features that the x matrix has
y = np.matrix(y)
while True:
calculation = x * theta #multiplies x and theta
hypo = calc_log(calculation) #f(z) = 1/(1+e^-z), plugs calculation into f(z)
difference = hypo - y.T #calculates the difference between the predicted y values and the real y values
vals = x.T * difference #multiplies the difference by each feature for every piece of training data
lasttheta = theta.copy()
theta -= (vals*(a/m)) #multiplies the matrix of theta values by a/m and negate it from the previous theta values
array = lasttheta - theta
array = [j for j in array if abs(float(j)) > 0.001] #checks if the difference between each theta value and the previous theta value is more than 0.001.
if not array:
break #Breaks from the loop if the difference between each theta value and its previous theta value is less than 0.001
return theta
X_vals = np.array([[3, 5, 6], [4, 2, 4], [8, 6, 2]])
Y_vals = np.array([1, 0, 1])
value = logisticregression(X_vals, Y_vals, 0.1)
the coefficients returned from the logisticregression function are incorrect however I'm not sure what the error is. Can anyone help?
Formula for gradient descent of logistic regression:
The problem seems to be confusing 'i' with 'j'. You can check it by setting m=1 and re-read the code.
import numpy as np
def calc_log(values):
temp1 = np.exp(-values)
temp2 = 1+temp1
return 1/temp2
def logisticregression(x, y, a):
#a is the learning rate
m = len(y)
# x = np.matrix(x) # x has been a 2d array
x = np.c_[np.ones((x.shape[0], 1)), x]
# adds a row of ones at the start of the x matrix which represents x0 (which is multiplied by theta[0])
theta = np.zeros(x.shape[1])
#makes a list of 0s as starting theta values with the same number of features that the x matrix has
# y = np.matrix(y) # y should be a vector here
while True:
calculation = theta # x.T #multiplies x and theta, or you can use np.dot(theta, x.T) for python < 3.5
hypo = calc_log(calculation) #f(z) = 1/(1+e^-z), plugs calculation into f(z)
difference = hypo - y.T #calculates the difference between the predicted y values and the real y values
vals = np.multiply(x, difference[:, np.newaxis]) #multiplies the difference by each feature for every piece of training data
lasttheta = theta.copy()
theta -= (vals.sum(0)*(a/m)) #multiplies the matrix of theta values by a/m and negate it from the previous theta values
array = lasttheta - theta
array = [j for j in array if abs(float(j)) > 0.001] #checks if the difference between each theta value and the previous theta value is more than 0.001.
loss = np.abs(hypo-y.T).sum()
print(f'loss: {loss}')
if not array:
break #Breaks from the loop if the difference between each theta value and its previous theta value is less than 0.001
return theta
X_vals = np.array([[3, 5, 6], [4, 2, 4], [8, 6, 2]])
Y_vals = np.array([1, 0, 1])
value = logisticregression(X_vals, Y_vals, 0.1)
I'm starting the ML journey and I'm having troubles with this coding exercise
here is my code
import numpy as np
import pandas as pd
import scipy.optimize as op
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, name['Test1', 'Test2', 'Accepted'])
# Separate the features to make it fit into the mapFeature function
X1 = data['Test1'].values.T
X2 = data['Test2'].values.T
# This function makes more features (degree)
def mapFeature(x1, x2):
degree = 6
out = np.ones((x1.shape[0], sum(range(degree + 2))))
curr_column = 1
for i in range(1, degree + 1):
for j in range(i+1):
out[:,curr_column] = np.power(x1, i-j) * np.power(x2, j)
curr_column += 1
return out
# Separate the data into training and target, also initialize theta
X = mapFeature(X1, X2)
y = np.matrix(data['Accepted'].values).T
m, n = X.shape
cols = X.shape[1]
theta = np.matrix(np.zeros(cols))
#Initialize the learningRate(sigma)
learningRate = 1
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
first = np.multiply(-y, np.log(error))
second = np.multiply(1 - y, np.log(1 - error))
j = np.sum((first - second)) / m + (learningRate * np.sum(np.power(theta, 2)) / 2 * m)
return j
# Define the gradient of the cost function
def gradient(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
grad = (X.T # (error - y)) / m + ((learningRate * theta) / m)
grad_no = (X.T # (error - y)) / m
grad[0] = grad_no[0]
return grad
Result = op.minimize(fun=cost, x0=theta, args=(X, y, learningRate), method='TNC', jac=gradient)
opt_theta = np.matrix(Result.x)
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
p = predict(opt_theta, X)
print('Train Accuracy: {:f}'.format(np.mean(p == y) * 100))
So, when the learningRate = 1, the accuracy should be around 83,05% but I'm getting 80.5% and when the learningRate = 0, the accuracy should be 91.52% but I'm getting 87.28%
So the question is What am I doing wrong? Why my accuracy is below the problem default answer?
Hope someone can guide me in the right direction. Thanks!
P.D: Here is the dataset, maybe it can help
https://raw.githubusercontent.com/TheGirlWhiteWithBandages/Machine-Learning-Algorithms/master/Logistic%20Regression/ex2data2.txt
Hey guys I found a way to make it even better!
Here is the code
import numpy as np
import pandas as pd
import scipy.optimize as op
from sklearn.preprocessing import PolynomialFeatures
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, names=['Test1', 'Test2', 'Accepted'])
# Separate the data into training and target
X = (data.iloc[:, 0:2]).values
y = (data.iloc[:, 2:3]).values
# Modify the features to a certain degree (Polynomial)
poly = PolynomialFeatures(6)
m = y.size
XX = poly.fit_transform(data.iloc[:, 0:2].values)
# Initialize Theta
theta = np.zeros(XX.shape[1])
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return(1 / (1 + np.exp(-z)))
# Define the Regularized cost function
def costFunctionReg(theta, reg, *args):
# This is require to make the optimize function work
h = sigmoid(XX # theta)
first = np.log(h).T # - y
second = np.log(1 - h).T # (1 - y)
J = (1 / m) * (first - second) + (reg / (2 * m)) * np.sum(np.square(theta[1:]))
return J
# Define the Regularized gradient function
def gradientReg(theta, reg, *args):
theta = theta.reshape(-1, 1)
h = sigmoid(XX # theta)
grad = (1 / m) * (XX.T # (h - y)) + (reg / m) * np.r_[[[0]], theta[1:]]
return grad.flatten()
# Define the predict Function
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
# A loop to test between different values for sigma (reg parameter)
for i, Sigma in enumerate([0, 1, 100]):
# Optimize costFunctionReg
res2 = op.minimize(costFunctionReg, theta, args=(Sigma, XX, y), method=None, jac=gradientReg)
# Get the accuracy of the model
accuracy = 100 * sum(predict(res2.x, XX) == y.ravel()) / y.size
# Get the Error between different weights
error1 = costFunctionReg(res2.x, Sigma, XX, y)
# print the accuracy and error
print('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=4), Sigma))
print(error1)
Thanks for all your help!
try out this:
# import library
import pandas as pd
import numpy as np
dataset = pd.read_csv('ex2data2.csv',names = ['Test #1','Test #2','Accepted'])
# splitting to x and y variables for features and target variable
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
print('x[0] ={}, y[0] ={}'.format(x[0],y[0]))
m, n = x.shape
print('#{} Number of training samples, #{} features per sample'.format(m,n))
# import library FeatureMapping
from sklearn.preprocessing import PolynomialFeatures
# We also add one column of ones to interpret theta 0 (x with power of 0 = 1) by
include_bias as True
pf = PolynomialFeatures(degree = 6, include_bias = True)
x_poly = pf.fit_transform(x)
pd.DataFrame(x_poly).head(5)
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_ = 1
# reshape (-1,1) because we just have one feature in y column
y = y.reshape(-1,1)
def sigmoid(z):
return 1/(1+np.exp(-z))
def lr_hypothesis(x,theta):
return np.dot(x,theta)
def compute_cost(theta,x,y,lambda_):
theta = theta.reshape(n,1)
infunc1 = -y*(np.log(sigmoid(lr_hypothesis(x,theta)))) - ((1-y)*(np.log(1 - sigmoid(lr_hypothesis(x,theta)))))
infunc2 = (lambda_*np.sum(theta[1:]**2))/(2*m)
j = np.sum(infunc1)/m+ infunc2
return j
# gradient[0] correspond to gradient for theta(0)
# gradient[1:] correspond to gradient for theta(j) j>0
def compute_gradient(theta,x,y,lambda_):
gradient = np.zeros(n).reshape(n,)
theta = theta.reshape(n,1)
infunc1 = sigmoid(lr_hypothesis(x,theta))-y
gradient_in = np.dot(x.transpose(),infunc1)/m
gradient[0] = gradient_in[0,0] # theta(0)
gradient[1:] = gradient_in[1:,0]+(lambda_*theta[1:,]/m).reshape(n-1,) # theta(j) ; j>0
gradient = gradient.flatten()
return gradient
You can now test your cost and gradient without optimization. Th below code will optimize the model:
# hyperparameters
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_array = [0, 1, 10, 100]
import scipy.optimize as opt
for i in range(0,len(lambda_array)):
# Train
print('======================================== Iteration {} ===================================='.format(i))
optimized = opt.minimize(fun = compute_cost, x0 = theta, args = (x_poly, y,lambda_array[i]),
method = 'TNC', jac = compute_gradient)
new_theta = optimized.x
# Prediction
y_pred_train = predictor(x_poly,new_theta)
cm_train = confusion_matrix(y,y_pred_train)
t_train,f_train,acc_train = acc(cm_train)
print('With lambda = {}, {} correct, {} wrong ==========> accuracy = {}%'
.format(lambda_array[i],t_train,f_train,acc_train*100))
Now you should see output like this :
=== Iteration 0 === With lambda = 0, 104 correct, 14 wrong ==========> accuracy = 88.13559322033898%
=== Iteration 1 === With lambda = 1, 98 correct, 20 wrong ==========> accuracy = 83.05084745762711%
=== Iteration 2 === With lambda = 10, 88 correct, 30 wrong ==========> accuracy = 74.57627118644068%
=== Iteration 3 === With lambda = 100, 72 correct, 46 wrong ==========> accuracy = 61.016949152542374%
I was trying to match the orthogonal polynomials in the following code in R:
X <- cbind(1, poly(x = x, degree = 9))
but in python.
To do this I implemented my own method for giving orthogonal polynomials:
def get_hermite_poly(x,degree):
#scipy.special.hermite()
N, = x.shape
##
X = np.zeros( (N,degree+1) )
for n in range(N):
for deg in range(degree+1):
X[n,deg] = hermite( n=deg, z=float(x[deg]) )
return X
though it does not seem to match it. Does someone know type of orthogonal polynomial it uses? I tried search in the documentation but didn't say.
To give some context I am trying to implement the following R code in python (https://stats.stackexchange.com/questions/313265/issue-with-convergence-with-sgd-with-function-approximation-using-polynomial-lin/315185#comment602020_315185):
set.seed(1234)
N <- 10
x <- seq(from = 0, to = 1, length = N)
mu <- sin(2 * pi * x * 4)
y <- mu
plot(x,y)
X <- cbind(1, poly(x = x, degree = 9))
# X <- sapply(0:9, function(i) x^i)
w <- rnorm(10)
learning_rate <- function(t) .1 / t^(.6)
n_samp <- 2
for(t in 1:100000) {
mu_hat <- X %*% w
idx <- sample(1:N, n_samp)
X_batch <- X[idx,]
y_batch <- y[idx]
score_vec <- t(X_batch) %*% (y_batch - X_batch %*% w)
change <- score_vec * learning_rate(t)
w <- w + change
}
plot(mu_hat, ylim = c(-1, 1))
lines(mu)
fit_exact <- predict(lm(y ~ X - 1))
lines(fit_exact, col = 'red')
abs(w - coef(lm(y ~ X - 1)))
because it seems to be the only one that works with gradient descent with linear regression with polynomial features.
I feel that any orthogonal polynomial (or at least orthonormal) should work and give a hessian with condition number 1 but I can't seem to make it work in python. Related question: How does one use Hermite polynomials with Stochastic Gradient Descent (SGD)?
poly uses QR factorization, as described in some detail in this answer.
I think that what you really seem to be looking for is how to replicate the output of R's poly using python.
Here I have written a function to do that based on R's implementation. I have also added some comments so that you can see the what the equivalent statements in R look like:
import numpy as np
def poly(x, degree):
xbar = np.mean(x)
x = x - xbar
# R: outer(x, 0L:degree, "^")
X = x[:, None] ** np.arange(0, degree+1)
#R: qr(X)$qr
q, r = np.linalg.qr(X)
#R: r * (row(r) == col(r))
z = np.diag((np.diagonal(r)))
# R: Z = qr.qy(QR, z)
Zq, Zr = np.linalg.qr(q)
Z = np.matmul(Zq, z)
# R: colSums(Z^2)
norm1 = (Z**2).sum(0)
#R: (colSums(x * Z^2)/norm2 + xbar)[1L:degree]
alpha = ((x[:, None] * (Z**2)).sum(0) / norm1 +xbar)[0:degree]
# R: c(1, norm2)
norm2 = np.append(1, norm1)
# R: Z/rep(sqrt(norm1), each = length(x))
Z = Z / np.reshape(np.repeat(norm1**(1/2.0), repeats = x.size), (-1, x.size), order='F')
#R: Z[, -1]
Z = np.delete(Z, 0, axis=1)
return [Z, alpha, norm2];
Checking that this works:
x = np.arange(10) + 1
degree = 9
poly(x, degree)
The first row of the returned matrix is
[-0.49543369, 0.52223297, -0.45342519, 0.33658092, -0.21483446,
0.11677484, -0.05269379, 0.01869894, -0.00453516],
compared to the same operation in R
poly(1:10, 9)
# [1] -0.495433694 0.522232968 -0.453425193 0.336580916 -0.214834462
# [6] 0.116774842 -0.052693786 0.018698940 -0.004535159
I'm trying to use the code that I found to implement the LeCun Local Contrast Normalization but I get incorrect result. The code is in Python and uses the theano library.
def lecun_lcn(input, img_shape, kernel_shape, threshold=1e-4):
"""
Yann LeCun's local contrast normalization
Orginal code in Theano by: Guillaume Desjardins
"""
input = input.reshape(input.shape[0], 1, img_shape[0], img_shape[1])
X = T.matrix(dtype=theano.config.floatX)
X = X.reshape(input.shape)
filter_shape = (1, 1, kernel_shape, kernel_shape)
filters = gaussian_filter(kernel_shape).reshape(filter_shape)
convout = conv.conv2d(input=X,
filters=filters,
image_shape=(input.shape[0], 1, img_shape[0], img_shape[1]),
filter_shape=filter_shape,
border_mode='full')
# For each pixel, remove mean of 9x9 neighborhood
mid = int(np.floor(kernel_shape / 2.))
centered_X = X - convout[:, :, mid:-mid, mid:-mid]
# Scale down norm of 9x9 patch if norm is bigger than 1
sum_sqr_XX = conv.conv2d(input=centered_X ** 2,
filters=filters,
image_shape=(input.shape[0], 1, img_shape[0], img_shape[1]),
filter_shape=filter_shape,
border_mode='full')
denom = T.sqrt(sum_sqr_XX[:, :, mid:-mid, mid:-mid])
per_img_mean = denom.mean(axis=[1, 2])
divisor = T.largest(per_img_mean.dimshuffle(0, 'x', 'x', 1), denom)
divisor = T.maximum(divisor, threshold)
new_X = centered_X / divisor
new_X = new_X.dimshuffle(0, 2, 3, 1)
new_X = new_X.flatten(ndim=3)
f = theano.function([X], new_X)
return f(input)
Here is the testing code:
x_img_origin = plt.imread("..//data//Lenna.png")
x_img = plt.imread("..//data//Lenna.png")
x_img_real_result = plt.imread("..//data//Lenna_Processed.png")
x_img = x_img.reshape(1, x_img.shape[0], x_img.shape[1], x_img.shape[2])
for d in range(3):
x_img[:, :, :, d] = tools.lecun_lcn(x_img[:, :, :, d], (x_img.shape[1], x_img.shape[2]), 9)
x_img = x_img[0]
pylab.subplot(1, 3, 1); pylab.axis('off'); pylab.imshow(x_img_origin)
pylab.gray()
pylab.subplot(1, 3, 2); pylab.axis('off'); pylab.imshow(x_img)
pylab.subplot(1, 3, 3); pylab.axis('off'); pylab.imshow(x_img_real_result)
pylab.show()
Here is the result:
(left to right: origin, my result, the expected result)
Could someone tell me what I did wrong with the code?
Here is how I implemented local contrast normalization as reported in Jarrett et al (http://yann.lecun.com/exdb/publis/pdf/jarrett-iccv-09.pdf). You can use it as a separate layer.
I tested it on the code from the LeNet tutorial of theano in which I applied LCN to the input and to each convolutional layer which yields slightly better results.
You can find the full code here:
https://github.com/jostosh/theano_utils/blob/master/lcn.py
class LecunLCN(object):
def __init__(self, X, image_shape, threshold=1e-4, radius=9, use_divisor=True):
"""
Allocate an LCN.
:type X: theano.tensor.dtensor4
:param X: symbolic image tensor, of shape image_shape
:type image_shape: tuple or list of length 4
:param image_shape: (batch size, num input feature maps,
image height, image width)
:type threshold: double
:param threshold: the threshold will be used to avoid division by zeros
:type radius: int
:param radius: determines size of Gaussian filter patch (default 9x9)
:type use_divisor: Boolean
:param use_divisor: whether or not to apply divisive normalization
"""
# Get Gaussian filter
filter_shape = (1, image_shape[1], radius, radius)
self.filters = theano.shared(self.gaussian_filter(filter_shape), borrow=True)
# Compute the Guassian weighted average by means of convolution
convout = conv.conv2d(
input=X,
filters=self.filters,
image_shape=image_shape,
filter_shape=filter_shape,
border_mode='full'
)
# Subtractive step
mid = int(numpy.floor(filter_shape[2] / 2.))
# Make filter dimension broadcastable and subtract
centered_X = X - T.addbroadcast(convout[:, :, mid:-mid, mid:-mid], 1)
# Boolean marks whether or not to perform divisive step
if use_divisor:
# Note that the local variances can be computed by using the centered_X
# tensor. If we convolve this with the mean filter, that should give us
# the variance at each point. We simply take the square root to get our
# denominator
# Compute variances
sum_sqr_XX = conv.conv2d(
input=T.sqr(centered_X),
filters=self.filters,
image_shape=image_shape,
filter_shape=filter_shape,
border_mode='full'
)
# Take square root to get local standard deviation
denom = T.sqrt(sum_sqr_XX[:,:,mid:-mid,mid:-mid])
per_img_mean = denom.mean(axis=[2,3])
divisor = T.largest(per_img_mean.dimshuffle(0, 1, 'x', 'x'), denom)
# Divisise step
new_X = centered_X / T.maximum(T.addbroadcast(divisor, 1), threshold)
else:
new_X = centered_X
self.output = new_X
def gaussian_filter(self, kernel_shape):
x = numpy.zeros(kernel_shape, dtype=theano.config.floatX)
def gauss(x, y, sigma=2.0):
Z = 2 * numpy.pi * sigma ** 2
return 1. / Z * numpy.exp(-(x ** 2 + y ** 2) / (2. * sigma ** 2))
mid = numpy.floor(kernel_shape[-1] / 2.)
for kernel_idx in xrange(0, kernel_shape[1]):
for i in xrange(0, kernel_shape[2]):
for j in xrange(0, kernel_shape[3]):
x[0, kernel_idx, i, j] = gauss(i - mid, j - mid)
return x / numpy.sum(x)
I think these two lines may have some mistakes on the matrix axes:
per_img_mean = denom.mean(axis=[1, 2])
divisor = T.largest(per_img_mean.dimshuffle(0, 'x', 'x', 1), denom)
and it should be rewritten as:
per_img_mean = denom.mean(axis=[2, 3])
divisor = T.largest(per_img_mean.dimshuffle(0, 1, 'x', 'x'), denom)