I am building a sketch of a neural network in Python 3.4 with numpy and matrices to learn a simple XOR.
My Notation is as follows:
a is the activity of a neuron
z is the input of a neuron
W is a weight matrix with size R^{#number of neurons in previous layer}x{#number of neurons in next layer}
B is a vector of bias values
After implementing a very simple network in python, everything works fine when training on only a single input vector. However, when training on all four training examples of XOR the error function shows a quite weird behaviour (see picture) and the output of the network is always roughly 0.5.
Changing the network size, the learning rate or the training epochs does not seem to help.
Cost J while only training on one training example
Cost J while training with all training examples
This is the code for the network:
import numpy as np
import time
import matplotlib.pyplot as plt
Js = []
start = time.time()
np.random.seed(2)
#Sigmoid
def activation(x, derivative = False):
if(derivative):
a = activation(x)
return a * (1 - a)
else:
return 1/(1+np.exp(-x))
def cost(output, target):
return (1/2) * np.sum((target - output)**2)
INPUTS = np.array([
[0, 1],
[1, 0],
[0, 0],
[1, 1],
])
TARGET = np.array([
[1],
[1],
[0],
[0],
])
"Hyper-Parameters"
# Layer Structure
LAYER = [2, 3, 1]
LEARNING_RATE = 0.1
ITERATIONS = int(1e3)
# Init Weights
W1 = np.random.rand(LAYER[0], LAYER[1])
W2 = np.random.rand(LAYER[1], LAYER[2])
# Init Biases
B1 = np.random.rand(LAYER[1], 1)
B2 = np.random.rand(LAYER[2], 1)
for i in range(0, ITERATIONS):
exampleIndex = i % len(INPUTS)
#exampleIndex = 2
"Forward Pass"
# Layer One Activity (Input layer)
A0 = np.transpose(INPUTS[exampleIndex:exampleIndex+1])
# Layer Two Activity (Hidden Layer)
Z1 = np.dot(np.transpose(W1), A0) + B1
A1 = activation(Z1)
# Layer Three Activity (Output Layer)
Z2 = np.dot(np.transpose(W2), A1) + B2
A2 = activation(Z2)
# Output
O = A2
# Cost J
# Target Vector T
T = np.transpose(TARGET[exampleIndex:exampleIndex+1])
J = cost(O, T)
Js.append(J)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
"Backward Pass"
# Calculate Delta of output layer
D2 = (O - T) * activation(Z2, True)
# Calculate Delta of hidden layer
D1 = np.dot(W2, D2) * activation(Z1, True)
# Calculate Derivatives w.r.t. W2
DerW2 = np.dot(A1, np.transpose(D2))
# Calculate Derivatives w.r.t. W1
DerW1 = np.dot(A0, np.transpose(D1))
# Calculate Derivatives w.r.t. B2
DerB2 = D2
# Calculate Derivatives w.r.t. B1
DerB1 = D1
"Update Weights and Biases"
W1 -= LEARNING_RATE * DerW1
B1 -= LEARNING_RATE * DerB1
W2 -= LEARNING_RATE * DerW2
B2 -= LEARNING_RATE * DerB2
# Show prediction
print("Time elapsed {}s".format(time.time() - start))
plt.plot(Js)
plt.ylabel("Cost J")
plt.xlabel("Iterations")
plt.show()
What could be the reason for this strange behaviour in my implementation?
I think your cost function is jumping since you perform your weight updates after each sample. However, your network is training the correct behavior nonetheless:
479997
J = 4.7222501603409765e-05
I = [[1]
[0]], O = [[ 0.99028172]]
T = [[1]]
479998
J = 7.3205311398742e-05
I = [[0]
[0]], O = [[ 0.01210003]]
T = [[0]]
479999
J = 4.577485181547362e-05
I = [[1]
[1]], O = [[ 0.00956816]]
T = [[0]]
480000
J = 4.726257702199439e-05
I = [[0]
[1]], O = [[ 0.9902776]]
T = [[1]]
The cost function shows some interesting behavior: the training process reaches a point where jumps in the cost function will become quite small.
You can reproduce this with the code below (I have only made slight changes; note that I trained over much more epochs):
import numpy as np
import time
import matplotlib.pyplot as plt
Js = []
start = time.time()
np.random.seed(2)
#Sigmoid
def activation(x, derivative = False):
if(derivative):
a = activation(x)
return a * (1 - a)
else:
return 1/(1+np.exp(-x))
def cost(output, target):
return (1/2) * np.sum((target - output)**2)
INPUTS = np.array([[0, 1],[1, 0],[0, 0],[1, 1]])
TARGET = np.array([[1],[1],[0],[0]])
"Hyper-Parameters"
# Layer Structure
LAYER = [2, 3, 1]
LEARNING_RATE = 0.1
ITERATIONS = int(5e5)
# Init Weights
W1 = np.random.rand(LAYER[0], LAYER[1])
W2 = np.random.rand(LAYER[1], LAYER[2])
# Init Biases
B1 = np.random.rand(LAYER[1], 1)
B2 = np.random.rand(LAYER[2], 1)
for i in range(0, ITERATIONS):
exampleIndex = i % len(INPUTS)
# exampleIndex = 2
"Forward Pass"
# Layer One Activity (Input layer)
A0 = np.transpose(INPUTS[exampleIndex:exampleIndex+1])
# Layer Two Activity (Hidden Layer)
Z1 = np.dot(np.transpose(W1), A0) + B1
A1 = activation(Z1)
# Layer Three Activity (Output Layer)
Z2 = np.dot(np.transpose(W2), A1) + B2
A2 = activation(Z2)
# Output
O = A2
# Cost J
# Target Vector T
T = np.transpose(TARGET[exampleIndex:exampleIndex+1])
J = cost(O, T)
Js.append(J)
# print("J = {}".format(J))
# print("I = {}, O = {}".format(A0, O))
# print("T = {}".format(T))
if ((i+3) % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))
if ((i+2) % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))
if ((i+1) % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))
if (i % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))
"Backward Pass"
# Calculate Delta of output layer
D2 = (O - T) * activation(Z2, True)
# Calculate Delta of hidden layer
D1 = np.dot(W2, D2) * activation(Z1, True)
# Calculate Derivatives w.r.t. W2
DerW2 = np.dot(A1, np.transpose(D2))
# Calculate Derivatives w.r.t. W1
DerW1 = np.dot(A0, np.transpose(D1))
# Calculate Derivatives w.r.t. B2
DerB2 = D2
# Calculate Derivatives w.r.t. B1
DerB1 = D1
"Update Weights and Biases"
W1 -= LEARNING_RATE * DerW1
B1 -= LEARNING_RATE * DerB1
W2 -= LEARNING_RATE * DerW2
B2 -= LEARNING_RATE * DerB2
# Show prediction
print("Time elapsed {}s".format(time.time() - start))
plt.plot(Js)
plt.ylabel("Cost J")
plt.xlabel("Iterations")
plt.savefig('cost.pdf')
plt.show()
In order to reduce fluctuations in the cost function, one usually uses multiple data samples before performing an update (some averaged update), but I see that this is difficult in a set containing only four different training events.
So, to conclude this rather long answer: your cost function jumps because it is calculated for every single example and not for an average of multiple examples. However, the network output follows the distribution of the XOR function quite well, so you don't need to change it.
Related
I'm following the Andrew-Ng course on Machine Learning and I'm currently doing the week 5 exercise.
I've found myself stuck on the implementation of the backpropagation algorithm, due to the fact that the relative difference, compared to numerical gradient, is very high (order of 1e-1), but I can't find any error within my implementation, so I'm gently asking if someone could take a look at it and explain what I did wrong.
Forward propagation:
def forward_propagation(thetas, X, history=False):
activation_arr = []
a = X # X is the array of the first activation values
for k in range(0, len(thetas)):
a = add_intercept(a) # add the bias unit
a = sigmoid(a # thetas[k].T)
if history:
activation_arr.append(a)
return activation_arr if history else a
Backpropagation:
def gradient_nn(thetas, X, y, num_labels, reg_lambda=None):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
# add intercepted X to the activation array
activation_arr = [add_intercept(X)] + forward_propagation(thetas, X, history=True)
sigma = [activation_arr[-1] - Y] # sigma^L = a^L - y
delta = [sigma[-1].T # activation_arr[-2]] # find delta for the first row
thetas_grad = []
# Calculate sigma and delta
for idx in range(1, len(thetas)): # skip last iteration
sigma = [
(sigma[0] # thetas[-idx][:, 1:]) * partial_derivative(activation_arr[-1-idx])
] + sigma
delta = [
sigma[0].T # activation_arr[-2-idx]
] + delta
return [np.sum(d) / n_examples for d in thetas_grad]
Partial derivative:
def partial_derivative(a):
return a * (1 - a) # element wise multiplication
Numerical gradient:
def compute_numerical_gradient(cost_function, thetas):
# Unroll parameters
nn_params = unroll_thetas(thetas)
num_grad = np.zeros(nn_params.shape)
perturb = np.zeros(nn_params.shape)
shapes = [theta.shape for theta in thetas]
epsilon = 1e-4 # not the one of random initialization
for p in range(nn_params.shape[0]):
# Set perturbation vector
perturb[p] = epsilon
minus_theta = nn_params - perturb
plus_theta = nn_params + perturb
# --- Roll params back in order to use the cost function ---
minus_theta = roll_thetas(minus_theta, shapes)
plus_theta = roll_thetas(plus_theta, shapes)
# calculate the loss of the cost function
minus_loss = cost_function(minus_theta)
plus_loss = cost_function(plus_theta)
# Compute Numerical Gradient
num_grad[p] = (plus_loss - minus_loss) / (2 * epsilon)
perturb[p] = 0
num_grad = roll_thetas(num_grad, shapes)
return [np.sum(num_g) for num_g in num_grad]
Cost function:
def J_nn(num_labels, reg_lambda=None):
def non_reg_func(thetas, X, y):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
prediction = forward_propagation(thetas, X)
return np.sum(np.sum(-Y * np.log(prediction) - (1 - Y) * np.log(1 - prediction))) / n_examples
if reg_lambda is None:
func = non_reg_func
else: # regularization
def func(thetas, X, y):
cost = non_reg_func(thetas, X, y)
for theta in thetas: # regularize for every layer
theta = theta[1:] # remove bias unit
cost = cost + (reg_lambda / (2 * y.shape[0])) * np.sum(np.sum(theta[:, ] ** 2))
return cost
return func
Checking backpropagation with numerical gradient:
def check_nn_gradients(reg_lambda=None):
"""
Creates a small neural network (max 8 x 8 x 7 x 8) and checks that
the implementation of the backpropagation algorithm is good
"""
#n_examples, sizes = random.randint(5, 10), [random.randint(2, 8), random.randint(2, 8), random.randint(1, 8)]
n_examples, sizes = 5, [8, 8, 5, 4]
n_labels = sizes[-1] # Last size is equal to the number of labels
init_epsilon = 0.0001
thetas = random_init_thetas(sizes, init_epsilon)
X = np.array(
random_init_thetas([sizes[0]-1, n_examples], init_epsilon)
).squeeze() # We squeeze it because random_init_thetas returns a 3D array, but we want X to be 2D
y = np.array([random.randint(0, n_labels-1) for _ in X])
y = y[:, np.newaxis]
inner_cost = lambda _thetas: J_nn(n_labels, reg_lambda)(_thetas, X, y)
gradients = gradient_nn(thetas, X, y, n_labels, 0)
unrolled_gradients = unroll_thetas(gradients)
print(unrolled_gradients)
# finite difference method
grad_checking_epsilon = 1e-4
num_grad = compute_numerical_gradient(inner_cost, thetas)
unrolled_num_grad = unroll_thetas(num_grad)
print(unrolled_num_grad)
return diff = np.linalg.norm(unrolled_num_grad - unrolled_gradients) / np.linalg.norm(unrolled_num_grad + unrolled_gradients)
This post is about the same issue, but no proper answer has been given. And since this problem seems to be widespread, I'll keep my code behind the scene.
Following this source, I've written a network which does well when I give it a training example with a target vector. Using gradient descent I minimize the cost function to make the network provide the target vector when given the corresponding input vector. But this only works for one example!
The main goal of a neural network is to react differently for different inputs, and we should be able to train it to do so. I tried changing network weights by an average of delta-weights computed for each example, which failed: the training process gets stuck with the output vector holding averages of all the target vectors from the training set. No ideas left, no sources found to explain.
How do I train a neural network with a set of examples, not with just one input vector?
Update
For those wondering I'll attach my code below. Try to run this and you will see that instead of outputting 0 1 it provides 0.5 0.5, which is the result of subtracting averaged delta-weights.
import numpy as np
from sympy import symbols, lambdify
from sympy.functions.elementary.exponential import exp
from time import sleep
x = symbols('x')
sigmoid = exp(x) / (1 + exp(x))
sigmoid_der = sigmoid.diff(x)
sigmoid = lambdify(x, sigmoid)
sigmoid_der = lambdify(x, sigmoid_der)
class Neuron:
def __init__(self, amount_of_inputs: int, hidden = True):
self.inputs = np.random.rand(amount_of_inputs) if hidden else np.array([1])
self.bias = 0.0
self._activation = 0.0
self._wsum = 0.0
#property
def activation(self) -> float:
return self._activation
#property
def wsum(self) -> float:
return self._wsum
def calculate(self, indata):
wval = self.inputs * indata + self.bias
self._wsum = wval.sum()
self._activation = sigmoid(self._wsum)
class NeuralNetwork:
def __init__(self, shape: tuple):
self.shape = shape
self.layers = len(self.shape)
self.network = [None for _ in range(self.layers)]
self.network[0] = tuple([Neuron(1, hidden = False) for _ in range(shape[0])])
for L in range(1, self.layers):
self.network[L] = tuple([Neuron(shape[L - 1]) for _ in range(shape[L])])
self.network = tuple(self.network)
y = [symbols(f'y[{i}]') for i in range(shape[self.layers - 1])]
a = [symbols(f'a[{i}]') for i in range(shape[self.layers - 1])]
self.cost_function = sum([(y[i] - a[i]) ** 2 / 2 for i in range(shape[self.layers - 1])])
self.gradient = tuple([self.cost_function.diff(a[i]) for i in range(shape[self.layers - 1])])
self.cost_function = lambdify((y, a), self.cost_function)
self.gradient = lambdify((y, a), self.gradient)
def getLayer(self, L):
return np.array([self.network[L][i].activation for i in range(self.shape[L])])
def getWeightedSum(self, L):
return np.array([self.network[L][i].wsum for i in range(self.shape[L])])
def getInputsMatrix(self, L):
return np.array([self.network[L][i].inputs for i in range(self.shape[L])])
def calculate(self, values):
for i in range(self.shape[0]):
self.network[0][i].calculate(values[i])
for L in range(1, self.layers):
indata = self.getLayer(L - 1)
for j in range(self.shape[L]):
self.network[L][j].calculate(indata)
def get_result(self) -> tuple:
return tuple([self.network[self.layers - 1][i].activation for i in range(self.shape[self.layers - 1])])
def teach(self, targets, examples):
if len(targets) != len(examples):
raise TypeError("The amounts of target and input vectors do not coincide")
activations = [None for _ in range(len(examples))]
delta = activations.copy()
cost_is_low_enough = False
while not cost_is_low_enough:
for x in range(len(examples)):
self.calculate(examples[x])
activations[x] = [self.getLayer(l) for l in range(self.layers)]
delta[x] = [None for _ in range(self.layers - 1)]
network_output = self.getLayer(self.layers - 1)
output_weighted = self.getWeightedSum(self.layers - 1)
gradient_vector = np.array(self.gradient(targets[x], network_output))
delta[x][-1] = gradient_vector * sigmoid_der(output_weighted)
for l in range(self.layers - 2, 0, -1):
weight_matrix = self.getInputsMatrix(l + 1).transpose()
output_weighted = self.getWeightedSum(l)
activation = self.getLayer(l)
for j in range(self.shape[l]):
delta[x][l - 1] = (weight_matrix # delta[x][l]) * sigmoid_der(output_weighted) * activation
dw = [None for _ in range(self.layers - 1)]
for x in range(len(examples)):
self.calculate(examples[x])
for l in range(self.layers - 1):
dw[l] = np.empty(self.shape[l + 1])
for j in range(self.shape[l + 1]):
dw[l][j] = np.mean([delta[x][l][j] for x in range(len(examples))])
for l in range(1, self.layers):
for j in range(self.shape[l]):
for k in range(self.shape[l - 1]):
self.network[l][j].inputs[k] -= 0.1 * dw[l - 1][j]
cost = 0
for x in range(len(examples)):
self.calculate(examples[x])
network_output = np.array(self.get_result())
incost = self.cost_function(targets[x], network_output)
print(network_output, incost)
cost += incost
# sleep(0.05)
cost /= len(examples)
print()
if cost < 0.001: cost_is_low_enough = True
network = NeuralNetwork((2, 4, 1))
examples = np.array([
[1, 2],
[3, 4],
])
targets = np.array([
[0],
[1]
])
network.teach(targets, examples)
values_1 = np.array([5, 10])
network.calculate(values_1)
result = network.get_result()
print(result)
'''
values_2 = np.array([3, 4])
network.calculate(values_2)
result = network.get_result()
print(result)
'''
How can I visually display this gradient descent algorithm (e.g graph)?
import matplotlib.pyplot as plt
def sigmoid(sop):
return 1.0 / (1 + numpy.exp(-1 * sop))
def error(predicted, target):
return numpy.power(predicted - target, 2)
def error_predicted_deriv(predicted, target):
return 2 * (predicted - target)
def activation_sop_deriv(sop):
return sigmoid(sop) * (1.0 - sigmoid(sop))
def sop_w_deriv(x):
return x
def update_w(w, grad, learning_rate):
return w - learning_rate * grad
x = 0.1
target = 0.3
learning_rate = 0.01
w = numpy.random.rand()
print("Initial W : ", w)
iterations = 10000
for k in range(iterations):
# Forward Pass
y = w * x
predicted = sigmoid(y)
err = error(predicted, target)
# Backward Pass
g1 = error_predicted_deriv(predicted, target)
g2 = activation_sop_deriv(predicted)
g3 = sop_w_deriv(x)
grad = g3 * g2 * g1
# print(predicted)
w = update_w(w, grad, learning_rate)
I tried making a very simple plot with matplotlib but couldn't get the line to actual display (the graph initialised properly, but the line didn't appear).
Here's what I did:
plt.plot(iterations, predicted)
plt.ylabel("Prediction")
plt.xlabel("Iteration Number")
plt.show()
I tried doing a search but none of the resources I found applied to this particular format of gradient descent.
Both iterations and predicted are scalar values in your code, that's why you can't generate the line chart. You would need to store their values in two arrays in order to be able to plot them:
K = 10000
iterations = numpy.arange(K)
predicted = numpy.zeros(K)
for k in range(K):
# Forward Pass
y = w * x
predicted[k] = sigmoid(y)
err = error(predicted[k], target)
# Backward Pass
g1 = error_predicted_deriv(predicted[k], target)
g2 = activation_sop_deriv(predicted[k])
g3 = sop_w_deriv(x)
grad = g3 * g2 * g1
# print(predicted[k])
w = update_w(w, grad, learning_rate)
I want to define a custom loss function in Keras with Tensorflow backend which uses only the predicted y values, regardless of the true ones. The graph compiles successfully, but at the start of the training it returns an exception: InvalidArgumentError (see above for traceback): Self-adjoint eigen decomposition was not successful. The input might not be valid. I have tried replacing my data with random dummy data, but it produces the same exception.
My full code of the loss definition can be found below. Why is the input to the
tf.self_adjoint_eig not valid?
def model_correlation_loss(representation_size, k_singular_values):
global batch_size
def keras_loss(y_true, y_pred):
global batch_size
regularization_constant_1 = regularization_constant_2 = 1e-4
epsilon = 1e-12
o1 = o2 = int(y_pred.shape[1] // 2)
h_1 = y_pred[:, 0:o1]
h_2 = y_pred[:, o1:o1+o2]
h_1 = tf.transpose(h_1)
h_2 = tf.transpose(h_2)
m = tf.shape(h_1)[1]
centered_h_1 = h_1 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul(h_1, tf.ones(shape=(m, m)))
centered_h_2 = h_2 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul(h_2, tf.ones(shape=(m, m)))
sigma_hat_12 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_2))
sigma_hat_11 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_1)) + regularization_constant_1 * tf.eye(num_rows=o1)
sigma_hat_22 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_2, tf.transpose(centered_h_2)) + regularization_constant_2 * tf.eye(num_rows=o2)
w_1, v_1 = tf.self_adjoint_eig(sigma_hat_11)
w_2, v_2 = tf.self_adjoint_eig(sigma_hat_22)
zero = tf.constant(False, dtype=tf.bool)
idx_pos_entries_1 = tf.where(tf.equal(tf.greater(w_1, epsilon), True))
idx_pos_entries_1 = tf.reshape(idx_pos_entries_1, [-1, tf.shape(idx_pos_entries_1)[0]])[0]
w_1 = tf.gather(w_1, idx_pos_entries_1)
v_1 = tf.gather(v_1, idx_pos_entries_1)
idx_pos_entries_2 = tf.where(tf.equal(tf.greater(w_2, epsilon), True))
idx_pos_entries_2 = tf.reshape(idx_pos_entries_2, [-1, tf.shape(idx_pos_entries_2)[0]])[0]
w_2 = tf.gather(w_2, idx_pos_entries_2)
v_2 = tf.gather(v_2, idx_pos_entries_2)
sigma_hat_rootinvert_11 = tf.matmul(tf.matmul(v_1, tf.diag(tf.sqrt(w_1))), tf.transpose(v_1))
sigma_hat_rootinvert_22 = tf.matmul(tf.matmul(v_2, tf.diag(tf.sqrt(w_2))), tf.transpose(v_2))
t_matrix = tf.matmul(tf.matmul(sigma_hat_rootinvert_11, sigma_hat_12), sigma_hat_rootinvert_22)
if k_singular_values == representation_size: # use all
correlation = tf.sqrt(tf.trace(tf.matmul(K.transpose(t_matrix), t_matrix)))
return correlation
return keras_loss
Here's the tf code provided by Wang on his website for computing the loss function:
def CCA_loss(H1, H2, N, d1, d2, dim, rcov1, rcov2):
# Remove mean.
m1 = tf.reduce_mean(H1, axis=0, keep_dims=True)
H1 = tf.subtract(H1, m1)
m2 = tf.reduce_mean(H2, axis=0, keep_dims=True)
H2 = tf.subtract(H2, m2)
S11 = tf.matmul(tf.transpose(H1), H1) / (N-1) + rcov1 * tf.eye(d1)
S22 = tf.matmul(tf.transpose(H2), H2) / (N-1) + rcov2 * tf.eye(d2)
S12 = tf.matmul(tf.transpose(H1), H2) / (N-1)
E1, V1 = tf.self_adjoint_eig(S11)
E2, V2 = tf.self_adjoint_eig(S22)
# For numerical stability.
idx1 = tf.where(E1>eps_eig)[:,0]
E1 = tf.gather(E1, idx1)
V1 = tf.gather(V1, idx1, axis=1)
idx2 = tf.where(E2>eps_eig)[:,0]
E2 = tf.gather(E2, idx2)
V2 = tf.gather(V2, idx2, axis=1)
K11 = tf.matmul( tf.matmul(V1, tf.diag(tf.reciprocal(tf.sqrt(E1)))), tf.transpose(V1))
K22 = tf.matmul( tf.matmul(V2, tf.diag(tf.reciprocal(tf.sqrt(E2)))), tf.transpose(V2))
T = tf.matmul( tf.matmul(K11, S12), K22)
# Eigenvalues are sorted in increasing order.
E2, U = tf.self_adjoint_eig(tf.matmul(T, tf.transpose(T)))
return tf.reduce_sum(tf.sqrt(E2[-dim:]))
In code listed in logistic regression code, I saw the following code snippet. What throws me off is the expression:
probs[range(num_examples),y].
Can someone tell me what dimension this matrix has? My guess is that it's a N*K by N*K matrix, but I am not sure. Thanks.
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
N = 100 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')
for j in xrange(K):
ix = range(N*j,N*(j+1))
r = np.linspace(0.0,1,N) # radius
t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
y[ix] = j
#Train a Linear Classifier
# initialize parameters randomly
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1,K))
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength
# gradient descent loop
num_examples = X.shape[0]
for i in xrange(200):
# evaluate class scores, [N x K]
scores = np.dot(X, W) + b
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
# compute the loss: average cross-entropy loss and regularization
corect_logprobs = -np.log(probs[range(num_examples),y])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W)
loss = data_loss + reg_loss
if i % 10 == 0:
probs[range(num_examples), y] seems to be a 1D slice, where:
range(num_examples) is a vector spanning the length of your samples
y is a 1D vector, length N*K