I am trying to create an efficient loss function for the following problem:
The loss is a sum of MAE calculated for each range between the red lines. The blue line is the ground truth, the orange line is a prediction, and the red dots mark the index where the value of the ground truth changes from one to another and close the current value range. Values of inputs are within the [0,1] range. The number of value ranges varies; it can be something between 2-12.
Previously, I made a code with TF map_fn but it was VERY slow:
def rwmae_old(y_true, y_pred):
y_pred = tf.convert_to_tensor(y_pred)
y_true = tf.cast(y_true, y_pred.dtype)
# prepare array
yt_tmp = tf.concat(
[tf.ones([len(y_true), 1], dtype=y_pred.dtype) * tf.cast(len(y_true), dtype=y_true.dtype), y_true], axis=-1)
yt_tmp = tf.concat([yt_tmp, tf.ones([len(y_true), 1]) * tf.cast(len(y_true), dtype=y_true.dtype)], axis=-1)
# find where there is a change of values between consecutive indices
ranges = tf.transpose(tf.where(yt_tmp[:, :-1] != yt_tmp[:, 1:]))
ranges_cols = tf.concat(
[[0], tf.transpose(tf.where(ranges[1][1:] == 0))[0] + 1, [tf.cast(len(ranges[1]), dtype=y_true.dtype)]], axis=0)
ranges_rows = tf.range(len(y_true))
losses = tf.map_fn(
# loop through every row in the array
lambda ii:
tf.reduce_mean(
tf.map_fn(
# loop through every range within the example and calculate the loss
lambda jj:
tf.reduce_mean(
tf.abs(
y_true[ii][ranges[1][ranges_cols[ii] + jj]: ranges[1][ranges_cols[ii] + jj + 1]] -
y_pred[ii][ranges[1][ranges_cols[ii] + jj]: ranges[1][ranges_cols[ii] + jj + 1]]
),
),
tf.range(ranges_cols[ii + 1] - ranges_cols[ii] - 1),
fn_output_signature=y_pred.dtype
)
),
ranges_rows,
fn_output_signature=y_pred.dtype
)
return losses
Today, I created a lazy code that just goes through every example in the batch and checks if values change between indices and, if so, calculates MAE for the current range:
def rwmae(y_true, y_pred):
(batch_size, length) = y_pred.shape
losses = tf.zeros(batch_size, dtype=y_pred.dtype)
for ii in range(batch_size):
# reset loss for the current row
loss = tf.constant(0, dtype=y_pred.dtype)
# set current range start index to 0
ris = 0
for jj in range(length - 1):
if y_true[ii][jj] != y_true[ii][jj + 1]:
# we found a point of change, calculate the loss in the current range and ...
loss = tf.add(loss, tf.reduce_mean(tf.abs(y_true[ii][ris: jj + 1] - y_pred[ii][ris: jj + 1])))
# ... update the new range starting point
ris = jj + 1
if ris != length - 1:
# we need to calculate the loss for the rest of the vector
loss = tf.add(loss, tf.reduce_mean(tf.abs(y_true[ii][ris: length] - y_pred[ii][ris: length])))
#replace loss in the proper row
losses = tf.tensor_scatter_nd_update(losses, [[ii]], [loss])
return losses
Do you think there is any way to improve its efficiency? Or maybe you think there is a better loss function for the problem?
You can try something like this:
import numpy as np
import tensorflow as tf
def rwmae(y_true, y_pred):
(batch_size, length) = tf.shape(y_pred)
losses = tf.zeros(batch_size, dtype=y_pred.dtype)
for ii in tf.range(batch_size):
ris = 0
indices= tf.concat([tf.where(y_true[ii][:-1] != y_true[ii][1:])[:, 0], [length-1]], axis=0)
ragged_indices = tf.ragged.range(tf.concat([[ris], indices[:-1] + 1], axis=0), indices + 1)
loss = tf.reduce_sum(tf.reduce_mean(tf.abs(tf.gather(y_true[ii], ragged_indices) - tf.gather(y_pred[ii], ragged_indices)), axis=-1, keepdims=True))
losses = tf.tensor_scatter_nd_update(losses, [[ii]], [tf.math.divide_no_nan(loss, tf.cast(tf.shape(indices)[0], dtype=tf.float32))])
return losses
data = np.load('/content/data.npy', allow_pickle=True)
y_pred = data[0:2][0]
y_true = data[0:2][1]
print(rwmae(y_true, y_pred), y_true.shape)
Related
def svm_loss_naive(W, X, y):
"""
SVM loss function, naive implementation calculating loss for each sample
using loops.
Inputs:
- X: A numpy array of shape (n, m) containing data(samples).
- y: A numpy array of shape (m, ) containing labels
- W: A numpy array of shape (p, n) containing weights.
"""
# Compute the loss
num_classes = W.shape[0] # classes weights are in row wise fashion
num_samples = X.shape[1] # samples of unknown images are in column-wise fashion
loss = 0.0
delta = 1 # SVM parameter
for i in range(num_samples):
scores = np.dot(W, X[:,i])
correct_class_score = scores[y[i]]
for j in range(num_classes):
if j == y[i]:
continue
margin = max(0, scores[j] - correct_class_score + delta )
loss = loss + margin
# Average loss
loss = loss / num_samples
return loss
according to my understanding of the python code
we are first calculating the score for the 1st category by multiplying the weight of the 1st row with the 1st sample column
then we are fetching the correct_class_score of the ith sample that we have stored in array y
then we are iterating over the number of classes(let it be 3) the thing I didn't understand is
what j == y[i] is doing?
I mean to say when j will be equal to y[i] as j ranges from 0 to 2 and y[i] is just the index of the correct_class_score for the ith sample
the rest of the code I understood thnx in advance
This is the SVM loss (data loss) function definition:
In the internal summation the j index equal to y[i] is explicitly escluded
I am new to PyTorch and I would like to implement linear regression partly with PyTorch and partly on my own. I want to use squared features for my regression:
import torch
# init
x = torch.tensor([1,2,3,4,5])
y = torch.tensor([[1],[4],[9],[16],[25]])
w = torch.tensor([[0.5], [0.5], [0.5]], requires_grad=True)
iterations = 30
alpha = 0.01
def forward(X):
# feature transformation [1, x, x^2]
psi = torch.tensor([[1.0, x[0], x[0]**2]])
for i in range(1, len(X)):
psi = torch.cat((psi, torch.tensor([[1.0, x[i], x[i]**2]])), 0)
return torch.matmul(psi, w)
def loss(y, y_hat):
return ((y-y_hat)**2).mean()
for i in range(iterations):
y_hat = forward(x)
l = loss(y, y_hat)
l.backward()
with torch.no_grad():
w -= alpha * w.grad
w.grad.zero_()
if i%10 == 0:
print(f'Iteration {i}: The weight is:\n{w.detach().numpy()}\nThe loss is:{l}\n')
When I execute my code, the regression doesn't learn the correct features and the loss increases permanently. The output is the following:
Iteration 0: The weight is:
[[0.57 ]
[0.81 ]
[1.898]]
The loss is:25.450000762939453
Iteration 10: The weight is:
[[ 5529.5835]
[22452.398 ]
[97326.12 ]]
The loss is:210414632960.0
Iteration 20: The weight is:
[[5.0884394e+08]
[2.0662339e+09]
[8.9567642e+09]]
The loss is:1.7820802835250162e+21
Does somebody know, why my model is not learning?
UPDATE
Is there a reason why it performs so poorly? I thought it's because of the low number of training data. But also with 10 data points, it is not performing well :
You should normalize your data. Also, since you're trying to fit x -> ax² + bx + c, c is essentially the bias. It should be wiser to remove it from the training data (I'm referring to psi here) and use a separate parameter for the bias.
What could be done:
normalize your input data and targets with mean and standard deviation.
separate the parameters into w (a two-component weight tensor) and b (the bias).
you don't need to construct psi on every inference since x is identical.
you can build psi with torch.stack([torch.ones_like(x), x, x**2], 1), but here we won't need the ones, as we've essentially detached the bias from the weight tensor.
Here's how it would look like:
x = torch.tensor([1,2,3,4,5]).float()
psi = torch.stack([x, x**2], 1).float()
psi = (psi - psi.mean(0)) / psi.std(0)
y = torch.tensor([[1],[4],[9],[16],[25]]).float()
y = (y - y.mean(0)) / y.std(0)
w = torch.tensor([[0.5], [0.5]], requires_grad=True)
b = torch.tensor([0.5], requires_grad=True)
iterations = 30
alpha = 0.02
def loss(y, y_hat):
return ((y-y_hat)**2).mean()
for i in range(iterations):
y_hat = torch.matmul(psi, w) + b
l = loss(y, y_hat)
l.backward()
with torch.no_grad():
w -= alpha * w.grad
b -= alpha * b.grad
w.grad.zero_()
b.grad.zero_()
if i%10 == 0:
print(f'Iteration {i}: The weight is:\n{w.detach().numpy()}\nThe loss is:{l}\n')
And the results:
Iteration 0: The weight is:
[[0.49954653]
[0.5004535 ]]
The loss is:0.25755801796913147
Iteration 10: The weight is:
[[0.49503425]
[0.5049657 ]]
The loss is:0.07994867861270905
Iteration 20: The weight is:
[[0.49056274]
[0.50943726]]
The loss is:0.028329044580459595
Currently I'm learning from Andrew Ng course on Coursera called "Machine Learning". In exercise 5, we built a model that can predict digits, trained by the MNIST dataset. This task was completed successfully in Matlab by me, but I wanted to migrate that code to Python, just to see how different things are and maybe continue to play around with the model.
I managed to implement the cost function and the back propagation algorithm correctly. I know that because I compared the metrics with my working model in Matlab and it emits the same numbers.
Now, because in the course we train the model using fmincg, I tried to do the same using Scipy fmin_cg
function.
My problem is, the cost function takes extra small steps and fails to converge.
Here is my code for the network:
import numpy as np
import utils
import scipy.optimize as op
class Network:
def __init__(self, layers):
self.layers = layers
self.weights = self.generate_params()
# Function for generating theta multidimensional matrix
def generate_params(self):
theta = []
epsilon = 0.12
for i in range(len(self.layers) - 1):
current_layer_units = self.layers[i]
next_layer_units = self.layers[i + 1]
theta_i = np.multiply(
np.random.rand(next_layer_units, current_layer_units + 1),
2 * epsilon - epsilon
)
# Appending the params to the theta matrix
theta.append(theta_i)
return theta
# Function to append bias row/column to matrix X
def append_bias(self, X, d):
m = X.shape[0]
n = 1 if len(X.shape) == 1 else X.shape[1]
if (d == 'column'):
ones = np.ones((m, n + 1))
ones[:, 1:] = X.reshape((m, n))
elif (d == 'row'):
ones = np.ones((m + 1, n))
ones[1:, :] = X.reshape((m, n))
return ones
# Function for computing the gradient for 1 training example
def back_prop(self, y, feed, theta):
activations = feed["activations"]
weighted_layers = feed["weighted_layers"]
delta_output = activations[-1] - y.reshape(len(y), 1)
current_delta = delta_output
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Peforming delta calculations.
# Here, we continue to propagate the delta values backwards
# until we arrive to the second layer.
for i in reversed(range(len(theta))):
theta_i = theta[i]
if (i > 0):
i_weighted_inputs = self.append_bias(weighted_layers[i - 1], 'row')
t_theta_i = np.transpose(theta_i)
delta_i = np.multiply(np.dot(t_theta_i, current_delta), utils.sigmoidGradient(i_weighted_inputs))
delta_i = delta_i[1:]
gradients[i] = current_delta * np.transpose(activations[i])
# Setting current delta for the next layer
current_delta = delta_i
else:
gradients[i] = current_delta * np.transpose(activations[i])
return gradients
# Function for computing the cost and the derivatives
def compute_cost(self, theta, X, y, r12n = 0):
m = len(X)
num_labels = self.layers[-1]
costs = np.zeros(m)
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Iterating over the training set
for i in range(m):
inputs = X[i]
observed = utils.create_output_vector(y[i], num_labels)
feed = self.feed_forward(inputs)
predicted = feed["activations"][-1]
total_cost = 0
for k, o in enumerate(observed):
if (o == 1):
total_cost += np.log(predicted[k])
else:
total_cost += np.log(1 - predicted[k])
cost = -1 * total_cost
# Storing the cost for the i-th training example
costs[i] = cost
# Calculating the gradient for this training example
# using back propagation algorithm
gradients_i = self.back_prop(observed, feed, theta)
for i, gradient in enumerate(gradients_i):
gradients[i] += gradient
# Calculating the avg regularization term for the cost
sum_of_theta = 0
for i, theta_i in enumerate(theta):
squared_theta = np.power(theta_i[:, 1:], 2)
sum_of_theta += np.sum(squared_theta)
r12n_avg = r12n * sum_of_theta / (2 * m)
total_cost = np.sum(costs) / m + r12n_avg
# Applying regularization terms to the gradients
for i, theta_i in enumerate(theta):
lambda_i = np.copy(theta_i)
lambda_i[:, 0] = 0
lambda_i = np.multiply((r12n / m), lambda_i)
# Adding the r12n matrix to the gradient
gradients[i] = gradients[i] / m + lambda_i
return total_cost, gradients
# Function for training the neural network using conjugate gradient algorithm
def train_cg(self, X, y, r12n = 0, iterations = 50):
weights = self.weights
def Cost(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
cost, _ = self.compute_cost(theta, X, y, r12n)
print(cost);
return cost
def Gradient(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
_, gradient = self.compute_cost(theta, X, y, r12n)
return utils.unroll_theta(gradient)
unrolled_theta = utils.unroll_theta(weights)
result = op.fmin_cg(f = Cost,
x0 = unrolled_theta,
args=(X, y),
fprime=Gradient,
maxiter = iterations)
self.weights = utils.roll_theta(result, self.layers)
# Function for feeding forward the network
def feed_forward(self, X):
# Useful variables
activations = []
weighted_layers = []
weights = self.weights
currentActivations = self.append_bias(X, 'row')
activations.append(currentActivations)
for i in range(len(self.layers) - 1):
layer_weights = weights[i]
weighted_inputs = np.dot(layer_weights, currentActivations)
# Storing the weighted inputs
weighted_layers.append(weighted_inputs)
activation_nodes = []
# If the next layer is not the output layer, we'd like to add a bias unit to it
# (Excluding the input and the output layer)
if (i < len(self.layers) - 2):
activation_nodes = self.append_bias(utils.sigmoid(weighted_inputs), 'row')
else:
activation_nodes = utils.sigmoid(weighted_inputs)
# Appending the layer of nodes to the activations array
activations.append(activation_nodes)
currentActivations = activation_nodes
data = {
"activations": activations,
"weighted_layers": weighted_layers
}
return data
def predict(self, X):
data = self.feed_forward(X)
output = data["activations"][-1]
# Finding the max index in the output layer
return np.argmax(output, axis=0)
Here is the invocation of the code:
import numpy as np
from network import Network
# %% Load data
X = np.genfromtxt('data/mnist_data.csv', delimiter=',')
y = np.genfromtxt('data/mnist_outputs.csv', delimiter=',').astype(int)
# %% Create network
num_labels = 10
input_layer = 400
hidden_layer = 25
output_layer = num_labels
layers = [input_layer, hidden_layer, output_layer]
# Create a new neural network
network = Network(layers)
# %% Train the network and save the weights
network.train_cg(X, y, r12n = 1, iterations = 20)
This is what the code emits after each iteration:
15.441233231650283
15.441116436313076
15.441192262452514
15.44122384651483
15.441231216030646
15.441232804294314
15.441233141284435
15.44123321255294
15.441233227614855
As you can see, the changes to the cost are very small.
I checked for the shapes of the vectors and gradient and they both seem fine, just like in my Matlab implementation. I'm not sure what I do wrong here.
If you guys could help me, that'd be great :)
After answering this question, there are some interesting but confused findings I met in tensorflow 2.0. The gradients of logits looks incorrect to me. Let's say we have logits and labels here.
logits = tf.Variable([[0.8, 0.1, 0.1]], dtype=tf.float32)
labels = tf.constant([[1, 0, 0]],dtype=tf.float32)
with tf.GradientTape(persistent=True) as tape:
loss = tf.reduce_sum(tf.keras.losses.categorical_crossentropy(labels, logits,
from_logits=False))
grads = tape.gradient(loss, logits)
print(grads)
Since logits is already a prob distribution, so I set from_logits=False in the loss function.
I thought tensorflow will use loss=-\Sigma_i(p_i)\log(q_i) to calculate the loss, and if we derive on q_i, we will have the derivative be -p_i/q_i. So, the expected grads should be [-1.25,0,0]. However, tensorflow will return [-0.25,1,1].
After reading the source code of tf.categorical_crossentropy, I found that even though we set from_logits=False, it still normalize the probabilities. That will change the final gradient expression. Specifically, the gradient will be -p_i/q_i+p_i/sum_j(q_j). If p_i=1 and sum_j(q_j)=1, the final gradient will plus one. That's why the gradient will be -0.25, however, I haven't figured out why the last two gradients would be 1.
To prove that all gradients are increased by 1/sum_j(q_j), I made up a logits, which is not prob distribution, and set from_logits=False still.
logits = tf.Variable([[0.5, 0.1, 0.1]], dtype=tf.float32)
labels = tf.constant([[1, 0, 0]],dtype=tf.float32)
with tf.GradientTape(persistent=True) as tape:
loss = tf.reduce_sum(tf.keras.losses.categorical_crossentropy(labels, logits,
from_logits=False))
grads = tape.gradient(loss, logits)
print(grads)
The grads returned by tensorflow is [-0.57142866,1.4285713,1.4285713 ], which I thought should be [-2,0,0].
It shows that all gradients are increased by 1/(0.5+0.1+0.1). For the p_i==1, the gradient increased by 1/(0.5+0.1+0.1) makes sense to me. But I don't understand why p_i==0, the gradient is still increased by 1/(0.5+0.1+0.1).
Update
Thanks for #OverLordGoldDragon's kind reminder. After normalizing the probs, the correct gradients formula should be -p_i/q_i+1/sum_j(q_j). So the behaviors in the question are expected.
Categorical crossentropy is tricky, particularly w.r.t. one-hot encodings; the problem arises out of presuming that some predictions are "tossed out" in computing loss or gradient, when looking at how loss is computed:
loss = f(labels * preds) = f([1, 0, 0] * preds)
Why are the gradients incorrect? Above may suggest that preds[1:] don't matter, but note that this isn't actually preds - it's preds_normalized, which involves single element of preds. To get a better idea of what's happening, the Numpy backend is helpful; assuming from_logits=False:
losses = []
for label, pred in zip(labels, preds):
pred_norm = pred / pred.sum(axis=-1, keepdims=True)
losses.append(np.sum(label * -np.log(pred_norm), axis=-1, keepdims=False))
A more complete explanation of above - here. Below is my derivation of the gradients formula, with examples comparing its Numpy implementation with tf.GradientTape results. To skip the meaty details, scroll to "Main idea".
Formula + Derivation: proof of correctness at the bottom.
"""
grad = -y * sum(p_zeros) / (p_one * sum(pred)) + p_mask / sum(pred)
p_mask = abs(y - 1)
p_zeros = p_mask * pred
y = label: 1D array of length N, one-hot
p = prediction: 1D array of length N, float32 from 0 to 1
p_norm = normalized predictions
p_mask = prediction masks (see below)
"""
What's happening? Begin with a simple example to understand what tf.GradientTape is doing:
w = tf.Variable([0.5, 0.1, 0.1])
with tf.GradientTape(persistent=True) as tape:
f1 = w[0] + w[1] # f = function
f2 = w[0] / w[1]
f3 = w[0] / (w[0] + w[1] + w[2])
print(tape.gradient(f1, w)) # [1. 1. 0.]
print(tape.gradient(f2, w)) # [10. -50. 0.]
print(tape.gradient(f3, w)) # [0.40816 -1.02040 -1.02040]
Let w = [w1, w2, w3]. Then:
"""
grad = [df1/dw1, df1/dw2, df1/dw3]
grad1 = [d(w1 + w2)/w1, d(w1 + w2)/w2, d(w1 + w2)/w3] = [1, 1, 0]
grad2 = [d(w1 / w2)/w1, d(w1 / w2)/w2, d(w1 + w2)/w3] = [1/w2, -w1/w2^2, 0] = [10, -50, 0]
grad3 = [(w1 + w2)/K, - w2/K, -w3/K] = [0.40816 -1.02040 -1.02040] -- K = (w1 + w2 + w3)^2
"""
In other words, tf.GradientTape treats each element of the input tensor its differentiating against as a variable. This in mind, it suffices to implement categorical crossentropy via elementary tffunctions then derive its derivative by hand and see if they agree. It's what I've done at the bottom code, with loss better explained in answer linked above.
Formula explanation:
f3 above is the most insightful, as it's actually pred_norm; all we need now is to add a natural log, and handle two separate cases: grads for y==1, and for y==0; with a handy Wolf, derivatives can be computed in a flash. Adding more variables to denominator, we can see the following pattern:
d(loss)/d(p_one) = p_zeros / (p_one * sum(pred))
d(loss)/d(p_non_one) = -1 / sum(pred)
where p_one is pred where label == 1, p_non_one is any other pred element, and p_zeros is all pred elements except p_one. The code at bottom is simply an implementation of exactly this, using compact syntax.
Explanation example:
Suppose label = [1, 0, 0]; pred = [.5, .1, .1]. Below is numpy_gradient, step-by-step:
p_mask == [0, 1, 1] # effectively `label` "inverted", to exclude `p_one`
p_one == .5 # pred where `label` == 1
## grad_zeros
p_mask / np.sum(pred) == [0, 1, 1] / (.5 + .1 + .1) = [0, 1/.7, 1/.7]
## grad_one
p_one * np.sum(pred) == .5 * (.5 + .1 + .1) = .5 * .7 = .35
p_mask * pred == [0, 1, 1] * [.5, .1, .1] = [0, .1, .1]
np.sum(p_mask * pred) == .2
label * np.sum(p_mask * pred) == .2 * [1, 0, 0] = [.2, 0, 0]
label * np.sum(p_mask * pred) / (p_one * np.sum(pred))
== [.2, 0, 0] / .35 = 0.57142854
Per above, we can see that the gradient is effectively divided into two computations: grad_one, and grad_zeros.
Main idea: understandably, that's a lot of detail, so here's the main idea: every element of label and pred affects grad, and loss is computed using pred_norm, not pred, and the normalization step is backpropagated. We can run a little visual to confirm this:
labels = tf.constant([[1, 0, 0]],dtype=tf.float32)
grads = []
for i in np.linspace(0, 1, 100):
logits = tf.Variable([[0.5, 0.1, i]], dtype=tf.float32)
with tf.GradientTape(persistent=True) as tape:
loss = tf.keras.losses.categorical_crossentropy(
labels, logits, from_logits=False)
grads.append(tape.gradient(loss, logits))
grads = np.vstack(grads)
plt.plot(grads)
Even though only logits[2] is varied, grads[1] varies exactly the same. The explanation's clear from grad_zeros above, but more intuitively, categorical crossentropy doesn't care "how wrong" the zero-label predictions are individually, only collectively - because it only semi-directly computes loss from pred[0] (i.e. pred[0] / sum(pred)), which is normalized by all other pred. So whether pred[1] == .9 and pred[2] == .2 or vice versa, p_norm is exactly the same.
Closing note: derived formulas are intended for a 1D case for simplicity, and may not work for N-dimensional labels and preds tensors, but can be easily generalized.
Numpy vs. tf.GradientTape:
def numpy_gradient(label, pred):
p_mask = np.abs(label - 1)
p_one = pred[np.where(label==1)[0][0]]
return p_mask / np.sum(pred) \
- label * np.sum(p_mask * pred) / (p_one * np.sum(pred))
def gtape_gradient(label, pred):
pred = tf.Variable(pred)
label = tf.Variable(label)
with tf.GradientTape() as tape:
loss = - tf.math.log(tf.reduce_sum(label * pred) / tf.reduce_sum(pred))
return tape.gradient(loss, pred).numpy()
label = np.array([1., 0., 0. ])
pred = np.array([0.5, 0.1, 0.1])
print(numpy_gradient(label, pred))
print(gtape_gradient(label, pred))
# [-0.57142854 1.4285713 1.4285713 ] <-- 100% agreement
# [-0.57142866 1.4285713 1.4285713 ] <-- 100% agreement
Given the simple OR gate problem:
or_input = np.array([[0,0], [0,1], [1,0], [1,1]])
or_output = np.array([[0,1,1,1]]).T
If we train a simple single-layered perceptron (without backpropagation), we could do something like this:
import numpy as np
np.random.seed(0)
def sigmoid(x): # Returns values that sums to one.
return 1 / (1 + np.exp(-x))
def cost(predicted, truth):
return (truth - predicted)**2
or_input = np.array([[0,0], [0,1], [1,0], [1,1]])
or_output = np.array([[0,1,1,1]]).T
# Define the shape of the weight vector.
num_data, input_dim = or_input.shape
# Define the shape of the output vector.
output_dim = len(or_output.T)
num_epochs = 50 # No. of times to iterate.
learning_rate = 0.03 # How large a step to take per iteration.
# Lets standardize and call our inputs X and outputs Y
X = or_input
Y = or_output
W = np.random.random((input_dim, output_dim))
for _ in range(num_epochs):
layer0 = X
# Forward propagation.
# Inside the perceptron, Step 2.
layer1 = sigmoid(np.dot(X, W))
# How much did we miss in the predictions?
cost_error = cost(layer1, Y)
# update weights
W += - learning_rate * np.dot(layer0.T, cost_error)
# Expected output.
print(Y.tolist())
# On the training data
print([[int(prediction > 0.5)] for prediction in layer1])
[out]:
[[0], [1], [1], [1]]
[[0], [1], [1], [1]]
With backpropagation, to compute the d(cost)/d(X), are the follow steps correct?
compute the layer1 error by multiplying the cost error and the derivatives of the cost
then compute the layer1 delta by multiplying the layer 1 error and the derivatives of the sigmoid
then do a dot product between the inputs and the layer1 delta to get the differential of the i.e. d(cost)/d(X)
Then the d(cost)/d(X) is multiplied with the negative of the learning rate to perform gradient descent.
num_epochs = 0 # No. of times to iterate.
learning_rate = 0.03 # How large a step to take per iteration.
# Lets standardize and call our inputs X and outputs Y
X = or_input
Y = or_output
W = np.random.random((input_dim, output_dim))
for _ in range(num_epochs):
layer0 = X
# Forward propagation.
# Inside the perceptron, Step 2.
layer1 = sigmoid(np.dot(X, W))
# How much did we miss in the predictions?
cost_error = cost(layer1, Y)
# Back propagation.
# multiply how much we missed from the gradient/slope of the cost for our prediction.
layer1_error = cost_error * cost_derivative(cost_error)
# multiply how much we missed by the gradient/slope of the sigmoid at the values in layer1
layer1_delta = layer1_error * sigmoid_derivative(layer1)
# update weights
W += - learning_rate * np.dot(layer0.T, layer1_delta)
In that case, should the implementation look like this below with the cost_derivative and sigmoid_derivative?
import numpy as np
np.random.seed(0)
def sigmoid(x): # Returns values that sums to one.
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(sx):
# See https://math.stackexchange.com/a/1225116
return sx * (1 - sx)
def cost(predicted, truth):
return (truth - predicted)**2
def cost_derivative(y):
# If the cost is:
# cost = y - y_hat
# What's the derivative of d(cost)/d(y)
# d(cost)/d(y) = 1
return 2*y
or_input = np.array([[0,0], [0,1], [1,0], [1,1]])
or_output = np.array([[0,1,1,1]]).T
# Define the shape of the weight vector.
num_data, input_dim = or_input.shape
# Define the shape of the output vector.
output_dim = len(or_output.T)
num_epochs = 5 # No. of times to iterate.
learning_rate = 0.03 # How large a step to take per iteration.
# Lets standardize and call our inputs X and outputs Y
X = or_input
Y = or_output
W = np.random.random((input_dim, output_dim))
for _ in range(num_epochs):
layer0 = X
# Forward propagation.
# Inside the perceptron, Step 2.
layer1 = sigmoid(np.dot(X, W))
# How much did we miss in the predictions?
cost_error = cost(layer1, Y)
# Back propagation.
# multiply how much we missed from the gradient/slope of the cost for our prediction.
layer1_error = cost_error * cost_derivative(cost_error)
# multiply how much we missed by the gradient/slope of the sigmoid at the values in layer1
layer1_delta = layer1_error * sigmoid_derivative(layer1)
# update weights
W += - learning_rate * np.dot(layer0.T, layer1_delta)
# Expected output.
print(Y.tolist())
# On the training data
print([[int(prediction > 0.5)] for prediction in layer1])
[out]:
[[0], [1], [1], [1]]
[[0], [1], [1], [1]]
BTW, given the random input seeds, even without the W and gradient descent or perceptron, the prediction can be still right:
import numpy as np
np.random.seed(0)
# Lets standardize and call our inputs X and outputs Y
X = or_input
Y = or_output
W = np.random.random((input_dim, output_dim))
# On the training data
predictions = sigmoid(np.dot(X, W))
[[int(prediction > 0.5)] for prediction in predictions]
You are almost correct. In your implementation, you define the cost as the square of the error, which as the unfortunate consequence of being always positive. As a result, if you plot the mean(cost_error), it is raising slowly at each iteration, and your weights are slowly decreasing.
In your particular case, you can have any weights >0 to make it work : if you try your implementation with enough epochs, your weights will turn negative and your network won't work anymore.
You can just remove the square in your cost function :
def cost(predicted, truth):
return (truth - predicted)
Now to update your weights, you need to evaluate the gradient at the "position" of your error. So what your need is :
d_predicted = output_errors * sigmoid_derivative(predicted_output)
Next, we update the weights :
W += np.dot(X.T, d_predicted) * learning_rate
Full code with error display :
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
def sigmoid(x): # Returns values that sums to one.
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(sx):
# See https://math.stackexchange.com/a/1225116
return sx * (1 - sx)
def cost(predicted, truth):
return (truth - predicted)
or_input = np.array([[0,0], [0,1], [1,0], [1,1]])
or_output = np.array([[0,1,1,1]]).T
# Define the shape of the weight vector.
num_data, input_dim = or_input.shape
# Define the shape of the output vector.
output_dim = len(or_output.T)
num_epochs = 50 # No. of times to iterate.
learning_rate = 0.1 # How large a step to take per iteration.
# Lets standardize and call our inputs X and outputs Y
X = or_input
Y = or_output
W = np.random.random((input_dim, output_dim))
# W = [[-1],[1]] # you can try to set bad weights to see the training process
error_list = []
for _ in range(num_epochs):
layer0 = X
# Forward propagation.
layer1 = sigmoid(np.dot(X, W))
# How much did we miss in the predictions?
cost_error = cost(layer1, Y)
error_list.append(np.mean(cost_error)) # save the loss to plot later
# Back propagation.
# eval the gradient :
d_predicted = cost_error * sigmoid_derivative(cost_error)
# update weights
W = W + np.dot(X.T, d_predicted) * learning_rate
# Expected output.
print(Y.tolist())
# On the training data
print([[int(prediction > 0.5)] for prediction in layer1])
# plot error curve :
plt.plot(range(num_epochs), loss_list, '+b')
plt.xlabel('Epoch')
plt.ylabel('mean error')
I also added a line to set the initial weights manually, to see how the network is learning