Visualizing the attention map of a multihead attention in ViT

Visualizing the attention map of a multihead attention in ViT - python

I'm trying to visualize the attention map of mit Visual Transformer architecture in keras/tensorflow. For this I was able to implement the ViT model the following way:
def model():
input_layer = layers.Input(shape=input_shape)
#image_patches = create_patches(input_layer)
#print(input_layer.shape)
image_patches = Patches(patch_size)(input_layer)
#print(image_patches.shape)
encoded_patches = PatchEncoder(num_patch, projection_dim)(image_patches)
#print(encoded_patches.shape)
#for i in range(transformer_blocks):
x1 = layers.LayerNormalization()(encoded_patches)
x1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim, name='MHA_1')(x1, x1)
x = layers.Add()([x1, encoded_patches])
x2 = layers.LayerNormalization()(x)
x2 = mlp_head(x2, transformer_units)
encoded_patches = layers.Add()([x2, x])
x = layers.LayerNormalization()(encoded_patches)
x = layers.Flatten()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(2)(x)
model = tf.keras.Model(inputs=input_layer, outputs=x)
print(model.summary())
return model
I'm now trying to visualize the attention map based on an input image and my model output. For this I first try to predict the outcome and reshape the weights:
def attention_map(model, image):
size = model.input_shape[1]
grid_size = int(np.sqrt(model.layers[4].output_shape[-2] - 1))
# Prepare the input
X = preprocess_inputs(cv2.resize(image, (size, size)))#[np.newaxis, :] # type: ignore
# Get the attention weights from each transformer.
outputs = [
l.output[1] for l in model.layers if isinstance(l, layers.MultiHeadAttention)
]
weights = np.array(
tf.keras.models.Model(inputs=model.inputs, outputs=outputs).predict(X_test)
)
print(weights.shape)
num_layers = weights.shape[0]
num_heads = weights.shape[1]
reshaped = weights.reshape(
(num_layers, num_heads, grid_size ** 2 + 1, grid_size ** 2 + 1)
)
# From Appendix D.6 in the paper ...
# Average the attention weights across all heads.
reshaped = reshaped.mean(axis=1)
# From Section 3 in https://arxiv.org/pdf/2005.00928.pdf ...
# To account for residual connections, we add an identity matrix to the
# attention matrix and re-normalize the weights.
reshaped = reshaped + np.eye(reshaped.shape[1])
reshaped = reshaped / reshaped.sum(axis=(1, 2))[:, np.newaxis, np.newaxis]
# Recursively multiply the weight matrices
v = reshaped[-1]
for n in range(1, len(reshaped)):
v = np.matmul(v, reshaped[-1 - n])
# Attention from the output token to the input space.
mask = v[0, 1:].reshape(grid_size, grid_size)
mask = cv2.resize(mask / mask.max(), (image.shape[1], image.shape[0]))[
..., np.newaxis
]
return (mask * image).astype("uint8")
However my problem is now to reshape my weight matrix getting in mismatch. Can someone give me a hint on why this is occuring? A hint based on the output dimension given by
weights = np.array(
tf.keras.models.Model(inputs=model.inputs, outputs=outputs).predict(X_test)
)
would also help.

Related

Keras tensor operation dimensionality problem when implementing inductive bias

I am trying to implement the following inductive bias (the following is for a batch size/sample size of one, the first array dimension):
A function g takes five variables/features 100 times (second array dimension in X below). I then find the average of these hundred function evaluations and feed it into a second function f. Using the Keras-only API, I tried implementing this inductive bias in the following way:
def call(self, x):
y_i = self.g(x)[:,:, 0]
y = keras.backend.sum(y_i, axis=1, keepdims=True) / y_i.shape[1]
z = self.f(y)
return z[:, 0]
Sadly, I get the following error.
NotImplementedError: Unable to build a Value from a 'tuple' instance
which is a Keras problem when dimensionality does not match. If I write instead:
def call(self, x):
y_i = self.g(x)[:,:, 0]
return y_i
skipping f entirely, the whole model runs fine. I therefore suspect that my summing y = keras.backend.sum(y_i, axis=1, keepdims=True) / y_i.shape[1] is wrong.
For reproduction, here is the entire runnable code. You can skip import os and os.environ... if you are not on a Mac with PlaidML and AMD.
Code
# %%
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
import numpy as np
import keras
def mlp2(size_in, size_out):
hidden = 128
inputs = keras.Input(shape=(size_in,))
x = keras.layers.Dense(hidden, name='layer1', activation='relu')(inputs)
x = keras.layers.Dense(hidden, name='layer2', activation='relu')(x)
x = keras.layers.Dense(hidden, name='layer3', activation='relu')(x)
outputs = keras.layers.Dense(size_out, name='layer4', activation='relu')(x)
m = keras.Model(inputs, outputs)
return m
class SumNet(keras.models.Model):
def __init__(self):
super(SumNet, self).__init__()
########################################################
# The same inductive bias as above!
self.g = mlp2(5, 1)
self.f = mlp2(1, 1)
def call(self, x):
y_i = self.g(x)[:,:, 0]
y = keras.backend.sum(y_i, axis=1, keepdims=True) / y_i.shape[1]
z = self.f(y)
return z[:,]
# %%
###### np.random.seed(0)
factoring = SumNet()
# check if there is an argument for a maximum learning rate, set to default 10^-3
# check difference of epochs vs total steps in pytorch "scheduler" object
optimizer = keras.optimizers.Adam(lr=1e-3)
factoring.compile(optimizer, loss=keras.losses.mean_squared_error)
# %%
N = 100000
Nt = 100
X = 6 * np.random.rand(N, Nt, 5) - 3
y_i = X[..., 0] ** 2 + 6 * np.cos(2 * X[..., 2])
y = np.sum(y_i, axis=1) / y_i.shape[1]
z = y ** 2
X.shape, y.shape
# create array along first dim of X
f_dim = np.arange(len(X))
training_indeces = np.random.choice(f_dim, int(.8*f_dim.shape[0]), replace=False)
# include_idx = set(training_indeces) #Set is more efficient, but doesn't reorder your elements if that is desireable
mask = np.array([(i in training_indeces) for i in np.arange(len(X))])
Xtrain = X[mask]
ztrain = z[mask]
Xtest = X[~mask]
ztest = z[~mask]
# %%
factoring.fit(Xtrain, ztrain, batch_size=64, epochs=3, validation_split=.05)
results = factoring.evaluate(Xtest, ztest, batch_size=64)
print("test loss, test acc:", results)

IndexError: Dimension out of range - PyTorch dimension expected to be in range of [-1, 0], but got 1

Despite already numerous answers on this very topic, failing to see in the example below (extract from https://gist.github.com/lirnli/c16ef186c75588e705d9864fb816a13c on Variational Recurrent Networks) which input and output dimensions trigger the error.
Having tried to change dimensions in torch.cat and also suppress the call to squeeze(), the error persists,
<ipython-input-51-cdc928891ad7> in generate(self, hidden, temperature)
56 x_sample = x = x_out.div(temperature).exp().multinomial(1).squeeze()
57 x = self.phi_x(x)
---> 58 tc = torch.cat([x,z], dim=1)
59
60 hidden_next = self.rnn(tc,hidden)
IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
Thus how to shape the dimensions in x and z in tc = torch.cat([x,z], dim=1)?
Note the code as follows,
import torch
from torch import nn, optim
from torch.autograd import Variable
class VRNNCell(nn.Module):
def __init__(self):
super(VRNNCell,self).__init__()
self.phi_x = nn.Sequential(nn.Embedding(128,64), nn.Linear(64,64), nn.ELU())
self.encoder = nn.Linear(128,64*2) # output hyperparameters
self.phi_z = nn.Sequential(nn.Linear(64,64), nn.ELU())
self.decoder = nn.Linear(128,128) # logits
self.prior = nn.Linear(64,64*2) # output hyperparameters
self.rnn = nn.GRUCell(128,64)
def forward(self, x, hidden):
x = self.phi_x(x)
# 1. h => z
z_prior = self.prior(hidden)
# 2. x + h => z
z_infer = self.encoder(torch.cat([x,hidden], dim=1))
# sampling
z = Variable(torch.randn(x.size(0),64))*z_infer[:,64:].exp()+z_infer[:,:64]
z = self.phi_z(z)
# 3. h + z => x
x_out = self.decoder(torch.cat([hidden, z], dim=1))
# 4. x + z => h
hidden_next = self.rnn(torch.cat([x,z], dim=1),hidden)
return x_out, hidden_next, z_prior, z_infer
def calculate_loss(self, x, hidden):
x_out, hidden_next, z_prior, z_infer = self.forward(x, hidden)
# 1. logistic regression loss
loss1 = nn.functional.cross_entropy(x_out, x)
# 2. KL Divergence between Multivariate Gaussian
mu_infer, log_sigma_infer = z_infer[:,:64], z_infer[:,64:]
mu_prior, log_sigma_prior = z_prior[:,:64], z_prior[:,64:]
loss2 = (2*(log_sigma_infer-log_sigma_prior)).exp() \
+ ((mu_infer-mu_prior)/log_sigma_prior.exp())**2 \
- 2*(log_sigma_infer-log_sigma_prior) - 1
loss2 = 0.5*loss2.sum(dim=1).mean()
return loss1, loss2, hidden_next
def generate(self, hidden=None, temperature=None):
if hidden is None:
hidden=Variable(torch.zeros(1,64))
if temperature is None:
temperature = 0.8
# 1. h => z
z_prior = self.prior(hidden)
# sampling
z = Variable(torch.randn(z_prior.size(0),64))*z_prior[:,64:].exp()+z_prior[:,:64]
z = self.phi_z(z)
# 2. h + z => x
x_out = self.decoder(torch.cat([hidden, z], dim=1))
# sampling
x_sample = x = x_out.div(temperature).exp().multinomial(1).squeeze()
x = self.phi_x(x)
# 3. x + z => h
# hidden_next = self.rnn(torch.cat([x,z], dim=1),hidden)
tc = torch.cat([x,z], dim=1)
hidden_next = self.rnn(tc,hidden)
return x_sample, hidden_next
def generate_text(self, hidden=None,temperature=None, n=100):
res = []
hidden = None
for _ in range(n):
x_sample, hidden = self.generate(hidden,temperature)
res.append(chr(x_sample.data[0]))
return "".join(res)
# Test
net = VRNNCell()
x = Variable(torch.LongTensor([12,13,14]))
hidden = Variable(torch.rand(3,64))
output, hidden_next, z_infer, z_prior = net(x, hidden)
loss1, loss2, _ = net.calculate_loss(x, hidden)
loss1, loss2
hidden = Variable(torch.zeros(1,64))
net.generate_text()

The error
IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
means that you're trying to access an index that doesn't exist in the tensor. For instance, the following code would cause the same IndexError you're experiencing.
# sample input tensors
In [210]: x = torch.arange(4)
In [211]: z = torch.arange(6)
# trying to concatenate along the second dimension
# but the tensors have only one dimension (i.e., `0`).
In [212]: torch.cat([x, z], dim=1)
So, one way to overcome this is to promote the tensors to higher dimensions before concatenation, if that is what you need.
# promoting tensors to 2D before concatenation
In [216]: torch.cat([x[None, :], z[None, :]], dim=1)
Out[216]: tensor([[0, 1, 2, 3, 0, 1, 2, 3, 4, 5]])
Thus, in your case, you've to analyze and understand what shape you need for x so that it can be concatenated with z along dimension 1 and then the tc passed as input to self.rnn() along with hidden.
As far as I can see, x[None, :] , z[None, :] should work.
Debugging for successful training
The code you posted has been written for PyTorch v0.4.1. A lot has changed in the PyTorch Python API since then, but the code was not updated.
Below are the changes you need to make the code run and train successfully. Copy the below functions and paste it at appropriate places in your code.
def generate(self, hidden=None, temperature=None):
if hidden is None:
hidden=Variable(torch.zeros(1,64))
if temperature is None:
temperature = 0.8
# 1. h => z
z_prior = self.prior(hidden)
# sampling
z = Variable(torch.randn(z_prior.size(0),64))*z_prior[:,64:].exp()+z_prior[:,:64]
z = self.phi_z(z)
# 2. h + z => x
x_out = self.decoder(torch.cat([hidden, z], dim=1))
# sampling
x_sample = x = x_out.div(temperature).exp().multinomial(1).squeeze()
x = self.phi_x(x)
# 3. x + z => h
x = x[None, ...] # changed here
xz = torch.cat([x,z], dim=1) # changed here
hidden_next = self.rnn(xz,hidden) # changed here
return x_sample, hidden_next
def generate_text(self, hidden=None,temperature=None, n=100):
res = []
hidden = None
for _ in range(n):
x_sample, hidden = self.generate(hidden,temperature)
res.append(chr(x_sample.data)) # changed here
return "".join(res)
for epoch in range(max_epoch):
batch = next(g)
loss_seq = 0
loss1_seq, loss2_seq = 0, 0
optimizer.zero_grad()
for x in batch:
loss1, loss2, hidden = net.calculate_loss(Variable(x),hidden)
loss1_seq += loss1.data # changed here
loss2_seq += loss2.data # changed here
loss_seq = loss_seq + loss1+loss2
loss_seq.backward()
optimizer.step()
hidden.detach_()
if epoch%100==0:
print('>> epoch {}, loss {:12.4f}, decoder loss {:12.4f}, latent loss {:12.4f}'.format(epoch, loss_seq.data, loss1_seq, loss2_seq)) # changed here
print(net.generate_text())
print()
Note: After these changes, the training loop at my end proceeds without any errors on PyTorch v1.7.1. Have a look at the comments with # changed here to understand the changes.

Siamese network assigns same label to every couple of images

I'm working on a siamese network to identify if the input images are the same picture or not.
The problem is that the network keep assigning the label 0 (rather than 0 and 1) resulting in 0.5 accuracy every time but I don't know why.
Here's the model:
def Model():
input_dim = (200, 200, 1)
img_a = Input(shape = input_dim)
img_b = Input(shape = input_dim)
base_net = build_base_network(input_dim)
features_a = base_net(img_a)
features_b = base_net(img_b)
distance = Lambda(euclidean_distance, output_shape = eucl_dist_output_shape)([features_a, features_b])
model = Model(inputs=[img_a, img_b], outputs=distance)
return model
And here's the distance
def contrastive_loss(y_true, y_pred):
margin = 1
return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
#Optimizer
rms = RMSprop()
#Distance
def euclidean_distance(vects):
x, y = vects
return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))
def eucl_dist_output_shape(shapes):
shape1, shape2 = shapes
return (shape1[0], 1)
Training and X/y shapes
#Pair_equal --> every element is a tuple of numpy arrays representing same images
#Pair_diff --> every element is a tuple of numpy arrays representing different images
#y_equal --> for every Pair equal's element, contains 0
#y_diff --> for every Pair equal's element, contains 1
if len(Pair_equal) > len(Pair_diff):
Pair_equal = Pair_equal[0:len(Pair_diff)]
y_equal = y_equal[0:len(y_diff)]
elif len(Pair_equal) < len(Pair_diff):
Pair_diff = Pair_diff[0:len(Pair_equal)]
y_diff = y_diff[0:len(y_equal)]
Pair_equal = np.array(Pair_equal)
Pair_diff = np.array(Pair_diff)
y_equal = np.array(y_equal)
y_diff = np.array(y_diff)
X = np.concatenate([Pair_equal, Pair_diff], axis=0)
y = np.concatenate([y_equal, y_diff], axis=0)
y = y.reshape(-1, 1)
#index shuffling
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X = X[indices]
y = y[indices]
#X shape: (32, 2, 200, 200, 1)
#y shape: (32, 1)
return [X[:,0,...], X[:,1,...]], y
Thank you in advance for the help.

Scipy fails to minimize cost function

Currently I'm learning from Andrew Ng course on Coursera called "Machine Learning". In exercise 5, we built a model that can predict digits, trained by the MNIST dataset. This task was completed successfully in Matlab by me, but I wanted to migrate that code to Python, just to see how different things are and maybe continue to play around with the model.
I managed to implement the cost function and the back propagation algorithm correctly. I know that because I compared the metrics with my working model in Matlab and it emits the same numbers.
Now, because in the course we train the model using fmincg, I tried to do the same using Scipy fmin_cg
function.
My problem is, the cost function takes extra small steps and fails to converge.
Here is my code for the network:
import numpy as np
import utils
import scipy.optimize as op
class Network:
def __init__(self, layers):
self.layers = layers
self.weights = self.generate_params()
# Function for generating theta multidimensional matrix
def generate_params(self):
theta = []
epsilon = 0.12
for i in range(len(self.layers) - 1):
current_layer_units = self.layers[i]
next_layer_units = self.layers[i + 1]
theta_i = np.multiply(
np.random.rand(next_layer_units, current_layer_units + 1),
2 * epsilon - epsilon
)
# Appending the params to the theta matrix
theta.append(theta_i)
return theta
# Function to append bias row/column to matrix X
def append_bias(self, X, d):
m = X.shape[0]
n = 1 if len(X.shape) == 1 else X.shape[1]
if (d == 'column'):
ones = np.ones((m, n + 1))
ones[:, 1:] = X.reshape((m, n))
elif (d == 'row'):
ones = np.ones((m + 1, n))
ones[1:, :] = X.reshape((m, n))
return ones
# Function for computing the gradient for 1 training example
def back_prop(self, y, feed, theta):
activations = feed["activations"]
weighted_layers = feed["weighted_layers"]
delta_output = activations[-1] - y.reshape(len(y), 1)
current_delta = delta_output
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Peforming delta calculations.
# Here, we continue to propagate the delta values backwards
# until we arrive to the second layer.
for i in reversed(range(len(theta))):
theta_i = theta[i]
if (i > 0):
i_weighted_inputs = self.append_bias(weighted_layers[i - 1], 'row')
t_theta_i = np.transpose(theta_i)
delta_i = np.multiply(np.dot(t_theta_i, current_delta), utils.sigmoidGradient(i_weighted_inputs))
delta_i = delta_i[1:]
gradients[i] = current_delta * np.transpose(activations[i])
# Setting current delta for the next layer
current_delta = delta_i
else:
gradients[i] = current_delta * np.transpose(activations[i])
return gradients
# Function for computing the cost and the derivatives
def compute_cost(self, theta, X, y, r12n = 0):
m = len(X)
num_labels = self.layers[-1]
costs = np.zeros(m)
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Iterating over the training set
for i in range(m):
inputs = X[i]
observed = utils.create_output_vector(y[i], num_labels)
feed = self.feed_forward(inputs)
predicted = feed["activations"][-1]
total_cost = 0
for k, o in enumerate(observed):
if (o == 1):
total_cost += np.log(predicted[k])
else:
total_cost += np.log(1 - predicted[k])
cost = -1 * total_cost
# Storing the cost for the i-th training example
costs[i] = cost
# Calculating the gradient for this training example
# using back propagation algorithm
gradients_i = self.back_prop(observed, feed, theta)
for i, gradient in enumerate(gradients_i):
gradients[i] += gradient
# Calculating the avg regularization term for the cost
sum_of_theta = 0
for i, theta_i in enumerate(theta):
squared_theta = np.power(theta_i[:, 1:], 2)
sum_of_theta += np.sum(squared_theta)
r12n_avg = r12n * sum_of_theta / (2 * m)
total_cost = np.sum(costs) / m + r12n_avg
# Applying regularization terms to the gradients
for i, theta_i in enumerate(theta):
lambda_i = np.copy(theta_i)
lambda_i[:, 0] = 0
lambda_i = np.multiply((r12n / m), lambda_i)
# Adding the r12n matrix to the gradient
gradients[i] = gradients[i] / m + lambda_i
return total_cost, gradients
# Function for training the neural network using conjugate gradient algorithm
def train_cg(self, X, y, r12n = 0, iterations = 50):
weights = self.weights
def Cost(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
cost, _ = self.compute_cost(theta, X, y, r12n)
print(cost);
return cost
def Gradient(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
_, gradient = self.compute_cost(theta, X, y, r12n)
return utils.unroll_theta(gradient)
unrolled_theta = utils.unroll_theta(weights)
result = op.fmin_cg(f = Cost,
x0 = unrolled_theta,
args=(X, y),
fprime=Gradient,
maxiter = iterations)
self.weights = utils.roll_theta(result, self.layers)
# Function for feeding forward the network
def feed_forward(self, X):
# Useful variables
activations = []
weighted_layers = []
weights = self.weights
currentActivations = self.append_bias(X, 'row')
activations.append(currentActivations)
for i in range(len(self.layers) - 1):
layer_weights = weights[i]
weighted_inputs = np.dot(layer_weights, currentActivations)
# Storing the weighted inputs
weighted_layers.append(weighted_inputs)
activation_nodes = []
# If the next layer is not the output layer, we'd like to add a bias unit to it
# (Excluding the input and the output layer)
if (i < len(self.layers) - 2):
activation_nodes = self.append_bias(utils.sigmoid(weighted_inputs), 'row')
else:
activation_nodes = utils.sigmoid(weighted_inputs)
# Appending the layer of nodes to the activations array
activations.append(activation_nodes)
currentActivations = activation_nodes
data = {
"activations": activations,
"weighted_layers": weighted_layers
}
return data
def predict(self, X):
data = self.feed_forward(X)
output = data["activations"][-1]
# Finding the max index in the output layer
return np.argmax(output, axis=0)
Here is the invocation of the code:
import numpy as np
from network import Network
# %% Load data
X = np.genfromtxt('data/mnist_data.csv', delimiter=',')
y = np.genfromtxt('data/mnist_outputs.csv', delimiter=',').astype(int)
# %% Create network
num_labels = 10
input_layer = 400
hidden_layer = 25
output_layer = num_labels
layers = [input_layer, hidden_layer, output_layer]
# Create a new neural network
network = Network(layers)
# %% Train the network and save the weights
network.train_cg(X, y, r12n = 1, iterations = 20)
This is what the code emits after each iteration:
15.441233231650283
15.441116436313076
15.441192262452514
15.44122384651483
15.441231216030646
15.441232804294314
15.441233141284435
15.44123321255294
15.441233227614855
As you can see, the changes to the cost are very small.
I checked for the shapes of the vectors and gradient and they both seem fine, just like in my Matlab implementation. I'm not sure what I do wrong here.
If you guys could help me, that'd be great :)

Correct backpropagation in simple perceptron

Given the simple OR gate problem:
or_input = np.array([[0,0], [0,1], [1,0], [1,1]])
or_output = np.array([[0,1,1,1]]).T
If we train a simple single-layered perceptron (without backpropagation), we could do something like this:
import numpy as np
np.random.seed(0)
def sigmoid(x): # Returns values that sums to one.
return 1 / (1 + np.exp(-x))
def cost(predicted, truth):
return (truth - predicted)**2
or_input = np.array([[0,0], [0,1], [1,0], [1,1]])
or_output = np.array([[0,1,1,1]]).T
# Define the shape of the weight vector.
num_data, input_dim = or_input.shape
# Define the shape of the output vector.
output_dim = len(or_output.T)
num_epochs = 50 # No. of times to iterate.
learning_rate = 0.03 # How large a step to take per iteration.
# Lets standardize and call our inputs X and outputs Y
X = or_input
Y = or_output
W = np.random.random((input_dim, output_dim))
for _ in range(num_epochs):
layer0 = X
# Forward propagation.
# Inside the perceptron, Step 2.
layer1 = sigmoid(np.dot(X, W))
# How much did we miss in the predictions?
cost_error = cost(layer1, Y)
# update weights
W += - learning_rate * np.dot(layer0.T, cost_error)
# Expected output.
print(Y.tolist())
# On the training data
print([[int(prediction > 0.5)] for prediction in layer1])
[out]:
[[0], [1], [1], [1]]
[[0], [1], [1], [1]]
With backpropagation, to compute the d(cost)/d(X), are the follow steps correct?
compute the layer1 error by multiplying the cost error and the derivatives of the cost
then compute the layer1 delta by multiplying the layer 1 error and the derivatives of the sigmoid
then do a dot product between the inputs and the layer1 delta to get the differential of the i.e. d(cost)/d(X)
Then the d(cost)/d(X) is multiplied with the negative of the learning rate to perform gradient descent.
num_epochs = 0 # No. of times to iterate.
learning_rate = 0.03 # How large a step to take per iteration.
# Lets standardize and call our inputs X and outputs Y
X = or_input
Y = or_output
W = np.random.random((input_dim, output_dim))
for _ in range(num_epochs):
layer0 = X
# Forward propagation.
# Inside the perceptron, Step 2.
layer1 = sigmoid(np.dot(X, W))
# How much did we miss in the predictions?
cost_error = cost(layer1, Y)
# Back propagation.
# multiply how much we missed from the gradient/slope of the cost for our prediction.
layer1_error = cost_error * cost_derivative(cost_error)
# multiply how much we missed by the gradient/slope of the sigmoid at the values in layer1
layer1_delta = layer1_error * sigmoid_derivative(layer1)
# update weights
W += - learning_rate * np.dot(layer0.T, layer1_delta)
In that case, should the implementation look like this below with the cost_derivative and sigmoid_derivative?
import numpy as np
np.random.seed(0)
def sigmoid(x): # Returns values that sums to one.
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(sx):
# See https://math.stackexchange.com/a/1225116
return sx * (1 - sx)
def cost(predicted, truth):
return (truth - predicted)**2
def cost_derivative(y):
# If the cost is:
# cost = y - y_hat
# What's the derivative of d(cost)/d(y)
# d(cost)/d(y) = 1
return 2*y
or_input = np.array([[0,0], [0,1], [1,0], [1,1]])
or_output = np.array([[0,1,1,1]]).T
# Define the shape of the weight vector.
num_data, input_dim = or_input.shape
# Define the shape of the output vector.
output_dim = len(or_output.T)
num_epochs = 5 # No. of times to iterate.
learning_rate = 0.03 # How large a step to take per iteration.
# Lets standardize and call our inputs X and outputs Y
X = or_input
Y = or_output
W = np.random.random((input_dim, output_dim))
for _ in range(num_epochs):
layer0 = X
# Forward propagation.
# Inside the perceptron, Step 2.
layer1 = sigmoid(np.dot(X, W))
# How much did we miss in the predictions?
cost_error = cost(layer1, Y)
# Back propagation.
# multiply how much we missed from the gradient/slope of the cost for our prediction.
layer1_error = cost_error * cost_derivative(cost_error)
# multiply how much we missed by the gradient/slope of the sigmoid at the values in layer1
layer1_delta = layer1_error * sigmoid_derivative(layer1)
# update weights
W += - learning_rate * np.dot(layer0.T, layer1_delta)
# Expected output.
print(Y.tolist())
# On the training data
print([[int(prediction > 0.5)] for prediction in layer1])
[out]:
[[0], [1], [1], [1]]
[[0], [1], [1], [1]]
BTW, given the random input seeds, even without the W and gradient descent or perceptron, the prediction can be still right:
import numpy as np
np.random.seed(0)
# Lets standardize and call our inputs X and outputs Y
X = or_input
Y = or_output
W = np.random.random((input_dim, output_dim))
# On the training data
predictions = sigmoid(np.dot(X, W))
[[int(prediction > 0.5)] for prediction in predictions]

You are almost correct. In your implementation, you define the cost as the square of the error, which as the unfortunate consequence of being always positive. As a result, if you plot the mean(cost_error), it is raising slowly at each iteration, and your weights are slowly decreasing.
In your particular case, you can have any weights >0 to make it work : if you try your implementation with enough epochs, your weights will turn negative and your network won't work anymore.
You can just remove the square in your cost function :
def cost(predicted, truth):
return (truth - predicted)
Now to update your weights, you need to evaluate the gradient at the "position" of your error. So what your need is :
d_predicted = output_errors * sigmoid_derivative(predicted_output)
Next, we update the weights :
W += np.dot(X.T, d_predicted) * learning_rate
Full code with error display :
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
def sigmoid(x): # Returns values that sums to one.
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(sx):
# See https://math.stackexchange.com/a/1225116
return sx * (1 - sx)
def cost(predicted, truth):
return (truth - predicted)
or_input = np.array([[0,0], [0,1], [1,0], [1,1]])
or_output = np.array([[0,1,1,1]]).T
# Define the shape of the weight vector.
num_data, input_dim = or_input.shape
# Define the shape of the output vector.
output_dim = len(or_output.T)
num_epochs = 50 # No. of times to iterate.
learning_rate = 0.1 # How large a step to take per iteration.
# Lets standardize and call our inputs X and outputs Y
X = or_input
Y = or_output
W = np.random.random((input_dim, output_dim))
# W = [[-1],[1]] # you can try to set bad weights to see the training process
error_list = []
for _ in range(num_epochs):
layer0 = X
# Forward propagation.
layer1 = sigmoid(np.dot(X, W))
# How much did we miss in the predictions?
cost_error = cost(layer1, Y)
error_list.append(np.mean(cost_error)) # save the loss to plot later
# Back propagation.
# eval the gradient :
d_predicted = cost_error * sigmoid_derivative(cost_error)
# update weights
W = W + np.dot(X.T, d_predicted) * learning_rate
# Expected output.
print(Y.tolist())
# On the training data
print([[int(prediction > 0.5)] for prediction in layer1])
# plot error curve :
plt.plot(range(num_epochs), loss_list, '+b')
plt.xlabel('Epoch')
plt.ylabel('mean error')
I also added a line to set the initial weights manually, to see how the network is learning

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Visualizing the attention map of a multihead attention in ViT - python

Related

Keras tensor operation dimensionality problem when implementing inductive bias

IndexError: Dimension out of range - PyTorch dimension expected to be in range of [-1, 0], but got 1

Siamese network assigns same label to every couple of images

Scipy fails to minimize cost function

Correct backpropagation in simple perceptron

Categories

Resources