Batch-wise beam search in pytorch - python

I'm trying to implement a beam search decoding strategy in a text generation model. This is the function that I am using to decode the output probabilities.
def beam_search_decoder(data, k):
sequences = [[list(), 0.0]]
# walk over each step in sequence
for row in data:
all_candidates = list()
for i in range(len(sequences)):
seq, score = sequences[i]
for j in range(len(row)):
candidate = [seq + [j], score - torch.log(row[j])]
# sort candidates by score
ordered = sorted(all_candidates, key=lambda tup:tup[1])
sequences = ordered[:k]
return sequences
Now you can see this function is implemented with batch_size 1 in mind. Adding another loop for batch size would make the algorithm O(n^4). It is slow as it is now. Is there any way to improve the speed of this function. My model output is usually of the size (32, 150, 9907) which follows the format (batch_size, max_len, vocab_size)

Below is my implementation, which may be a little bit faster than the for loop implementation.
import torch
def beam_search_decoder(post, k):
"""Beam Search Decoder
post(Tensor) – the posterior of network.
k(int) – beam size of decoder.
indices(Tensor) – a beam of index sequence.
log_prob(Tensor) – a beam of log likelihood of sequence.
post: (batch_size, seq_length, vocab_size).
indices: (batch_size, beam_size, seq_length).
log_prob: (batch_size, beam_size).
>>> post = torch.softmax(torch.randn([32, 20, 1000]), -1)
>>> indices, log_prob = beam_search_decoder(post, 3)
batch_size, seq_length, _ = post.shape
log_post = post.log()
log_prob, indices = log_post[:, 0, :].topk(k, sorted=True)
indices = indices.unsqueeze(-1)
for i in range(1, seq_length):
log_prob = log_prob.unsqueeze(-1) + log_post[:, i, :].unsqueeze(1).repeat(1, k, 1)
log_prob, index = log_prob.view(batch_size, -1).topk(k, sorted=True)
indices =[indices, index.unsqueeze(-1)], dim=-1)
return indices, log_prob

You can use this library
It implements Beam Search, Greedy Search and sampling for PyTorch sequence models.
The following snippet implements a Transformer seq2seq model and uses it to generate predictions.
#pip install pytorch-beam-search
from pytorch_beam_search import seq2seq
# Create vocabularies
# Tokenize the way you need
source = [list("abcdefghijkl"), list("mnopqrstwxyz")]
target = [list("ABCDEFGHIJKL"), list("MNOPQRSTWXYZ")]
# An Index object represents a mapping from the vocabulary to
# to integers (indices) to feed into the models
source_index = seq2seq.Index(source)
target_index = seq2seq.Index(target)
# Create tensors
X = source_index.text2tensor(source)
Y = target_index.text2tensor(target)
# X.shape == (n_source_examples, len_source_examples) == (2, 11)
# Y.shape == (n_target_examples, len_target_examples) == (2, 12)
# Create and train the model
model = seq2seq.Transformer(source_index, target_index) # just a PyTorch model, Y, epochs = 100) # basic method included
# Generate new predictions
new_source = [list("new first in"), list("new second in")]
new_target = [list("new first out"), list("new second out")]
X_new = source_index.text2tensor(new_source)
Y_new = target_index.text2tensor(new_target)
loss, error_rate = model.evaluate(X_new, Y_new) # basic method included
predictions, log_probabilities = seq2seq.beam_search(model, X_new)
output = [target_index.tensor2text(p) for p in predictions]


ValueError: `decode_predictions` expects a batch of predictions (i.e. a 2D array of shape (samples, 1000)). Found array with shape: (1, 26)

I am using a model trained by myself to translate braille digits into plain text. As you can see this is a classification problem with 26 classes, one for each letter in the alphabet.
This is the dataset that I used to train my model:
This is how I am generating my training and validation set:
alpha = 'a'
for i in range(0, 26):
os.mkdir('./images/' + alpha)
alpha = chr(ord(alpha) + 1)
rootdir = "C:\\Users\\ffernandez\\Downloads\\capstoneProject\\Braille Dataset\\Braille Dataset\\"
for file in os.listdir(rootdir):
letter = file[0]
copyfile(rootdir+file, './images/' + letter + '/' + file)
The resulting folder looks like this:
folder structure
And this is how I create the train and validation split:
datagen = ImageDataGenerator(rotation_range=20,
train_generator = datagen.flow_from_directory('./images/',
val_generator = datagen.flow_from_directory('./images/',
Finally this is the code corresponding to the design, compilation and training of the model:
model_ckpt = ModelCheckpoint('BrailleNet.h5',save_best_only=True)
reduce_lr = ReduceLROnPlateau(patience=8,verbose=0)
early_stop = EarlyStopping(patience=15,verbose=1)
entry = L.Input(shape=(28,28,3))
x = L.SeparableConv2D(64,(3,3),activation='relu')(entry)
x = L.MaxPooling2D((2,2))(x)
x = L.SeparableConv2D(128,(3,3),activation='relu')(x)
x = L.MaxPooling2D((2,2))(x)
x = L.SeparableConv2D(256,(2,2),activation='relu')(x)
x = L.GlobalMaxPooling2D()(x)
x = L.Dense(256)(x)
x = L.LeakyReLU()(x)
x = L.Dense(64,kernel_regularizer=l2(2e-4))(x)
x = L.LeakyReLU()(x)
x = L.Dense(26,activation='softmax')(x)
model = Model(entry,x)
history = model.fit_generator(train_generator,validation_data=val_generator,epochs=666,
Then this is the code for testing an image of the letter 'a' in braille has the same size as the training and validation set (28x28):
img_path = "./test/a1.JPG10whs.jpg"
img = plt.imread(img_path)
img_array = tf.keras.utils.img_to_array(img)
img_batch = np.expand_dims(img_array, axis=0)
img_preprocessed = tf.keras.applications.resnet50.preprocess_input(img_batch)
prediction = model.predict(img_preprocessed)
print(tf.keras.applications.imagenet_utils.decode_predictions(prediction, top=3)[0])
Just when I execute that last line of code this error appears:
ValueError: decode_predictions expects a batch of predictions (i.e. a 2D array of shape (samples, 1000)). Found array with shape: (1, 26)
A similar question I found here on stackoverflow (ValueError: `decode_predictions` expects a batch of predictions (i.e. a 2D array of shape (samples, 1000)). Found array with shape: (1, 7)).
I've seen that using "decode_predictions" only makes sense if your model outputs the ImageNet classes (1000-dimensional) but if I can't use "decode_predictions" I don't know how to get my predictions.
My desired output would be like:
prediction = model.predict(img_preprocessed)
output: 'a'
Any hint or suggestion on how to solve this issue is highly appreciated.
If we take a look at what the prediction object acually is we can see that it has 26 values. These values are the propabiity for each letter that the model predicts:
So we need a way to map the prediction value to the respective letter.
A simple way to do this could to create a list of all the 26 possible letters and search the max value in the prediction array. Example:
#Create prediction labels from a-z
for i in range(0, 25):
alpha = chr(ord(alpha) + 1)
#Search the max value in prediction
The output should be the character with the highest probability:

How to write a custom loss function in Keras/Tensorflow that uses loops/iterations with reference numpy code

I saw this question: Implementing custom loss function in keras with condition And I need to do the same thing but with code that seems to need loops.
I have a custom numpy function which calculates the mean Euclid distance from the mean vector. I wrote this based on the paper
import numpy as np
def mean_euclid_distance_from_mean_vector(n_vectors):
dists = []
for (i, v) in enumerate(n_vectors):
n_vectors_rest = n_vectors[np.arange(len(n_vectors)) != i]
print("rest of vectors: ")
# calculate mean vector
mean_rest = n_vectors_rest.mean(axis=0)
print("mean rest vector")
dist = v - mean_rest
print("dist vector")
# dists is now a matrix of distance vectors (distance from the mean vector)
dists = np.array(dists)
print("distance vector matrix")
# here we matmult each vector
# sum them up
# and divide by the total number of elements
result = np.sum([np.matmul(d, d) for d in dists]) / dists.size
return result
features = np.array([
c = mean_euclid_distance_from_mean_vector(features)
I need this function however to work inside tensorflow with Keras. So a custom lambda
However, I'm not sure how to implement the above in Keras/Tensorflow since it has loops, and the way the paper talked about calculating the m_i seems to require loops like the way I implemented the above.
For reference, the PyTorch version of this code is here:
Given a feature map like:
features = np.array([
[1, 2, 3, 4],
[2, 4, 4, 3],
[3, 2, 1, 4],
], dtype=np.float64)
reflecting a batch_size of
batch_size = features.shape[0]
k = features.shape[1]
One has that implementing the above Formulas in Tensorflow could be expressed (prototyped) by:
dim = (batch_size, features.shape[1])
def zero(i):
arr = np.ones(dim)
arr[i] = 0
return arr
mapper = [zero(i) for i in range(batch_size)]
elems = (features, mapper)
m = (1 / (batch_size - 1)) * tf.map_fn(lambda x: tf.math.reduce_sum(x[0] * x[1], axis=0), elems, dtype=tf.float64)
pairs = tf.map_fn(lambda x: tf.concat(x, axis=0) , tf.stack([features, m], 1), dtype=tf.float64)
compactness_loss = (1 / (batch_size * k)) * tf.map_fn(lambda x: tf.math.reduce_euclidean_norm(x), pairs, dtype=tf.float64)
with tf.Session() as sess:
print("loss value output is: ", compactness_loss.eval())
Which yields:
loss value output is: [0.64549722 0.79056942 0.64549722]
However a single measure is required for the batch, therefore it is necessary to reduce it; by the summation of all values.
The wanted Compactness Loss function à la Tensorflow is:
def compactness_loss(actual, features):
features = Flatten()(features)
k = 7 * 7 * 512
dim = (batch_size, k)
def zero(i):
z = tf.zeros((1, dim[1]), dtype=tf.dtypes.float32)
o = tf.ones((1, dim[1]), dtype=tf.dtypes.float32)
arr = []
for k in range(dim[0]):
arr.append(o if k != i else z)
res = tf.concat(arr, axis=0)
return res
masks = [zero(i) for i in range(batch_size)]
m = (1 / (batch_size - 1)) * tf.map_fn(
# row-wise summation
lambda mask: tf.math.reduce_sum(features * mask, axis=0),
dists = features - m
sqrd_dists = tf.pow(dists, 2)
red_dists = tf.math.reduce_sum(sqrd_dists, axis=1)
compact_loss = (1 / (batch_size * k)) * tf.math.reduce_sum(red_dists)
return compact_loss
Of course the Flatten() could be moved back into the model for convenience and the k could be derived directly from the feature map; this answers your question. You may just have some trouble finding out the the expected values for the model are - feature maps from the VGG16 (or any other architechture) trained against the imagenet for instance?
The paper says:
In our formulation (shown in Figure 2 (e)), starting froma pre-trained deep model, we freeze initial features (gs) and learn (gl) and (hc). Based on the output of the classification sub-network (hc), two losses compactness loss and descriptiveness loss are evaluated. These two losses, introduced in the subsequent sections, are used to assess the quality of the learned deep feature. We use the provided one-class dataset to calculate the compactness loss. An external multi-class reference dataset is used to evaluate the descriptiveness loss.As shown in Figure 3, weights of gl and hc are learned in the proposed method through back-propagation from the composite loss. Once training is converged, system shown in setup in Figure 2(d) is used to perform classification where the resulting model is used as the pre-trained model.
then looking at the "Framework" backbone here plus:
AlexNet Binary and VGG16 Binary (Baseline). A binary CNN is trained by having ImageNet samples and one-class image samples as the two classes using AlexNet andVGG16 architectures, respectively. Testing is performed using k-nearest neighbor, One-class SVM [43], Isolation Forest [3]and Gaussian Mixture Model [3] classifiers.
Makes me wonder whether it would not be reasonable to add suggested the dense layers to both the Secondary and the Reference Networks to a single class output (Sigmoid) or even and binary class output (using Softmax) and using the mean_squared_error as the so called Compactness Loss and binary_cross_entropy as the Descriptveness Loss.

Solved: How to combine tf.gradients with and keras models

I'm trying to build a workflow that uses batches and an iterator. For performance reasons, I am really trying to avoid using the placeholder->feed_dict loop workflow.
The process I'm trying to implement involves grad-cam (which requires the gradient of the loss with respect to the final convolutional layer of a CNN) as an intermediate step, and ideally I'd like to be able to try it out on several Keras pre-trained models, including non-sequential ones like ResNet.
Most implementations of grad-cam that I've found rely on hand-crafting the CNN of interest in tensorflow. I found one implementation,, that is made for keras models, and following that example, I get
def safe_norm(x):
return x / tf.sqrt(tf.reduce_mean(x ** 2) + 1e-8)
vgg_ = VGG19()
dataset =
it = dataset.make_one_shot_iterator()
files, batch = it.get_next()
conv5_4 = vgg_.layers[-6]
h_k, w_k, c_k = conv5_4.output.shape[1:]
vgg_model = Model(inputs=vgg_.input, outputs=vgg_.output)
conv_model = Model(inputs=vgg_.input, outputs=conv5_4.output)
probs = vgg_model(batch)
predicted_class = tf.argmax(probs, axis=-1)
layer_name = 'block5_conv4'
target_layer = lambda x: target_category_loss(x, predicted_class, n_categories)
x = Lambda(target_layer)(vgg_model.outputs[0])
model = Model(inputs=vgg_model.inputs[0], outputs=x)
loss = K.sum(model.output, axis=-1)
conv_output = [l for l in model.layers if is layer_name][0].output
grads = Lambda(safe_norm)(K.gradients(loss, [conv_output])[0])
gradient_function = K.function([model.input], [conv_output, grads])
output, grads_val = gradient_function([batch])
weights = tf.reduce_mean(grads_val, axis = (1, 2))
cam = tf.ones([batch_size, h_k, w_k], dtype = tf.float32)
cam += tf.reduce_sum(output * tf.reshape(weights, [-1, 1, 1, weights.shape[-1]]), axis=-1)
cam = tf.squeeze(tf.image.resize_images(images=tf.expand_dims(cam, axis=-1), size=(224, 224)))
cam = tf.maximum(cam, 0)
heatmap = cam / tf.reshape(tf.reduce_max(cam, axis=[1, 2]), shape=[-1, 1, 1])
The problem is that gradient_function([batch]) returns a numpy array whose value is determined by the first batch, so that heatmap doesn't change with subsequent evaluations.
I've tried replacing K.function with a Model in various ways, but nothing seems to work. I usually end up either with an error suggesting that grads evaluates to None or that one model or another is expecting a feed_dict and not receiving one.
Is this code salvageable? Is there a better way to do this besides looping through the data several times (once to get all the grad-cams and then again once I have them) or using placeholders and feed_dicts?
def safe_norm(x):
return x / tf.sqrt(tf.reduce_mean(x ** 2) + 1e-8)
vgg_ = VGG19()
dataset =
it = dataset.make_one_shot_iterator()
files, batch = it.get_next()
conv5_4 = vgg_.layers[-6]
h_k, w_k, c_k = conv5_4.output.shape[1:]
vgg_model = Model(inputs=vgg_.input, outputs=vgg_.output)
conv_model = Model(inputs=vgg_.input, outputs=conv5_4.output)
probs = vgg_model(batch)
predicted_class = tf.argmax(probs, axis=-1)
layer_name = 'block5_conv4'
target_layer = lambda x: target_category_loss(x, predicted_class, n_categories)
x = Lambda(target_layer)(vgg_model.outputs[0])
model = Model(inputs=vgg_model.inputs[0], outputs=x)
loss = K.sum(model.output, axis=-1)
conv_output = [l for l in model.layers if is layer_name][0].output
grads = Lambda(safe_norm)(K.gradients(loss, [conv_output])[0])
gradient_function = K.function([model.input], [conv_output, grads])
output, grads_val = gradient_function([batch])
weights = tf.reduce_mean(grads_val, axis = (1, 2))
cam = tf.ones([batch_size, h_k, w_k], dtype = tf.float32)
cam += tf.reduce_sum(output * tf.reshape(weights, [-1, 1, 1, weights.shape[-1]]), axis=-1)
cam = tf.squeeze(tf.image.resize_images(images=tf.expand_dims(cam, axis=-1), size=(224, 224)))
cam = tf.maximum(cam, 0)
heatmap = cam / tf.reshape(tf.reduce_max(cam, axis=[1, 2]), shape=[-1, 1, 1])
# other operations on heatmap and batch ...
# ...
output_function = K.function(model.input, [node1, ..., nodeN])
for batch in range(n_batches):
outputs1, ... , outputsN = output_function(batch)
Gives me the desired outputs for each batch.
Yes, K.function returns numpy arrays because it evaluates the symbolic computation in your graph. What I think you should do is to keep everything symbolic up to K.function, and after getting the gradients, perform all computations of the Grad-CAM weights and final saliency map using numpy.
Then you can iterate on your dataset, evaluate gradient_function on a new batch of data, and compute the saliency map.
If you want to keep everything symbolic, then you should not use K.function to produce the gradient function, but use the symbolic gradient (the output of K.gradient, without lambda) and convolutional feature maps (conv_output) and perform the saliency map computation on top of that, and then build a function (using K.function) that takes the model input, and outputs the saliency map.
Hope the explanation is enough.

Predicting image using triplet loss

I'm new to NN.
I built a NN for image understanding using a triplet loss method.
And I think that I'm missing some basic knowledge about how to use this method for predicting an image tag.
After I have my model built, how should I predict a sample image?
Because my model input is a triplet - what the triplet should be constructed from?
As for the theory, I think that I should somehow get the embedding matrix for the test image and then use knn with k=1 to get the nearest embedding. But i am clueless about how to do that in practice
My code is running and generating the model:
import numpy as np
import random
import os
import imageio
import matplotlib.pyplot as plt
import pandas as pd
from time import time
import tensorflow as tf
from PIL import Image
from keras.models import Model
from keras.layers import Input, Lambda, concatenate
from keras.optimizers import Adam
from keras import backend as K
from keras.layers import Conv2D, PReLU, Flatten, Dense
ALPHA = 0.2 # Triplet Loss Parameter
def get_triplets(features):
df_features = pd.DataFrame(features)
triplets = []
for index, row in df_features.iterrows():
same_tag = df_features.loc[df_features.iloc[:, -1] == row.iloc[-1]]
same_tag_indexes = list(set(same_tag.index) - {index})
diff_tag_indexes = list(set(df_features.index) - set(same_tag_indexes) - {index})
anchor = row.iloc[0]
anchor = anchor.reshape(-1, anchor.shape[0], anchor.shape[1], anchor.shape[2])
pos = df_features.iloc[random.choice(same_tag_indexes), :].iloc[0]
pos = pos.reshape(-1, pos.shape[0], pos.shape[1], pos.shape[2])
neg = df_features.iloc[random.choice(diff_tag_indexes), :].iloc[0]
neg = neg.reshape(-1, neg.shape[0], neg.shape[1], neg.shape[2])
triplets.append(list(list([anchor, pos, neg])))
return np.array(triplets)
def triplet_loss(x):
anchor, positive, negative = tf.split(x, 3, axis=1)
pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, positive)), 1)
neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, negative)), 1)
basic_loss = tf.add(tf.subtract(pos_dist, neg_dist), ALPHA)
loss = tf.reduce_mean(tf.maximum(basic_loss, 0.0), 0)
return loss
# When fitting the model (i.e.,; use as an input [anchor_example,
# positive_example, negative_example] in that order and as an output zero.
# The reason to use the output as zero is that you are trying to minimize the
# triplet loss as much as possible and the minimum value of the loss is zero.
def create_embedding_network(input_shape):
input_shape = Input(input_shape)
x = Conv2D(32, (3, 3))(input_shape)
x = PReLU()(x)
x = Conv2D(64, (3, 3))(x)
x = PReLU()(x)
x = Flatten()(x)
x = Dense(10, activation='softmax')(x)
model = Model(inputs=input_shape, outputs=x)
return model
anchor_embedding = None
# Builds an embedding for each example (i.e., positive, negative, anchor)
# Then calculates the triplet loss between their embedding.
# Then applies identity loss on the triplet loss value to minimize it on training.
def build_model(input_shape):
global anchor_embedding
# Standardizing the input shape order
positive_example = Input(shape=input_shape)
negative_example = Input(shape=input_shape)
anchor_example = Input(shape=input_shape)
# Create Common network to share the weights along different examples (+/-/Anchor)
embedding_network = create_embedding_network(input_shape)
positive_embedding = embedding_network(positive_example)
negative_embedding = embedding_network(negative_example)
anchor_embedding = embedding_network(anchor_example)
# loss = merge([anchor_embedding, positive_embedding, negative_embedding],
# mode=triplet_loss, output_shape=(1,))
merged_output = concatenate([anchor_embedding, positive_embedding, negative_embedding])
loss = Lambda(triplet_loss, (1,))(merged_output)
model = Model(inputs=[anchor_example, positive_example, negative_example],
model.compile(loss='mean_absolute_error', optimizer=Adam())
return model
#start_time = time()
numOfPhotosPerTag = 10
#Change this line to your own drive path
baseDir = "C:/Intelligent systems/DNN/images/"
imagesHashtags = ["beer", "bigcity"]
imagesDir = [baseDir + str(x) for x in imagesHashtags]
images = ["/" + str(x) + ".jpg" for x in range(1, numOfPhotosPerTag + 1)]
allImages = []
for x in imagesDir:
allImages += [x + loc for loc in images]
imageio.imread(allImages[0], pilmode="RGB").shape
data = []
for x in allImages:
image = imageio.imread(x, pilmode="RGB")
tag = x.split('/')[-2]
data.append((image, tag))
data = np.array(data)
triplets = get_triplets(data)
model = build_model((256, 256, 3)), y=np.zeros(len(triplets)), batch_size=1)
for i in range(len(data)):[0]), y=[0], batch_size=1, verbose=10)
If you've trained your embedding_network properly, you now don't need to use triplets any more.
Basically, the whole point of the triplet-loss concept is to learn an embedding that is compatible with a pre-defined metric (usually just the Euclidean distance for instance), and then use this embedding for simple KNN classification as you mentioned.
So take your labeled data and pass all the points through the embedding_network.
You now have a set of points in a (low-dimensional?) space, in which "close points" are of the same class. Again, this depends on the data, how successful the training was, etc.
The natural thing to then do is to pass your test point through the same embedding_network, and compare it's distances to the labeled points in the embedding-space.
KNN is then a viable solution for classification, but the real point is that your data has been transformed very-non-linearly into a "comfortable" space in which many classical and simple methods will work more easily; clustering, classification, you name it.
Hope that helps, and good luck!
If you use the name= to tag the "normal" half of the model, you can extract the layers you need. We use the following code for this:
def triplet2normal(model, keep_str='pos', out='score'):
""" take a triplet model, keep half of the model """
new_out_layer_name = next( for model in model.layers if keep_str in and out in
model_half = Model(inputs=[i for i in model.input if keep_str in],
return model_half
Where the model is any triplet model - the example below is for recommendation on e.g. the movielens set:
# Input placeholders
positive_item_input = Input((1,), name='pos_item_input')
negative_item_input = Input((1,), name='neg_item_input')
user_input = Input((1,), name='pos_neg_user_input')
# Embedding layers for the items and for users
item_embedding_layer = Embedding(num_items, latent_dim, name='pos_neg_item_embedding', input_length=1)
user_embedding_layer = Embedding(num_users, latent_dim, name='pos_neg_user_embedding', input_length=1)
# Flatten the embedding layers
positive_item_embedding = Flatten(name='pos_item_embedded')(item_embedding_layer(positive_item_input))
negative_item_embedding = Flatten(name='neg_item_embedded')(item_embedding_layer(negative_item_input))
user_embedding = Flatten(name='pos_neg_user_embedded')(user_embedding_layer(user_input))
# Dot product - Matrix factorization
positive_scores = Dot(axes=1, name='positive_scores')([user_embedding, positive_item_embedding])
negative_scores = Dot(axes=1, name='negative_scores')([user_embedding, negative_item_embedding])
# Compare scores
delta_scores_1 = Subtract(name='delta_scores')([negative_scores, positive_scores])
loss = Activation('sigmoid')(delta_scores_1)
# Define model
model = Model(
inputs=[user_input, positive_item_input, negative_item_input],

Trying to write my own Neural Network in Python

Last semester I took an online machine learning course from Standford taught by Professor Ng. I thought it was pretty informative. To brush up/understand neural networks better I tried to write my own in python. Here it is:
import numpy
class NN:
def __init__(self, sl):
#sl = number of units (not counting bias unit) in layer l = sl
self.layers = len(sl)
#Create weights
self.weights = []
for idx in range(1, self.layers):
self.cost = []
def update(self, input):
if input.shape[1] !=[0]:
raise ValueError, 'The first layer must have a node for every feature'
self.z = []
self.a = []
#Input activations. I'm expecting inputs as numpy matrix (Examples x Featrues)
self.a.append(numpy.hstack((numpy.ones((input.shape[0], 1)), input)))#Set inputs ai + bias unit
#Hidden activations
for weight in self.weights:
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), numpy.tanh(self.z[-1])))) #tanh is a fancy sigmoid
#Output activation
self.a[-1] = self.z[-1] #Not logistic regression thus no sigmoid function
del self.z[-1]
def backPropagate(self, targets, lamda):
m = float(targets.shape[0]) #m is number of examples
#Calculate cost
Cost = -1/m*sum(numpy.power(self.a[-1] - targets, 2))
for weight in self.weights:
Cost = Cost + lamda/(2*m)*numpy.power(weight[1:, :], 2).sum()
#Calculate error for each layer
delta = []
delta.append(self.a[-1] - targets)
for idx in range(1, self.layers-1): #No delta for the input layer because it is the input
weight = self.weights[-idx][1:, :] #Ignore bias unit
dsigmoid = numpy.multiply(self.a[-(idx+1)][:,1:], 1-self.a[-(idx+1)][:,1:]) #dsigmoid is a(l).*(1-a(l))
delta.append(numpy.multiply(delta[-1]*weight.T, dsigmoid)) #Ignore Regularization
Delta = []
for idx in range(self.layers-1):
self.weight_gradient = []
for idx in range(len(Delta)):
self.weight_gradient.append(numpy.nan_to_num(1/m*Delta[idx] + numpy.vstack((numpy.zeros((1, self.weights[idx].shape[1])), lamda/m*self.weights[idx][1:, :]))))
def train(self, input, targets, alpha, lamda, iterations = 1000):
#alpha: learning rate
#lamda: regularization term
for i in range(iterations):
self.backPropagate(targets, lamda)
self.weights = [self.weights[idx] - alpha*self.weight_gradient[idx] for idx in range(len(self.weights))]
def predict(self, input):
return self.a[-1]
But it doesn't work =(. Inspecting the cost vs. iteration I can see a blip in the cost and the prediction for A is all the same. Can someone help me understand why my neural network is not converging?
Sorry about the amount of code (maybe someone will find it useful).
Instead of using random data I've got some structured data from the UCI Machine Learning Repository. The particular data set is the burned area of forest fires, in the northeast region of Portugal, using meteorological and other data: I modified the data so that days and months were numbers:
data = numpy.loadtxt(open('FF-data.csv', 'rb'), delimiter = ',', skiprows = 1)
features = data[:,0:11]
targets = numpy.matrix(data[:,12]).T
nfeatures = (features-features.mean(axis=0))/features.std(axis=0)
n = NN([11, 10, 1]) #The class takes the list of how many nodes in each layer
n.train(nfeatures, targets, 0.003, 0.0)
import matplotlib.pyplot
matplotlib.pyplot.title('Cost vs. Iteration')
matplotlib.pyplot.scatter(n.predict(nfeatures), targets)
matplotlib.pyplot.title('Data vs. Predicted')
matplotlib.pyplot.savefig('Report.png', format = 'png')
Why does the cost bottom out around 4000 and why does the Data Vs. Predicted not have any trend? You can see the graphs here:
(Sorry, I don't have enough rep to add comments, so I'll just keep posting answers instead.)
Yes, it does seem strange. If, however, after training you generate a new matrix B:
B = numpy.random.rand(5, 4)/5
Targets = B*X
print n.predict(B)
print B*X
it will work fine (most of the times - sometimes it will still give the average(Targets) as the answer).
Note: I switched from using 100 features to using just 4 in my example.
Also, I don't think that running 5000 iterations on 50 elements of the data set will do you any good. You should generally try to use as much training data as you can - and here you can use as much as you want, but you use even less examples than you have features.
This is fun, I'll think about it some more :) I was using your network for a more simple example - as Input I provided two numbers, and expected their sum as Output. It worked more or less okay.
The neural network was unable to train on the Forest Fire data for a few reasons.
First the numpy.tanh() sigmoid function is not behaving as expected. The code should be changed from:
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)),numpy.tanh(self.z[-1])))) #tanh is a fancy sigmoid
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), 1/(1+numpy.exp(-self.z[-1])))))
Second numpy and matplotlib are not playing nice. The numpy matrices seem to be plotted backwards. This can be fixed by using matrix.tolist(). Code changed from:
matplotlib.pyplot.scatter(n.predict(nfeatures), targets)
matplotlib.pyplot.scatter(n.predict(nfeatures).tolist(), targets.tolist())
Finally the number of nodes should be approximately 10% of the example size. Instead of 10 it is better to use 50 nodes.
The working neural network code is posted below with a new function autoparam which tries to find the best learning rate and regularization constant. You can see the graphs for the Forest Fire cost vs iteration and data vs predicted here:
Thanks for reading! I hope my neural network can help people.
import numpy
class NN:
def __init__(self, sl):
#sl = number of units (not counting bias unit) in layer l = sl
self.layers = len(sl)
#Create weights
self.weights = []
for idx in range(1, self.layers):
self.cost = []
def update(self, input):
if input.shape[1] !=[0]:
raise ValueError, 'The first layer must have a node for every feature'
self.z = []
self.a = []
#Input activations. Expected inputs as numpy matrix (Examples x Featrues)
self.a.append(numpy.hstack((numpy.ones((input.shape[0], 1)), input)))#Set inputs ai + bias unit
#Hidden activations
for weight in self.weights:
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), 1/(1+numpy.exp(-self.z[-1]))))) #sigmoid
#Output activation
self.a[-1] = self.z[-1] #Not logistic regression thus no sigmoid function
del self.z[-1]
def backPropagate(self, targets, lamda):
m = float(targets.shape[0]) #m is number of examples
#Calculate cost
Cost = -1/m*sum(numpy.power(self.a[-1] - targets, 2))
for weight in self.weights:
Cost = Cost + lamda/(2*m)*numpy.power(weight[1:, :], 2).sum()
#Calculate error for each layer
delta = []
delta.append(self.a[-1] - targets)
for idx in range(1, self.layers-1): #No delta for the input layer because it is the input
weight = self.weights[-idx][1:, :] #Ignore bias unit
dsigmoid = numpy.multiply(self.a[-(idx+1)][:,1:], 1-self.a[-(idx+1)][:,1:]) #dsigmoid is a(l).*(1-a(l))
delta.append(numpy.multiply(delta[-1]*weight.T, dsigmoid)) #Ignore Regularization
Delta = []
for idx in range(self.layers-1):
self.weight_gradient = []
for idx in range(len(Delta)):
self.weight_gradient.append(numpy.nan_to_num(1/m*Delta[idx] + numpy.vstack((numpy.zeros((1, self.weights[idx].shape[1])), lamda/m*self.weights[idx][1:, :]))))
def train(self, input, targets, alpha, lamda, iterations = 1000):
#alpha: learning rate
#lamda: regularization term
for i in range(iterations):
self.backPropagate(targets, lamda)
self.weights = [self.weights[idx] - alpha*self.weight_gradient[idx] for idx in range(len(self.weights))]
def autoparam(self, data, alpha = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3], lamda = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]):
#data: numpy matrix with targets in last column
#alpha: learning rate
#lamda: regularization term
#Create training, cross validation, and test sets
while 1:
numpy.seterr(invalid = 'raise')
numpy.random.shuffle(data) #Shuffle data
training_set = data[0:data.shape[0]/10*6, 0:-1]
self.ntraining_set = (training_set-training_set.mean(axis=0))/training_set.std(axis=0)
self.training_tgt = numpy.matrix(data[0:data.shape[0]/10*6, -1]).T
cv_set = data[data.shape[0]/10*6:data.shape[0]/10*8, 0:-1]
self.ncv_set = (cv_set-cv_set.mean(axis=0))/cv_set.std(axis=0)
self.cv_tgt = numpy.matrix(data[data.shape[0]/10*6:data.shape[0]/10*8, -1]).T
test_set = data[data.shape[0]/10*8:, 0:-1]
self.ntest_set = (test_set-test_set.mean(axis=0))/test_set.std(axis=0)
self.test_tgt = numpy.matrix(data[data.shape[0]/10*8:, -1]).T
except FloatingPointError:
numpy.seterr(invalid = 'warn')
cost = 999999
for i in alpha:
for j in lamda:
self.train(self.ntraining_set, self.training_tgt, i, j, 2000)
current_cost = 1/float(cv_set.shape[0])*sum(numpy.square(self.predict(self.ncv_set) - self.cv_tgt)).tolist()[0][0]
print current_cost
if current_cost < cost:
cost = current_cost
self.learning_rate = i
self.regularization = j
def predict(self, input):
return self.a[-1]
Loading data, Plotting, etc...
data = numpy.loadtxt(open('FF-data.csv', 'rb'), delimiter = ',', skiprows = 1)#Load
features = data[:,0:11]
nfeatures = (features-features.mean(axis=0))/features.std(axis=0)
targets = numpy.matrix(data[:, 12]).T
n = NN([11, 50, 1])
n.train(nfeatures, targets, 0.07, 0.0, 2000)
import matplotlib.pyplot
matplotlib.pyplot.title('Cost vs. Iteration')
matplotlib.pyplot.scatter(n.predict(nfeatures).tolist(), targets.tolist())
matplotlib.pyplot.plot(targets.tolist(), targets.tolist(), c = 'r')
matplotlib.pyplot.title('Data vs. Predicted')
matplotlib.pyplot.savefig('Report.png', format = 'png')
I think that your bias should be subtracted somewhere from the weighted inputs (or set to -1). From what I see in your code, the neurons add all the inputs, including the bias (which is set to +1.
