I am going through this link to understand Multi-channel CNN Model for Text Classification.
The code is based on this tutorial.
I have understood most of the things, however I can't understand how Keras defines the output shapes of certain layers.
Here is the code:
define a model with three input channels for processing 4-grams, 6-grams, and 8-grams of movie review text.
#Skipped keras imports
# load a clean dataset
def load_dataset(filename):
return load(open(filename, 'rb'))
# fit a tokenizer
def create_tokenizer(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# calculate the maximum document length
def max_length(lines):
return max([len(s.split()) for s in lines])
# encode a list of lines
def encode_text(tokenizer, lines, length):
# integer encode
encoded = tokenizer.texts_to_sequences(lines)
# pad encoded sequences
padded = pad_sequences(encoded, maxlen=length, padding='post')
return padded
# define the model
def define_model(length, vocab_size):
# channel 1
inputs1 = Input(shape=(length,))
embedding1 = Embedding(vocab_size, 100)(inputs1)
conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
drop1 = Dropout(0.5)(conv1)
pool1 = MaxPooling1D(pool_size=2)(drop1)
flat1 = Flatten()(pool1)
# channel 2
inputs2 = Input(shape=(length,))
embedding2 = Embedding(vocab_size, 100)(inputs2)
conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
drop2 = Dropout(0.5)(conv2)
pool2 = MaxPooling1D(pool_size=2)(drop2)
flat2 = Flatten()(pool2)
# channel 3
inputs3 = Input(shape=(length,))
embedding3 = Embedding(vocab_size, 100)(inputs3)
conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
drop3 = Dropout(0.5)(conv3)
pool3 = MaxPooling1D(pool_size=2)(drop3)
flat3 = Flatten()(pool3)
# merge
merged = concatenate([flat1, flat2, flat3])
# interpretation
dense1 = Dense(10, activation='relu')(merged)
outputs = Dense(1, activation='sigmoid')(dense1)
model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
# compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize
print(model.summary())
plot_model(model, show_shapes=True, to_file='multichannel.png')
return model
# load training dataset
trainLines, trainLabels = load_dataset('train.pkl')
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
print(trainX.shape)
# define model
model = define_model(length, vocab_size)
# fit model
model.fit([trainX,trainX,trainX], array(trainLabels), epochs=10, batch_size=16)
# save the model
model.save('model.h5')
Running the code:
Running the example first prints a summary of the prepared training dataset.
Max document length: 1380
Vocabulary size: 44277
(1800, 1380)
____________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
====================================================================================================
input_1 (InputLayer) (None, 1380) 0
____________________________________________________________________________________________________
input_2 (InputLayer) (None, 1380) 0
____________________________________________________________________________________________________
input_3 (InputLayer) (None, 1380) 0
____________________________________________________________________________________________________
embedding_1 (Embedding) (None, 1380, 100) 4427700 input_1[0][0]
____________________________________________________________________________________________________
embedding_2 (Embedding) (None, 1380, 100) 4427700 input_2[0][0]
____________________________________________________________________________________________________
embedding_3 (Embedding) (None, 1380, 100) 4427700 input_3[0][0]
____________________________________________________________________________________________________
conv1d_1 (Conv1D) (None, 1377, 32) 12832 embedding_1[0][0]
____________________________________________________________________________________________________
conv1d_2 (Conv1D) (None, 1375, 32) 19232 embedding_2[0][0]
____________________________________________________________________________________________________
conv1d_3 (Conv1D) (None, 1373, 32) 25632 embedding_3[0][0]
____________________________________________________________________________________________________
dropout_1 (Dropout) (None, 1377, 32) 0 conv1d_1[0][0]
____________________________________________________________________________________________________
dropout_2 (Dropout) (None, 1375, 32) 0 conv1d_2[0][0]
____________________________________________________________________________________________________
dropout_3 (Dropout) (None, 1373, 32) 0 conv1d_3[0][0]
____________________________________________________________________________________________________
max_pooling1d_1 (MaxPooling1D) (None, 688, 32) 0 dropout_1[0][0]
____________________________________________________________________________________________________
max_pooling1d_2 (MaxPooling1D) (None, 687, 32) 0 dropout_2[0][0]
____________________________________________________________________________________________________
max_pooling1d_3 (MaxPooling1D) (None, 686, 32) 0 dropout_3[0][0]
____________________________________________________________________________________________________
flatten_1 (Flatten) (None, 22016) 0 max_pooling1d_1[0][0]
____________________________________________________________________________________________________
flatten_2 (Flatten) (None, 21984) 0 max_pooling1d_2[0][0]
____________________________________________________________________________________________________
flatten_3 (Flatten) (None, 21952) 0 max_pooling1d_3[0][0]
____________________________________________________________________________________________________
concatenate_1 (Concatenate) (None, 65952) 0 flatten_1[0][0]
flatten_2[0][0]
flatten_3[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (None, 10) 659530 concatenate_1[0][0]
____________________________________________________________________________________________________
dense_2 (Dense) (None, 1) 11 dense_1[0][0]
====================================================================================================
Total params: 14,000,337
Trainable params: 14,000,337
Non-trainable params: 0
____________________________________________________________________________________________________
And
Epoch 6/10
1800/1800 [==============================] - 30s - loss: 9.9093e-04 - acc: 1.0000
Epoch 7/10
1800/1800 [==============================] - 29s - loss: 5.1899e-04 - acc: 1.0000
Epoch 8/10
1800/1800 [==============================] - 28s - loss: 3.7958e-04 - acc: 1.0000
Epoch 9/10
1800/1800 [==============================] - 29s - loss: 3.0534e-04 - acc: 1.0000
Epoch 10/10
1800/1800 [==============================] - 29s - loss: 2.6234e-04 - acc: 1.0000
My interpretation of the Layer and output shape are as follows:
Please help me understand if its correct as I am lost in multi-dimension.
input_1 (InputLayer) (None, 1380) : ---> 1380 is the total number of features ( that is 1380 input neurons) per data point. 1800 is the total number of documents or data points.
embedding_1 (Embedding) (None, 1380, 100) 4427700 ----> Embedding layer is : 1380 as features(words) and each feature is a vector of dimension 100.
How the number of parameters here is 4427700??
conv1d_1 (Conv1D) (None, 1377, 32) 12832 ------> Conv1d is of kernel size=4. Is it 1*4 filter which is used 32 times. Then how the dimension became (None, 1377, 32) with 12832 parameters?
max_pooling1d_1 (MaxPooling1D) (None, 688, 32) with MaxPooling1D(pool_size=2) how the dimension became (None, 688, 32)?
flatten_1 (Flatten) (None, 22016) This is just multiplication of 688, 32?
** Does every epoch trains 1800 data points at once?**
Please let me know how output dimensions is calculated. Any reference or help would be appreciated.
Please see the answers below:
input_1 (InputLayer) (None, 1380) : ---> 1380 is the total number of features ( that is 1380 input neurons) per data point. 1800 is the total number of documents or data points.
Yes. model.fit([trainX,trainX,trainX], array(trainLabels), epochs=10, batch_size=16) says, that you want the network to train 10 times (for 10 epochs) on the whole training dataset in batches of size 16.
This means, that every 16 data points, the backpropagation algorithm will be launched and the weights will update. This will happen 1800/16 times and will be called an epoch.
1380 is the number of neurons in the first layer.
embedding_1 (Embedding) (None, 1380, 100) | 4427700 ----> Embedding layer is : 1380 as features(words) and each feature is a vector of dimension 100.
1380 is the size of the input (numbers of neurons in the previous layer) and 100 is the size (length) of the embedding vector.
The number of parameters here is vocabulary_size * 100 as for each v in vocabulary you need to train 100 parameters. Embedding layer is in fact a matrix built with vocabulary_size vectors of size 100 where each row represents a vector representation of each word from the vocabulary.
conv1d_1 (Conv1D) (None, 1377, 32) | 12832 ------> Conv1d is of kernel size=4. Is it 1*4 filter which is used 32 times. Then how the dimension became (None, 1377, 32) with 12832 parameters?
1380 becomes 1377 because of the size of kernel. Imagine the following input (of size 10 to simplify) with kernel of size 4:
0123456789 #input
KKKK456789
0KKKK56789
12KKKK6789
123KKKK789
1234KKKK89
12345KKKK9
123456KKKK
Look, the Kernel can't move any further to the right, so for the input size 10 and Kernel size 4, the output shape would be 7.
In general, for input shape of n and kernel shape of k, the output shape would be n - k + 1, so for n=1380, k=4 the result is 1377.
The amount of the parameters is equal to 12832 because the number of parameters is equal to output_channels * (input_channels * window_size + 1). In your case it's 32*(100*4 + 1).
max_pooling1d_1 (MaxPooling1D) (None, 688, 32) with MaxPooling1D(pool_size=2) how the dimension became (None, 688, 32)?
The max_pooling takes every two consecutive numbers and replaces them with a max of them, so you end up with original_size/pool_size values.
flatten_1 (Flatten) (None, 22016) This is just multiplication of 688, 32?`
Yes, this is just a multiplication of 688 and 32. It's because, the flatten operation does the following:
1234
5678 -> 123456789012
9012
so it takes all values from all dimensions and put it into a one-dimensional vector.
Does every epoch trains 1800 data points at once?
No. It takes them in batches of 16 as pointed out in the first answer. Each epoch takes 1800 data points in a random order in batches of 16 data points. An epoch is a term which means, a period in time, after which we'll start reading data again.
Edit:
I will clarify the place where 1d convolutional layers are applied to embedding layers.
The output of the Embedding layers you should interpret as a vector of width 1380 and 100 channels.
Similarly to 2d images where you have an RGB image with three channels at the input, its shape is (width, height, 3) when you apply a convolutional layer built of 32 filters (filter size is irrelevant), the convolution operation is applied simultaneously to all channels and the output shape will be (new_width, new_height, 32). Notice the output shape is the same as the number of filters.
Back to your example. Treat the output shape from the embedding layer as (width, channels). So then the 1d convolutional layer with 32 filters and kernel size equals to 4 is applied to vector 1380 and depth 100. As result, you will get the output of shape (1377, 32).
Related
I am currently using code from https://keras.io/examples/vision/handwriting_recognition/ which is a tutorial on text recognition. I am using a local dataset to test the model. And during my experiments I have encountered something which made me question.
1.) Is it normal for a loss value to start at a higher value than the previous loss? If not what could be the cause of this and how can I prevent this?
2.) Is a val_loss of 1 good enough for bi-LSTM networks? If not how can I lessen the loss?
Here is the snippet of two consecutive epochs.
1520/1520 [==============================] - 735s 484ms/step - loss: 2.5462 - val_loss: 2.7302
Epoch 12/100
443/1520 [=======>......................] - ETA: 8:18 - loss: 3.9221
Below is the summary of the current model
Layer (type) Output Shape Param # Connected to
==================================================================================================
image (InputLayer) [(None, 128, 32, 1) 0 []
]
Conv1 (Conv2D) (None, 128, 32, 32) 320 ['image[0][0]']
batchnorm1 (BatchNormalization (None, 128, 32, 32) 128 ['Conv1[0][0]']
)
pool1 (MaxPooling2D) (None, 64, 16, 32) 0 ['batchnorm1[0][0]']
Conv2 (Conv2D) (None, 64, 16, 64) 18496 ['pool1[0][0]']
Conv3 (Conv2D) (None, 64, 16, 64) 36928 ['Conv2[0][0]']
batchnorm2 (BatchNormalization (None, 64, 16, 64) 256 ['Conv3[0][0]']
)
pool2 (MaxPooling2D) (None, 32, 8, 64) 0 ['batchnorm2[0][0]']
reshape (Reshape) (None, 32, 512) 0 ['pool2[0][0]']
dense1 (Dense) (None, 32, 64) 32832 ['reshape[0][0]']
dropout_3 (Dropout) (None, 32, 64) 0 ['dense1[0][0]']
bidirectional_9 (Bidirectional (None, 32, 256) 197632 ['dropout_3[0][0]']
)
bidirectional_10 (Bidirectiona (None, 32, 256) 394240 ['bidirectional_9[0][0]']
l)
bidirectional_11 (Bidirectiona (None, 32, 128) 164352 ['bidirectional_10[0][0]']
l)
label (InputLayer) [(None, None)] 0 []
dense2 (Dense) (None, 32, 85) 10965 ['bidirectional_11[0][0]']
ctc_loss (CTCLayer) (None, 32, 85) 0 ['label[0][0]',
'dense2[0][0]']
==================================================================================================
Total params: 856,149
Trainable params: 855,957
Non-trainable params: 192
__________________________________________________________________________________________________
optimizer = Adam
batch_size = 64
total_dataset = 100,000+
activation = relu
To answer the first query:
Yes, it is common for a loss value to start higher than it was in the previous epoch. During each epoch, your model is trained on different batches of data, and the loss is accumulated or averaged (depends on your loss function) over these batches. At the end of the epoch, you observe the loss over the entire dataset. At the start of the next epoch, you observe the loss over the first batch of the dataset that your model is training on.
Your dataset (ideally) follows a general pattern that you want your model to learn. A batch of your dataset will likely contain a sub-pattern out of the general pattern. At the end of an epoch, given that your model has been exposed to the entire dataset before, it will be better optimized to predict the general pattern of your data than a sub-pattern. Therefore, the loss on the batch/ data containing the sub-pattern will be higher.
For the second question:
It's hard to say if a certain numerical value of loss will be good or bad for a network, since your validation loss will depend on many factors. These include what loss function you are using, how many data points were used to compute the loss, and so on. The numerical value of your loss should not matter as long as your model meets the performance criteria you define in your evaluation metric.
So I need to be able to use predict on my model to test a benchmark but it doesn't work. When I use predict on the same validation data as my model uses whilst training, I only get an accuracy of .529. When using model.evaluate I get .85, it doesn't make sense and other threads talk about np.argmax or forgetting normalization but I've tried it all.
PS: I use transfer learning so it gets trained twice and some layers get frozen but that shouldn't influence this.
Model: "model_7"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_17 (InputLayer) [(None, 150, 150, 3)] 0
_________________________________________________________________
sequential_8 (Sequential) (None, 150, 150, 3) 0
_________________________________________________________________
normalization_7 (Normalizati (None, 150, 150, 3) 7
_________________________________________________________________
xception (Functional) (None, 5, 5, 2048) 20861480
_________________________________________________________________
global_average_pooling2d_7 ( (None, 2048) 0
_________________________________________________________________
dropout_7 (Dropout) (None, 2048) 0
_________________________________________________________________
dense_7 (Dense) (None, 1) 2049
=================================================================
Total params: 20,863,536
Trainable params: 20,809,001
Non-trainable params: 54,535
model.compile(
optimizer=keras.optimizers.Adam(1e-5), # Low learning rate
loss=keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy', keras.metrics.BinaryAccuracy(),f1_m,precision_m, recall_m],
)
print(model.evaluate(x_test,y_test))
OUTPUT: 38/38 [==============================] - 2s 45ms/step - loss: 0.3570 - binary_accuracy: 0.8550 - f1_m: 0.8509 - precision_m: 0.8786 - recall_m: 0.8326
[0.3569563925266266, 0.8550000190734863, 0.8509402871131897, 0.8786242604255676, 0.8326380848884583]
better = norm_layer(x_test)
# print(better[0])
y_pred = model.predict(better)
print(y_pred)
y_pred[y_pred <= 0.5] = 0
y_pred[y_pred > 0.5] = 1
print(y_pred)
print(sum(1 for x,y in zip(y_pred,y_test) if x == y) / len(y_pred))
OUTPUT: [[0.42335328]
[0.3409149 ]
[0.45328587]
...
[0.38108858]
[0.44630498]
[0.76832736]]
[[0.]
[0.]
[0.]
...
[0.]
[0.]
[1.]]
0.5291666666666667
Code normalization
# Pre-trained Xception weights requires that input be normalized
# from (0, 255) to a range (-1., +1.), the normalization layer
# does the following, outputs = (inputs - mean) / sqrt(var)
norm_layer = keras.layers.experimental.preprocessing.Normalization()
mean = np.array([127.5] * 3)
var = mean ** 2
# Scale inputs to [-1, +1]
x = norm_layer(x)
norm_layer.set_weights([mean, var])
For full code the following link (the file Xception detection):
https://console.paperspace.com/tees0czt0/notebook/rj824evpaot143v?file=Xception%20detection.ipynb
I am already adding dropout regularization. I am trying to build a multiclass text classification multilayer perceptron model.
My model:
model = Sequential([
Dropout(rate=0.2, input_shape=features),
Dense(units=64, activation='relu'),
Dropout(rate=0.2),
Dense(units=64, activation='relu'),
Dropout(rate=0.2),
Dense(units=16, activation='softmax')])
My model.summary():
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dropout_1 (Dropout) (None, 20000) 0
_________________________________________________________________
dense_1 (Dense) (None, 64) 1280064
_________________________________________________________________
dropout_2 (Dropout) (None, 64) 0
_________________________________________________________________
dense_2 (Dense) (None, 64) 4160
_________________________________________________________________
dropout_3 (Dropout) (None, 64) 0
_________________________________________________________________
dense_3 (Dense) (None, 16) 1040
=================================================================
Total params: 1,285,264
Trainable params: 1,285,264
Non-trainable params: 0
_________________________________________________________________
None
Train on 6940 samples, validate on 1735 samples
I am getting:
Epoch 16/1000
- 4s - loss: 0.4926 - acc: 0.8719 - val_loss: 1.2640 - val_acc: 0.6640
Validation accuracy: 0.6639769498140736, loss: 1.2639631692545559
The validation accuracy is ~20% less than the accuracy, and the validation loss is way higher than the training loss.
I am already using dropout regularization, and using epochs = 1000, batch size = 512 and early stopping on val_loss.
Any suggestions?
I have network :
Tensor("input_1:0", shape=(?, 5, 1), dtype=float32)
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) (None, 5, 1) 0
_________________________________________________________________
bidirectional_1 (Bidirection (None, 5, 64) 2176
_________________________________________________________________
activation_1 (Activation) (None, 5, 64) 0
_________________________________________________________________
bidirectional_2 (Bidirection (None, 5, 128) 16512
_________________________________________________________________
activation_2 (Activation) (None, 5, 128) 0
_________________________________________________________________
bidirectional_3 (Bidirection (None, 1024) 656384
_________________________________________________________________
activation_3 (Activation) (None, 1024) 0
_________________________________________________________________
dense_1 (Dense) (None, 1) 1025
_________________________________________________________________
p_re_lu_1 (PReLU) (None, 1) 1
=================================================================
Total params: 676,098
Trainable params: 676,098
Non-trainable params: 0
_________________________________________________________________
None
Train on 27496 samples, validate on 6875 samples
I fit and compile it by:
model.compile(loss='mse',optimizer=Adamx,metrics=['accuracy'])
model.fit(x_train,y_train,batch_size=100,epochs=10,validation_data=(x_test,y_test),verbose=2)
When I run it and also evaluate it on unseen data,it returns 0.0 Accuracy with very low loss. I can't figure out what's the problem.
Epoch 10/10
- 29s - loss: 1.6972e-04 - acc: 0.0000e+00 - val_loss: 1.7280e-04 - val_acc: 0.0000e+00
What you are getting is expected. Your model is working correctly, it is your metrics of measure that is incorrect. The aim of the optimization function is to minimize loss, not to increase accuracy.
Since you are using PRelu as the activation function of your last layer, you always get float output from the network. Comparing these float output with actual label for measure of accuracy doesn't seem the right option. Since the outputs and labels are continuous random variable the joint probability for specific value will be zero. Therefore, even if the model predicts values very close to the true label value the model accuracy still will be zero unless the model predicts exactly the same value as true label - which is improbable.
e.g if y_true is 1.0 and the model predicts 0.99999 still this value does not add value to accuracy of the model since 1.0 != 0.99999
Update
The choice of metrics function depends on the type of problem. Keras also provides functionality for implementing custom metrics.
Assuming the problem on question is linear regression and two values are equal if difference between the two values is less than 0.01, the custom loss metrics can be defined as:-
import keras.backend as K
import tensorflow as tf
accepted_diff = 0.01
def linear_regression_equality(y_true, y_pred):
diff = K.abs(y_true-y_pred)
return K.mean(K.cast(diff < accepted_diff, tf.float32))
Now you can use this metrics for your model
model.compile(loss='mse',optimizer=Adamx,metrics=[linear_regression_equality])
I have a Convolution Network (CNN) as followed. I would like to add visualization for every layer activation layer as in the
There are several layer of the CNN that are doing the required task. I only want to probe the output of each layer.
def get_model():
input_shape = (IMG_MODE, img_rows, img_cols)
model = Sequential()
model.add(ZeroPadding2D(padding=(1,1), input_shape=input_shape))
model.add(Conv2D(32, (3, 3), padding = 'valid'))
model.add(LeakyReLU(alpha=0.01))
model.add(MaxPooling2D(pool_size=pool_size2))
....
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
print(model.summary())
return model
The code output:
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
zero_padding2d_1 (ZeroPaddin (None, 1, 114, 94) 0
_________________________________________________________________
conv2d_1 (Conv2D) (None, 32, 112, 92) 320
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU) (None, 32, 112, 92) 0
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 32, 56, 46) 0
_________________________________________________________________
....
_________________________________________________________________
dense_1 (Dense) (None, 1024) 8258560
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU) (None, 1024) 0
_________________________________________________________________
dropout_1 (Dropout) (None, 1024) 0
_________________________________________________________________
dense_2 (Dense) (None, 40) 41000
_________________________________________________________________
activation_1 (Activation) (None, 40) 0
=================================================================
Total params: 8,392,232
Trainable params: 8,392,232
Non-trainable params: 0
_________________________________________________________________
None
Train on 320 samples, validate on 80 samples
Epoch 1/20
- 18s - loss: 3.7036 - acc: 0.0187 - val_loss: 3.6824 - val_acc: 0.0250
Epoch 2/20
- 17s - loss: 3.6903 - acc: 0.0250 - val_loss: 3.6786 - val_acc: 0.0250
...
Epoch 20/20
- 17s - loss: 0.2067 - acc: 0.9312 - val_loss: 0.9892 - val_acc: 0.7625
Test score: 0.9891735315322876
Test accuracy: 0.7625
I tried to use the following code to do my task:
import matplotlib.pyplot as plt
from keras import models
layer_outputs = [layer.output for layer in model.layers[:8]]
activation_model = models.Model(inputs=model.input, outputs=layer_outputs)
activations = activation_model.predict(img_tensor)
import matplotlib.pyplot as plt
plt.matshow(first_layer_activation[0, :, :, 7], cmap='viridis')
layer_names = []
for layer in model.layers[:8]:
layer_names.append(layer.name)
images_per_row = 16
for layer_name, layer_activation in zip(layer_names, activations):
n_features = layer_activation.shape[-1]
size = layer_activation.shape[1]
n_cols = n_features // images_per_row
display_grid = np.zeros((size * n_cols, images_per_row * size))
for col in range(n_cols):
for row in range(images_per_row):
channel_image = layer_activation[0,
:, :,
col * images_per_row + row]
channel_image -= channel_image.mean()
channel_image /= channel_image.std()
channel_image *= 64
channel_image += 128
channel_image = np.clip(channel_image, 0, 255).astype('uint8')
display_grid[col * size : (col + 1) * size,
row * size : (row + 1) * size] = channel_image
scale = 1. / size
plt.figure(figsize=(scale * display_grid.shape[1],
scale * display_grid.shape[0]))
plt.title(layer_name)
plt.grid(False)
plt.imshow(display_grid, aspect='auto', cmap='viridis')
I describe a general approach how to get outputs of convolutional layers of any model and how to visualize them. I will use Keras of TensorFlow, the code for other implementation of Keras may differ slightly.
First a function is required to get outputs of a convolutional layer of a model
def getConvOutput(model,index=-1):
# index=-1 means last convolutional layer
layers = model.layers
return [layer.output for layer in layers if type(layer) is tf.keras.layers.Conv2D][index]
This will be a four dimensional tensor (batch_size,height,width,number_of_channels)
Next we we need a function which constructs a grid for single data element (single image ) for output of convolutional layer. It will construct nearly square image which will be a grid of activation maps.
def mapsToGrid(output):
numMaps = int(output.shape[-1])
#calculate the number of rows and columns which we want to have
numColumns = math.ceil(numMaps**0.5)
numRows = math.ceil(numMaps/numColumns)
# the end of the grid may be filled with zeros
zerosNum = numRows*numColumns-numMaps
zerosShape = [int(i) for i in output.shape]
zerosShape[-1] = zerosNum
zeros = tf.zeros(
zerosShape,
dtype=tf.float32,
name=None)
# extend the activation maps with zeros
concated = tf.concat([output,zeros],-1)
len,width,depth= [s for s in concated.shape]
# unstack the activation maps and construct the grid
mapStack =tf.unstack(concated,axis=2)
rowStacks = [tf.concat(mapStack[i:i+numColumns],axis=1) for i in range(0,numColumns*numRows,numColumns)]
result = tf.concat(rowStacks,axis=0)
return result
Once you have these functions you can get the grid as follows
activation_map_grid_tensor = mapsToGrid(getConvOutput(model)[0])
Index 0 is required as mapsToGrid works with activation maps for single image, so we pick first element of the batch.
Now you can evalutate the tensor, and show it with e.g. cv2.imshow()
This approach is taken from https://github.com/cyberneuron/RT-CNN-Vis which is a platform for CNN visualization. One may also find it easier to get the code from there directly.