I have already trained a network and I have saved it in the form of mynetwork.model. I want to apply gradcam using my own model and not VGG16 or ResNet etc.
apply_gradcam.py
# import the necessary packages
from Grad_CAM.gradcam import GradCAM
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.applications import imagenet_utils
from tensorflow.keras.models import load_model
import numpy as np
import argparse
import imutils
import cv2
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
help="path to the input image")
ap.add_argument("-m", "--model", type=str, default="vgg",
#choices=("vgg", "resnet"),
help="model to be used")
args = vars(ap.parse_args())
# initialize the model to be VGG16
Model = VGG16
# check to see if we are using ResNet
if args["model"] == "resnet":
Model = ResNet50
# load the pre-trained CNN from disk
print("[INFO] loading model...")
model = Model(weights="imagenet")
# load the original image from disk (in OpenCV format) and then
# resize the image to its target dimensions
orig = cv2.imread(args["image"])
resized = cv2.resize(orig, (224, 224))
# load the input image from disk (in Keras/TensorFlow format) and
# preprocess it
image = load_img(args["image"], target_size=(224, 224))
image = img_to_array(image)
image = np.expand_dims(image, axis=0)
image = imagenet_utils.preprocess_input(image)
# use the network to make predictions on the input image and find
# the class label index with the largest corresponding probability
preds = model.predict(image)
i = np.argmax(preds[0])
# decode the ImageNet predictions to obtain the human-readable label
decoded = imagenet_utils.decode_predictions(preds)
(imagenetID, label, prob) = decoded[0][0]
label = "{}: {:.2f}%".format(label, prob * 100)
print("[INFO] {}".format(label))
# initialize our gradient class activation map and build the heatmap
cam = GradCAM(model, i)
heatmap = cam.compute_heatmap(image)
# resize the resulting heatmap to the original input image dimensions
# and then overlay heatmap on top of the image
heatmap = cv2.resize(heatmap, (orig.shape[1], orig.shape[0]))
(heatmap, output) = cam.overlay_heatmap(heatmap, orig, alpha=0.5)
cv2.rectangle(output, (0, 0), (340, 40), (0, 0, 0), -1)
cv2.putText(output, label, (10, 25), cv2.FONT_HERSHEY_SIMPLEX,
0.8, (255, 255, 255), 2)
# display the original image and resulting heatmap and output image
# to our screen
output = np.vstack([orig, heatmap, output])
output = imutils.resize(output, height=700)
cv2.imshow("Output", output)
cv2.waitKey(0)
gradcam.py
from tensorflow.keras.models import Model
import tensorflow as tf
import numpy as np
import cv2
class GradCAM:
def __init__(self, model, classIdx, layerName=None):
# store the model, the class index used to measure the class
# activation map, and the layer to be used when visualizing
# the class activation map
self.model = model
self.classIdx = classIdx
self.layerName = layerName
# if the layer name is None, attempt to automatically find
# the target output layer
if self.layerName is None:
self.layerName = self.find_target_layer()
def find_target_layer(self):
# attempt to find the final convolutional layer in the network
# by looping over the layers of the network in reverse order
for layer in reversed(self.model.layers):
# check to see if the layer has a 4D output
if len(layer.output_shape) == 4:
return layer.name
# otherwise, we could not find a 4D layer so the GradCAM
# algorithm cannot be applied
raise ValueError("Could not find 4D layer. Cannot apply GradCAM.")
def compute_heatmap(self, image, eps=1e-8):
# construct our gradient model by supplying (1) the inputs
# to our pre-trained model, (2) the output of the (presumably)
# final 4D layer in the network, and (3) the output of the
# softmax activations from the model
gradModel = Model(
inputs=[self.model.inputs],
outputs=[self.model.get_layer(self.layerName).output,
self.model.output])
# record operations for automatic differentiation
with tf.GradientTape() as tape:
# cast the image tensor to a float-32 data type, pass the
# image through the gradient model, and grab the loss
# associated with the specific class index
inputs = tf.cast(image, tf.float32)
(convOutputs, predictions) = gradModel(inputs)
loss = predictions[:, self.classIdx]
# use automatic differentiation to compute the gradients
grads = tape.gradient(loss, convOutputs)
# compute the guided gradients
castConvOutputs = tf.cast(convOutputs > 0, "float32")
castGrads = tf.cast(grads > 0, "float32")
guidedGrads = castConvOutputs * castGrads * grads
# the convolution and guided gradients have a batch dimension
# (which we don't need) so let's grab the volume itself and
# discard the batch
convOutputs = convOutputs[0]
guidedGrads = guidedGrads[0]
# compute the average of the gradient values, and using them
# as weights, compute the ponderation of the filters with
# respect to the weights
weights = tf.reduce_mean(guidedGrads, axis=(0, 1))
cam = tf.reduce_sum(tf.multiply(weights, convOutputs), axis=-1)
# grab the spatial dimensions of the input image and resize
# the output class activation map to match the input image
# dimensions
(w, h) = (image.shape[2], image.shape[1])
heatmap = cv2.resize(cam.numpy(), (w, h))
# normalize the heatmap such that all values lie in the range
# [0, 1], scale the resulting values to the range [0, 255],
# and then convert to an unsigned 8-bit integer
numer = heatmap - np.min(heatmap)
denom = (heatmap.max() - heatmap.min()) + eps
heatmap = numer / denom
heatmap = (heatmap * 255).astype("uint8")
# return the resulting heatmap to the calling function
return heatmap
def overlay_heatmap(self, heatmap, image, alpha=0.5,
colormap=cv2.COLORMAP_VIRIDIS):
# apply the supplied color map to the heatmap and then
# overlay the heatmap on the input image
heatmap = cv2.applyColorMap(heatmap, colormap)
output = cv2.addWeighted(image, alpha, heatmap, 1 - alpha, 0)
# return a 2-tuple of the color mapped heatmap and the output,
# overlaid image
return (heatmap, output)
As you can see in apply_gradcam.py, the VGG16 or ResNet pretrained models are used. I want to perform gradcam by using my own trained model. For this reason I commented these lines:
# initialize the model to be VGG16
Model = VGG16
# check to see if we are using ResNet
if args["model"] == "resnet":
Model = ResNet50
# load the pre-trained CNN from disk
print("[INFO] loading model...")
model = Model(weights="imagenet")
and I used
model = load_model(args["model"])
in order to use my own model. Then I executed:
python apply_gradcam.py --image /home/antonis/IM0001.jpeg --model /home/antonis/mynetwork.model
However, I get the following error:
ValueError: `decode_predictions` expects a batch of predictions (i.e.
a 2D array of shape (samples, 1000)). Found array with shape: (1, 3)
which is expected as the model outputs the ImageNet classes (1000-dimensional) while my model returns predictions over 2 classes.
I wonder how to fix this and apply gradcam using my own model.
One thing I don't get is if you've your own classifier (2) why then use imagenet_utils.decode_predictions? I'm not sure if my following answer will satisfy you or not. But here are some pointer.
DataSet
import tensorflow as tf
import numpy as np
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
# train set / data
x_train = x_train.astype('float32') / 255
# train set / target
y_train = tf.keras.utils.to_categorical(y_train , num_classes=10)
# validation set / data
x_test = x_test.astype('float32') / 255
# validation set / target
y_test = tf.keras.utils.to_categorical(y_test, num_classes=10)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# (50000, 32, 32, 3) (50000, 10)
# (10000, 32, 32, 3) (10000, 10
Model
input = tf.keras.Input(shape=(32,32,3))
efnet = tf.keras.applications.EfficientNetB0(weights='imagenet',
include_top = False,
input_tensor = input)
# Now that we apply global max pooling.
gap = tf.keras.layers.GlobalMaxPooling2D()(efnet.output)
# Finally, we add a classification layer.
output = tf.keras.layers.Dense(10, activation='softmax')(gap)
# bind all
func_model = tf.keras.Model(efnet.input, output)
Compile and Run
func_model.compile(
loss = tf.keras.losses.CategoricalCrossentropy(),
metrics = tf.keras.metrics.CategoricalAccuracy(),
optimizer = tf.keras.optimizers.Adam())
# fit
func_model.fit(x_train, y_train, batch_size=128, epochs=15, verbose = 2)
Epoch 14/15
391/391 - 13s - loss: 0.1479 - categorical_accuracy: 0.9491
Epoch 15/15
391/391 - 13s - loss: 0.1505 - categorical_accuracy: 0.9481
Grad CAM
Same as your set up.
from tensorflow.keras.models import Model
import tensorflow as tf
import numpy as np
import cv2
class GradCAM:
def __init__(self, model, classIdx, layerName=None):
# store the model, the class index used to measure the class
# activation map, and the layer to be used when visualizing
# the class activation map
self.model = model
self.classIdx = classIdx
self.layerName = layerName
# if the layer name is None, attempt to automatically find
# the target output layer
if self.layerName is None:
self.layerName = self.find_target_layer()
def find_target_layer(self):
# attempt to find the final convolutional layer in the network
# by looping over the layers of the network in reverse order
for layer in reversed(self.model.layers):
# check to see if the layer has a 4D output
if len(layer.output_shape) == 4:
return layer.name
# otherwise, we could not find a 4D layer so the GradCAM
# algorithm cannot be applied
raise ValueError("Could not find 4D layer. Cannot apply GradCAM.")
def compute_heatmap(self, image, eps=1e-8):
# construct our gradient model by supplying (1) the inputs
# to our pre-trained model, (2) the output of the (presumably)
# final 4D layer in the network, and (3) the output of the
# softmax activations from the model
gradModel = Model(
inputs=[self.model.inputs],
outputs=[self.model.get_layer(self.layerName).output, self.model.output])
# record operations for automatic differentiation
with tf.GradientTape() as tape:
# cast the image tensor to a float-32 data type, pass the
# image through the gradient model, and grab the loss
# associated with the specific class index
inputs = tf.cast(image, tf.float32)
(convOutputs, predictions) = gradModel(inputs)
loss = predictions[:, tf.argmax(predictions[0])]
# use automatic differentiation to compute the gradients
grads = tape.gradient(loss, convOutputs)
# compute the guided gradients
castConvOutputs = tf.cast(convOutputs > 0, "float32")
castGrads = tf.cast(grads > 0, "float32")
guidedGrads = castConvOutputs * castGrads * grads
# the convolution and guided gradients have a batch dimension
# (which we don't need) so let's grab the volume itself and
# discard the batch
convOutputs = convOutputs[0]
guidedGrads = guidedGrads[0]
# compute the average of the gradient values, and using them
# as weights, compute the ponderation of the filters with
# respect to the weights
weights = tf.reduce_mean(guidedGrads, axis=(0, 1))
cam = tf.reduce_sum(tf.multiply(weights, convOutputs), axis=-1)
# grab the spatial dimensions of the input image and resize
# the output class activation map to match the input image
# dimensions
(w, h) = (image.shape[2], image.shape[1])
heatmap = cv2.resize(cam.numpy(), (w, h))
# normalize the heatmap such that all values lie in the range
# [0, 1], scale the resulting values to the range [0, 255],
# and then convert to an unsigned 8-bit integer
numer = heatmap - np.min(heatmap)
denom = (heatmap.max() - heatmap.min()) + eps
heatmap = numer / denom
heatmap = (heatmap * 255).astype("uint8")
# return the resulting heatmap to the calling function
return heatmap
def overlay_heatmap(self, heatmap, image, alpha=0.5,
colormap=cv2.COLORMAP_VIRIDIS):
# apply the supplied color map to the heatmap and then
# overlay the heatmap on the input image
heatmap = cv2.applyColorMap(heatmap, colormap)
output = cv2.addWeighted(image, alpha, heatmap, 1 - alpha, 0)
# return a 2-tuple of the color mapped heatmap and the output,
# overlaid image
return (heatmap, output)
Prediction
image = cv2.imread('/content/dog.jpg')
image = cv2.resize(image, (32, 32))
image = image.astype('float32') / 255
image = np.expand_dims(image, axis=0)
preds = func_model.predict(image)
i = np.argmax(preds[0])
To get the layer's name of the model
for idx in range(len(func_model.layers)):
print(func_model.get_layer(index = idx).name)
# we picked `block5c_project_con` layer
Passing to GradCAM class
icam = GradCAM(func_model, i, 'block5c_project_conv')
heatmap = icam.compute_heatmap(image)
heatmap = cv2.resize(heatmap, (32, 32))
image = cv2.imread('/content/dog.jpg')
image = cv2.resize(image, (32, 32))
print(heatmap.shape, image.shape)
(heatmap, output) = icam.overlay_heatmap(heatmap, image, alpha=0.5)
Visualization
fig, ax = plt.subplots(1, 3)
ax[0].imshow(heatmap)
ax[1].imshow(image)
ax[2].imshow(output)
Ref. Grad-CAM class activation visualization
Related
I am using a ResNet50 as base model to predict multiple label in an image and sum up the respective values of the labels.
reading the data:
#read the data
data_path = '/content/drive/MyDrive/Notifyer-dataset/dataset'
def load_dataset(folder):
X = [] # create an empty list to store the images
y = [] # create an empty list to store the labels
# get a list of all the files in the folder
filenames = os.listdir(folder)
# iterate over the files
for filename in filenames:
# get the label from the filename
label = filename.split('_')[0]
# open the image file and convert it to a NumPy array
image = Image.open(os.path.join(folder, filename))
image = image.resize((200, 200)) # resize the image to 200x200
image = image.convert('RGB') # convert the image to RGB
image = np.array(image) / 255 # normalize the pixel values
image = image.reshape(-1, 200, 200, 3) # reshape to (batch_size, height, width, channels)
# append the image and label to the list
X.append(image)
y.append(label)
# convert the lists to NumPy arrays
X = np.array(X)
y = np.array(y)
#preprocessing
X = X.reshape(-1, 200, 200, 3) # reshape arrays to 200x200 images with 1 channel
X = X / 255.0 # normalize pixel values
#one hot encoding
num_classes = len(np.unique(y))
y = to_categorical(y, num_classes)
return X, y,num_classes
X, y, num_classes = load_dataset(data_path)
building the model:
def build_r_cnn_model(num_classes):
"""
Build a region-based CNN model.
Parameters:
num_classes (int): number of classes to classify
Returns:
Model: the R-CNN model
"""
# load the ResNet50 model pre-trained on ImageNet
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(200, 200, 3))
# freeze the base model layers
for layer in base_model.layers:
layer.trainable = False
# add a global average pooling layer
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
# add a fully-connected layer
x = tf.keras.layers.Dense(1024, activation='relu')(x)
# add a dropout layer
x = tf.keras.layers.Dropout(0.5)(x)
# add a classification layer
predictions = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
#build the model
model = Model(inputs=base_model.input, outputs=predictions)
return model
compiling the model:
# build and compile the model
model = build_r_cnn_model(num_classes)
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
training the model:
#train
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_val, y_val))
function to sum up all label values in the image:
#function to calculate total sum of value of predicted labels
def predict_total_sum(model, image):
y_pred = model.predict(image) # classify the image
# define a lookup table to map class indices to values
value_lookup = {
0: 1, # class 0 corresponds to value 1
1: 2, # class 1 corresponds to value 2
}
total_sum = 0
for prediction in y_pred:
# get the class index with the highest predicted probability
class_index = np.argmax(prediction)
print(class_index)
# add the value of the detected denomination to the total sum
total_sum += value_lookup[class_index]
return total_sum
It gives value 1 or 2 for every image for each model compilation which means it is only predicting only one label even if the image has multiple objects of both the labels.
My dataset is small and every image in it contains object of one of the label, do I need to diversify my dataset to make the model identify both labels in an image or is there something wrong with the model architecture? I have also tried to build a CNN model from scratch but it is giving the same result...
I think the output of model.predict has shape [1, num_of_classes] (you can verify it by printing it's shape once). Hence when you are looping on y_pred then you basically iterate only once and add one of the class index to the total_sum. Even if the shape was [num_of_classes], then also I think that this is not how you should try multi-class classification. Would prefer you to read more about how multiclass classification is done.
You can take help from this link: https://www.kaggle.com/code/prateek0x/multiclass-image-classification-using-keras
Please help me to understand why predict doesn't work correct, when train-test accuracy is 0.97.
Is it from data, or network should be changed?
Input data are 32500 (5 gesture with 6500 images) RGB images with 640*480 pixels.
dataset
Images loaded and resized IMG_WIDTH = 100, IMG_HEIGHT = 77.
Here's the function which load, resized images and return np.array.
def load_data(data_dir):
"""
Load image data from directory `data_dir`.
Assume `data_dir` has one directory named after each category, numbered
0 through NUM_CATEGORIES - 1. Inside each category directory will be some
number of image files.
Return tuple `(images, labels)`. `images` should be a list of all
of the images in the data directory, where each image is formatted as a
numpy ndarray with dimensions IMG_WIDTH x IMG_HEIGHT x 3. `labels` should
be a list of integer labels, representing the categories for each of the
corresponding `images`.
"""
images = []
labels = []
for dir in range(0, NUM_CATEGORIES):
# get path for each gesture
d = os.path.join(data_dir, f"{str(dir)}")
# os.listdir(d) return the list of all names of images in that folder
for image_path in os.listdir(d):
# get the full path of specific image
full_path = os.path.join(data_dir, f"{str(dir)}", image_path)
# Returns an image that is loaded from the specified file
image = cv2.imread(full_path)
# get dimension for each image
dim = (100, 77)
# resized the image
image_resized = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
# add image and their directory name to images and labels list
images.append(image_resized)
labels.append(dir)
return images, labels
Here's my model.
def get_model():
"""
Returns a compiled convolutional neural network model. Assume that the
`input_shape` of the first layer is `(IMG_WIDTH=100, IMG_HEIGHT=77, 3)`.
The output layer should have `NUM_GESTURE = 5` units, one for each category.
"""
# Create a convolutional neural network
model = tf.keras.models.Sequential(
[
# Convolutional layer. Learn 32 filters using a 3x3 kernel
tf.keras.layers.Conv2D(
32, (5, 5), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)
),
# Max-pooling layer, using 2x2 pool size
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Conv2D(
64, (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)
),
# Max-pooling layer, using 2x2 pool size
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Conv2D(
64, (3, 3), activation='relu', input_shape=((IMG_WIDTH), (IMG_HEIGHT), 3)
),
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Conv2D(
128, (3, 3), activation='relu', input_shape=((IMG_WIDTH), (IMG_HEIGHT), 3)
),
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Flatten(),
# Add a hidden layer with dropout
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.3),
# Add an output layer with output units for all 5 gestures
tf.keras.layers.Dense(5, activation='softmax')
])
# Train neural network
model.compile(
optimizer='adam',
loss="categorical_crossentropy",
metrics=["accuracy"]
)
return model
labels = tf.keras.utils.to_categorical(labels)
x_train, x_test, y_train, y_test = train_test_split(
np.array(images), np.array(labels), test_size=0.4)
model = get_model()
model.fit(x_train, y_train, batch_size=64, epochs=10)
model.evaluate(x_test, y_test, verbose=2)
The result is 0.97.
fit result
From video I save the image and want to real time predict the hand gesture.
GESTURE = {0:"ok", 1:"down", 2:"up", 3:"palm", 4:"l"}
video = cv2.VideoCapture(0)
while True:
# Capture the video frame
ret, img = video.read()
# Display the resulting frame
# to flip the video with 180 degree
image = cv2.flip(img, 1)
# save image for prediction
image = cv2.imwrite('Frame'+str(0)+'.jpg', image)
image_addr = "Frame0.jpg"
image = cv2.imread(image_addr)
dim = (100,77)
image = tf.keras.preprocessing.image.load_img(image_addr, target_size=dim)
# Converts a PIL Image instance to a Numpy array. Return a 3D Numpy array.
input_arr = tf.keras.preprocessing.image.img_to_array(image)
# Convert single image to a batch.
input_arr = np.array([input_arr])
input_arr = input_arr.astype('float32')/255
# Generates output predictions for the input samples. Return Numpy array(s) of predictions.
predictions = model.predict(input_arr)
# Return the index_array of the maximum values along an axis.
pre_class = np.argmax(predictions, axis=-1)
# for writing in the video
text = GESTURE[pre_class[0]]
font = cv2.FONT_HERSHEY_SIMPLEX
image = cv2.flip(img, 1)
cv2.putText(image,
text,
(50, 50),
font, 2,
(0, 0, 0),
2,
cv2.LINE_4)
cv2.imshow('video', image)
# the 'q' button is set as the
# quitting button you may use any
# desired button of your choice
k = cv2.waitKey(1)
if k == ord('q'):
break
video.release()
cv2.destroyAllWindows()
github link
I am no expert, but typically when you perform well on the training data and the testing data "The result is 0.97", but perform poorly on new end-user data, it is because there is a data-mismatch (although it could possibly be overfitting).
As in, the data you trained and tested on is so different (pixel values, probability distribution of pixels, or unseen differences that are noticable to the model) that the model could not generalize to it and performed badly.
It is good practice to use the same data you would use in production/final-product as a test set. Andrew Ng uses this dataset split(this is applicable if you have enough data):
From the training data:
Training Set
Train-Dev (same as Validation, I think) Set
From the end-product data:
Development Set
Test Set
You can check this post for more information regarding why: https://cs230.stanford.edu/blog/split/
Your preprocess in training step mismatch with preprocess in predict step:
input_arr = input_arr.astype('float32')/255
I would like to know how to get the output of each layer of a pre-trained CNN Keras model. What I am working on is to get the intermediate outputs of each layer in the model associated with a specific image I am providing to the model. Here is what I did:
model = load_model('model.h5')
img = Image.open('img.jpg')
img_array = np.array (img)
img_array = img_array/255
img_array = img_array.reshape(-1,512,512,1)
pred = model.predict(img_array)
I am just so confused about what to do next to print the output of each layer in such a case!
The outputs from layers could be collected by following the steps below:
from keras import backend as K
model = load_model('model.h5')
inp = model.input # input placeholder
out = [layer.output for layer in model.layers] # all layer outputs
get_outputs = K.function([inp, K.learning_phase()], out)
img = load_img('img.jpg')
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
x /= 255.
layer_outs = get_outputs([x, 1.])
print(layer_outs)
The intermediate representation of the input image img.jpg could be replicated by running the following code snippet:
from tensorflow.keras.preprocessing.image import img_to_array, load_img
model = load_model('model.h5')
# Define a new Model that will take an image as input, and will output
# intermediate representations for all layers except the first layer.
layer_outputs = [layer.output for layer in model.layers[1:]]
visual_model = tf.keras.models.Model(inputs = model.input, outputs = layer_outputs)
# Read your image
img = load_img('img.jpg')
x = img_to_array(img)
x = x.reshape((1,) + x.shape) # add one extra dimension to the front
x /= 255. # rescale by 1/255.
# run your image through the network; make a prediction
feature_maps = visual_model.predict(x)
# Plotting intermediate representations for your image
# Collect the names of each layer except the first one for plotting
layer_names = [layer.name for layer in model.layers[1:]]
# Plotting intermediate representation images layer by layer
for layer_name, feature_map in zip(layer_names, feature_maps):
if len(feature_map.shape) == 4: # skip fully connected layers
# number of features in an individual feature map
n_features = feature_map.shape[-1]
# The feature map is in shape of (1, size, size, n_features)
size = feature_map.shape[1]
# Tile our feature images in matrix `display_grid
display_grid = np.zeros((size, size * n_features))
# Fill out the matrix by looping over all the feature images of your image
for i in range(n_features):
# Postprocess each feature of the layer to make it pleasible to your eyes
x = feature_map[0, :, :, i]
x -= x.mean()
x /= x.std()
x *= 64
x += 128
x = np.clip(x, 0, 255).astype('uint8')
# We'll tile each filter into this big horizontal grid
display_grid[:, i * size : (i + 1) * size] = x
# Display the grid
scale = 20. / n_features
plt.figure(figsize=(scale * n_features, scale))
plt.title(layer_name)
plt.grid(False)
plt.imshow(display_grid, aspect='auto', cmap='viridis')
I get an error using gradient visualization with transfer learning in TF 2.0. The gradient visualization works on a model that does not use transfer learning.
When I run my code I get the error:
assert str(id(x)) in tensor_dict, 'Could not compute output ' + str(x)
AssertionError: Could not compute output Tensor("block5_conv3/Identity:0", shape=(None, 14, 14, 512), dtype=float32)
When I run the code below it errors. I think there's an issue with the naming conventions or connecting inputs and outputs from the base model, vgg16, to the layers I'm adding. Really appreciate your help!
"""
Broken example when grad_model is created.
"""
!pip uninstall tensorflow
!pip install tensorflow==2.0.0
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
IMAGE_PATH = '/content/cat.3.jpg'
LAYER_NAME = 'block5_conv3'
model_layer = 'vgg16'
CAT_CLASS_INDEX = 281
imsize = (224,224,3)
img = tf.keras.preprocessing.image.load_img(IMAGE_PATH, target_size=(224, 224))
plt.figure()
plt.imshow(img)
img = tf.io.read_file(IMAGE_PATH)
img = tf.image.decode_jpeg(img)
img = tf.cast(img, dtype=tf.float32)
# img = tf.keras.preprocessing.image.img_to_array(img)
img = tf.image.resize(img, (224,224))
img = tf.reshape(img, (1, 224,224,3))
input = layers.Input(shape=(imsize[0], imsize[1], imsize[2]))
base_model = tf.keras.applications.VGG16(include_top=False, weights='imagenet',
input_shape=(imsize[0], imsize[1], imsize[2]))
# base_model.trainable = False
flat = layers.Flatten()
dropped = layers.Dropout(0.5)
global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
fc1 = layers.Dense(16, activation='relu', name='dense_1')
fc2 = layers.Dense(16, activation='relu', name='dense_2')
fc3 = layers.Dense(128, activation='relu', name='dense_3')
prediction = layers.Dense(2, activation='softmax', name='output')
for layr in base_model.layers:
if ('block5' in layr.name):
layr.trainable = True
else:
layr.trainable = False
x = base_model(input)
x = global_average_layer(x)
x = fc1(x)
x = fc2(x)
x = prediction(x)
model = tf.keras.models.Model(inputs = input, outputs = x)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
loss='binary_crossentropy',
metrics=['accuracy'])
This portion of the code is where the error lies. I'm not sure what is the correct way to label inputs and outputs.
# Create a graph that outputs target convolution and output
grad_model = tf.keras.models.Model(inputs = [model.input, model.get_layer(model_layer).input],
outputs=[model.get_layer(model_layer).get_layer(LAYER_NAME).output,
model.output])
print(model.get_layer(model_layer).get_layer(LAYER_NAME).output)
# Get the score for target class
# Get the score for target class
with tf.GradientTape() as tape:
conv_outputs, predictions = grad_model(img)
loss = predictions[:, 1]
The section below is for plotting a heatmap of gradcam.
print('Prediction shape:', predictions.get_shape())
# Extract filters and gradients
output = conv_outputs[0]
grads = tape.gradient(loss, conv_outputs)[0]
# Apply guided backpropagation
gate_f = tf.cast(output > 0, 'float32')
gate_r = tf.cast(grads > 0, 'float32')
guided_grads = gate_f * gate_r * grads
# Average gradients spatially
weights = tf.reduce_mean(guided_grads, axis=(0, 1))
# Build a ponderated map of filters according to gradients importance
cam = np.ones(output.shape[0:2], dtype=np.float32)
for index, w in enumerate(weights):
cam += w * output[:, :, index]
# Heatmap visualization
cam = cv2.resize(cam.numpy(), (224, 224))
cam = np.maximum(cam, 0)
heatmap = (cam - cam.min()) / (cam.max() - cam.min())
cam = cv2.applyColorMap(np.uint8(255 * heatmap), cv2.COLORMAP_JET)
output_image = cv2.addWeighted(cv2.cvtColor(img.astype('uint8'), cv2.COLOR_RGB2BGR), 0.5, cam, 1, 0)
plt.figure()
plt.imshow(output_image)
plt.show()
I also asked this to the tensorflow team on github at https://github.com/tensorflow/tensorflow/issues/37680.
I figured it out. If you set up the model extending the vgg16 base model with your own layers, rather than inserting the base model into a new model like a layer, then it works.
First set up the model and be sure to declare the input_tensor.
inp = layers.Input(shape=(imsize[0], imsize[1], imsize[2]))
base_model = tf.keras.applications.VGG16(include_top=False, weights='imagenet', input_tensor=inp,
input_shape=(imsize[0], imsize[1], imsize[2]))
This way we don't have to include a line like x=base_model(inp) to show what input we want to put in. That's already included in tf.keras.applications.VGG16(...).
Instead of putting this vgg16 base model inside another model, it's easier to do gradcam by adding layers to the base model itself. I grab the output of the last layer of VGG16 (with the top removed), which is the pooling layer.
block5_pool = base_model.get_layer('block5_pool')
x = global_average_layer(block5_pool.output)
x = fc1(x)
x = prediction(x)
model = tf.keras.models.Model(inputs = inp, outputs = x)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
loss='binary_crossentropy',
metrics=['accuracy'])
Now, I grab the layer for visualization, LAYER_NAME='block5_conv3'.
# Create a graph that outputs target convolution and output
grad_model = tf.keras.models.Model(inputs = [model.input],
outputs=[model.output, model.get_layer(LAYER_NAME).output])
print(model.get_layer(LAYER_NAME).output)
# Get the score for target class
# Get the score for target class
with tf.GradientTape() as tape:
predictions, conv_outputs = grad_model(img)
loss = predictions[:, 1]
print('Prediction shape:', predictions.get_shape())
# Extract filters and gradients
output = conv_outputs[0]
grads = tape.gradient(loss, conv_outputs)[0]
We (I plus a number of team members developing a project) found a similar problem with a code implementing Grad-CAM that we found in a tutorial.
That code didn't work with a model consisting of the base model of VGG19 plus a few extra layers added on top of it. The problem was that the VGG19 base model was inserted as a "layer" inside our model, and apparently the GradCAM code didn't know how to deal with it - we were getting a "Graph disconnected..." error. Then after some debugging (carried out by another team member, not me) we managed to modify the original code to make it work for this kind of model that contains another model inside it. The idea is to add the inner model as an extra argument of the class GradCAM. Since this may be helpful to others I am including the modified code below (we also renamed the GradCAM class as My_GradCAM).
class My_GradCAM:
def __init__(self, model, classIdx, inner_model=None, layerName=None):
self.model = model
self.classIdx = classIdx
self.inner_model = inner_model
if self.inner_model == None:
self.inner_model = model
self.layerName = layerName
[...]
gradModel = tensorflow.keras.models.Model(inputs=[self.inner_model.inputs],
outputs=[self.inner_model.get_layer(self.layerName).output,
self.inner_model.output])
Then the class can be instantiated by adding the inner model as the extra argument, e.g.:
cam = My_GradCAM(model, None, inner_model=model.get_layer("vgg19"), layerName="block5_pool")
I hope this helps.
Edit: Credit to Mirtha Lucas for doing the debugging and finding the solution.
After a lot of struggle, I condense the way to draw the heat map when you are using transfer learning. Here is the keras official tutorial
The issue I encounter is that when I'm trying to draw the heat map
from my model, the densenet can be only seen as functional layer in my
model. So the make_gradcam_heatmap can not figure out the layer that
inside functional layer. As the 5th layer shows.
Therefore, to simulate the Keras official document, I need to only use the densenet as the model for visualization. Here is the step
Only Take out the model from your model
dense_model = dense_model.get_layer('densenet121')
Copy the weight from dense model to your new initiated model
inputs = tf.keras.Input(shape=(224, 224, 3))
model = model_builder(weights="imagenet", include_top=True, input_tensor=inputs)
for layer, dense_layer in zip(model.layers[1:], dense_model.layers[1:]):
layer.set_weights(dense_layer.get_weights())
relu = model.get_layer('relu')
x = tf.keras.layers.GlobalAveragePooling2D()(relu.output)
outputs = tf.keras.layers.Dense(5)(x)
model = tf.keras.models.Model(inputs = inputs, outputs = outputs)
Draw the heat map
preprocess_input = keras.applications.densenet.preprocess_input
img_array = preprocess_input(get_img_array(img_path, size=(224, 224)))
heatmap = make_gradcam_heatmap(img_array, model, 'bn')
plt.matshow(heatmap)
plt.show()
get_img_array, make_gradcam_heatmap and save_and_display_gradcam are kept in still. Follow the keras tutorial then you are good to go.
I have confused ideas on how to calculate the validation loss in my model. I've a problem of classifying the pixels of the image.
So, after training, in validation statement, i can get my prediction image with this code:
output_image = sess.run(network,feed_dict={net_input:input_image})
But, if i run:
cost , output_image = sess.run([loss,network],feed_dict={net_input: input_image, net_output: output_image})
I get this exception: Cannot feed value of shape (1, 480, 480, 3) for Tensor 'Placeholder_1:0', which has shape '(?, ?, ?, 2)'
I don't understand why, the shape of input and output images are the same (1, 480, 480, 3).
What is the way to get the cost of validation image?
EDIT:
This is my code:
net_input = tf.placeholder(tf.float32,shape=[None,None,None,3])
net_output = tf.placeholder(tf.float32,shape=[None,None,None,num_classes])
network, _ = model_builder.build_model(args.model, net_input=net_input, num_classes=num_classes, crop_width=args.crop_width, crop_height=args.crop_height)
sess.run(tf.global_variables_initializer())
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=network, labels=net_output))
opt = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.995).minimize(loss, var_list=[var for var in tf.trainable_variables()])
saver=tf.train.Saver(max_to_keep=1000)
sess.run(tf.global_variables_initializer())
# Set random seed to make sure models are validated on the same validation images.
# So you can compare the results of different models more intuitively.
val_indices=random.sample(range(0,len(val_input_names)),num_vals)
# Do the training here
for epoch in range(args.epoch_start_i, args.num_epochs):
current_losses = []
# Equivalent to shuffling
id_list = np.random.permutation( len(train_input_names) )
num_iters = int(np.floor(len(id_list) / args.batch_size))
#for i in range(num_iters):
description_train = '[i]Train Epoch {:>2}/{}'.format(epoch + 1, args.num_epochs)
for i in tqdm(range(num_iters), desc=description_train, unit='batch'):
input_image_batch = []
output_image_batch = []
# Collect a batch of images
for j in range(args.batch_size):
index = i*args.batch_size + j
id = id_list[index]
input_image = utils.load_image(train_input_names[id])
output_image = utils.load_image(train_output_names[id])
with tf.device('/cpu:0'):
input_image, output_image = data_augmentation(input_image, output_image)
# Prep the data. Make sure the labels are in one-hot format
input_image = np.float32(input_image) / 255.0
output_image = np.float32(helpers.one_hot_it(label=output_image, label_values=label_values))
input_image_batch.append(np.expand_dims(input_image, axis=0))
output_image_batch.append(np.expand_dims(output_image, axis=0))
if args.batch_size == 1:
input_image_batch = input_image_batch[0]
output_image_batch = output_image_batch[0]
else:
input_image_batch = np.squeeze(np.stack(input_image_batch, axis=1))
output_image_batch = np.squeeze(np.stack(output_image_batch, axis=1))
# Do the training
_,current=sess.run([opt,loss],feed_dict={net_input:input_image_batch,net_output:output_image_batch})
# Do the validation on a small set of validation images
description_val = '[i]Validazione {:>2}/{}'.format(epoch + 1, args.num_epochs)
loss_val = [];
for ind in tqdm(val_indices, total=len(val_indices), desc=description_val, unit='img'):
input_image = np.expand_dims(np.float32(utils.load_image( val_input_names[ind] )[:args.crop_height, :args.crop_width]),axis=0)/255.0
output_image = np.expand_dims(np.float32(utils.load_image(val_output_names[ind])[:args.crop_height, :args.crop_width]), axis=0) / 255.0
#Do the validation
output_image =sess.run(network,feed_dict{net_input:input_image})
Looks like the number of classes (num_classes) is two in your case. So output_image you are feeding to sess.run() as net_output should have only two channels. But in your case, you have three channels and that's why you are getting this error.
Use helpers.one_hot_it() for getting a binary mask of your output image. You will have to expand dimension using np.expand_dim() to make it a batch of one image since the network accepts one batch at a time, not one image at a time.
You can make use of the following code snippet to get validation loss:
# Do the validation on a small set of validation images
description_val = '[i]Validazione {:>2}/{}'.format(epoch + 1, args.num_epochs)
loss_val = [];
for ind in tqdm(val_indices, total=len(val_indices), desc=description_val, unit='img'):
input_image = np.expand_dims(np.float32(utils.load_image( val_input_names[ind] )[:args.crop_height, :args.crop_width]),axis=0)/255.0
output_image = utils.load_image(val_output_names[ind])[:args.crop_height, :args.crop_width]
output_one_hot = helpers.one_hot_it(output_image, label_values)
#Do the validation
output_image, loss_value = sess.run([network, loss], feed_dict={net_input: input_image, net_output: np.expand_dims(output_one_hot, axis=0)})
The problem lies in net_output = tf.placeholder(tf.float32,shape=[None,None,None,num_classes]). If this is the known classification of the image then it should be of the like (if those are already logits):
net_output = tf.placeholder(tf.float32,shape=[None, num_classes])
To train your network you would do:
_, eval_loss = sess.run([opt, loss], feed_dict={net_input: input_image_batch, net_output:output_image_batch}})`
because you want to run the optimizer you have opt and because you want to monitor the loss you do loss.
To get the classification of an image you would do (after training):
classified_images = sess.run([network], feed_dict={net_input: input_image_batch})