Input image size of Faster-RCNN model in Pytorch - python

I'm Trying to implement of Faster-RCNN model with Pytorch.
In the structure, First element of model is Transform.
from torchvision.models.detection import fasterrcnn_resnet50_fpn
model = fasterrcnn_resnet50_fpn(pretrained=True)
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
Resize(min_size=(800,), max_size=1333, mode='bilinear')
When images pass forward of Resize(), They come out with (800,h) or (w, 1333) according to ratio of Width and Height.
for i in range(2):
_, image, target = testset.__getitem__(i)
img = image.unsqueeze(0)
output, _ = model.transform(img)
Before Transform : torch.Size([512, 640])
After Transform : [(800, 1000)]
Before Transform : torch.Size([315, 640])
After Transform : [(656, 1333)]
My question is how to get those resized output and why they use This method? I can't find the information in the paper and I can't understand the source code about transform in fasterrcnn_resnet50_fpn.
Sorry for my English

GeneralizedRCNN data transform:
performs the data transformation on the inputs to feed into the model
min_size: minimum size of the image to be rescaled before feeding it to the backbone.
max_size: maximum size of the image to be rescaled before feeding it to the backbone
I couldn't either find out why it was generalize for min 800 and max 1333, didn't find anything in research paper either.
but as the 1st layer is a Conv layer, the input to the network is fixed size, I apply many other augmentations such as mirror, random cropping etc, inspired by SSD based networks. Hence I would prefer to do all augmentation in a separate place once instead of twice.
I would assume the model should work the best during validation using images with shapes and other properties as close as possible to the training data.
though you can experiment with custom min_size and max_size...
from .transform import GeneralizedRCNNTransform
min_size = 900 #changed from default
max_size = 1433 #changed from default
image_mean = [0.485, 0.456, 0.406]
image_std = [0.229, 0.224, 0.225]
model = fasterrcnn_resnet50_fpn(pretrained=True, min_size, max_size, image_mean, image_std)
#batch of 4 image, 4 bboxes
images, boxes = torch.rand(4, 3, 600, 1200), torch.rand(4, 11, 4)
labels = torch.randint(1, 91, (4, 11))
images = list(image for image in images)
targets = []
for i in range(len(images)):
d = {}
d['boxes'] = boxes[i]
d['labels'] = labels[i]
output = model(images, targets)
or you can completely write your transforms
from torchvision.transforms import transforms as T
model = fasterrcnn_resnet50_rpn()
model.transform = T.Compose([*check torchvision.transforms for more*])
Hope this helps.


Getting different results after converting a model to from pytorch to ONNX

I'm coverting a googlenet model form pytorch to onnx using the following code:
torch.onnx.export(model, # model being run
input_batch, # model input (or a tuple for multiple inputs)
"google-net-onnx-test.onnx", # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=10, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
input_names = ['input'], # the model's input names
output_names = ['output'], # the model's output names
dynamic_axes={'input' : {0 : 'batch_size'}, # variable length axes
'output' : {0 : 'batch_size'}})
When I run the model on pytorch for this image:
I get the right results:
Samoyed 0.9378381967544556
Pomeranian 0.00828344002366066
Great Pyrenees 0.005603068508207798
Arctic fox 0.005527767818421125
white wolf 0.004741032607853413
But when I do it with ONNX I get this:
The pre and pos processing code is different for each case, I It should be equivalent.
This is the complete code in Pytorch:
import torch
from PIL import Image
from torchvision import transforms
model = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', pretrained=True)
input_image =
preprocess = transforms.Compose([
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
# move the input and model to GPU for speed if available
if torch.cuda.is_available():
input_batch ='cuda')'cuda')
with torch.no_grad():
output = model(input_batch)
# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
probabilities = torch.nn.functional.softmax(output[0], dim=0)
# Read the categories
with open("imagenet_classes.txt", "r") as f:
categories = [s.strip() for s in f.readlines()]
# Show top categories per image
top5_prob, top5_catid = torch.topk(probabilities, 5)
for i in range(top5_prob.size(0)):
print(categories[top5_catid[i]], top5_prob[i].item())
And this the code for ONNX
from PIL import Image
import imageio
import onnxruntime as ort
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from collections import namedtuple
import os
import time
def get_image(path):
Using path to image, return the RGB load image
img = imageio.imread(path, pilmode='RGB')
return img
# Pre-processing function for ImageNet models using numpy
def preprocess(img):
Preprocessing required on the images for inference with mxnet gluon
The function takes loaded image and returns processed tensor
img = np.array(Image.fromarray(img).resize((224, 224))).astype(np.float32)
img[:, :, 0] -= 123.68
img[:, :, 1] -= 116.779
img[:, :, 2] -= 103.939
img[:,:,[0,1,2]] = img[:,:,[2,1,0]]
img = img.transpose((2, 0, 1))
img = np.expand_dims(img, axis=0)
return img
def predict(path):
img_batch = preprocess(get_image(path))
outputs =
{"input": img_batch.astype(np.float32)},
a = np.argsort(-outputs[0].flatten())
results = {}
for i in a[0:5]:
return results
ort_session = ort.InferenceSession("/content/google-net-onnx-test.onnx")
with open('synset.txt', 'r') as f:
labels = [l.rstrip() for l in f]
image_path = "/content/dog.jpg"
I took the code of Pytorch from this tutorial
And the code for ONNX fro the github's ONNX Zoo
From the comments of #jhso, I think the normalisation step:
mean=[0.485, 0.456, 0.406]
I seems to me that is equivalent to:
img[:, :, 0] -= 123.68
img[:, :, 1] -= 116.779
img[:, :, 2] -= 103.939
constant = 256
a,b,c = 123.68/constant, 116.779/constant, 103.939/constant
print (f'{a:.3f} {b:.3f} {c:.3f}')
0.483 0.456 0.406
Regarding the std part, I'm not sure were it happend or if it is equivalent to:
img[:,:,[0,1,2]] = img[:,:,[2,1,0]]
img = img.transpose((2, 0, 1))
Also I ran the code again today and got a closer result:
Your preprocessing is wrong. Note that you have a center crop (less important) and a std deviation normalisation step you're not using. You're also seemingly converting from BGR which isn't required when using PIL (it's more of an opencv thing) - happy to be corrected if I'm wrong as I'm going from memory.
preprocess = transforms.Compose([
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
Your preprocessing stage should look something (ymmv) like this:
# Pre-processing function for ImageNet models using numpy
def preprocess(img):
Preprocessing required on the images for inference with mxnet gluon
The function takes loaded image and returns processed tensor
img = np.array(Image.fromarray(img).resize((256, 256))).astype(np.float32)
#center crop
rm_pad = (256-224)//2
img = img[rm_pad:-rm_pad,rm_pad:-rm_pad]
#normalize to 0-1
img /= 255.
#normalize by mean + std
img = (img - np.array([0.485, 0.456, 0.406]))/np.array([0.229, 0.224, 0.225])
# img[:,:,[0,1,2]] = img[:,:,[2,1,0]] #don't think this is needed?
img = img.transpose((2, 0, 1))
img = np.expand_dims(img, axis=0)
return img

ImageNet classification challenge: Achieving top-5 error of 0.99472 on test set using VGG11

I recently took an imagenet pre-trained VGG11 network and made predictions on the imagenet test dataset. Upon submitting this file to the evaluation server, I received an email with following text:
Error: 0.99607 (top-5) 0.99898 (top-1)
Per-class error (classes 1-1000):
1 1
1 1
1 1
Does this mean that my top-5 accuracy is 1-0.99607=0.393%? If so then the score is too low.
Could you please point out where I could be going wrong? Here is the code for reference.
P.S.: I have checked that the images are loaded and predicted upon in alphabetical order.
vgg11 = models.vgg11(pretrained=True)"cuda"))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
test_loader ="test_dataset",
batch_size=32, shuffle=False)
fp = open("predictions.txt", "w")
for a, b in tqdm(test_loader):
preds = vgg11(a.cuda())
_, preds = torch.topk(preds, k=5, dim=1)
preds = preds.cpu().detach().numpy()
for i in range(len(preds)):
fp.write(" ".join(str(j) for j in preds[i])+"\n")
Based on your code, I believe the error is right because of the lack of normalization. I don't have the environment to test on the ImageNet test set, so I made a small example with 4 random cat images from the internet. (Link: image1, image2, image3, image4).
The code test as below:
import torch
from torchvision import models
import numpy as np
import cv2
import os
with torch.no_grad():
vgg11 = models.vgg11(pretrained=True)
mean=torch.tensor([0.485, 0.456, 0.406])
std=torch.tensor([0.229, 0.224, 0.225])
def read_image(image_path, size=224):
image = cv2.imread(image_path)
image = cv2.resize(image, (size,size))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
image = torch.tensor(image).permute(2,0,1).unsqueeze(0) / 255.
image = (image - mean[None, :, None, None])/std[None, :, None, None]
return image
from_path = './../test_image/'
cat_name = ['cat1','cat2','cat3','cat4']
images = torch.empty(0, 3, 224, 224)
for name in cat_name:
image_path = os.path.join(from_path, f'{name}.png')
image = read_image(image_path)
images =, image), 0)
preds = vgg11(images.float()).detach().cpu().numpy()
result = np.argmax(preds, axis=1)
Without normalization, the result is ['Egyptian cat', 'sock', 'Komodo dragon', 'doormat'] ([285, 806, 48, 539]).
With normalization, the result is ['tabby cat', 'tabby cat', 'leopard', 'Egyptian cat'] ([281 281 288 285]).

Single image evaluation for pytorch resnet model on opencv frame and png file

I have a video in .mp4 format: eval.mp4. I also have a fine-tuned pytorch resnet nn with which I would like to perform inference on single frames that are read from the video or single png files that are saved to disk
My pre-trained nn successfully uses .png files that I load from disk and then perform the training/validation transforms. But during inference, rather than writing each frame of the eval.mp4 video to disk as .png files solely for the purpose of inferring on every frame, I would like to simply transform each captured frame into the correct format that can be evaluated by the network.
My dataset classes / dataloaders look like:
# create total dataset, no transforms
class MouseDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
self.mouse_frame = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.mouse_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
# img_name is root_dir+file_name
img_name = os.path.join(self.root_dir,
self.mouse_frame.iloc[idx, 0])
image =
coordinates = self.mouse_frame.iloc[idx, 1:]
coordinates = np.array([coordinates])
if self.transform:
image = self.transform(image)
return (image, coordinates)
# break total dataset into subsets for different transforms
class DatasetSubset(Dataset):
def __init__(self, dataset, transform=None):
self.dataset = dataset
self.transform = transform
def __len__(self):
return len(self.dataset)
def __getitem__(self, index):
# get image
image = self.dataset[index][0]
# transform for input into nn
if self.transform:
image = image.convert('RGB')
image = self.transform(image)
image =
#image = torch.unsqueeze(image, 0)
# get coordinates
coordinates = self.dataset[index][1]
# transform for input into nn
coordinates = coordinates.astype('float').reshape(-1, 2)
coordinates = torch.from_numpy(coordinates)
coordinates =
return (image, coordinates)
# create training / val split
train_split = 0.8
train_count = int(train_split * len(total_dataset))
val_count = int(len(total_dataset) - train_count)
train_subset, val_subset =, [train_count, val_count])
# create training / val datasets
train_dataset = DatasetSubset(train_subset, transform = data_transforms['train'])
val_dataset = DatasetSubset(val_subset, transform = data_transforms['val'])
# create train / val dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers)
dataloaders_dict = {}
dataloaders_dict['train'] = train_dataloader
dataloaders_dict['val'] = val_dataloader
My training vs. validation transforms (which are identical for testing purposes):
# Data augmentation and normalization for training
# Just normalization for validation
# required dimensions of input image
input_image_width = 224
input_image_height = 224
# mean and std of RGB pixel intensities
# ImageNet mean [0.485, 0.456, 0.406]
# ImageNet standard deviation [0.229, 0.224, 0.225]
model_mean = [0.485, 0.456, 0.406]
model_std = [0.229, 0.224, 0.225]
data_transforms = {
'train': transforms.Compose([
transforms.Resize((input_image_height, input_image_width)),
transforms.Normalize(model_mean, model_std)
'val': transforms.Compose([
transforms.Resize((input_image_height, input_image_width)),
transforms.Normalize(model_mean, model_std)
What I've tried to do is read each frame from an opencv vidcapture object, convert to PIL using this answer, and then infer but the result I'm getting is very different from the simply reading the frame, saving as a .png and then inferring on the .png.
The code that I am testing:
# Standard imports
import cv2
import numpy as np
import torch
import torchvision
from torchvision import models, transforms
from PIL import Image
# load best model for evaluation
model_ft = torch.load(BEST_PATH)
# Data augmentation and normalization for training
# Just normalization for validation
# required dimensions of input image
input_image_width = 224
input_image_height = 224
# mean and std of RGB pixel intensities
# ImageNet mean [0.485, 0.456, 0.406]
# ImageNet standard deviation [0.229, 0.224, 0.225]
model_mean = [0.485, 0.456, 0.406]
model_std = [0.229, 0.224, 0.225]
data_transforms = {
'train': transforms.Compose([
transforms.Resize((input_image_height, input_image_width)),
transforms.Normalize(model_mean, model_std)
'val': transforms.Compose([
transforms.Resize((input_image_height, input_image_width)),
transforms.Normalize(model_mean, model_std)
# Read image
cap = cv2.VideoCapture('eval.mp4')
total_frames = cap.get(7)
cap.set(1, 6840)
ret, frame =
cv2.imwrite('eval_6840.png', frame)
png_file = 'eval_6840.png'
# eval png
png_image =
png_image = png_image.convert('RGB')
png_image = data_transforms['val'](png_image)
png_image =
png_image = torch.unsqueeze(png_image, 0)
output = model_ft(png_image)
# eval frame
vid_image = Image.fromarray(frame)
vid_image = vid_image.convert('RGB')
vid_image = data_transforms['val'](vid_image)
vid_image =
vid_image = torch.unsqueeze(vid_image, 0)
output = model_ft(vid_image)
This returns:
torch.Size([1, 3, 224, 224])
tensor([[ 0.0229, -0.0990]], grad_fn=<AddmmBackward0>)
torch.Size([1, 3, 224, 224])
tensor([[ 0.0797, -0.2219]], grad_fn=<AddmmBackward0>)
My questions are:
(1) Why is the opencv frame evaluation different from the png file evaluation? All of the transformations appear to be identical (including the RGB conversion per the comments).
(2) How can I make the frame evaluation identical to the png evaluation given that both images are captured from the exact same segment of the video?
Here's a nice fan fact about opencv: it works in BGR space, rather than RGB.
This might be the reason why you have different results processing png images (read via PIL.Image) vs processing video frames (read via opencv).
Posting this answer here in case in helps anyone.
The issue is that: png_image = creates an object of this type: PIL.PngImagePlugin.PngImageFile.
A vidcapture frame, however, creates an object of type: numpy.ndarray. And the conversion step: vid_image = Image.fromarray(frame) creates an object of type: PIL.Image.Image
I tried converting PIL.Image.Image object to a PIL.PngImagePlugin.PngImageFile and vice versa to make them comparable, but it does not seem possible using the PIL method convert. Others seem to have had this issue as well.
So the solution was to convert back and forth between numpy.ndarray types and PIL image types to make use of the transforms functionality in the PIL image library on which pytorch relies. Probably not the most efficient method, but end result is identical input objects and model predictions.
For reference:
# Read image
cap = cv2.VideoCapture('eval.mp4')
total_frames = cap.get(7)
cap.set(1, 6840)
ret, frame =
cv2.imwrite('eval_6840.png', frame)
png_file = 'eval_6840.png'
# eval png
png_image =
png_array = np.array(png_image)
png_image = Image.fromarray(png_array)
png_image = data_transforms['val'](png_image)
png_image =
png_image = torch.unsqueeze(png_image, 0)
png_image =
output = model_ft(png_image)
# eval frame
vid_array = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
vid_image = Image.fromarray(vid_array)
vid_image = data_transforms['val'](vid_image)
vid_image =
vid_image = torch.unsqueeze(vid_image, 0)
vid_image =
output = model_ft(vid_image)
tensor([[ 0.0229, -0.0990]], grad_fn=<AddmmBackward0>)
tensor([[ 0.0229, -0.0990]], grad_fn=<AddmmBackward0>)

stack expects each tensor to be equal size pytorch

I'm trying to train a cnn network for image classification. all the images are from the same size. But a mismatch error of tensor size happned. Basically the error says some images are rotated.
The exact error message is here:
RuntimeError: stack expects each tensor to be equal size, but got [3, 200, 266] at entry 0 and [3, 266, 200] at entry 4
I wrote a simple program to check if any of the images are rotated using opencv.
for folder in folders:
for f in glob(folder+"/*.jpg"):
img = cv2.imread(f)
if img.shape[1] != 200:
If there's a mismatch in size it should print the file name. But unfortunately nothing was printed means all the image are exactly the same size. Please Take note that I don't want to resize the image maybe can rotate it but not resize.
The code I use to read the data for training is below
train_transforms = transforms.Compose([
test_transforms = transforms.Compose([
train_data = datasets.ImageFolder(datadir,
test_data = datasets.ImageFolder(datadir,
num_train = len(train_data)
indices = list(range(num_train))
split = int(np.floor(valid_size * num_train))
from import SubsetRandomSampler
train_idx, test_idx = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_idx)
test_sampler = SubsetRandomSampler(test_idx)
trainloader =,
sampler=train_sampler, batch_size=5)
testloader =,
sampler=test_sampler, batch_size=1)

How to implement Grad-CAM on a trained network

I have already trained a network and I have saved it in the form of mynetwork.model. I want to apply gradcam using my own model and not VGG16 or ResNet etc.
# import the necessary packages
from Grad_CAM.gradcam import GradCAM
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.applications import imagenet_utils
from tensorflow.keras.models import load_model
import numpy as np
import argparse
import imutils
import cv2
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
help="path to the input image")
ap.add_argument("-m", "--model", type=str, default="vgg",
#choices=("vgg", "resnet"),
help="model to be used")
args = vars(ap.parse_args())
# initialize the model to be VGG16
Model = VGG16
# check to see if we are using ResNet
if args["model"] == "resnet":
Model = ResNet50
# load the pre-trained CNN from disk
print("[INFO] loading model...")
model = Model(weights="imagenet")
# load the original image from disk (in OpenCV format) and then
# resize the image to its target dimensions
orig = cv2.imread(args["image"])
resized = cv2.resize(orig, (224, 224))
# load the input image from disk (in Keras/TensorFlow format) and
# preprocess it
image = load_img(args["image"], target_size=(224, 224))
image = img_to_array(image)
image = np.expand_dims(image, axis=0)
image = imagenet_utils.preprocess_input(image)
# use the network to make predictions on the input image and find
# the class label index with the largest corresponding probability
preds = model.predict(image)
i = np.argmax(preds[0])
# decode the ImageNet predictions to obtain the human-readable label
decoded = imagenet_utils.decode_predictions(preds)
(imagenetID, label, prob) = decoded[0][0]
label = "{}: {:.2f}%".format(label, prob * 100)
print("[INFO] {}".format(label))
# initialize our gradient class activation map and build the heatmap
cam = GradCAM(model, i)
heatmap = cam.compute_heatmap(image)
# resize the resulting heatmap to the original input image dimensions
# and then overlay heatmap on top of the image
heatmap = cv2.resize(heatmap, (orig.shape[1], orig.shape[0]))
(heatmap, output) = cam.overlay_heatmap(heatmap, orig, alpha=0.5)
cv2.rectangle(output, (0, 0), (340, 40), (0, 0, 0), -1)
cv2.putText(output, label, (10, 25), cv2.FONT_HERSHEY_SIMPLEX,
0.8, (255, 255, 255), 2)
# display the original image and resulting heatmap and output image
# to our screen
output = np.vstack([orig, heatmap, output])
output = imutils.resize(output, height=700)
cv2.imshow("Output", output)
from tensorflow.keras.models import Model
import tensorflow as tf
import numpy as np
import cv2
class GradCAM:
def __init__(self, model, classIdx, layerName=None):
# store the model, the class index used to measure the class
# activation map, and the layer to be used when visualizing
# the class activation map
self.model = model
self.classIdx = classIdx
self.layerName = layerName
# if the layer name is None, attempt to automatically find
# the target output layer
if self.layerName is None:
self.layerName = self.find_target_layer()
def find_target_layer(self):
# attempt to find the final convolutional layer in the network
# by looping over the layers of the network in reverse order
for layer in reversed(self.model.layers):
# check to see if the layer has a 4D output
if len(layer.output_shape) == 4:
# otherwise, we could not find a 4D layer so the GradCAM
# algorithm cannot be applied
raise ValueError("Could not find 4D layer. Cannot apply GradCAM.")
def compute_heatmap(self, image, eps=1e-8):
# construct our gradient model by supplying (1) the inputs
# to our pre-trained model, (2) the output of the (presumably)
# final 4D layer in the network, and (3) the output of the
# softmax activations from the model
gradModel = Model(
# record operations for automatic differentiation
with tf.GradientTape() as tape:
# cast the image tensor to a float-32 data type, pass the
# image through the gradient model, and grab the loss
# associated with the specific class index
inputs = tf.cast(image, tf.float32)
(convOutputs, predictions) = gradModel(inputs)
loss = predictions[:, self.classIdx]
# use automatic differentiation to compute the gradients
grads = tape.gradient(loss, convOutputs)
# compute the guided gradients
castConvOutputs = tf.cast(convOutputs > 0, "float32")
castGrads = tf.cast(grads > 0, "float32")
guidedGrads = castConvOutputs * castGrads * grads
# the convolution and guided gradients have a batch dimension
# (which we don't need) so let's grab the volume itself and
# discard the batch
convOutputs = convOutputs[0]
guidedGrads = guidedGrads[0]
# compute the average of the gradient values, and using them
# as weights, compute the ponderation of the filters with
# respect to the weights
weights = tf.reduce_mean(guidedGrads, axis=(0, 1))
cam = tf.reduce_sum(tf.multiply(weights, convOutputs), axis=-1)
# grab the spatial dimensions of the input image and resize
# the output class activation map to match the input image
# dimensions
(w, h) = (image.shape[2], image.shape[1])
heatmap = cv2.resize(cam.numpy(), (w, h))
# normalize the heatmap such that all values lie in the range
# [0, 1], scale the resulting values to the range [0, 255],
# and then convert to an unsigned 8-bit integer
numer = heatmap - np.min(heatmap)
denom = (heatmap.max() - heatmap.min()) + eps
heatmap = numer / denom
heatmap = (heatmap * 255).astype("uint8")
# return the resulting heatmap to the calling function
return heatmap
def overlay_heatmap(self, heatmap, image, alpha=0.5,
# apply the supplied color map to the heatmap and then
# overlay the heatmap on the input image
heatmap = cv2.applyColorMap(heatmap, colormap)
output = cv2.addWeighted(image, alpha, heatmap, 1 - alpha, 0)
# return a 2-tuple of the color mapped heatmap and the output,
# overlaid image
return (heatmap, output)
As you can see in, the VGG16 or ResNet pretrained models are used. I want to perform gradcam by using my own trained model. For this reason I commented these lines:
# initialize the model to be VGG16
Model = VGG16
# check to see if we are using ResNet
if args["model"] == "resnet":
Model = ResNet50
# load the pre-trained CNN from disk
print("[INFO] loading model...")
model = Model(weights="imagenet")
and I used
model = load_model(args["model"])
in order to use my own model. Then I executed:
python --image /home/antonis/IM0001.jpeg --model /home/antonis/mynetwork.model
However, I get the following error:
ValueError: `decode_predictions` expects a batch of predictions (i.e.
a 2D array of shape (samples, 1000)). Found array with shape: (1, 3)
which is expected as the model outputs the ImageNet classes (1000-dimensional) while my model returns predictions over 2 classes.
I wonder how to fix this and apply gradcam using my own model.
One thing I don't get is if you've your own classifier (2) why then use imagenet_utils.decode_predictions? I'm not sure if my following answer will satisfy you or not. But here are some pointer.
import tensorflow as tf
import numpy as np
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
# train set / data
x_train = x_train.astype('float32') / 255
# train set / target
y_train = tf.keras.utils.to_categorical(y_train , num_classes=10)
# validation set / data
x_test = x_test.astype('float32') / 255
# validation set / target
y_test = tf.keras.utils.to_categorical(y_test, num_classes=10)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# (50000, 32, 32, 3) (50000, 10)
# (10000, 32, 32, 3) (10000, 10
input = tf.keras.Input(shape=(32,32,3))
efnet = tf.keras.applications.EfficientNetB0(weights='imagenet',
include_top = False,
input_tensor = input)
# Now that we apply global max pooling.
gap = tf.keras.layers.GlobalMaxPooling2D()(efnet.output)
# Finally, we add a classification layer.
output = tf.keras.layers.Dense(10, activation='softmax')(gap)
# bind all
func_model = tf.keras.Model(efnet.input, output)
Compile and Run
loss = tf.keras.losses.CategoricalCrossentropy(),
metrics = tf.keras.metrics.CategoricalAccuracy(),
optimizer = tf.keras.optimizers.Adam())
# fit, y_train, batch_size=128, epochs=15, verbose = 2)
Epoch 14/15
391/391 - 13s - loss: 0.1479 - categorical_accuracy: 0.9491
Epoch 15/15
391/391 - 13s - loss: 0.1505 - categorical_accuracy: 0.9481
Grad CAM
Same as your set up.
from tensorflow.keras.models import Model
import tensorflow as tf
import numpy as np
import cv2
class GradCAM:
def __init__(self, model, classIdx, layerName=None):
# store the model, the class index used to measure the class
# activation map, and the layer to be used when visualizing
# the class activation map
self.model = model
self.classIdx = classIdx
self.layerName = layerName
# if the layer name is None, attempt to automatically find
# the target output layer
if self.layerName is None:
self.layerName = self.find_target_layer()
def find_target_layer(self):
# attempt to find the final convolutional layer in the network
# by looping over the layers of the network in reverse order
for layer in reversed(self.model.layers):
# check to see if the layer has a 4D output
if len(layer.output_shape) == 4:
# otherwise, we could not find a 4D layer so the GradCAM
# algorithm cannot be applied
raise ValueError("Could not find 4D layer. Cannot apply GradCAM.")
def compute_heatmap(self, image, eps=1e-8):
# construct our gradient model by supplying (1) the inputs
# to our pre-trained model, (2) the output of the (presumably)
# final 4D layer in the network, and (3) the output of the
# softmax activations from the model
gradModel = Model(
outputs=[self.model.get_layer(self.layerName).output, self.model.output])
# record operations for automatic differentiation
with tf.GradientTape() as tape:
# cast the image tensor to a float-32 data type, pass the
# image through the gradient model, and grab the loss
# associated with the specific class index
inputs = tf.cast(image, tf.float32)
(convOutputs, predictions) = gradModel(inputs)
loss = predictions[:, tf.argmax(predictions[0])]
# use automatic differentiation to compute the gradients
grads = tape.gradient(loss, convOutputs)
# compute the guided gradients
castConvOutputs = tf.cast(convOutputs > 0, "float32")
castGrads = tf.cast(grads > 0, "float32")
guidedGrads = castConvOutputs * castGrads * grads
# the convolution and guided gradients have a batch dimension
# (which we don't need) so let's grab the volume itself and
# discard the batch
convOutputs = convOutputs[0]
guidedGrads = guidedGrads[0]
# compute the average of the gradient values, and using them
# as weights, compute the ponderation of the filters with
# respect to the weights
weights = tf.reduce_mean(guidedGrads, axis=(0, 1))
cam = tf.reduce_sum(tf.multiply(weights, convOutputs), axis=-1)
# grab the spatial dimensions of the input image and resize
# the output class activation map to match the input image
# dimensions
(w, h) = (image.shape[2], image.shape[1])
heatmap = cv2.resize(cam.numpy(), (w, h))
# normalize the heatmap such that all values lie in the range
# [0, 1], scale the resulting values to the range [0, 255],
# and then convert to an unsigned 8-bit integer
numer = heatmap - np.min(heatmap)
denom = (heatmap.max() - heatmap.min()) + eps
heatmap = numer / denom
heatmap = (heatmap * 255).astype("uint8")
# return the resulting heatmap to the calling function
return heatmap
def overlay_heatmap(self, heatmap, image, alpha=0.5,
# apply the supplied color map to the heatmap and then
# overlay the heatmap on the input image
heatmap = cv2.applyColorMap(heatmap, colormap)
output = cv2.addWeighted(image, alpha, heatmap, 1 - alpha, 0)
# return a 2-tuple of the color mapped heatmap and the output,
# overlaid image
return (heatmap, output)
image = cv2.imread('/content/dog.jpg')
image = cv2.resize(image, (32, 32))
image = image.astype('float32') / 255
image = np.expand_dims(image, axis=0)
preds = func_model.predict(image)
i = np.argmax(preds[0])
To get the layer's name of the model
for idx in range(len(func_model.layers)):
print(func_model.get_layer(index = idx).name)
# we picked `block5c_project_con` layer
Passing to GradCAM class
icam = GradCAM(func_model, i, 'block5c_project_conv')
heatmap = icam.compute_heatmap(image)
heatmap = cv2.resize(heatmap, (32, 32))
image = cv2.imread('/content/dog.jpg')
image = cv2.resize(image, (32, 32))
print(heatmap.shape, image.shape)
(heatmap, output) = icam.overlay_heatmap(heatmap, image, alpha=0.5)
fig, ax = plt.subplots(1, 3)
Ref. Grad-CAM class activation visualization
