How to get the classes name of prediction?
the code (from https://tensorflow-object-detection-api-tutorial.readthedocs.io/en/latest/auto_examples/object_detection_camera.html#sphx-glr-auto-examples-object-detection-camera-py)
import os
DATA_DIR = os.path.join(os.getcwd(), 'data')
MODELS_DIR = os.path.join(DATA_DIR, 'models')
for dir in [DATA_DIR, MODELS_DIR]:
if not os.path.exists(dir):
os.mkdir(dir)
import tarfile
import urllib.request
# Download and extract model
MODEL_DATE = '20200711'
MODEL_NAME = 'ssd_resnet101_v1_fpn_640x640_coco17_tpu-8'
MODEL_TAR_FILENAME = MODEL_NAME + '.tar.gz'
MODELS_DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/tf2/'
MODEL_DOWNLOAD_LINK = MODELS_DOWNLOAD_BASE + MODEL_DATE + '/' + MODEL_TAR_FILENAME
PATH_TO_MODEL_TAR = os.path.join(MODELS_DIR, MODEL_TAR_FILENAME)
PATH_TO_CKPT = os.path.join(MODELS_DIR, os.path.join(MODEL_NAME, 'checkpoint/'))
PATH_TO_CFG = os.path.join(MODELS_DIR, os.path.join(MODEL_NAME, 'pipeline.config'))
if not os.path.exists(PATH_TO_CKPT):
print('Downloading model. This may take a while... ', end='')
urllib.request.urlretrieve(MODEL_DOWNLOAD_LINK, PATH_TO_MODEL_TAR)
tar_file = tarfile.open(PATH_TO_MODEL_TAR)
tar_file.extractall(MODELS_DIR)
tar_file.close()
os.remove(PATH_TO_MODEL_TAR)
print('Done')
# Download labels file
LABEL_FILENAME = 'mscoco_label_map.pbtxt'
LABELS_DOWNLOAD_BASE = \
'https://raw.githubusercontent.com/tensorflow/models/master/research/object_detection/data/'
PATH_TO_LABELS = os.path.join(MODELS_DIR, os.path.join(MODEL_NAME, LABEL_FILENAME))
if not os.path.exists(PATH_TO_LABELS):
print('Downloading label file... ', end='')
urllib.request.urlretrieve(LABELS_DOWNLOAD_BASE + LABEL_FILENAME, PATH_TO_LABELS)
print('Done')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow logging
import tensorflow as tf
from object_detection.utils import label_map_util
from object_detection.utils import config_util
from object_detection.utils import visualization_utils as viz_utils
from object_detection.builders import model_builder
tf.get_logger().setLevel('ERROR') # Suppress TensorFlow logging (2)
# Enable GPU dynamic memory allocation
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# Load pipeline config and build a detection model
configs = config_util.get_configs_from_pipeline_file(PATH_TO_CFG)
model_config = configs['model']
detection_model = model_builder.build(model_config=model_config, is_training=False)
# Restore checkpoint
ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
ckpt.restore(os.path.join(PATH_TO_CKPT, 'ckpt-0')).expect_partial()
# Load pipeline config and build a detection model
configs = config_util.get_configs_from_pipeline_file(PATH_TO_CFG)
model_config = configs['model']
detection_model = model_builder.build(model_config=model_config, is_training=False)
# Restore checkpoint
ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
ckpt.restore(os.path.join(PATH_TO_CKPT, 'ckpt-0')).expect_partial()
def get_model_detection_function(model):
#tf.function
def detect_fn(image):
"""Detect objects in image."""
image, shapes = model.preprocess(image)
prediction_dict = model.predict(image, shapes)
detections = model.postprocess(prediction_dict, shapes)
return detections, prediction_dict, tf.reshape(shapes, [-1])
return detect_fn
detect_fn = get_model_detection_function(detection_model)
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS,
use_display_name=True)
import cv2
cap = cv2.VideoCapture(0)
import numpy as np
while True:
# Read frame from camera
ret, image_np = cap.read()
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
# Things to try:
# Flip horizontally
# image_np = np.fliplr(image_np).copy()
# Convert image to grayscale
# image_np = np.tile(
# np.mean(image_np, 2, keepdims=True), (1, 1, 3)).astype(np.uint8)
input_tensor = tf.convert_to_tensor(np.expand_dims(image_np, 0), dtype=tf.float32)
detections, predictions_dict, shapes = detect_fn(input_tensor)
label_id_offset = 1
image_np_with_detections = image_np.copy()
viz_utils.visualize_boxes_and_labels_on_image_array(
image_np_with_detections,
detections['detection_boxes'][0].numpy(),
(detections['detection_classes'][0].numpy() + label_id_offset).astype(int),
detections['detection_scores'][0].numpy(),
category_index,
use_normalized_coordinates=True,
max_boxes_to_draw=200,
min_score_thresh=.30,
agnostic_mode=False)
# Display output
cv2.imshow('object detection', cv2.resize(image_np_with_detections, (800, 600)))
if cv2.waitKey(25) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
actually the code is working fine but i want to get the class name to make some action..
for example :
if variable_name_class == 'cat':
{action 1}
elif variable_name_class == 'dog':
{action 2}
maybe to be like this
while True:
# Read frame from camera
ret, image_np = cap.read()
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
# Things to try:
# Flip horizontally
# image_np = np.fliplr(image_np).copy()
# Convert image to grayscale
# image_np = np.tile(
# np.mean(image_np, 2, keepdims=True), (1, 1, 3)).astype(np.uint8)
input_tensor = tf.convert_to_tensor(np.expand_dims(image_np, 0), dtype=tf.float32)
detections, predictions_dict, shapes = detect_fn(input_tensor)
label_id_offset = 1
image_np_with_detections = image_np.copy()
viz_utils.visualize_boxes_and_labels_on_image_array(
image_np_with_detections,
detections['detection_boxes'][0].numpy(),
(detections['detection_classes'][0].numpy() + label_id_offset).astype(int),
detections['detection_scores'][0].numpy(),
category_index,
use_normalized_coordinates=True,
max_boxes_to_draw=200,
min_score_thresh=.30,
agnostic_mode=False)
# Display output
cv2.imshow('object detection', cv2.resize(image_np_with_detections, (800, 600)))
if variable_name_class == 'cat':
{action 1}
elif variable_name_class == 'dog':
{action 2}
if cv2.waitKey(25) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
I don't know the 'variable_name_class' ..
anyone know how to get the classes name? or the 'variable_name_class' please help me..
Copy and paste the whole code from this link to your jupyter notebook cell:
https://github.com/tensorflow/models/blob/master/research/object_detection/utils/visualization_utils.py
You should copy all those 1500 lines of code.
After copying all the code to jupyter notebook cell, you need to change some part of the code. Our aim is to return the detected label along with image array inside the visualize_boxes_and_labels_on_image_array() function
So, part of the code for visualize_boxes_and_labels_on_image_array() function should look like below code:
Update the code for visualize_boxes_and_labels_on_image_array() function which should look like below.
def visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index,
instance_masks=None,
instance_boundaries=None,
keypoints=None,
keypoint_scores=None,
keypoint_edges=None,
track_ids=None,
use_normalized_coordinates=False,
max_boxes_to_draw=20,
min_score_thresh=.5,
agnostic_mode=False,
line_thickness=4,
mask_alpha=.4,
groundtruth_box_visualization_color='black',
skip_boxes=False,
skip_scores=False,
skip_labels=False,
skip_track_ids=False):
"""Overlay labeled boxes on an image with formatted scores and label names.
This function groups boxes that correspond to the same location
and creates a display string for each detection and overlays these
on the image. Note that this function modifies the image in place, and returns
that same image.
Args:
image: uint8 numpy array with shape (img_height, img_width, 3)
boxes: a numpy array of shape [N, 4]
classes: a numpy array of shape [N]. Note that class indices are 1-based,
and match the keys in the label map.
scores: a numpy array of shape [N] or None. If scores=None, then
this function assumes that the boxes to be plotted are groundtruth
boxes and plot all boxes as black with no classes or scores.
category_index: a dict containing category dictionaries (each holding
category index `id` and category name `name`) keyed by category indices.
instance_masks: a uint8 numpy array of shape [N, image_height, image_width],
can be None.
instance_boundaries: a numpy array of shape [N, image_height, image_width]
with values ranging between 0 and 1, can be None.
keypoints: a numpy array of shape [N, num_keypoints, 2], can
be None.
keypoint_scores: a numpy array of shape [N, num_keypoints], can be None.
keypoint_edges: A list of tuples with keypoint indices that specify which
keypoints should be connected by an edge, e.g. [(0, 1), (2, 4)] draws
edges from keypoint 0 to 1 and from keypoint 2 to 4.
track_ids: a numpy array of shape [N] with unique track ids. If provided,
color-coding of boxes will be determined by these ids, and not the class
indices.
use_normalized_coordinates: whether boxes is to be interpreted as
normalized coordinates or not.
max_boxes_to_draw: maximum number of boxes to visualize. If None, draw
all boxes.
min_score_thresh: minimum score threshold for a box or keypoint to be
visualized.
agnostic_mode: boolean (default: False) controlling whether to evaluate in
class-agnostic mode or not. This mode will display scores but ignore
classes.
line_thickness: integer (default: 4) controlling line width of the boxes.
mask_alpha: transparency value between 0 and 1 (default: 0.4).
groundtruth_box_visualization_color: box color for visualizing groundtruth
boxes
skip_boxes: whether to skip the drawing of bounding boxes.
skip_scores: whether to skip score when drawing a single detection
skip_labels: whether to skip label when drawing a single detection
skip_track_ids: whether to skip track id when drawing a single detection
Returns:
uint8 numpy array with shape (img_height, img_width, 3) with overlaid boxes.
"""
# Create a display string (and color) for every box location, group any boxes
# that correspond to the same location.
box_to_display_str_map = collections.defaultdict(list)
box_to_color_map = collections.defaultdict(str)
box_to_instance_masks_map = {}
box_to_instance_boundaries_map = {}
box_to_keypoints_map = collections.defaultdict(list)
box_to_keypoint_scores_map = collections.defaultdict(list)
box_to_track_ids_map = {}
if not max_boxes_to_draw:
max_boxes_to_draw = boxes.shape[0]
for i in range(boxes.shape[0]):
if max_boxes_to_draw == len(box_to_color_map):
break
if scores is None or scores[i] > min_score_thresh:
box = tuple(boxes[i].tolist())
if instance_masks is not None:
box_to_instance_masks_map[box] = instance_masks[i]
if instance_boundaries is not None:
box_to_instance_boundaries_map[box] = instance_boundaries[i]
if keypoints is not None:
box_to_keypoints_map[box].extend(keypoints[i])
if keypoint_scores is not None:
box_to_keypoint_scores_map[box].extend(keypoint_scores[i])
if track_ids is not None:
box_to_track_ids_map[box] = track_ids[i]
if scores is None:
box_to_color_map[box] = groundtruth_box_visualization_color
else:
display_str = ''
if not skip_labels:
if not agnostic_mode:
if classes[i] in six.viewkeys(category_index):
class_name = category_index[classes[i]]['name']
else:
class_name = 'N/A'
display_str = str(class_name)
final_label = display_str
if not skip_scores:
if not display_str:
display_str = '{}%'.format(round(100*scores[i]))
final_label = display_str
else:
display_str = '{}: {}%'.format(display_str, round(100*scores[i]))
if not skip_track_ids and track_ids is not None:
if not display_str:
display_str = 'ID {}'.format(track_ids[i])
final_label = track_ids[i]
else:
display_str = '{}: ID {}'.format(display_str, track_ids[i])
box_to_display_str_map[box].append(display_str)
if agnostic_mode:
box_to_color_map[box] = 'DarkOrange'
elif track_ids is not None:
prime_multipler = _get_multiplier_for_color_randomness()
box_to_color_map[box] = STANDARD_COLORS[
(prime_multipler * track_ids[i]) % len(STANDARD_COLORS)]
else:
box_to_color_map[box] = STANDARD_COLORS[
classes[i] % len(STANDARD_COLORS)]
# Draw all boxes onto image.
for box, color in box_to_color_map.items():
ymin, xmin, ymax, xmax = box
if instance_masks is not None:
draw_mask_on_image_array(
image,
box_to_instance_masks_map[box],
color=color,
alpha=mask_alpha
)
if instance_boundaries is not None:
draw_mask_on_image_array(
image,
box_to_instance_boundaries_map[box],
color='red',
alpha=1.0
)
draw_bounding_box_on_image_array(
image,
ymin,
xmin,
ymax,
xmax,
color=color,
thickness=0 if skip_boxes else line_thickness,
display_str_list=box_to_display_str_map[box],
use_normalized_coordinates=use_normalized_coordinates)
if keypoints is not None:
keypoint_scores_for_box = None
if box_to_keypoint_scores_map:
keypoint_scores_for_box = box_to_keypoint_scores_map[box]
draw_keypoints_on_image_array(
image,
box_to_keypoints_map[box],
keypoint_scores_for_box,
min_score_thresh=min_score_thresh,
color=color,
radius=line_thickness / 2,
use_normalized_coordinates=use_normalized_coordinates,
keypoint_edges=keypoint_edges,
keypoint_edge_color=color,
keypoint_edge_width=line_thickness // 2)
return final_label, image
Here, I have just created a variable called final_label which is the class name of the detected bounding box.
While calling this function, you have to use this code:
label, _ = visualize_boxes_and_labels_on_image_array(
image_np_with_detections, #change this according to your image array name
detections['detection_boxes'],
detections['detection_classes']+label_id_offset,
detections['detection_scores'],
category_index,
use_normalized_coordinates=True,
max_boxes_to_draw=8,
min_score_thresh=.50,
agnostic_mode=False)
print(label)
It will give the exact label detected.
Though, it's been some time, since this question was posted, if someone still want's to get the class-names without changing the code (as suggested above), one can post these two lines after getting visualization on the image.
classes = [cls for cls in detections['detection_classes'][detections['detection_scores'] > threshold]]
classes = [category_index.get(cls)['name'] for cls in classes]
Related
Getting error: line 50, in
faceA = preprocess_face(rgbA[boxesA[0][1]:boxesA[0][3], boxesA[0][0]:boxesA[0][2]])
TypeError: only integer scalar arrays can be converted to a scalar index
import os
import argparse
import cv2
from deepface import DeepFace
import numpy as np
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-f", "--first", required=True,
help="first input image")
ap.add_argument("-d", "--directory", required=True,
help="directory of images to compare")
args = vars(ap.parse_args())
# load the first input image
imageA = cv2.imread(args["first"])
rgbA = cv2.cvtColor(imageA, cv2.COLOR_BGR2RGB)
# detect the face in the first image
boxesA = DeepFace.detectFace(rgbA)
# make sure there is a face in the first image
if len(boxesA) == 0:
print("No face detected in the first image")
exit()
def preprocess_face(face, size=(96, 96)):
# extract the face ROI and resize it to the desired size
face = cv2.resize(face, size)
# compute the scaling factor for the images
factor_0 = size[0] / face.shape[0]
factor_1 = size[1] / face.shape[1]
factor = np.min([factor_0, factor_1])
# stretch the face ROI to the desired size
face = cv2.resize(face, None, fx=factor, fy=factor)
# convert the face ROI to grayscale
gray = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
# normalize the grayscale image
gray = gray / 255.0
# return the preprocessed face
return [gray]
boxesA = boxesA.astype(int)
# extract the face encoding for the first image
faceA = preprocess_face(rgbA[boxesA[0][1]:boxesA[0][3], boxesA[0][0]:boxesA[0][2]])
encodingA = DeepFace.detectFace(faceA, boxesA)[0]
# initialize a dictionary to store the image names and scores
scores = {}
# loop over the images in the directory
for image_name in os.listdir(args["directory"]):
# load the image
imageB = cv2.imread(os.path.join(args["directory"], image_name))
rgbB = cv2.cvtColor(imageB, cv2.COLOR_BGR2RGB)
# detect the face in the image
boxesB = DeepFace.detectFace(rgbB, enforce_detection=False)
# make sure there is a face in the image
if len(boxesB) == 0:
continue
boxesB = boxesB.astype(int)
# extract the face encoding for the image
facesB = []
for i in range(len(boxesB)):
faceB = preprocess_face(rgbB[boxesB[i][1]:boxesB[i][3], boxesB[i][0]:boxesB[i][2]])
facesB.extend(faceB)
encodingB = DeepFace.detectFace(facesB, boxesB)[0]
# compare the face encodings
score = DeepFace.verifyFace(encodingA, encodingB)
similarity_percentage = score * 100
# store the image name and score in the dictionary
scores[image_name] = score
# sort the scores in descending order
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
# display the top 10 scores
for i in range(10):
image_name, score = sorted_scores[i]
print("{}: {}".format(image_name, score))
Usage of the script is: python Image_Comparison_Deepface.py -f C:/folder/image.png -d C:/testpics
Script was created with ChatGPT.
I am trying to develop a script which will detect pixelation from LiveTV from an external camera. To test my script I have been using a short snippet of LiveTV which has two instances of pixelation.
See Google Drive below for video:
https://drive.google.com/file/d/1f339HJSWKhyPr1y5sf9tWW4vcXgBOVbz/view?usp=sharing
Currently I am able to filter out most of the noise in the video, and detect the pixelation. However, I am also detecting the white text (given the intensity of the text it gets picked up by the kernel I am applying).
See the code below:
import cv2
import numpy as np
cap = cv2.VideoCapture("./hgtv_short.ts")
while True:
success, image = cap.read()
gray = cv2.cvtColor(src=image, code=cv2.COLOR_BGR2GRAY)
sharpen_kernel = np.array([[.4, .4], [-2.25, -2.25], [.4, .4]])
sharpen = cv2.filter2D(src=gray, ddepth=-1, kernel=sharpen_kernel)
sharpe = sharpen + 128
canny = cv2.Canny(image=sharpe, threshold1=245, threshold2=255, edges=1, apertureSize=3, L2gradient=True)
white = np.where(canny != [0])
coordinates = zip(white[1], white[0])
for p in coordinates:
cv2.circle(canny, p, 30, (200, 0, 0), 2)
cv2.imshow('image', image)
cv2.imshow('edges', canny)
cv2.waitKey(1)
What I would like to do is apply a threshold and findContours to the given coordinates to see if text is in the region. Then I can discern between actual pixelation and text.
NOTE:
If anyone has any other ideas on finding pixelation I am open to suggestions.
UPDATE
Here is a screenshot from the video showing the type of pixelation I am looking for in this video (macro-blocking) to be specific.
Image
Edges
From the above Images you can see that I am detecting the macro-blocking, but also the white text. I would like to be able to discern between text and actual macro-blocking.
SECOND UPDATE
After more trial and error, I found that it will be best to use some sort of reference model to help predict when an image is showing macro-blocking, pixelation, artifacts, etc...
I have decided to use the hog(Histogram of Oriented Gradients) descriptor to create my feature vector. I have created to functions, one loops through the GOOD images and the other the BAD images:
def pos_train_set(self):
print("Starting to Gather Positive Photos")
for pos_file in glob.iglob(os.path.join(self.base_path, "Bad_Images", "*.jpg")):
pos_img = cv2.imread(pos_file, 1)
pos_img = cv2.resize(pos_img, self.winSize, interpolation=cv2.CV_32F)
pos_des = self.hog.compute(pos_img)
pos_des = cv2.normalize(pos_des, None)
self.labels.append(1)
self.training_data.append(pos_des)
print("Gathered Positive Photos")
def neg_train_set(self):
print("Starting to Gather Negative Photos")
for neg_file in glob.iglob(os.path.join(self.base_path, "Good_Images", "*.jpg")):
neg_img = cv2.imread(neg_file, 1)
neg_img = cv2.resize(neg_img, self.winSize, interpolation=cv2.CV_32F)
neg_des = self.hog.compute(neg_img)
neg_des = cv2.normalize(neg_des, None)
self.labels.append(0)
self.training_data.append(neg_des)
print("Gathered Negative Photos")
I then train my model using the SVM(Support Vector Machines) classification algorithm.
def train_set(self):
print("Starting to Convert")
td = np.float32(self.training_data)
lab = np.array(self.labels)
print("Converted List")
print("Starting Shuffle")
rand = np.random.RandomState(10)
shuffle = rand.permutation(len(td))
td = td[shuffle]
lab = lab[shuffle]
print("Shuffled List")
print("Starting SVM")
svm = cv2.ml.SVM_create()
svm.setType(cv2.ml.SVM_C_SVC)
# Exponential Chi2 kernel, similar to the RBF kernel: K(xi,xj)=e−γχ2(xi,xj),χ2(xi,xj)=(xi−xj)2/(xi+xj),γ>0.
svm.setKernel(cv2.ml.SVM_CHI2)
svm.setTermCriteria((cv2.TERM_CRITERIA_MAX_ITER, 100, 1e-6))
svm.setGamma(5.383)
svm.setC(2.67)
print("Starting Training")
svm.train(td, cv2.ml.ROW_SAMPLE, lab)
print("Saving to .yml")
svm.save(os.path.join(self.base_path, "svm_model.yml"))
I then use that SVM model to try and predict if an image is a 1 (Bad Image) or a 0 (Good Image). With the help of the kernel and edge detection I used in my first attempt:
def predict(self):
svm = cv2.ml.SVM_load("./svm_model.yml")
for file in self.files:
os.mkdir(os.path.join(self.base_path, "1_Frames", os.path.basename(file)))
print(f"Starting predict on {file}")
cap = cv2.VideoCapture(file)
while cap.isOpened():
success, image = cap.read(1)
if success:
img = cv2.resize(image, self.winSize, interpolation=cv2.CV_32F)
test_data = self.hog.compute(img)
test_data = cv2.normalize(test_data, None)
test_data = np.float32(test_data)
test_data = np.transpose(test_data)
if not np.any(test_data):
print("Invalid Dimension")
success, image = cap.read(1)
print(f"New Frame {success}")
else:
response = svm.predict(test_data)[1]
if response == 1:
gray = cv2.cvtColor(src=image, code=cv2.COLOR_BGR2GRAY)
sharpen_kernel = np.array([[.4, .4], [-2.25, -2.25], [.4, .4]])
sharpen = cv2.filter2D(src=gray, ddepth=-1, kernel=sharpen_kernel)
sharpe = sharpen + 128
canny = cv2.Canny(image=sharpe, threshold1=245, threshold2=255, edges=1, apertureSize=3, L2gradient=True)
white = np.where(canny != [0])
if not len(white[0]) == 0:
cv2.imwrite(os.path.join(self.base_path, '1_Frames', os.path.basename(file), f'found_{self.x}.jpg'), image)
success, image = cap.read(1)
self.x += 1
else:
success, image = cap.read(1)
pass
else:
cv2.imwrite(os.path.join(self.base_path, '0_Frames', f'found_{self.y}.jpg'), image)
success, image = cap.read(1)
self.y += 1
else:
break
cap.release()
cv2.destroyAllWindows()
This method seems to work well, but I am still open to any further ideas of suggestions. I posted this new update in hopes it may assist someone else looking for suggestions on how to detect issues in images.
Using Tensorflow API detection API came up with the lower detection_scores issue,
Don't understand how to improve the detection_scores , while Using the lower detection_scores getting IndexError: list index out of range
Need Suggestion on how to remove the Error
image_path = "C:/Users/Documents/pdf2txt/invoice.jpg"
def run_inference_for_single_image(image, graph):
with graph.as_default():
with tf.Session() as sess:
# Get handles to input and output tensors
ops = tf.get_default_graph().get_operations()
all_tensor_names = {output.name for op in ops for output in op.outputs}
tensor_dict = {}
for key in [
'num_detections', 'detection_boxes', 'detection_scores',
'detection_classes', 'detection_masks'
]:
tensor_name = key + ':0'
if tensor_name in all_tensor_names:
tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(
tensor_name)
if 'detection_masks' in tensor_dict:
# The following processing is only for single image
detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
# Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
detection_masks, detection_boxes, image.shape[0], image.shape[1])
detection_masks_reframed = tf.cast(
tf.greater(detection_masks_reframed, 0.5), tf.uint8)
# Follow the convention by adding back the batch dimension
tensor_dict['detection_masks'] = tf.expand_dims(
detection_masks_reframed, 0)
image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')
# Run inference
output_dict = sess.run(tensor_dict,
feed_dict={image_tensor: np.expand_dims(image, 0)})
# all outputs are float32 numpy arrays, so convert types as appropriate
output_dict['num_detections'] = int(output_dict['num_detections'][0])
output_dict['detection_classes'] = output_dict[
'detection_classes'][0].astype(np.uint8)
output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
output_dict['detection_scores'] = output_dict['detection_scores'][0]
print(output_dict['detection_scores'])
if 'detection_masks' in output_dict:
output_dict['detection_masks'] = output_dict['detection_masks'][0]
return output_dict
For image_path in TEST_IMAGE_PATHS:
image = Image.open(image_path)
image_np = load_image_into_numpy_array(image)
image_np_expanded = np.expand_dims(image_np, axis=0)
output_dict = run_inference_for_single_image(image_np, detection_graph)
outImage = Image.fromarray(image_np)
firstResult = output_dict['detection_boxes'][0]
firstArray = []
score = output_dict['detection_scores'][0]
print(score)
# if score > float(0.85):
for coords in firstResult:
realCoord = coords*1024
firstArray.append(realCoord)
firstImage = image.crop((firstArray[1], firstArray[0],firstArray[3],firstArray[2]))
outputClass = output_di ct['detection_classes'][0]
parameter = CLASSES[outputClass - 1]
coordText = str(firstArray[1]) + " " + str(firstArray[0]) + " " + str(firstArray[3]) + " " +str(firstArray[2]) + " " + parameter + 'xout1.tif'
coordsFile.write(coordText + "\n")
firstImage.save(r'C:/Users/neerajjha/Documents/pdf2txt/object_detection/Results/' + parameter + 'xout1.tif')
print(coordsFile)
OUTPUT:
Traceback (most recent call last):
File "c:/Users/Documents/pdf2txt/server_detection.py", line 260, in <module>
firstImage = image.crop((firstArray[1], firstArray[0],firstArray[3],firstArray[2]))
IndexError: list index out of range
Please Suggest !!
I think the issue is in this piece of code:
for coords in firstResult:
realCoord = coords*1024
firstArray.append(realCoord)
firstImage = image.crop((firstArray[1], firstArray[0],firstArray[3],firstArray[2]))
The FirstResult should contain the 4 coordinates of the bounding box that the model detects. Can you try moving the last line out of the for loop so that all 4 values are added to the firstArray before you are using it in the image.crop function?
TensorFlow version: 1.14
Python version: 3.6.9
My purpose is to build an object detection system with classification. I used Object Detection API and I want to feed its output bounding boxes to another neural networks (there are 6 different objects to detect and then I want to classify these object with Keras neural networks by object's features).
When I use Object Detection API only its OK, but if I want to use model.predict() script crashes. As I've read there's a problem with graph and sessions.
I'm pretty fresh to all these stuff, so I want to ask: is this possible to use multiple models simultaneously?
I've read about creating two sessions and graphs but the input of Object Detection model is a live video from the webcam and I don't want to lose performance of a script. I tried to start session with each frame, but it's very slow.
Also maybe upgrading script to Tensorflow 2.0 will be helpful?
EDIT:
I want to detect fruits and pass them to another Keras models which will predict their state. Detecting fruits works good, but I cannot use additional Keras model, because of the following error:
Tensor Tensor("dense_3/Sigmoid:0", shape=(?, 1), dtype=float32) is not an element of this graph.
Code provided:
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image
from keras import models
from keras.preprocessing import image
import cv2
if 'cap' in globals():
cap.release()
cap = cv2.VideoCapture(0)
sys.path.append("..")
graph = tf.get_default_graph()
from utils import label_map_util
from utils import visualization_utils as vis_util
def limit(value, max_val, min_val):
if(value > max_val):
value = max_val
elif(value < min_val):
value = min_val
return value
# What model to download.
MODEL_NAME = 'inference_graph'
MODEL_FILE = MODEL_NAME + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'
# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_CKPT = MODEL_NAME + '/frozen_inference_graph.pb'
# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = 'training/labelmap.pbtxt'
NUM_CLASSES = 6
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
def load_image_into_numpy_array_updated(image):
return np.array(image).astype(np.uint8)
# PATH_TO_TEST_IMAGES_DIR = 'test_images'
# TEST_IMAGE_PATHS = [ os.path.join(PATH_TO_TEST_IMAGES_DIR, 'image{}.jpg'.format(i)) for i in range(1, 3) ]
# Size, in inches, of the output images.
IMAGE_SIZE = (12, 8)
# Loading a keras model
model = models.load_model('new_banana.h5')
with detection_graph.as_default():
with tf.Session(graph=detection_graph) as sess:
while True:
ret, image_np = cap.read()
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Actual detection.
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
image_np_copy = image_np.copy()
# Visualization of the results of a detection.
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=8,
min_score_thresh=0.7)
# Code what are used to get thresholded bounding boxes from image
# enlarge them about compenser value, limitates them
# print them and send them to another script
# 0 - apple, 2 - banana, 3 - orange, 4 - pear, 5 - pepper, 6 - tomato
min_score_thresh = 0.7
bboxes = boxes[scores > min_score_thresh]
bclasses = classes[scores > min_score_thresh]
image_np_new = cv2.resize(image_np_copy, (800,600))
im_width, im_height = (800, 600)
if bclasses.size > 0:
final_box = []
cropped_images = []
compenser = 30
if(bclasses[0] == 2): #if any of detected classes stands for 'banana'
for box in bboxes:
ymin, xmin, ymax, xmax = box
ymin0 = int(im_height * ymin) - compenser
ymax0 = int(im_height * ymax) + compenser
xmin0 = int(im_width * xmin) - compenser
xmax0 = int(im_width * xmax) + compenser
ymin1 = limit(ymin0, im_height, 0)
ymax1 = limit(ymax0, im_height, 0)
xmax1 = limit(xmax0, im_width, 0)
xmin1 = limit(xmin0, im_width, 0)
image_cropped = image_np_new[ymin1:ymax1, xmin1:xmax1]
height, width, _ = image_cropped.shape
if width > height:
image_cropped = cv2.resize(image_cropped, (200, 150))
image_cropped = cv2.rotate(image_cropped, cv2.ROTATE_90_CLOCKWISE)
else:
image_cropped = cv2.resize(image_cropped, (150, 200))
image_cropped = load_image_into_numpy_array_updated(image_cropped)
image_cropped = image_cropped.reshape((1,) + image_cropped.shape)
image_cropped = image_cropped/255
cropped_images.append(image_cropped)
if (len(cropped_images) > 0):
for image in cropped_images:
print(image.shape)
# input tensor 200, 150, 3
classes = model.predict_classes(image, batch_size=10)
print(classes)
cv2.imshow('object detection', image_np)
if cv2.waitKey(25) & 0xFF == ord('q'):
cv2.destroyAllWindows()
cap.release()
break
I have set up the Google's DeepLab V3 Demo on my local system and it runs successfully after making some minor changes. It's as:
# -*- coding: utf-8 -*-
# DeepLab Demo
# This demo will demostrate the steps to run deeplab semantic segmentation model on sample input images.
import os
from io import BytesIO
import tarfile
import tempfile
from six.moves import urllib
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
from PIL import Image
import tensorflow as tf
class DeepLabModel(object):
"""Class to load deeplab model and run inference."""
INPUT_TENSOR_NAME = 'ImageTensor:0'
OUTPUT_TENSOR_NAME = 'SemanticPredictions:0'
INPUT_SIZE = 513
FROZEN_GRAPH_NAME = 'frozen_inference_graph'
def __init__(self, tarball_path):
"""Creates and loads pretrained deeplab model."""
self.graph = tf.Graph()
graph_def = None
# Extract frozen graph from tar archive.
tar_file = tarfile.open(tarball_path)
for tar_info in tar_file.getmembers():
if self.FROZEN_GRAPH_NAME in os.path.basename(tar_info.name):
file_handle = tar_file.extractfile(tar_info)
graph_def = tf.GraphDef.FromString(file_handle.read())
break
tar_file.close()
if graph_def is None:
raise RuntimeError('Cannot find inference graph in tar archive.')
with self.graph.as_default():
tf.import_graph_def(graph_def, name='')
self.sess = tf.Session(graph=self.graph)
def run(self, image):
"""Runs inference on a single image.
Args:
image: A PIL.Image object, raw input image.
Returns:
resized_image: RGB image resized from original input image.
seg_map: Segmentation map of `resized_image`.
"""
width, height = image.size
resize_ratio = 1.0 * self.INPUT_SIZE / max(width, height)
target_size = (int(resize_ratio * width), int(resize_ratio * height))
resized_image = image.convert('RGB').resize(target_size, Image.ANTIALIAS)
batch_seg_map = self.sess.run(
self.OUTPUT_TENSOR_NAME,
feed_dict={self.INPUT_TENSOR_NAME: [np.asarray(resized_image)]})
seg_map = batch_seg_map[0]
return resized_image, seg_map
def create_pascal_label_colormap():
"""Creates a label colormap used in PASCAL VOC segmentation benchmark.
Returns:
A Colormap for visualizing segmentation results.
"""
colormap = np.zeros((256, 3), dtype=int)
ind = np.arange(256, dtype=int)
for shift in reversed(range(8)):
for channel in range(3):
colormap[:, channel] |= ((ind >> channel) & 1) << shift
ind >>= 3
return colormap
def label_to_color_image(label):
"""Adds color defined by the dataset colormap to the label.
Args:
label: A 2D array with integer type, storing the segmentation label.
Returns:
result: A 2D array with floating type. The element of the array
is the color indexed by the corresponding element in the input label
to the PASCAL color map.
Raises:
ValueError: If label is not of rank 2 or its value is larger than color
map maximum entry.
"""
if label.ndim != 2:
raise ValueError('Expect 2-D input label')
colormap = create_pascal_label_colormap()
if np.max(label) >= len(colormap):
raise ValueError('label value too large.')
return colormap[label]
def vis_segmentation(image, seg_map):
"""Visualizes input image, segmentation map and overlay view."""
plt.figure(figsize=(15, 5))
grid_spec = gridspec.GridSpec(1, 4, width_ratios=[6, 6, 6, 1])
plt.subplot(grid_spec[0])
plt.imshow(image)
plt.axis('off')
plt.title('input image')
plt.subplot(grid_spec[1])
seg_image = label_to_color_image(seg_map).astype(np.uint8)
plt.imshow(seg_image)
plt.axis('off')
plt.title('segmentation map')
plt.subplot(grid_spec[2])
plt.imshow(image)
plt.imshow(seg_image, alpha=0.7)
plt.axis('off')
plt.title('segmentation overlay')
unique_labels = np.unique(seg_map)
ax = plt.subplot(grid_spec[3])
plt.imshow(
FULL_COLOR_MAP[unique_labels].astype(np.uint8), interpolation='nearest')
ax.yaxis.tick_right()
plt.yticks(range(len(unique_labels)), LABEL_NAMES[unique_labels])
plt.xticks([], [])
ax.tick_params(width=0.0)
plt.grid('off')
plt.show()
LABEL_NAMES = np.asarray([
'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tv'
])
FULL_LABEL_MAP = np.arange(len(LABEL_NAMES)).reshape(len(LABEL_NAMES), 1)
FULL_COLOR_MAP = label_to_color_image(FULL_LABEL_MAP)
# #title Select and download models {display-mode: "form"}
MODEL_NAME = 'mobilenetv2_coco_voctrainaug' # #param ['mobilenetv2_coco_voctrainaug', 'mobilenetv2_coco_voctrainval', 'xception_coco_voctrainaug', 'xception_coco_voctrainval']
_DOWNLOAD_URL_PREFIX = 'http://download.tensorflow.org/models/'
_MODEL_URLS = {
'mobilenetv2_coco_voctrainaug':
'deeplabv3_mnv2_pascal_train_aug_2018_01_29.tar.gz',
'mobilenetv2_coco_voctrainval':
'deeplabv3_mnv2_pascal_trainval_2018_01_29.tar.gz',
'xception_coco_voctrainaug':
'deeplabv3_pascal_train_aug_2018_01_04.tar.gz',
'xception_coco_voctrainval':
'deeplabv3_pascal_trainval_2018_01_04.tar.gz',
}
_TARBALL_NAME = 'deeplab_model.tar.gz'
model_dir = tempfile.mkdtemp()
tf.gfile.MakeDirs(model_dir)
download_path = os.path.join(model_dir, _TARBALL_NAME)
print('downloading model, this might take a while...')
urllib.request.urlretrieve(_DOWNLOAD_URL_PREFIX + _MODEL_URLS[MODEL_NAME],
download_path)
print('download completed! loading DeepLab model...')
MODEL = DeepLabModel(download_path)
print('model loaded successfully!')
# """## Run on sample images
#
# Select one of sample images (leave `IMAGE_URL` empty) or feed any internet image
# url for inference.
#
# Note that we are using single scale inference in the demo for fast computation,
# so the results may slightly differ from the visualizations in
# [README](https://github.com/tensorflow/models/blob/master/research/deeplab/README.md),
# which uses multi-scale and left-right flipped inputs.
# """
# #title Run on sample images {display-mode: "form"}
SAMPLE_IMAGE = 'image1.jpg' # #param ['image1', 'image2', 'image3']
IMAGE_URL = 'https://raw.githubusercontent.com/tensorflow/models/master/research/deeplab/g3doc/img/image1.jpg' ##param {type:"string"}
_SAMPLE_URL = ('https://github.com/tensorflow/models/blob/master/research/'
'deeplab/g3doc/img/%s.jpg?raw=true')
def run_visualization(url):
"""Inferences DeepLab model and visualizes result."""
try:
# f = urllib.request.urlopen(url)
# jpeg_str = f.read()
# original_im = Image.open(BytesIO(jpeg_str))
original_im = Image.open("human.jpg")
except IOError:
print('Cannot retrieve image. Please check url: ' + url)
return
print('running deeplab on image %s...' % url)
resized_im, seg_map = MODEL.run(original_im)
vis_segmentation(resized_im, seg_map)
image_url = SAMPLE_IMAGE
run_visualization(SAMPLE_IMAGE)
I have used various images with this model and it's working. Here's an example output:
Now I need to extract the mask as a separate image, how can I achieve that?
Thanks in advance!
The seg_map hold the segmented image.
resized_im, seg_map = MODEL.run(original_im)
Its a matplot Image array. You can convert it into numpy array using
np.array(seg_map) or use it whatever way you like.