Scalar array error in facial similarity python script created in ChatGPT - python

Getting error: line 50, in
faceA = preprocess_face(rgbA[boxesA[0][1]:boxesA[0][3], boxesA[0][0]:boxesA[0][2]])
TypeError: only integer scalar arrays can be converted to a scalar index
import os
import argparse
import cv2
from deepface import DeepFace
import numpy as np
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-f", "--first", required=True,
help="first input image")
ap.add_argument("-d", "--directory", required=True,
help="directory of images to compare")
args = vars(ap.parse_args())
# load the first input image
imageA = cv2.imread(args["first"])
rgbA = cv2.cvtColor(imageA, cv2.COLOR_BGR2RGB)
# detect the face in the first image
boxesA = DeepFace.detectFace(rgbA)
# make sure there is a face in the first image
if len(boxesA) == 0:
print("No face detected in the first image")
exit()
def preprocess_face(face, size=(96, 96)):
# extract the face ROI and resize it to the desired size
face = cv2.resize(face, size)
# compute the scaling factor for the images
factor_0 = size[0] / face.shape[0]
factor_1 = size[1] / face.shape[1]
factor = np.min([factor_0, factor_1])
# stretch the face ROI to the desired size
face = cv2.resize(face, None, fx=factor, fy=factor)
# convert the face ROI to grayscale
gray = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
# normalize the grayscale image
gray = gray / 255.0
# return the preprocessed face
return [gray]
boxesA = boxesA.astype(int)
# extract the face encoding for the first image
faceA = preprocess_face(rgbA[boxesA[0][1]:boxesA[0][3], boxesA[0][0]:boxesA[0][2]])
encodingA = DeepFace.detectFace(faceA, boxesA)[0]
# initialize a dictionary to store the image names and scores
scores = {}
# loop over the images in the directory
for image_name in os.listdir(args["directory"]):
# load the image
imageB = cv2.imread(os.path.join(args["directory"], image_name))
rgbB = cv2.cvtColor(imageB, cv2.COLOR_BGR2RGB)
# detect the face in the image
boxesB = DeepFace.detectFace(rgbB, enforce_detection=False)
# make sure there is a face in the image
if len(boxesB) == 0:
continue
boxesB = boxesB.astype(int)
# extract the face encoding for the image
facesB = []
for i in range(len(boxesB)):
faceB = preprocess_face(rgbB[boxesB[i][1]:boxesB[i][3], boxesB[i][0]:boxesB[i][2]])
facesB.extend(faceB)
encodingB = DeepFace.detectFace(facesB, boxesB)[0]
# compare the face encodings
score = DeepFace.verifyFace(encodingA, encodingB)
similarity_percentage = score * 100
# store the image name and score in the dictionary
scores[image_name] = score
# sort the scores in descending order
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
# display the top 10 scores
for i in range(10):
image_name, score = sorted_scores[i]
print("{}: {}".format(image_name, score))
Usage of the script is: python Image_Comparison_Deepface.py -f C:/folder/image.png -d C:/testpics
Script was created with ChatGPT.

Related

Perform Macro-Blocking detection within given x, y coordinates using Python OpenCV

I am trying to develop a script which will detect pixelation from LiveTV from an external camera. To test my script I have been using a short snippet of LiveTV which has two instances of pixelation.
See Google Drive below for video:
https://drive.google.com/file/d/1f339HJSWKhyPr1y5sf9tWW4vcXgBOVbz/view?usp=sharing
Currently I am able to filter out most of the noise in the video, and detect the pixelation. However, I am also detecting the white text (given the intensity of the text it gets picked up by the kernel I am applying).
See the code below:
import cv2
import numpy as np
cap = cv2.VideoCapture("./hgtv_short.ts")
while True:
success, image = cap.read()
gray = cv2.cvtColor(src=image, code=cv2.COLOR_BGR2GRAY)
sharpen_kernel = np.array([[.4, .4], [-2.25, -2.25], [.4, .4]])
sharpen = cv2.filter2D(src=gray, ddepth=-1, kernel=sharpen_kernel)
sharpe = sharpen + 128
canny = cv2.Canny(image=sharpe, threshold1=245, threshold2=255, edges=1, apertureSize=3, L2gradient=True)
white = np.where(canny != [0])
coordinates = zip(white[1], white[0])
for p in coordinates:
cv2.circle(canny, p, 30, (200, 0, 0), 2)
cv2.imshow('image', image)
cv2.imshow('edges', canny)
cv2.waitKey(1)
What I would like to do is apply a threshold and findContours to the given coordinates to see if text is in the region. Then I can discern between actual pixelation and text.
NOTE:
If anyone has any other ideas on finding pixelation I am open to suggestions.
UPDATE
Here is a screenshot from the video showing the type of pixelation I am looking for in this video (macro-blocking) to be specific.
Image
Edges
From the above Images you can see that I am detecting the macro-blocking, but also the white text. I would like to be able to discern between text and actual macro-blocking.
SECOND UPDATE
After more trial and error, I found that it will be best to use some sort of reference model to help predict when an image is showing macro-blocking, pixelation, artifacts, etc...
I have decided to use the hog(Histogram of Oriented Gradients) descriptor to create my feature vector. I have created to functions, one loops through the GOOD images and the other the BAD images:
def pos_train_set(self):
print("Starting to Gather Positive Photos")
for pos_file in glob.iglob(os.path.join(self.base_path, "Bad_Images", "*.jpg")):
pos_img = cv2.imread(pos_file, 1)
pos_img = cv2.resize(pos_img, self.winSize, interpolation=cv2.CV_32F)
pos_des = self.hog.compute(pos_img)
pos_des = cv2.normalize(pos_des, None)
self.labels.append(1)
self.training_data.append(pos_des)
print("Gathered Positive Photos")
def neg_train_set(self):
print("Starting to Gather Negative Photos")
for neg_file in glob.iglob(os.path.join(self.base_path, "Good_Images", "*.jpg")):
neg_img = cv2.imread(neg_file, 1)
neg_img = cv2.resize(neg_img, self.winSize, interpolation=cv2.CV_32F)
neg_des = self.hog.compute(neg_img)
neg_des = cv2.normalize(neg_des, None)
self.labels.append(0)
self.training_data.append(neg_des)
print("Gathered Negative Photos")
I then train my model using the SVM(Support Vector Machines) classification algorithm.
def train_set(self):
print("Starting to Convert")
td = np.float32(self.training_data)
lab = np.array(self.labels)
print("Converted List")
print("Starting Shuffle")
rand = np.random.RandomState(10)
shuffle = rand.permutation(len(td))
td = td[shuffle]
lab = lab[shuffle]
print("Shuffled List")
print("Starting SVM")
svm = cv2.ml.SVM_create()
svm.setType(cv2.ml.SVM_C_SVC)
# Exponential Chi2 kernel, similar to the RBF kernel: K(xi,xj)=e−γχ2(xi,xj),χ2(xi,xj)=(xi−xj)2/(xi+xj),γ>0.
svm.setKernel(cv2.ml.SVM_CHI2)
svm.setTermCriteria((cv2.TERM_CRITERIA_MAX_ITER, 100, 1e-6))
svm.setGamma(5.383)
svm.setC(2.67)
print("Starting Training")
svm.train(td, cv2.ml.ROW_SAMPLE, lab)
print("Saving to .yml")
svm.save(os.path.join(self.base_path, "svm_model.yml"))
I then use that SVM model to try and predict if an image is a 1 (Bad Image) or a 0 (Good Image). With the help of the kernel and edge detection I used in my first attempt:
def predict(self):
svm = cv2.ml.SVM_load("./svm_model.yml")
for file in self.files:
os.mkdir(os.path.join(self.base_path, "1_Frames", os.path.basename(file)))
print(f"Starting predict on {file}")
cap = cv2.VideoCapture(file)
while cap.isOpened():
success, image = cap.read(1)
if success:
img = cv2.resize(image, self.winSize, interpolation=cv2.CV_32F)
test_data = self.hog.compute(img)
test_data = cv2.normalize(test_data, None)
test_data = np.float32(test_data)
test_data = np.transpose(test_data)
if not np.any(test_data):
print("Invalid Dimension")
success, image = cap.read(1)
print(f"New Frame {success}")
else:
response = svm.predict(test_data)[1]
if response == 1:
gray = cv2.cvtColor(src=image, code=cv2.COLOR_BGR2GRAY)
sharpen_kernel = np.array([[.4, .4], [-2.25, -2.25], [.4, .4]])
sharpen = cv2.filter2D(src=gray, ddepth=-1, kernel=sharpen_kernel)
sharpe = sharpen + 128
canny = cv2.Canny(image=sharpe, threshold1=245, threshold2=255, edges=1, apertureSize=3, L2gradient=True)
white = np.where(canny != [0])
if not len(white[0]) == 0:
cv2.imwrite(os.path.join(self.base_path, '1_Frames', os.path.basename(file), f'found_{self.x}.jpg'), image)
success, image = cap.read(1)
self.x += 1
else:
success, image = cap.read(1)
pass
else:
cv2.imwrite(os.path.join(self.base_path, '0_Frames', f'found_{self.y}.jpg'), image)
success, image = cap.read(1)
self.y += 1
else:
break
cap.release()
cv2.destroyAllWindows()
This method seems to work well, but I am still open to any further ideas of suggestions. I posted this new update in hopes it may assist someone else looking for suggestions on how to detect issues in images.

How to get the classes name on object detection tensorflow

How to get the classes name of prediction?
the code (from https://tensorflow-object-detection-api-tutorial.readthedocs.io/en/latest/auto_examples/object_detection_camera.html#sphx-glr-auto-examples-object-detection-camera-py)
import os
DATA_DIR = os.path.join(os.getcwd(), 'data')
MODELS_DIR = os.path.join(DATA_DIR, 'models')
for dir in [DATA_DIR, MODELS_DIR]:
if not os.path.exists(dir):
os.mkdir(dir)
import tarfile
import urllib.request
# Download and extract model
MODEL_DATE = '20200711'
MODEL_NAME = 'ssd_resnet101_v1_fpn_640x640_coco17_tpu-8'
MODEL_TAR_FILENAME = MODEL_NAME + '.tar.gz'
MODELS_DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/tf2/'
MODEL_DOWNLOAD_LINK = MODELS_DOWNLOAD_BASE + MODEL_DATE + '/' + MODEL_TAR_FILENAME
PATH_TO_MODEL_TAR = os.path.join(MODELS_DIR, MODEL_TAR_FILENAME)
PATH_TO_CKPT = os.path.join(MODELS_DIR, os.path.join(MODEL_NAME, 'checkpoint/'))
PATH_TO_CFG = os.path.join(MODELS_DIR, os.path.join(MODEL_NAME, 'pipeline.config'))
if not os.path.exists(PATH_TO_CKPT):
print('Downloading model. This may take a while... ', end='')
urllib.request.urlretrieve(MODEL_DOWNLOAD_LINK, PATH_TO_MODEL_TAR)
tar_file = tarfile.open(PATH_TO_MODEL_TAR)
tar_file.extractall(MODELS_DIR)
tar_file.close()
os.remove(PATH_TO_MODEL_TAR)
print('Done')
# Download labels file
LABEL_FILENAME = 'mscoco_label_map.pbtxt'
LABELS_DOWNLOAD_BASE = \
'https://raw.githubusercontent.com/tensorflow/models/master/research/object_detection/data/'
PATH_TO_LABELS = os.path.join(MODELS_DIR, os.path.join(MODEL_NAME, LABEL_FILENAME))
if not os.path.exists(PATH_TO_LABELS):
print('Downloading label file... ', end='')
urllib.request.urlretrieve(LABELS_DOWNLOAD_BASE + LABEL_FILENAME, PATH_TO_LABELS)
print('Done')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow logging
import tensorflow as tf
from object_detection.utils import label_map_util
from object_detection.utils import config_util
from object_detection.utils import visualization_utils as viz_utils
from object_detection.builders import model_builder
tf.get_logger().setLevel('ERROR') # Suppress TensorFlow logging (2)
# Enable GPU dynamic memory allocation
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# Load pipeline config and build a detection model
configs = config_util.get_configs_from_pipeline_file(PATH_TO_CFG)
model_config = configs['model']
detection_model = model_builder.build(model_config=model_config, is_training=False)
# Restore checkpoint
ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
ckpt.restore(os.path.join(PATH_TO_CKPT, 'ckpt-0')).expect_partial()
# Load pipeline config and build a detection model
configs = config_util.get_configs_from_pipeline_file(PATH_TO_CFG)
model_config = configs['model']
detection_model = model_builder.build(model_config=model_config, is_training=False)
# Restore checkpoint
ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
ckpt.restore(os.path.join(PATH_TO_CKPT, 'ckpt-0')).expect_partial()
def get_model_detection_function(model):
#tf.function
def detect_fn(image):
"""Detect objects in image."""
image, shapes = model.preprocess(image)
prediction_dict = model.predict(image, shapes)
detections = model.postprocess(prediction_dict, shapes)
return detections, prediction_dict, tf.reshape(shapes, [-1])
return detect_fn
detect_fn = get_model_detection_function(detection_model)
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS,
use_display_name=True)
import cv2
cap = cv2.VideoCapture(0)
import numpy as np
while True:
# Read frame from camera
ret, image_np = cap.read()
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
# Things to try:
# Flip horizontally
# image_np = np.fliplr(image_np).copy()
# Convert image to grayscale
# image_np = np.tile(
# np.mean(image_np, 2, keepdims=True), (1, 1, 3)).astype(np.uint8)
input_tensor = tf.convert_to_tensor(np.expand_dims(image_np, 0), dtype=tf.float32)
detections, predictions_dict, shapes = detect_fn(input_tensor)
label_id_offset = 1
image_np_with_detections = image_np.copy()
viz_utils.visualize_boxes_and_labels_on_image_array(
image_np_with_detections,
detections['detection_boxes'][0].numpy(),
(detections['detection_classes'][0].numpy() + label_id_offset).astype(int),
detections['detection_scores'][0].numpy(),
category_index,
use_normalized_coordinates=True,
max_boxes_to_draw=200,
min_score_thresh=.30,
agnostic_mode=False)
# Display output
cv2.imshow('object detection', cv2.resize(image_np_with_detections, (800, 600)))
if cv2.waitKey(25) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
actually the code is working fine but i want to get the class name to make some action..
for example :
if variable_name_class == 'cat':
{action 1}
elif variable_name_class == 'dog':
{action 2}
maybe to be like this
while True:
# Read frame from camera
ret, image_np = cap.read()
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
# Things to try:
# Flip horizontally
# image_np = np.fliplr(image_np).copy()
# Convert image to grayscale
# image_np = np.tile(
# np.mean(image_np, 2, keepdims=True), (1, 1, 3)).astype(np.uint8)
input_tensor = tf.convert_to_tensor(np.expand_dims(image_np, 0), dtype=tf.float32)
detections, predictions_dict, shapes = detect_fn(input_tensor)
label_id_offset = 1
image_np_with_detections = image_np.copy()
viz_utils.visualize_boxes_and_labels_on_image_array(
image_np_with_detections,
detections['detection_boxes'][0].numpy(),
(detections['detection_classes'][0].numpy() + label_id_offset).astype(int),
detections['detection_scores'][0].numpy(),
category_index,
use_normalized_coordinates=True,
max_boxes_to_draw=200,
min_score_thresh=.30,
agnostic_mode=False)
# Display output
cv2.imshow('object detection', cv2.resize(image_np_with_detections, (800, 600)))
if variable_name_class == 'cat':
{action 1}
elif variable_name_class == 'dog':
{action 2}
if cv2.waitKey(25) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
I don't know the 'variable_name_class' ..
anyone know how to get the classes name? or the 'variable_name_class' please help me..
Copy and paste the whole code from this link to your jupyter notebook cell:
https://github.com/tensorflow/models/blob/master/research/object_detection/utils/visualization_utils.py
You should copy all those 1500 lines of code.
After copying all the code to jupyter notebook cell, you need to change some part of the code. Our aim is to return the detected label along with image array inside the visualize_boxes_and_labels_on_image_array() function
So, part of the code for visualize_boxes_and_labels_on_image_array() function should look like below code:
Update the code for visualize_boxes_and_labels_on_image_array() function which should look like below.
def visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index,
instance_masks=None,
instance_boundaries=None,
keypoints=None,
keypoint_scores=None,
keypoint_edges=None,
track_ids=None,
use_normalized_coordinates=False,
max_boxes_to_draw=20,
min_score_thresh=.5,
agnostic_mode=False,
line_thickness=4,
mask_alpha=.4,
groundtruth_box_visualization_color='black',
skip_boxes=False,
skip_scores=False,
skip_labels=False,
skip_track_ids=False):
"""Overlay labeled boxes on an image with formatted scores and label names.
This function groups boxes that correspond to the same location
and creates a display string for each detection and overlays these
on the image. Note that this function modifies the image in place, and returns
that same image.
Args:
image: uint8 numpy array with shape (img_height, img_width, 3)
boxes: a numpy array of shape [N, 4]
classes: a numpy array of shape [N]. Note that class indices are 1-based,
and match the keys in the label map.
scores: a numpy array of shape [N] or None. If scores=None, then
this function assumes that the boxes to be plotted are groundtruth
boxes and plot all boxes as black with no classes or scores.
category_index: a dict containing category dictionaries (each holding
category index `id` and category name `name`) keyed by category indices.
instance_masks: a uint8 numpy array of shape [N, image_height, image_width],
can be None.
instance_boundaries: a numpy array of shape [N, image_height, image_width]
with values ranging between 0 and 1, can be None.
keypoints: a numpy array of shape [N, num_keypoints, 2], can
be None.
keypoint_scores: a numpy array of shape [N, num_keypoints], can be None.
keypoint_edges: A list of tuples with keypoint indices that specify which
keypoints should be connected by an edge, e.g. [(0, 1), (2, 4)] draws
edges from keypoint 0 to 1 and from keypoint 2 to 4.
track_ids: a numpy array of shape [N] with unique track ids. If provided,
color-coding of boxes will be determined by these ids, and not the class
indices.
use_normalized_coordinates: whether boxes is to be interpreted as
normalized coordinates or not.
max_boxes_to_draw: maximum number of boxes to visualize. If None, draw
all boxes.
min_score_thresh: minimum score threshold for a box or keypoint to be
visualized.
agnostic_mode: boolean (default: False) controlling whether to evaluate in
class-agnostic mode or not. This mode will display scores but ignore
classes.
line_thickness: integer (default: 4) controlling line width of the boxes.
mask_alpha: transparency value between 0 and 1 (default: 0.4).
groundtruth_box_visualization_color: box color for visualizing groundtruth
boxes
skip_boxes: whether to skip the drawing of bounding boxes.
skip_scores: whether to skip score when drawing a single detection
skip_labels: whether to skip label when drawing a single detection
skip_track_ids: whether to skip track id when drawing a single detection
Returns:
uint8 numpy array with shape (img_height, img_width, 3) with overlaid boxes.
"""
# Create a display string (and color) for every box location, group any boxes
# that correspond to the same location.
box_to_display_str_map = collections.defaultdict(list)
box_to_color_map = collections.defaultdict(str)
box_to_instance_masks_map = {}
box_to_instance_boundaries_map = {}
box_to_keypoints_map = collections.defaultdict(list)
box_to_keypoint_scores_map = collections.defaultdict(list)
box_to_track_ids_map = {}
if not max_boxes_to_draw:
max_boxes_to_draw = boxes.shape[0]
for i in range(boxes.shape[0]):
if max_boxes_to_draw == len(box_to_color_map):
break
if scores is None or scores[i] > min_score_thresh:
box = tuple(boxes[i].tolist())
if instance_masks is not None:
box_to_instance_masks_map[box] = instance_masks[i]
if instance_boundaries is not None:
box_to_instance_boundaries_map[box] = instance_boundaries[i]
if keypoints is not None:
box_to_keypoints_map[box].extend(keypoints[i])
if keypoint_scores is not None:
box_to_keypoint_scores_map[box].extend(keypoint_scores[i])
if track_ids is not None:
box_to_track_ids_map[box] = track_ids[i]
if scores is None:
box_to_color_map[box] = groundtruth_box_visualization_color
else:
display_str = ''
if not skip_labels:
if not agnostic_mode:
if classes[i] in six.viewkeys(category_index):
class_name = category_index[classes[i]]['name']
else:
class_name = 'N/A'
display_str = str(class_name)
final_label = display_str
if not skip_scores:
if not display_str:
display_str = '{}%'.format(round(100*scores[i]))
final_label = display_str
else:
display_str = '{}: {}%'.format(display_str, round(100*scores[i]))
if not skip_track_ids and track_ids is not None:
if not display_str:
display_str = 'ID {}'.format(track_ids[i])
final_label = track_ids[i]
else:
display_str = '{}: ID {}'.format(display_str, track_ids[i])
box_to_display_str_map[box].append(display_str)
if agnostic_mode:
box_to_color_map[box] = 'DarkOrange'
elif track_ids is not None:
prime_multipler = _get_multiplier_for_color_randomness()
box_to_color_map[box] = STANDARD_COLORS[
(prime_multipler * track_ids[i]) % len(STANDARD_COLORS)]
else:
box_to_color_map[box] = STANDARD_COLORS[
classes[i] % len(STANDARD_COLORS)]
# Draw all boxes onto image.
for box, color in box_to_color_map.items():
ymin, xmin, ymax, xmax = box
if instance_masks is not None:
draw_mask_on_image_array(
image,
box_to_instance_masks_map[box],
color=color,
alpha=mask_alpha
)
if instance_boundaries is not None:
draw_mask_on_image_array(
image,
box_to_instance_boundaries_map[box],
color='red',
alpha=1.0
)
draw_bounding_box_on_image_array(
image,
ymin,
xmin,
ymax,
xmax,
color=color,
thickness=0 if skip_boxes else line_thickness,
display_str_list=box_to_display_str_map[box],
use_normalized_coordinates=use_normalized_coordinates)
if keypoints is not None:
keypoint_scores_for_box = None
if box_to_keypoint_scores_map:
keypoint_scores_for_box = box_to_keypoint_scores_map[box]
draw_keypoints_on_image_array(
image,
box_to_keypoints_map[box],
keypoint_scores_for_box,
min_score_thresh=min_score_thresh,
color=color,
radius=line_thickness / 2,
use_normalized_coordinates=use_normalized_coordinates,
keypoint_edges=keypoint_edges,
keypoint_edge_color=color,
keypoint_edge_width=line_thickness // 2)
return final_label, image
Here, I have just created a variable called final_label which is the class name of the detected bounding box.
While calling this function, you have to use this code:
label, _ = visualize_boxes_and_labels_on_image_array(
image_np_with_detections, #change this according to your image array name
detections['detection_boxes'],
detections['detection_classes']+label_id_offset,
detections['detection_scores'],
category_index,
use_normalized_coordinates=True,
max_boxes_to_draw=8,
min_score_thresh=.50,
agnostic_mode=False)
print(label)
It will give the exact label detected.
Though, it's been some time, since this question was posted, if someone still want's to get the class-names without changing the code (as suggested above), one can post these two lines after getting visualization on the image.
classes = [cls for cls in detections['detection_classes'][detections['detection_scores'] > threshold]]
classes = [category_index.get(cls)['name'] for cls in classes]

Realtime yolov5 detection with Desktop screen as input

I have a script that grabs an application's screenshot and displays it. it works quite nicely on my machine like a video with around 60FPS.
import os
os.getcwd()
from PIL import ImageGrab
import numpy as np
import cv2
import pyautogui
import win32gui
import time
from mss import mss
from PIL import Image
import tempfile
os.system('calc')
sct = mss()
xx=1
tstart = time.time()
while xx<10000:
hwnd = win32gui.FindWindow(None, 'Calculator')
left_x, top_y, right_x, bottom_y = win32gui.GetWindowRect(hwnd)
#screen = np.array(ImageGrab.grab( bbox = (left_x, top_y, right_x, bottom_y ) ) )
bbox = {'top': top_y, 'left': left_x, 'width': right_x-left_x, 'height':bottom_y-top_y }
screen = sct.grab(bbox)
scr = np.array(screen)
cv2.imshow('window', scr)
if cv2.waitKey(25) & 0xFF == ord('q'):
cv2.destroyAllWindows()
break
xx+=1
cv2.destroyAllWindows()
tend = time.time()
print(xx/(tend-tstart))
print((tend-tstart))
os.system('taskkill /f /im calculator.exe')
I would like to run yolov5's detect.py on this scr image without having to save to disk all the time. I'd also like to show the images with bounding boxes and have their coordinates saved somewhere.
My python level is not good enough, I tried importing detect and adding arguments, but it doesn't seem like it accepts any function parameter, only command line arguments.
Perhaps I should adapt this line, or use opencv?
parser.add_argument('--source', type=str, default='data/images', help='source') # file/folder, 0 for webcam
Any idea? thanks (this is the detect.py file for yolov5)
import argparse
import time
from pathlib import Path
import cv2
import torch
import torch.backends.cudnn as cudnn
from numpy import random
from models.experimental import attempt_load
from utils.datasets import LoadStreams, LoadImages
from utils.general import check_img_size, non_max_suppression, apply_classifier, scale_coords, xyxy2xywh, \
strip_optimizer, set_logging, increment_path
from utils.plots import plot_one_box
from utils.torch_utils import select_device, load_classifier, time_synchronized
def detect(save_img=False):
source, weights, view_img, save_txt, imgsz = opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size
webcam = source.isnumeric() or source.endswith('.txt') or source.lower().startswith(
('rtsp://', 'rtmp://', 'http://'))
# Directories
save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # increment run
(save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True) # make dir
# Initialize
set_logging()
device = select_device(opt.device)
half = device.type != 'cpu' # half precision only supported on CUDA
# Load model
model = attempt_load(weights, map_location=device) # load FP32 model
imgsz = check_img_size(imgsz, s=model.stride.max()) # check img_size
if half:
model.half() # to FP16
# Second-stage classifier
classify = False
if classify:
modelc = load_classifier(name='resnet101', n=2) # initialize
modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']).to(device).eval()
# Set Dataloader
vid_path, vid_writer = None, None
if webcam:
view_img = True
cudnn.benchmark = True # set True to speed up constant image size inference
dataset = LoadStreams(source, img_size=imgsz)
else:
save_img = True
dataset = LoadImages(source, img_size=imgsz)
# Get names and colors
names = model.module.names if hasattr(model, 'module') else model.names
colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]
# Run inference
t0 = time.time()
img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img
_ = model(img.half() if half else img) if device.type != 'cpu' else None # run once
for path, img, im0s, vid_cap in dataset:
img = torch.from_numpy(img).to(device)
img = img.half() if half else img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
# Inference
t1 = time_synchronized()
pred = model(img, augment=opt.augment)[0]
# Apply NMS
pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
t2 = time_synchronized()
# Apply Classifier
if classify:
pred = apply_classifier(pred, modelc, img, im0s)
# Process detections
for i, det in enumerate(pred): # detections per image
if webcam: # batch_size >= 1
p, s, im0 = Path(path[i]), '%g: ' % i, im0s[i].copy()
else:
p, s, im0 = Path(path), '', im0s
save_path = str(save_dir / p.name)
txt_path = str(save_dir / 'labels' / p.stem) + ('_%g' % dataset.frame if dataset.mode == 'video' else '')
s += '%gx%g ' % img.shape[2:] # print string
gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh
if len(det):
# Rescale boxes from img_size to im0 size
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
# Print results
for c in det[:, -1].unique():
n = (det[:, -1] == c).sum() # detections per class
s += '%g %ss, ' % (n, names[int(c)]) # add to string
# Write results
for *xyxy, conf, cls in reversed(det):
if save_txt: # Write to file
xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh
line = (cls, *xywh, conf) if opt.save_conf else (cls, *xywh) # label format
with open(txt_path + '.txt', 'a') as f:
f.write(('%g ' * len(line)).rstrip() % line + '\n')
if save_img or view_img: # Add bbox to image
label = '%s %.2f' % (names[int(cls)], conf)
plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=3)
# Print time (inference + NMS)
print('%sDone. (%.3fs)' % (s, t2 - t1))
# Stream results
if view_img:
cv2.imshow(str(p), im0)
if cv2.waitKey(1) == ord('q'): # q to quit
raise StopIteration
# Save results (image with detections)
if save_img:
if dataset.mode == 'images':
cv2.imwrite(save_path, im0)
else:
if vid_path != save_path: # new video
vid_path = save_path
if isinstance(vid_writer, cv2.VideoWriter):
vid_writer.release() # release previous video writer
fourcc = 'mp4v' # output video codec
fps = vid_cap.get(cv2.CAP_PROP_FPS)
w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h))
vid_writer.write(im0)
if save_txt or save_img:
s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
print(f"Results saved to {save_dir}{s}")
print('Done. (%.3fs)' % (time.time() - t0))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--weights', nargs='+', type=str, default='yolov5s.pt', help='model.pt path(s)')
parser.add_argument('--source', type=str, default='data/images', help='source') # file/folder, 0 for webcam
parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
parser.add_argument('--conf-thres', type=float, default=0.25, help='object confidence threshold')
parser.add_argument('--iou-thres', type=float, default=0.45, help='IOU threshold for NMS')
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
parser.add_argument('--view-img', action='store_true', help='display results')
parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --class 0, or --class 0 2 3')
parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
parser.add_argument('--augment', action='store_true', help='augmented inference')
parser.add_argument('--update', action='store_true', help='update all models')
parser.add_argument('--project', default='runs/detect', help='save results to project/name')
parser.add_argument('--name', default='exp', help='save results to project/name')
parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
opt = parser.parse_args()
print(opt)
with torch.no_grad():
if opt.update: # update all models (to fix SourceChangeWarning)
for opt.weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt']:
detect()
strip_optimizer(opt.weights)
else:
detect()
EDIT I already have weights saved somewhere and am able to run detect on images that are saved on disc, just would like to skip this step to keep those FPS.
The Yolov5 repo is here
For standalone inference in 3rd party projects or repos importing your model into the python workspace with PyTorch Hub is the recommended method. See YOLOv5 PyTorch Hub tutorial here, specifically the section on loading custom models.
https://github.com/ultralytics/yolov5#tutorials
Custom Models
This example loads a custom 20-class VOC-trained YOLOv5s model 'yolov5s_voc_best.pt' with PyTorch Hub.
import torch
model = torch.hub.load('ultralytics/yolov5', 'custom', path_or_model='yolov5s_voc_best.pt')
model = model.autoshape() # for PIL/cv2/np inputs and NMS
Then once the model is loaded:
from PIL import Image
# Images
img1 = Image.open('zidane.jpg')
img2 = Image.open('bus.jpg')
imgs = [img1, img2] # batched list of images
# Inference
result = model(imgs, size=640) # includes NMS
result.print()
import cv2
import torch
from mss import mss
import numpy as np
model = torch.hub.load("/yolov5", 'custom', path="yolov5/best.pt", source='local')
sct = mss()
while 1:
w, h = 1920, 1080
monitor = {'top': 0, 'left': 0, 'width': w, 'height': h}
img = Image.frombytes('RGB', (w, h), sct.grab(monitor).rgb)
screen = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
# set the model use the screen
result = model(screen, size=640)
cv2.imshow('Screen', result.render()[0])
if cv2.waitKey(25) & 0xFF == ord('q'):
cv2.destroyAllWindows()
break
im a noob in programming
and using desktop screen to run inference can be found in yolov5's github page
https://github.com/ultralytics/yolov5/issues/36
import cv2
import numpy
import torch
from mss import mss
from PIL import ImageGrab
im = numpy.array(ImageGrab.grab(bbox=(0,0,1920,1080)))
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
model.conf = 0.6
image = r'D:\i\test\yolov5-master(original)\yolov5-master\data\images\zidane.jpg'
results = model(im)
results.print()
results.show()
print(results.pandas().xyxy[0])
i have found mss().grab() have an rgb order issue, so use PIL instead

Unable to extract a word out of an image

I've written a script in python in combination with pytesseract to extract a word out of an image. There is only a single word TOOLS available in that image and that is what I'm after. Currently my below script is giving me wrong output which is WIS. What Can I do to get the text?
Link to that image
This is my script:
import requests, io, pytesseract
from PIL import Image
response = requests.get('http://facweb.cs.depaul.edu/sgrais/images/Type/Tools.jpg')
img = Image.open(io.BytesIO(response.content))
img = img.resize([100,100], Image.ANTIALIAS)
img = img.convert('L')
img = img.point(lambda x: 0 if x < 170 else 255)
imagetext = pytesseract.image_to_string(img)
print(imagetext)
# img.show()
This is the status of the modified image when I run the above script:
The output I'm having:
WIS
Expected output:
TOOLS
The key is matching image transformation to the tesseract abilities. Your main problem is that the font is not a usual one. All you need is
from PIL import Image, ImageEnhance, ImageFilter
response = requests.get('http://facweb.cs.depaul.edu/sgrais/images/Type/Tools.jpg')
img = Image.open(io.BytesIO(response.content))
# remove texture
enhancer = ImageEnhance.Color(img)
img = enhancer.enhance(0) # decolorize
img = img.point(lambda x: 0 if x < 250 else 255) # set threshold
img = img.resize([300, 100], Image.LANCZOS) # resize to remove noise
img = img.point(lambda x: 0 if x < 250 else 255) # get rid of remains of noise
# adjust font weight
img = img.filter(ImageFilter.MaxFilter(11)) # lighten the font ;)
imagetext = pytesseract.image_to_string(img)
print(imagetext)
And voila,
TOOLS
are recognized.
The key issue with your implementation lies here:
img = img.resize([100,100], Image.ANTIALIAS)
img = img.point(lambda x: 0 if x < 170 else 255)
You could try different sizes and different threshold:
import requests, io, pytesseract
from PIL import Image
from PIL import ImageFilter
response = requests.get('http://facweb.cs.depaul.edu/sgrais/images/Type/Tools.jpg')
img = Image.open(io.BytesIO(response.content))
filters = [
# ('nearest', Image.NEAREST),
('box', Image.BOX),
# ('bilinear', Image.BILINEAR),
# ('hamming', Image.HAMMING),
# ('bicubic', Image.BICUBIC),
('lanczos', Image.LANCZOS),
]
subtle_filters = [
# 'BLUR',
# 'CONTOUR',
'DETAIL',
'EDGE_ENHANCE',
'EDGE_ENHANCE_MORE',
# 'EMBOSS',
'FIND_EDGES',
'SHARPEN',
'SMOOTH',
'SMOOTH_MORE',
]
for name, filt in filters:
for subtle_filter_name in subtle_filters:
for s in range(220, 250, 10):
for threshold in range(250, 253, 1):
img_temp = img.copy()
img_temp.thumbnail([s,s], filt)
img_temp = img_temp.convert('L')
img_temp = img_temp.point(lambda x: 0 if x < threshold else 255)
img_temp = img_temp.filter(getattr(ImageFilter, subtle_filter_name))
imagetext = pytesseract.image_to_string(img_temp)
print(s, threshold, name, subtle_filter_name, imagetext)
with open('thumb%s_%s_%s_%s.jpg' % (s, threshold, name, subtle_filter_name), 'wb') as g:
img_temp.save(g)
and see what works for you.
I would suggest you resize your image while keeping the original ratio. You could also try some alternative to img_temp.convert('L')
Best so far: TWls and T0018
You can try to manipulate the image manually and see if you can find some edit that can provide a better output (for instance http://gimpchat.com/viewtopic.php?f=8&t=1193)
By knowing in advance the font you could probably achieve a better result too.

Why am I getting this error in facenet?

I am trying to run facematch(facenet) on my Virtual Machine (Google Cloud Platform). At first, things were running smoothly and it was embedding the points of the faces, but then out of the blue, my code stopped working.
The first code, you can see the imports are there
For the second code, you can see the imports are there.
This is the ls commands, so you can see that all the directories/modules are there and see the errors I'm getting
Anyone can share some insight on what I'm doing wrong?
Face_match_demo code:
import tensorflow as tf
import numpy as np
import facenet
from align import detect_face
import cv2
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--img1", type = str, required=True)
parser.add_argument("--img2", type = str, required=True)
args = parser.parse_args()
# some constants kept as default from facenet
minsize = 20
threshold = [0.6, 0.7, 0.7]
factor = 0.709
margin = 44
input_image_size = 160
sess = tf.Session()
# read pnet, rnet, onet models from align directory and files are det1.npy, det2.npy, det3.npy
pnet, rnet, onet = detect_face.create_mtcnn(sess, 'align')
# read 20170512-110547 model file downloaded from https://drive.google.com/file/d/0B5MzpY9kBtDVZ2RpVDYwWmxoSUk
facenet.load_model("20170512-110547/20170512-110547.pb")
# Get input and output tensors
images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
embedding_size = embeddings.get_shape()[1]
def getFace(img):
faces = []
img_size = np.asarray(img.shape)[0:2]
bounding_boxes, _ = detect_face.detect_face(img, minsize, pnet, rnet, onet, threshold, factor)
if not len(bounding_boxes) == 0:
for face in bounding_boxes:
if face[4] > 0.50:
det = np.squeeze(face[0:4])
bb = np.zeros(4, dtype=np.int32)
bb[0] = np.maximum(det[0] - margin / 2, 0)
bb[1] = np.maximum(det[1] - margin / 2, 0)
bb[2] = np.minimum(det[2] + margin / 2, img_size[1])
bb[3] = np.minimum(det[3] + margin / 2, img_size[0])
cropped = img[bb[1]:bb[3], bb[0]:bb[2], :]
resized = cv2.resize(cropped, (input_image_size,input_image_size),interpolation=cv2.INTER_CUBIC)
prewhitened = facenet.prewhiten(resized)
faces.append({'face':resized,'rect':[bb[0],bb[1],bb[2],bb[3]],'embedding':getEmbedding(prewhitened)})
return faces
def getEmbedding(resized):
reshaped = resized.reshape(-1,input_image_size,input_image_size,3)
feed_dict = {images_placeholder: reshaped, phase_train_placeholder: False}
embedding = sess.run(embeddings, feed_dict=feed_dict)
return embedding
def compare2face(img1,img2):
face1 = getFace(img1)
face2 = getFace(img2)
if face1 and face2:
# calculate Euclidean distance
dist = np.sqrt(np.sum(np.square(np.subtract(face1[0]['embedding'], face2[0]['embedding']))))
return dist
return -1
img1 = cv2.imread(args.img1)
img2 = cv2.imread(args.img2)
distance = compare2face(img1, img2)
threshold = 1.10 # set yourself to meet your requirement
print("distance = "+str(distance))
face_embeddings_demo code:
import tensorflow as tf
from align import detect_face
import facenet
import cv2
import imutils
import numpy as np
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--img", type = str, required=True)
args = parser.parse_args()
# some constants kept as default from facenet
minsize = 20
threshold = [0.6, 0.7, 0.7]
factor = 0.709
margin = 44
input_image_size = 160
sess = tf.Session()
# read pnet, rnet, onet models from align directory and files are det1.npy, det2.npy, det3.npy
pnet, rnet, onet = detect_face.create_mtcnn(sess, 'align')
# read 20170512-110547 model file downloaded from https://drive.google.com/file/d/0B5MzpY9kBtDVZ2RpVDYwWmxoSUk
facenet.load_model("20170512-110547/20170512-110547.pb")
# Get input and output tensors
images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
embedding_size = embeddings.get_shape()[1]
def getFace(img):
faces = []
img_size = np.asarray(img.shape)[0:2]
bounding_boxes, points = detect_face.detect_face(img, minsize, pnet, rnet, onet, threshold, factor)
if not len(bounding_boxes) == 0:
for face in bounding_boxes:
if face[4] > 0.50:
det = np.squeeze(face[0:4])
bb = np.zeros(4, dtype=np.int32)
bb[0] = np.maximum(det[0] - margin / 2, 0)
bb[1] = np.maximum(det[1] - margin / 2, 0)
bb[2] = np.minimum(det[2] + margin / 2, img_size[1])
bb[3] = np.minimum(det[3] + margin / 2, img_size[0])
cropped = img[bb[1]:bb[3], bb[0]:bb[2], :]
resized = cv2.resize(cropped, (input_image_size,input_image_size),interpolation=cv2.INTER_CUBIC)
prewhitened = facenet.prewhiten(resized)
faces.append({'face':resized,'rect':[bb[0],bb[1],bb[2],bb[3]],'embedding':getEmbedding(prewhitened)})
return faces
def getEmbedding(resized):
reshaped = resized.reshape(-1,input_image_size,input_image_size,3)
feed_dict = {images_placeholder: reshaped, phase_train_placeholder: False}
# print(feed_dict)
embedding = sess.run(embeddings, feed_dict=feed_dict)
return embedding
img = cv2.imread(args.img)
img = imutils.resize(img,width=1000)
faces = getFace(img)
for face in faces:
print("Embeddings = "+str(face['embedding']))
cv2.waitKey(0)
cv2.destroyAllWindows()
You have to have the __init__.py in the package directory to be recognized as a package. It can be an empty file, but it has to be present. You don't have this in the align directory.
From the documentation:
The __init__.py files are required to make Python treat the directories as containing packages
From your comment, the error
usage: face_match_demo.py [-h] --img1 IMG1 --img2 IMG2 face_match_demo.py: error: ambiguous option: --img=images/faces.jpg could match --img2, --img1
means that face_match_demo.py is actually a utility to match two images, to say whether they contain the same face or not. So you have to provide two images to it, and it will tell if the face is the same. And you need to use the --img1 and --img2 options to do that like this:
python face_match_demo.py --img1 images/faces.jpg --img2 [[another face image]]

Categories