I have a face recognition project using features extraction.
I want to generate the embedding vector using DeepFace or VGGFace instead of ResNet-34 without changing the whole scripts. Any help will be appreciated
I've written a script (encoding.py) doing features extraction based on ResNet-34. The script loops throw images in the train-set and extract the encoding (a 128-d vector) and the name of each image from the train set, then put them in a dictionary.
And when executing this script we generate a file (encodings.pickle) that contains our dictionary.
The script is below:
from imutils import paths
import face_recognition
import argparse
import pickle
import cv2
import os
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--dataset", required=True,
help="path to input directory of faces + images")
ap.add_argument("-e", "--encodings", required=True,
help="path to serialized db of facial encodings")
ap.add_argument("-d", "--detection-method", type=str, default="cnn",
help="face detection model to use: either `hog` or `cnn`")
args = vars(ap.parse_args())
# grab the paths to the input images in our dataset
print("[INFO] quantifying faces...")
imagePaths = list(paths.list_images(args["dataset"]))
# initialize the list of known encodings and known names
knownEncodings = []
knownNames = []
# loop over the image paths
for (i, imagePath) in enumerate(imagePaths):
# extract the person name from the image path
print("[INFO] processing image {}/{}".format(i + 1,
len(imagePaths)))
name = imagePath.split(os.path.sep)[-2]
# load the input image and convert it from BGR (OpenCV ordering)
# to dlib ordering (RGB)
image = cv2.imread(imagePath)
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# detect the (x, y)-coordinates of the bounding boxes
# corresponding to each face in the input image
boxes = face_recognition.face_locations(rgb,
model=args["detection_method"])
# compute the facial embedding for the face
encodings = face_recognition.face_encodings(rgb, boxes)
# loop over the encodings
for encoding in encodings:
# add each encoding + name to our set of known names and
# encodings
knownEncodings.append(encoding)
knownNames.append(name)
# dump the facial encodings + names to disk
print("[INFO] serializing encodings...")
data = {"encodings": knownEncodings, "names": knownNames}
f = open(args["encodings"], "wb")
f.write(pickle.dumps(data))
f.close()
And to make predictions I've used another script that loads a single image then do the encoding part and make a comparison between its encoding and the file generated before.
The 2nd script is below:
# import the necessary packages
import face_recognition
import argparse
import pickle
import cv2
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-e", "--encodings", required=True,
help="path to serialized db of facial encodings")
ap.add_argument("-i", "--image", required=True,
help="path to input image")
ap.add_argument("-d", "--detection-method", type=str, default="hog",
help="face detection model to use: either `hog` or `cnn`")
args = vars(ap.parse_args())
# load the known faces and embeddings
print("[INFO] loading encodings...")
data = pickle.loads(open(args["encodings"], "rb").read())
# load the input image and convert it from BGR to RGB
image = cv2.imread(args["image"])
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# detect the (x, y)-coordinates of the bounding boxes corresponding
# to each face in the input image, then compute the facial embeddings
# for each face
print("[INFO] recognizing faces...")
boxes = face_recognition.face_locations(rgb,
model=args["detection_method"])
encodings = face_recognition.face_encodings(rgb, boxes)
# initialize the list of names for each face detected
names = []
# loop over the facial embeddings
for encoding in encodings:
# attempt to match each face in the input image to our known
# encodings
matches = face_recognition.compare_faces(data["encodings"],
encoding)
name = "Unknown"
# check to see if we have found a match
if True in matches:
# find the indexes of all matched faces then initialize a
# dictionary to count the total number of times each face
# was matched
matchedIdxs = [i for (i, b) in enumerate(matches) if b]
counts = {}
# loop over the matched indexes and maintain a count for
# each recognized face face
for i in matchedIdxs:
name = data["names"][i]
counts[name] = counts.get(name, 0) + 1
# determine the recognized face with the largest number of
# votes (note: in the event of an unlikely tie Python will
# select first entry in the dictionary)
name = max(counts, key=counts.get)
# update the list of names
names.append(name)
print(names)
# loop over the recognized faces
for ((top, right, bottom, left), name) in zip(boxes, names):
# draw the predicted face name on the image
cv2.rectangle(image, (left, top), (right, bottom), (0, 255, 0), 2)
y = top - 15 if top - 15 > 15 else top + 15
cv2.putText(image, name, (left, y), cv2.FONT_HERSHEY_SIMPLEX,
0.75, (0, 255, 0), 2)
# show the output image
cv2.imshow("Image", image)
cv2.waitKey(0)
DeepFace has a represent function in its interface.
from deepface import DeepFace
img_path = "img.jpg"
models = ["VGG-Face", "Facenet", "OpenFace", "DeepFace", "Dlib", "ArcFace"]
embedding = DeepFace.represent(img_path, model_name = models[0])
If you already read an image, you can pass it directly to the deepface instead of passing its exact path.
image = cv2.imread(imagePath)
embedding = DeepFace.represent(image, model_name = models[0])
Related
I am a novice in Computer Vision. I am trying to implement Real Time Face Recognition with Local Binary Patterns with its Face Detection part based on Deep Learning dnn module. I am using the caltech_faces dataset and have added a folder with my 20 photos to it.
So, this is my code. I basically transformed the code of the Face Recognition of sample images to a Real Time Face Recognition by making some changes and additions.
I get the following error when executing the below code:
predName = le.inverse_transform([predictions[i]])[0]
^
TabError: inconsistent use of tabs and spaces in indentation
I checked all the tabs and indentations, and cant find what and where to fix. I kindly ask you to give me a hint on what to do. Thank you very much!
# import the necessary packages
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imutils.video import VideoStream
from imutils import paths
import matplotlib.pyplot as plt
import numpy as np
import argparse
import imutils
import time
import cv2
import os
#Creating our face detector
def detect_faces(net, frame, minConfidence=0.5):
# grab the dimensions of the image and then construct a blob
# from it
(h, w) = frame.shape[:2]
blob = cv2.dnn.blobFromImage(frame, 1.0, (300, 300),
(104.0, 177.0, 123.0))
# pass the blob through the network to obtain the face detections,
# then initialize a list to store the predicted bounding boxes
net.setInput(blob)
detections = net.forward()
boxes = []
# loop over the detections
for i in range(0, detections.shape[2]):
# extract the confidence (i.e., probability) associated with
# the detection
confidence = detections[0, 0, i, 2]
# filter out weak detections by ensuring the confidence is
# greater than the minimum confidence
if confidence > minConfidence:
# compute the (x, y)-coordinates of the bounding box for
# the object
box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
(startX, startY, endX, endY) = box.astype("int")
# update our bounding box results list
boxes.append((startX, startY, endX, endY))
# return the face detection bounding boxes
return boxes
#Loading the CALTECH Faces dataset
def load_face_dataset(inputPath, net, minConfidence=0.5,
minSamples=15):
# grab the paths to all images in our input directory, extract
# the name of the person (i.e., class label) from the directory
# structure, and count the number of example images we have per
# face
imagePaths = list(paths.list_images(inputPath))
names = [p.split(os.path.sep)[-2] for p in imagePaths]
(names, counts) = np.unique(names, return_counts=True)
names = names.tolist()
# initialize lists to store our extracted faces and associated
# labels
faces = []
labels = []
# loop over the image paths
for imagePath in imagePaths:
# load the image from disk and extract the name of the person
# from the subdirectory structure
frame = cv2.imread(imagePath)
name = imagePath.split(os.path.sep)[-2]
# only process images that have a sufficient number of
# examples belonging to the class
if counts[names.index(name)] < minSamples:
continue
# perform face detection
boxes = detect_faces(net, frame, minConfidence)
# loop over the bounding boxes
for (startX, startY, endX, endY) in boxes:
# extract the face ROI, resize it, and convert it to
# grayscale
faceROI = frame[startY:endY, startX:endX]
faceROI = cv2.resize(faceROI, (47, 62))
faceROI = cv2.cvtColor(faceROI, cv2.COLOR_BGR2GRAY)
# update our faces and labels lists
faces.append(faceROI)
labels.append(name)
# convert our faces and labels lists to NumPy arrays
faces = np.array(faces)
labels = np.array(labels)
# return a 2-tuple of the faces and labels
return (faces, labels)
#Implementing Local Binary Patterns for face recognition
# # construct the argument parser and parse the arguments
# ap = argparse.ArgumentParser()
# ap.add_argument("-i", "--input", type=str, required=True,
# help="path to input directory of images")
# ap.add_argument("-f", "--face", type=str,
# default="face_detector",
# help="path to face detector model directory")
# ap.add_argument("-c", "--confidence", type=float, default=0.5,
# help="minimum probability to filter weak detections")
# args = vars(ap.parse_args())
# since we are using Jupyter Notebooks we can replace our argument
# parsing code with *hard coded* arguments and values
args = {
"input": "caltech_faces",
"face": "face_detector",
"confidence": 0.5,
}
# load our serialized face detector model from disk
print("[INFO] loading face detector model...")
prototxtPath = os.path.sep.join([args["face"], "deploy.prototxt"])
weightsPath = os.path.sep.join([args["face"],
"res10_300x300_ssd_iter_140000.caffemodel"])
net = cv2.dnn.readNet(prototxtPath, weightsPath)
# load the CALTECH faces dataset
print("[INFO] loading dataset...")
(faces, labels) = load_face_dataset(args["input"], net,
minConfidence=0.5, minSamples=20)
print("[INFO] {} images in dataset".format(len(faces)))
# encode the string labels as integers
le = LabelEncoder()
labels = le.fit_transform(labels)
# construct our training and testing split
(trainX, testX, trainY, testY) = train_test_split(faces,
labels, test_size=0.25, stratify=labels, random_state=42)
# train our LBP face recognizer
print("[INFO] training face recognizer...")
recognizer = cv2.face.LBPHFaceRecognizer_create(
radius=2, neighbors=16, grid_x=8, grid_y=8)
start = time.time()
recognizer.train(trainX, trainY)
end = time.time()
print("[INFO] training took {:.4f} seconds".format(end - start))
# initialize the list of predictions and confidence scores
print("[INFO] gathering predictions...")
predictions = []
confidence = []
start = time.time()
# loop over the test data
for i in range(0, len(testX)):
# classify the face and update the list of predictions and
# confidence scores
(prediction, conf) = recognizer.predict(testX[i])
predictions.append(prediction)
confidence.append(conf)
# measure how long making predictions took
end = time.time()
print("[INFO] inference took {:.4f} seconds".format(end - start))
# show the classification report
print(classification_report(testY, predictions,
target_names=le.classes_))
# initialize the video stream and allow the cammera sensor to warmup
print("[INFO] starting video stream...")
vs = VideoStream(src=0).start()
time.sleep(2.0)
# loop over the frames from the video stream
while True:
# grab the frame from the threaded video stream and resize it
# to have a maximum width of 400 pixels
face = vs.read()
face = imutils.resize(face, width=400)
# loop over the detections
for i in range(0, detections.shape[2]):
# grab the predicted name and actual name
predName = le.inverse_transform([predictions[i]])[0]
actualName = le.classes_[testY[i]]
# draw the predicted name and actual name on the image
cv2.putText(face, "pred: {}".format(predName), (5, 25),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
cv2.putText(face, "actual: {}".format(actualName), (5, 60),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
# display the predicted name, actual name, and confidence of the
# prediction (i.e., chi-squared distance; the *lower* the distance
# is the *more confident* the prediction is)
print("[INFO] prediction: {}, actual: {}, confidence: {:.2f}".format(predName, actualName, confidence[i]))
# show the output frame
cv2.imshow("Face", face)
key = cv2.waitKey(1) & 0xFF
# if the `q` key was pressed, break from the loop
if key == ord("q"):
break
You have a for loop without any line of code but a comment just before the line that cause the problem:
# loop over the detections
for i in range(0, detections.shape[2]):
# grab the predicted name and actual name
predName = le.inverse_transform([predictions[i]])[0]
actualName = le.classes_[testY[i]]
The problem comes from this empty loop; if you have a loop, you must have at least one line of code inside. So delete it or add the pass keyword inside.
I am using google collab for this and first of all, make sure you have OpenCV installed. You can install it using pip:
pip install opencv-python
Before detecting the face we should have to open the web camera using google collab.
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode
def take_photo(filename='photo.jpg', quality=0.8):
js = Javascript('''
async function takePhoto(quality) {
const div = document.createElement('div');
const capture = document.createElement('button');
capture.textContent = 'Capture';
div.appendChild(capture);
const video = document.createElement('video');
video.style.display = 'block';
const stream = await navigator.mediaDevices.getUserMedia({video: true});
document.body.appendChild(div);
div.appendChild(video);
video.srcObject = stream;
await video.play();
// Resize the output to fit the video element. google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);
// Wait for Capture to be clicked.
await new Promise((resolve) => capture.onclick = resolve);
const canvas = document.createElement('canvas');
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
canvas.getContext('2d').drawImage(video, 0, 0);
stream.getVideoTracks()[0].stop();
div.remove();
return canvas.toDataURL('image/jpeg', quality);
}
''')
display(js)
data = eval_js('takePhoto({})'.format(quality))
binary = b64decode(data.split(',')[1])
with open(filename, 'wb') as f:
f.write(binary)
return filename
You have to run the below code as the second step.
from IPython.display import Image
try:
filename = take_photo()
print('Saved to {}'.format(filename))
# Show the image which was just taken.
display(Image(filename))
except Exception as err:
# Errors will be thrown if the user does not have a webcam or if they do
not
# grant the page permission to access it.
print(str(err))
After running these two codes, the web camera is opened and you can capture a photo.
The photo is saved as photo.jpg.
Face detection using Haar cascades is a machine learning-based approach where a cascade function is trained with a set of input data. OpenCV already contains many pre-trained classifiers for face, eyes, smiles, etc. Today we will be using the face classifier. You can experiment with other classifiers as well.
I am new to python also to openCv. I am trying out the code which detects edges with deep Learning and trying to understand it. I found this code this website https://www.pyimagesearch.com
# USAGE
# python detect_edges_image.py --edge-detector hed_model --image images/guitar.jpg
# import the necessary packages
import argparse
import cv2
import os
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--edge-detector", type=str, required=True,
help="path to OpenCV's deep learning edge detector")
ap.add_argument("-i", "--image", type=str, required=True,
help="path to input image")
args = vars(ap.parse_args())
class CropLayer(object):
def __init__(self, params, blobs):
# initialize our starting and ending (x, y)-coordinates of
# the crop
self.startX = 0
self.startY = 0
self.endX = 0
self.endY = 0
def getMemoryShapes(self, inputs):
# the crop layer will receive two inputs -- we need to crop
# the first input blob to match the shape of the second one,
# keeping the batch size and number of channels
(inputShape, targetShape) = (inputs[0], inputs[1])
(batchSize, numChannels) = (inputShape[0], inputShape[1])
(H, W) = (targetShape[2], targetShape[3])
# compute the starting and ending crop coordinates
self.startX = int((inputShape[3] - targetShape[3]) / 2)
self.startY = int((inputShape[2] - targetShape[2]) / 2)
self.endX = self.startX + W
self.endY = self.startY + H
# return the shape of the volume (we'll perform the actual
# crop during the forward pass
return [[batchSize, numChannels, H, W]]
def forward(self, inputs):
# use the derived (x, y)-coordinates to perform the crop
return [inputs[0][:, :, self.startY:self.endY,
self.startX:self.endX]]
# load our serialized edge detector from disk
print("[INFO] loading edge detector...")
protoPath = os.path.sep.join([args["edge_detector"],
"deploy.prototxt"])
modelPath = os.path.sep.join([args["edge_detector"],
"hed_pretrained_bsds.caffemodel"])
net = cv2.dnn.readNetFromCaffe(protoPath, modelPath)
# register our new layer with the model
cv2.dnn_registerLayer("Crop", CropLayer)
# load the input image and grab its dimensions
image = cv2.imread(args["image"])
(H, W) = image.shape[:2]
print("Height: ",H)
print("Width: ",W)
# convert the image to grayscale, blur it, and perform Canny
# edge detection
print("[INFO] performing Canny edge detection...")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
canny = cv2.Canny(blurred, 30, 150)
# construct a blob out of the input image for the Holistically-Nested
# Edge Detector
blob = cv2.dnn.blobFromImage(image, scalefactor=1.0, size=(W, H),
mean=(104.00698793, 116.66876762, 122.67891434),
swapRB=False, crop=False)
# set the blob as the input to the network and perform a forward pass
# to compute the edges
print("[INFO] performing holistically-nested edge detection...")
net.setInput(blob)
hed = net.forward()
print("before: ",hed)
hed = cv2.resize(hed[0,0], (W, H))
print("after:",hed)
hed = (255 * hed).astype("uint8")
# show the output edge detection results for Canny and
# Holistically-Nested Edge Detection
cv2.imshow("Input", image)
cv2.imshow("Canny", canny)
cv2.imshow("HED", hed)
cv2.waitKey(0)
Can any one explain what hed[0,0] returns and what hed = cv2.resize(hed[0,0], (W, H)) does?
Also hed = (255 * hed).astype("uint8")
if possible can explain in different Language like JAVA or C++
What I found is here
after FeedForward process:
hed = net.forward()
print(hed.shape)
your answer is:
(1, 1, 2121, 4148)
when you doing resize:
hed = cv2.resize(hed[0,0], (W, H))
print (hed.shape)
your answer becomes is:
(2121, 4148)
so it seems you are reducing the dimension of the array (hed) to fit H,W of the original image.
yah. same question I also have.
But it seems like it makes a resize of the hed image to new dimensions H and W.
example below:
newwidth = 350
newheight = 450
dim = (width, height)
# resize image
resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
I'm using OpenCV + Python to apply a deep learning model and classify objects in 8 categories (animal types) namely, cat, dog, horse, deer, bear, lizard, monkey, no object detected (when the is no object detected in the image).
I have a folder that has images of all types of animals in it. I read all the images in one folder and then I apply the deep learning model to extract bounding box coordinates of each object in each image.
I want to first categorize each image by putting each type of animal image in the related folder. second, save the coordinate of the bounding box of that image in the same folder. For example if the network detected cat, I want to save that image and corresponding coordinates(as a text file .text) in the cat folder and if it did not find any of those objects in the image just put it in the no object detected folder.
My question is how can I save the original image and the bounding box coordinates of that object inside the 8 category folder?
here is my code:
import cv2
import numpy as np
import os
import glob
import argparse
import time
img_dir="/path/imgt/"
data_path=os.path.join(img_dir,'*g')
files=glob.glob(data_path)
data=[]
i = 0
for f1 in files:
image=cv2.imread(f1)
data.append(image)
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
help="path to input image")
ap.add_argument("-y", "--yolo", required=True,
help="base path to YOLO directory")
ap.add_argument("-c", "--confidence", type=float, default=0.5,
help="minimum probability to filter weak detections")
ap.add_argument("-t", "--threshold", type=float, default=0.3,
help="threshold when applyong non-maxima suppression")
args = vars(ap.parse_args())
# load the COCO class labels our YOLO model was trained on
labelsPath = os.path.sep.join([args["yolo"], "obj.names"])
LABELS = open(labelsPath).read().strip().split("\n")
# initialize a list of colors to represent each possible class label
np.random.seed(42)
COLORS = np.random.randint(0, 255, size=(len(LABELS), 3),
dtype="uint8")
# derive the paths to the YOLO weights and model configuration
weightsPath = os.path.sep.join([args["yolo"], "yolo-obj_last.weights"])
configPath = os.path.sep.join([args["yolo"], "yolo-obj.cfg"])
# load our YOLO object detector trained on COCO dataset (80 classes)
print("[INFO] loading YOLO from disk...")
net = cv2.dnn.readNetFromDarknet(configPath, weightsPath)
# load our input image and grab its spatial dimensions
# image = cv2.imread(args["image"])
(H, W) = image.shape[:2]
# determine only the *output* layer names that we need from YOLO
ln = net.getLayerNames()
ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
# construct a blob from the input image and then perform a forward
# pass of the YOLO object detector, giving us our bounding boxes and
# associated probabilities
blob = cv2.dnn.blobFromImage(image, 1 / 255.0, (416, 416),
swapRB=True, crop=False)
net.setInput(blob)
start = time.time()
layerOutputs = net.forward(ln)
end = time.time()
# show timing information on YOLO
print("[INFO] YOLO took {:.6f} seconds".format(end - start))
# initialize our lists of detected bounding boxes, confidences, and
# class IDs, respectively
boxes = []
confidences = []
classIDs = []
# loop over each of the layer outputs
for output in layerOutputs:
# loop over each of the detections
for detection in output:
# extract the class ID and confidence (i.e., probability) of
# the current object detection
scores = detection[5:]
classID = np.argmax(scores)
confidence = scores[classID]
# filter out weak predictions by ensuring the detected
# probability is greater than the minimum probability
if confidence > args["confidence"]:
# scale the bounding box coordinates back relative to the
# size of the image, keeping in mind that YOLO actually
# returns the center (x, y)-coordinates of the bounding
# box followed by the boxes' width and height
box = detection[0:4] * np.array([W, H, W, H])
(centerX, centerY, width, height) = box.astype("int")
# use the center (x, y)-coordinates to derive the top and
# and left corner of the bounding box
x = int(centerX - (width / 2))
y = int(centerY - (height / 2))
# update our list of bounding box coordinates, confidences,
# and class IDs
boxes.append([x, y, int(width), int(height)])
confidences.append(float(confidence))
classIDs.append(classID)
# apply non-maxima suppression to suppress weak, overlapping bounding
# boxes
idxs = cv2.dnn.NMSBoxes(boxes, confidences, args["confidence"],
args["threshold"])
# ensure at least one detection exists
if len(idxs) > 0:
# loop over the indexes we are keeping
for i in idxs.flatten():
# extract the bounding box coordinates
(x, y) = (boxes[i][0], boxes[i][1])
(w, h) = (boxes[i][2], boxes[i][3])
# draw a bounding box rectangle and label on the image
color = [int(c) for c in COLORS[classIDs[i]]]
cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
text = "{}: {:.4f}".format(LABELS[classIDs[i]], confidences[i])
cv2.putText(image, text, (x, y - 7), cv2.FONT_HERSHEY_SIMPLEX,0.6, color, 2)
path = '/path/imgr/' + LABELS[classIDs[i]] + '/'
cv2.imwrite(os.path.join(path, 'image' + str(i) + '.jpg'), image)
with open(os.path.join(path, 'image' + str(i) + '.txt'), 'a+') as f:
f.write(str(classIDs[i]) + ' ' + str(x) + ' ' + str(y) + ' ' + str(w) + ' ' + str(h))
how does the text file look like?
.txt -file for each .jpg-image-file - in the same directory and with the same name, but with .txt-extension, and put to file: object number and object coordinates on this image, for each object in new line: <object-class> <x> <y> <width> <height>
Where:
<object-class> - integer number of object from 0 to (classes-1)
<x> <y> <width> <height> - float values relative to width and height of image, it can be equal from (0.0 to 1.0]
for example: <x> = <absolute_x> / <image_width> or <height> = <absolute_height> / <image_height>
atention: <x> <y> - are center of rectangle (are not top-left corner)
For example for img1.jpg you will be created img1.txt containing:
1 0.716797 0.395833 0.216406 0.147222
0 0.687109 0.379167 0.255469 0.158333
1 0.420312 0.395833 0.140625 0.166667
Maybe something like this:
path = os.path.join('/path/imgr/', LABELS[classID], image_name)
cv2.imwrite(path + '.jpg', image)
with open(path + '.txt'), 'a+') as f:
f.write(str(classID) + ' ' + str(detection[0]) + ' ' + str(detection[1]) + ' ' + str(detection[2]) + ' ' + str(detection[3]) + '\n')
You may have multiple objects in an image, in which case it should write to each of the relevant folders and append to the text file if it exists.
image_name will be something you generate, you can use the name that you are reading in, or a counter.
This snippet should go somewhere under the if statement:
if confidence > args["confidence"]:
I would put it at the end. You may need to make minor adjustments, but that is the gist.
More explicitly:
import cv2
import numpy as np
import os
import glob
import argparse
import time
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
#ap.add_argument("-i", "--image", required=True,
# help="path to input image")
ap.add_argument("-y", "--yolo", required=True,
help="base path to YOLO directory")
ap.add_argument("-c", "--confidence", type=float, default=0.5,
help="minimum probability to filter weak detections")
ap.add_argument("-t", "--threshold", type=float, default=0.3,
help="threshold when applyong non-maxima suppression")
args = vars(ap.parse_args())
# load the COCO class labels our YOLO model was trained on
labelsPath = os.path.sep.join([args["yolo"], "obj.names"])
LABELS = open(labelsPath).read().strip().split("\n")
# derive the paths to the YOLO weights and model configuration
weightsPath = os.path.sep.join([args["yolo"], "yolo-obj_last.weights"])
configPath = os.path.sep.join([args["yolo"], "yolo-obj.cfg"])
# load our YOLO object detector trained on COCO dataset (80 classes)
print("[INFO] loading YOLO from disk...")
net = cv2.dnn.readNetFromDarknet(configPath, weightsPath)
# determine only the *output* layer names that we need from YOLO
ln = net.getLayerNames()
ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
img_dir="/path/imgt/"
data_path=os.path.join(img_dir,'*g')
files=glob.glob(data_path)
for f1 in files:
# load our input image and grab its spatial dimensions
image=cv2.imread(f1)
# construct a blob from the input image and then perform a forward
# pass of the YOLO object detector, giving us our bounding boxes and
# associated probabilities
blob = cv2.dnn.blobFromImage(image, 1 / 255.0, (416, 416),
swapRB=True, crop=False)
net.setInput(blob)
layerOutputs = net.forward(ln)
# loop over each of the layer outputs
for output in layerOutputs:
# loop over each of the detections
for detection in output:
# extract the class ID and confidence (i.e., probability) of
# the current object detection
scores = detection[5:]
classID = np.argmax(scores)
confidence = scores[classID]
box = detection[0:4]
# get upper left corner
box[0] = box[0] - box[2]/2
box[1] = box[1] - box[3]/2
# filter out weak predictions by ensuring the detected
# probability is greater than the minimum probability
if confidence > args["confidence"]:
# write output files
class_dir = os.path.join('/path/imgr/', LABELS[classID])
if not os.path.exists(class_dir):
os.makedirs(class_dir)
path = os.path.join(class_dir, f1.split('/')[-1][:-4])
cv2.imwrite(path + '.jpg', image)
with open(path + '.txt'), 'a+') as f:
f.write(str(classID) + ' ' + str(box[0]) + ' ' + str(box[1]) + ' ' + str(box[2]) + ' ' + str(box[3]) + '\n')
Read through it and make sure you understand what each part in the for loop is doing. Once you are comfortable with this minimal example you could add back in the non-maximal suppression and drawing the bounding boxes if you like.
I'm using Tesseract to detect a black written word with a white background. In some images after the black word occurs an info symbol. I'm not interested in detecting this symbol, I'm only interested in the word. Sometimes the info symbol (with a circle around) is detected as 0 or O, this is fine. But in other cases (probably if tesseract doesn't know how to handle this sign) it is just returning an empty string, so the word is not returned as well. I used the code given here and also tried the configuration suggested here
from PIL import Image
import pytesseract
import argparse
import cv2
import os
import numpy as np
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
help="path to input image to be OCR'd")
ap.add_argument("-p", "--preprocess", type=str, default="thresh",
help="type of preprocessing to be done")
args = vars(ap.parse_args())
# load the example image and convert it to grayscale
image = cv2.imread(args["image"])
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# check to see if we should apply thresholding to preprocess the
# image
if args["preprocess"] == "thresh":
gray = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# make a check to see if median blurring should be done to remove
# noise
elif args["preprocess"] == "blur":
gray = cv2.medianBlur(gray, 3)
# write the grayscale image to disk as a temporary file so we can
# apply OCR to it
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)
# load the image as a PIL/Pillow image, apply OCR, and then delete
# the temporary file
text = pytesseract.image_to_string(gray, config='--psm 7')
os.remove(filename)
print("Output: " + text)
If anyone has an idea what else I could to I'm very grateful!
solved: config has to be config='-psm 7' with only one "-"
I am using opencv and want to stick with it.
I have 5 images with some common areas in a pairwise manner. I want to merge them together in a single image. I have been successful joining two images together, as they were of the same resolution(a little tweak brought them to the same resolution without distorting the contents significantly). But now this first stage of merging gives me a highly inflated image, the resolution has gone significantly up(kind of an addition of two images).
To merge the two images I had brought their resolutions to the same value and it didn't cause much distortion. But now there's this image with double the length. If I change its resolution to the level of the image next in line for stitching, it is going to highly distort the content of the first stage and hence the result from here on.
How do I fix this issue given that I need to go through 5-6 iterations of stitching where the resolution is going to keep increasing?
Also, if there is any text which goes into details of image processing with examples, like above.
Stitcher.py
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
# import the necessary packages
import numpy as np
import imutils
import cv2
class Stitcher:
def __init__(self):
# determine if we are using OpenCV v3.X
self.isv3 = imutils.is_cv3()
def stitch(self, images, ratio=0.75, reprojThresh=4.0,
showMatches=False):
# unpack the images, then detect keypoints and extract
# local invariant descriptors from them
(imageB, imageA) = images
#(b, g, r) = cv2.split(imageA)
#imageA = cv2.merge([r,g,b])
#(b, g, r) = cv2.split(imageB)
#imageB = cv2.merge([r,g,b])
(kpsA, featuresA) = self.detectAndDescribe(imageA)
(kpsB, featuresB) = self.detectAndDescribe(imageB)
# match features between the two images
M = self.matchKeypoints(kpsA, kpsB,
featuresA, featuresB, ratio, reprojThresh)
# if the match is None, then there aren't enough matched
# keypoints to create a panorama
if M is None:
return None
# otherwise, apply a perspective warp to stitch the images
# together
(matches, H, status) = M
result = cv2.warpPerspective(imageA, H,
(imageA.size[1] + imageB.size[1], imageA.size[0]))
result[0:imageB.size[0], 0:imageB.size[1]] = imageB
# check to see if the keypoint matches should be visualized
if showMatches:
vis = self.drawMatches(imageA, imageB, kpsA, kpsB, matches,
status)
# return a tuple of the stitched image and the
# visualization
return (result, vis)
# return the stitched image
return result
def detectAndDescribe(self, image):
# convert the image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# check to see if we are using OpenCV 3.X
if self.isv3:
# detect and extract features from the image
descriptor = cv2.xfeatures2d.SIFT_create()
(kps, features) = descriptor.detectAndCompute(image, None)
# otherwise, we are using OpenCV 2.4.X
else:
# detect keypoints in the image
detector = cv2.FeatureDetector_create("SIFT")
kps = detector.detect(gray)
# extract features from the image
extractor = cv2.DescriptorExtractor_create("SIFT")
(kps, features) = extractor.compute(gray, kps)
# convert the keypoints from KeyPoint objects to NumPy
# arrays
kps = np.float32([kp.pt for kp in kps])
# return a tuple of keypoints and features
return (kps, features)
def matchKeypoints(self, kpsA, kpsB, featuresA, featuresB,
ratio, reprojThresh):
# compute the raw matches and initialize the list of actual
# matches
matcher = cv2.DescriptorMatcher_create("BruteForce")
rawMatches = matcher.knnMatch(featuresA, featuresB, 2)
matches = []
# loop over the raw matches
for m in rawMatches:
# ensure the distance is within a certain ratio of each
# other (i.e. Lowe's ratio test)
if len(m) == 2 and m[0].distance < m[1].distance * ratio:
matches.append((m[0].trainIdx, m[0].queryIdx))
# computing a homography requires at least 4 matches
if len(matches) > 4:
# construct the two sets of points
ptsA = np.float32([kpsA[i] for (_, i) in matches])
ptsB = np.float32([kpsB[i] for (i, _) in matches])
# compute the homography between the two sets of points
(H, status) = cv2.findHomography(ptsA, ptsB, cv2.RANSAC,
reprojThresh)
# return the matches along with the homograpy matrix
# and status of each matched point
return (matches, H, status)
# otherwise, no homograpy could be computed
return None
def drawMatches(self, imageA, imageB, kpsA, kpsB, matches, status):
# initialize the output visualization image
(hA, wA) = imageA.shape[:2]
(hB, wB) = imageB.shape[:2]
vis = np.zeros((max(hA, hB), wA + wB, 3), dtype="uint8")
vis[0:hA, 0:wA] = imageA
vis[0:hB, wA:] = imageB
# loop over the matches
for ((trainIdx, queryIdx), s) in zip(matches, status):
# only process the match if the keypoint was successfully
# matched
if s == 1:
# draw the match
ptA = (int(kpsA[queryIdx][0]), int(kpsA[queryIdx][1]))
ptB = (int(kpsB[trainIdx][0]) + wA, int(kpsB[trainIdx][1]))
cv2.line(vis, ptA, ptB, (0, 255, 0), 1)
# return the visualization
return vis
run.py
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 18 11:13:23 2017
#author: user
"""
# import the necessary packages
import os
os.chdir('/home/user/Desktop/stitcher')
from str import Stitcher
import argparse
import imutils
import cv2
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-f", "--first", required=True,
help="path to the first image")
ap.add_argument("-s", "--second", required=True,
help="path to the second image")
args = vars(ap.parse_args())
# load the two images and resize them to have a width of 400 pixels
# (for faster processing)
#from PIL import Image
#imageA = Image.open(args['first']).convert('RGB')
#imageB = Image.open(args['second']).convert('RGB')
imageA = cv2.imread(args["first"])
imageB = cv2.imread(args["second"])
#imageA = imutils.resize(imageA, width=400)
#imageB = imutils.resize(imageB, width=400)
imageA = cv2.resize(imageA,(2464,832)) #hardcoded values
imageB = cv2.resize(imageB,(2464,832)) #hardcoded values
# stitch the images together to create a panorama
stitcher = Stitcher()
(result, vis) = stitcher.stitch([imageA, imageB], showMatches=True)
cv2.imwrite('stage1.png',result)
# show the images
cv2.imshow("Image A", imageA)
cv2.imshow("Image B", imageB)
cv2.imshow("Keypoint Matches", vis)
cv2.imshow("Result", result)
cv2.waitKey(0)
As you can see, I have resized the images so that they have the same height and width with hardcoded values. I could have just got the minimum of two and put that as their length and breadth.
When I bring in the third image, I can't inflate it to match the resolution of stage1 or neither can I decrease the stage1's resolution to match the third image.
P.S. : imgutils didn't give me a way to choose both length and breadth.