I try to move my code from CPU to Cuda on Opencv-python (v 4.4.5) with a Quadro P1000.
I experienced a massive gain in speed for background subtraction, but morphological operations are slower than on CPU.
There are several related questions to this, but most of them are related to c++ or do not propose any useful ideas. (Q1, Q2, Q3)
My code for testing returns that the CPU implementation for the morphological operator is at 20 fps, while the GPU implementation is at 10 FPS on 4k video stream.
import cv2
from vidgear.gears import VideoGear
from tqdm import tqdm
from time import time
erosion_size = 3
erosion_shape = cv2.MORPH_RECT
element_erosion = cv2.getStructuringElement(erosion_shape, (2 * erosion_size + 1, 2 * erosion_size + 1),
(erosion_size, erosion_size))
dilatation_size = 3
dilation_shape = cv2.MORPH_RECT
element_dilation = cv2.getStructuringElement(dilation_shape, (2 * dilatation_size + 1, 2 * dilatation_size + 1),
(dilatation_size, dilatation_size))
def morphological_filter(img):
## erosion and dilation are faster on cpu ? WTH
erosion_dst = cv2.erode(img, element_erosion,iterations=1)
dilatation_dst = cv2.dilate(erosion_dst, element_dilation, iterations=3)
dilatation_dst = cv2.dilate(dilatation_dst, element_dilation)
return dilatation_dst
path = "path/to/vid"
cap = VideoGear(source=path,backend = cv2.CAP_FFMPEG).start()
backSub = cv2.cuda.createBackgroundSubtractorMOG2()
img_c = cv2.cuda_GpuMat()
img =
img_c = cv2.cuda.cvtColor(img_c, cv2.COLOR_BGR2GRAY)
background = cv2.cuda_GpuMat(img_c.size(),img_c.type())
dilation_cuda = cv2.cuda.createMorphologyFilter(cv2.MORPH_DILATE, img_c.type(), element_dilation)
erosion_cuda = cv2.cuda.createMorphologyFilter(cv2.MORPH_ERODE, img_c.type(), element_erosion)
def morphological_filter_gpu(img):
erosion_dst = erosion_cuda.apply(img)
dilatation_dst = dilation_cuda.apply(erosion_dst)
dilatation_dst = dilation_cuda.apply(dilatation_dst)
return dilatation_dst
start = time()
for i in tqdm(range(1000)):
img =
if type(img) ==None:
img_c = cv2.cuda.cvtColor(img_c, cv2.COLOR_BGR2GRAY)
img_d = backSub.apply(img_c, -1, cv2.cuda_Stream.Null())
res =
res = morphological_filter(res)
end = time()
print("time elapsed: ", end-start)
print("FPS ",1000/(end-start))
path = "path/to/vid"
cap = VideoGear(source=path,backend = cv2.CAP_FFMPEG).start()
start = time()
for i in tqdm(range(1000)):
img =
if type(img) ==None:
img_c = cv2.cuda.cvtColor(img_c, cv2.COLOR_BGR2GRAY)
img_d = backSub.apply(img_c, -1, cv2.cuda_Stream.Null())
img_d = morphological_filter_gpu(img_d)
res =
end = time()
print("time elapsed: ", end-start)
print("FPS ",1000/(end-start))
Now the final question: Is it due to the cuda implementation in opencv or do I use something wrong and if so: how to accelerate it?
I create space detection code by using gray, gausian blur but now I dont know where to put these code to save my opencv video.
I already tried to put the code in random line but it only comes out of the file at the output, I cant play it and the video also just 5.6KB. I even tried to record a video for a very long time.
My code runs fine without save feature but I want to add save video feature:
fourcc = open_cv.VideoWriter_fourcc(*'DIVX')
out = open_cv.VideoWriter('output.avi',fourcc, 20.0, (640,480))
these is my code that i want to be add save video coding from above :
import cv2 as open_cv
import numpy as np
import logging
from drawing_utils import draw_contours
class MotionDetector:
def __init__(self, video, coordinates, start_frame): = 0
self.coordinates_data = coordinates
self.start_frame = start_frame
self.contours = []
self.bounds = []
self.mask = []
def detect_motion(self):
capture = open_cv.VideoCapture(
capture.set(open_cv.CAP_PROP_POS_FRAMES, self.start_frame)
coordinates_data = self.coordinates_data
logging.debug("coordinates data: %s", coordinates_data)
for p in coordinates_data:
coordinates = self._coordinates(p)
logging.debug("coordinates: %s", coordinates)
rect = open_cv.boundingRect(coordinates)
logging.debug("rect: %s", rect)
new_coordinates = coordinates.copy()
new_coordinates[:, 0] = coordinates[:, 0] - rect[0]
new_coordinates[:, 1] = coordinates[:, 1] - rect[1]
logging.debug("new_coordinates: %s", new_coordinates)
mask = open_cv.drawContours(
np.zeros((rect[3], rect[2]), dtype=np.uint8),
mask = mask == 255
logging.debug("mask: %s", self.mask)
statuses = [False] * len(coordinates_data)
times = [None] * len(coordinates_data)
while capture.isOpened():
result, frame =
if frame is None:
if not result:
raise CaptureReadError("Error reading video capture on frame %s" % str(frame))
blurred = open_cv.GaussianBlur(frame.copy(), (5, 5), 3)
grayed = open_cv.cvtColor(blurred, open_cv.COLOR_BGR2GRAY)
new_frame = frame.copy()
logging.debug("new_frame: %s", new_frame)
position_in_seconds = capture.get(open_cv.CAP_PROP_POS_MSEC) / 1000.0
for index, c in enumerate(coordinates_data):
status = self.__apply(grayed, index, c)
if times[index] is not None and self.same_status(statuses, index, status):
times[index] = None
if times[index] is not None and self.status_changed(statuses, index, status):
if position_in_seconds - times[index] >= MotionDetector.DETECT_DELAY:
statuses[index] = status
times[index] = None
if times[index] is None and self.status_changed(statuses, index, status):
times[index] = position_in_seconds
for index, p in enumerate(coordinates_data):
coordinates = self._coordinates(p)
color = COLOR_GREEN if statuses[index] else COLOR_BLUE
draw_contours(new_frame, coordinates, str(p["id"] + 1), COLOR_WHITE, color)
open_cv.imshow(str(, new_frame)
k = open_cv.waitKey(1)
if k == ord("q"):
def __apply(self, grayed, index, p):
coordinates = self._coordinates(p)
logging.debug("points: %s", coordinates)
rect = self.bounds[index]
logging.debug("rect: %s", rect)
roi_gray = grayed[rect[1]:(rect[1] + rect[3]), rect[0]:(rect[0] + rect[2])]
laplacian = open_cv.Laplacian(roi_gray, open_cv.CV_64F)
logging.debug("laplacian: %s", laplacian)
coordinates[:, 0] = coordinates[:, 0] - rect[0]
coordinates[:, 1] = coordinates[:, 1] - rect[1]
status = np.mean(np.abs(laplacian * self.mask[index])) < MotionDetector.LAPLACIAN
logging.debug("status: %s", status)
return status
def _coordinates(p):
return np.array(p["coordinates"])
def same_status(coordinates_status, index, status):
return status == coordinates_status[index]
def status_changed(coordinates_status, index, status):
return status != coordinates_status[index]
class CaptureReadError(Exception):
Create the file for the video to go in. You can think of this file like a book, but a book with no pages. This code is used to create the file:
fourcc = open_cv.VideoWriter_fourcc(*'DIVX')
out = open_cv.VideoWriter('output.avi',fourcc, 20.0, (640,480))
Not all codecs work on all systems. I use Ubuntu, and this is the code I use to create a video file:
out = cv2.VideoWriter('output.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 30,(w,h))
If you try to play this video, nothing will happen. There are no frames in the video, so there is nothing to play (like a book with no pages). After you process each frame, the frame needs to be written to the video (like putting a page in a book):
You should do this around .imshow():
open_cv.imshow(str(, new_frame)
k = open_cv.waitKey(1)
if k == ord("q"):
I'm trying to speed up calculations for extensive real time object detection and doing computation on it.
I'm using OpenCV with thread pool and producer, consumer for the video capture. But the execution speed is the same as the serial one.
How would I improve the speed of the execution ?
if __name__ == "__main__":
video_name = '2016-11-18_07-30-01.h264'
cap = cv2.VideoCapture(video_name)
det = detector.CarDetector()
car_tracker = Sort_Algorithm.Sort()
ped_tracker = Sort_Algorithm.Sort()
df_region, df_line = load_filter()
region = Region(df_region)
distance = compute_max_polygon_diagonal(df_region) * 0.1
region_buffered = region.buffer(distance)
threadn = cv2.getNumberOfCPUs()
pool = ThreadPool(processes = 2)
pending = deque()
threaded_mode = True
lock = threading.Lock()
while True:
while len(pending) > 0 and pending[0].ready():
res = pending.popleft().get()
cv2.imshow('video ', res)
if len(pending) < threadn:
ret, frame =
if threaded_mode:
t1 = time.time()
H = [-2.01134074616, -16.6502442427, -1314.05715739, -3.35391526592, -22.3546973012, 2683.63584335,
-0.00130731963137, -0.0396207582264, 1]
matrix = np.reshape(H, (3, 3))
dst = cv2.warpPerspective(frame.copy(), matrix, (frame.shape[1], frame.shape[0]))
task = pool.apply_async(pipeline, (lock, frame.copy(),car_tracker, ped_tracker,df_region,region_buffered, df_line, det, dst, matrix))
cv2.imshow('dst', dst)
task = DummyTask(pipeline,(lock, frame.copy(),car_tracker, ped_tracker,df_region, region_buffered, df_line, det, dst, matrix))
ch = cv2.waitKey(1)
if ch == ord(' '):
threaded_mode = not threaded_mode
if ch == 27:
The code for pipeline:
def pipeline(lock, img, car_tracker, ped_tracker, df_region, region_buffered, df_line, det, dst, H):
global point_lists
global df_car_lists
global frame_idx
global counter
global data_peds
global data_cars
global genera_data_pd_cars
global genera_data_pd_peds
car_box, ped_box = det.get_localization(img)
car_detections = car_tracker.update(np.array(car_box))
ped_detections = ped_tracker.update(np.array(ped_box))
saved_region = df_region.values
saved_region = np.delete(saved_region, 2, 1)
cv2.warpPerspective(np.array(df_line, dtype=np.float32), H, (df_line.shape[1], df_line.shape[0]))
cv2.polylines(dst, np.int32([[saved_region]]), False, color=(255, 0, 0))
cv2.polylines(dst, np.int32([np.array(df_line, dtype=np.float32)]), False, color=(255, 0, 0))
for trk in car_detections:
trk = trk.astype(np.int32)
helpers.draw_box_label(img, trk, trk[4]) # Draw the bounding boxes on the
for other in ped_detections:
other = other.astype(np.int32)
helpers.draw_box_label(img, other, other[4]) # Draw the bounding boxes on the
for trk in car_detections:
trk = trk.astype(np.int32)
p = np.array([[((trk[1] + trk[3]) / 2, (trk[0] + trk[2]) / 2)]], dtype=np.float32)
center_pt = cv2.perspectiveTransform(p, H)
ptx = center_pt.T.item(0)
pty = center_pt.T.item(1)
df_cars = compute(trk[4], ptx, pty, frame_idx, df_region, region_buffered, df_line)
genera_data_pd_cars = genera_data_pd_cars.append(df_cars)
for other in ped_detections:
other = other.astype(np.int32)
p = np.array([[((other[1] + other[3]) / 2, (other[0] + other[2]) / 2)]], dtype=np.float32)
center_pt = cv2.perspectiveTransform(p, H)
ptx = center_pt.T.item(0)
pty = center_pt.T.item(1)
df_peds = compute(other[4], ptx, pty, frame_idx, df_region, region_buffered, df_line)
genera_data_pd_peds = genera_data_pd_cars.append(df_peds)
query = "is_in_region == True and is_in_region_now == True"
df_peds = genera_data_pd_peds.query(query)
query = " is_in_region == True"
df_cars = genera_data_pd_cars.query(query)
if len(df_cars)> 1 and len(df_peds) > 1:
df_car_in_t_range_ped = select_slice(df_cars, df_peds)
df_ped_in_t_range_car = select_slice(df_peds, df_cars)
t_abs_crossing_car = df_cars['t_abs_at_crossing'].iloc[0]
t_abs_crossing_ped = df_peds['t_abs_at_crossing'].iloc[0]
dt_crossing = t_abs_crossing_car - t_abs_crossing_ped
is_able_to_pass_before_ped = \
((df_car_in_t_range_ped['t_abs_at_crossing_estimated'] -
t_abs_crossing_ped) > 0).any()
behavior = Behavior( # is_passed_before_ped
dt_crossing < 0,
# is_able_to_stop
# is_too_fast
# is_close_enough
# is_able_to_pass_before_ped
interaction = Interaction(trk[4], other[4])
interaction = interaction.assess_behavior(behavior)
code, res, msg = interaction.code, interaction.res, interaction.msg
genera_data_pd_cars = genera_data_pd_cars.iloc[0:0]
genera_data_pd_peds = genera_data_pd_peds.iloc[0:0]
return img
Multi-threading in python for CPU bound tasks is limited by GIL and effectively makes single thread run a time.
Ofcourse if you launch multiple threads for CPU bound tasks the performance is going to be even degraded because there is lot of overhead for both for kernel and python interpreter to manage these threads.
Kernel wants to schedule these threads and python wants to restrict these threads from running simultaneous and this results lot of context switches happening which degrades the performance.
If you are using just numpy in the threads then you would be fine as numpy isn't impacted by GIL since it uses atomic operations, but I am not sure if that is true for OpenCV as well.
Threads in python arn't meant for computation tasks.
This is classic problem of threads with python, consider using multiprocessing and there are number of articles on this topic, you might want to check few of them.
Threads aren't executed in parallel in cpython. Try using the ProcessPoolExecutor instead.
I am trying to do an image stitching project with OpenCV in Python where I use point matches calculated by tracking points between frames of a video using the Lucas Kanade algorithm to find homography matrices. After writing the program and it came time for stitching together the frames of a video, I decided to run a test where I simply display the perspective warped versions of each image onto a black canvas to see how the Homography matrix had warped them. When I did this, instead of moving over bit by bit between frames, frames were translated further and further distances, way off from a slight nudge between frames
[----------------------------------------------------------------------------Empty Space------------------------------------]
[------------Frame1----------------------------------------------------------------------------------------------------------- ]
[-------------------------------------------Frame 2----------------------------------------------------------------------------]
[------------------------------------------------------------------------------------------------------------Frame 3-----------]
Subsequent frames would be out of visual range. I am not quiet sure why this is happening. I implemented a back-projection error check to make sure only points with accurate optical flow calculations were passed on. I also set the back-projection threshold for findHomography to be 10, 1, and then 0.5, all to no avail. I am stitching multiple images, so I am multiplying my homography matrices between frames. This seems to be compounding the error. Why is this happening and how can I fix my homography matrices? Here is my code (ignore commented out tests. Also, some of the indentation formatting might have been messed with while copying over to the forum):
import numpy as np
import sys
import cv2
import math
lastFeatures = None
currentFeatures = None
opticFlow = None
panRow = None
Rows = None
finalPanorama = None
def loadRow(dirPath, fType, numImages, column):
imageRow = []
for i in range(0, numImages):
imageRow.append(cv2.imread("%s/%i_%i.%s" % (dirPath, column, i, fType), cv2.IMREAD_COLOR))
return imageRow
def findNthFeatures(prevImg, prevPnts, nxtImg):
back_threshold = 0.5
nxtDescriptors = []
prevGrey = None
nxtGrey = None
nxtPnts = prevPnts[:]
prevGrey = cv2.cvtColor(prevImg, cv2.COLOR_BGR2GRAY)
nxtGrey = cv2.cvtColor(nxtImg, cv2.COLOR_BGR2GRAY)
lucasKanadeParams = dict(winSize = (19,19), maxLevel = 100, criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))
nxtPnts, status, err = cv2.calcOpticalFlowPyrLK(prevGrey, nxtGrey, prevPnts, None, **lucasKanadeParams)
backProjections, status, err = cv2.calcOpticalFlowPyrLK(nxtGrey, prevGrey, nxtPnts, None, **lucasKanadeParams)
d = abs(prevPnts - backProjections).reshape(-1, 2).max(-1)
status = d < back_threshold
goodNew = nxtPnts[status].copy()
goodLast = prevPnts[status].copy()
return goodLast, goodNew
def getHomographies(videoName):
color = np.random.randint(0,255,(100,3))
lastFrame = None
currentFrame = None
lastKeypoints = None
currentKeypoints = None
firstImage = True
featureRefreshRate = 5
feature_params = dict( maxCorners = 100,
qualityLevel = 0.1,
minDistance = 8,
blockSize = 15)
frameCount = 0
Homographies = []
cv2.namedWindow('display', cv2.WINDOW_NORMAL)
cap = cv2.VideoCapture(videoName)
flags, frame =
while flags:
if firstImage:
firstImage = False
lastFrame = frame[:,:].copy()
lastGray = cv2.cvtColor(lastFrame, cv2.COLOR_BGR2GRAY)
lastKeypoints = cv2.goodFeaturesToTrack(lastGray, mask = None, **feature_params)
flags, frame =
frameCount += 1
mask = np.zeros_like(lastFrame)
currentFrame = frame[:,:].copy()
frameCount += 1
lastKeypoints, currentKeypoints = findNthFeatures(lastFrame, lastKeypoints, currentFrame)
# for i,(new,old) in enumerate(zip(currentKeypoints, lastKeypoints)):
# a, b = new.ravel()
# c, d = old.ravel()
# mask = cv2.line(mask, (a,b), (c,d), color[i].tolist(), 2)
# frame =, (a,b), 5, color[i].tolist(), -1)
# img = cv2.add(frame,mask)
# cv2.imshow('display', img)
# cv2.waitKey(0)
homographyMatrix, homographyStatus = cv2.findHomography(currentKeypoints, lastKeypoints, cv2.RANSAC, 0.5)
lastFrame = currentFrame
lastKeypoints = currentKeypoints
if frameCount % featureRefreshRate == 0:
grayBuf = cv2.cvtColor(lastFrame, cv2.COLOR_BGR2GRAY)
lastKeypoints = cv2.goodFeaturesToTrack(grayBuf, mask = None, **feature_params)
flags, frame =
return Homographies
def stitchRow(videoName):
cv2.namedWindow('display', cv2.WINDOW_NORMAL)
frameCount = 0
cap = cv2.VideoCapture(videoName)
ret, initialImage =
homographyMatrices = []
homographyMatrices = getHomographies(videoName)
warpHMat = homographyMatrices[frameCount]
while ret:
ret, nextImg =
frameCount += 1
result = cv2.warpPerspective(nextImg, warpHMat, (initialImage.shape[1] + nextImg.shape[1], nextImg.shape[0]))
#result[0:initialImage.shape[0], 0:initialImage.shape[1]] = initialImage
cv2.imshow('display', result)
# cv2.imshow('display', initialImage)
# cv2.waitKey(0)
warpHMat = homographyMatrices[frameCount]
for j in range(frameCount, 0, -1):
warpHMat = warpHMat * homographyMatrices[j-1]
# initialImage = result[:, :].copy()
I'm running opencv 2.4.1 using python bindings and am having difficulty calculating the optical flow.
Specifically this section of code:
#calculate the opticalflow
if prev_saturation_thresh_img==None:
if i >=0:
p1, st, err = cv2.calcOpticalFlowPyrLK(prev_img,next_img,tracks_np,**lk_params)
Returns the error:
<unknown> is not a numpy array
So then I try to convert the images to numpy arrays:
Now I have a new error:
<unknown> data type = 17 is not supported
In a last-ditch effort I convert the images to cvmat (from iplimage) before converting it to a numpy array, just to see what happens
error: ..\..\..\OpenCV-2.4.1\modules\video\src\lkpyramid.cpp:607: error: (-215) nextPtsMat.checkVector(2, CV_32F, true) == npoints
So now I'm stuck. Below is the code in it's entirety for reference
import cv
import cv2
import numpy as np
class Target:
def __init__(self):
self.capture = cv.CaptureFromFile("raw_gait_cropped.avi")
def run(self):
#initiate font
font = cv.InitFont(cv.CV_FONT_HERSHEY_SIMPLEX, 1, 1, 0, 3, 8)
#instantiate images
#create params for GoodFeaturesToTrack and calcOpticalFlowPyrLK
gftt_params = dict( cornerCount=11,
lk_params = dict( winSize = (15, 15),
maxLevel = 2,
criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03),
while True:
#grab a frame from the video capture
#break the loop when the video is over
if img == None:
#convert the image to HSV
#Get Saturation channel
#Apply threshold to saturation channel
#locate initial features to track
if i==0:
eig_image=temp_image = cv.CreateMat(img.height, img.width, cv.CV_32FC1)
for (x,y) in cv.GoodFeaturesToTrack(saturation_thresh_img, eig_image, temp_image, **gftt_params):
print tracks
#calculate the opticalflow
if prev_saturation_thresh_img==None:
if i >=0:
p1, st, err = cv2.calcOpticalFlowPyrLK(prev_img,next_img,tracks_np,**lk_params)
print i
#display frames to users
cv.ShowImage("Raw Video",img)
cv.ShowImage("Saturation Channel",saturation_img)
cv.ShowImage("Saturation Thresholded",saturation_thresh_img)
# Listen for ESC or ENTER key
c = cv.WaitKey(7) % 0x100
if c == 27 or c == 10:
#close all windows once video is done
if __name__=="__main__":
t = Target()
OpenCV can be very picky about the data formats it accepts. The following code extract works for me:
prev = cv.LoadImage('images/'+file_list[0])
prev = np.asarray(prev[:,:])
prev_gs = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
current = cv.LoadImage('images/'+file)
current = np.asarray(current[:,:])
current_gs = cv2.cvtColor(current, cv2.COLOR_BGR2GRAY)
features, status, track_error = cv2.calcOpticalFlowPyrLK(prev_gs, current_gs, good_features, None,
Note the [:,:] when converting from images to numpy arrays, I have found that they are required.
I hope that this may solve your problem.