I am using multiprocessing to get frames of a video using Opencv in python.
My class looks like this :-
import cv2
from multiprocessing import Process, Queue
class StreamVideos:
def __init__(self):
self.image_data = Queue()
def start_proces(self):
p = Process(target=self.echo)
p.start()
def echo(self):
cap = cv2.VideoCapture('videoplayback.mp4')
while cap.isOpened():
ret,frame = cap.read()
self.image_data.put(frame)
# print("frame")
I start the process "echo" using :-
p = Process(target=self.echo)
p.start()
the echo function looks like this :-
def echo(self):
cap = cv2.VideoCapture('videoplayback.mp4')
while cap.isOpened():
ret,frame = cap.read()
self.image_data.put(frame)
in which i am using queue where i put these frames
self.image_data.put(frame)
and then in another process I start reviving these frames
self.obj = StreamVideos()
def start_process(self):
self.obj.start_proces()
p = Process(target=self.stream_videos)
p.start()
def stream_videos(self):
while True:
self.img = self.obj.image_data.get()
print(self.img)
but as soon as I start putting frames to queue, the ram gets filled very quickly and the system gets stuck. The video I am using is just 25 fps and 39mb in size, so it does not make any sense.
One thing I noticed is that the "echo" process is putting a lot of frames in the queue before the "stream_videos" process retrives it.
What could be the root of this problem?
Thanks in advance.
Expectations: -
Able to retrieve the frames continuosly.
Tried :-
Not putting frames in queue, in which case the ram is not filled.
The following is a general purpose single producer/multiple consumer implementation. The producer (class StreamVideos) creates a shared memory array whose size is the number of bytes in the video frame. One or more consumers (you specify the number of consumers to StreamVideos) can then call StreamVideos.get_next_frame() to retrieve the next frame. This method converts the shared array back into a numpy array for subsequent processing. The producer will only read the next frame into the shared array after all consumers have called get_next_frame:
#!/usr/bin/env python3
import multiprocessing
import numpy as np
import ctypes
import cv2
class StreamVideos:
def __init__(self, path, n_consumers):
"""
path is the path to the video:
n_consumers is the number of tasks to which we will be sreaming this.
"""
self._path = path
self._event = multiprocessing.Event()
self._barrier = multiprocessing.Barrier(n_consumers + 1, self._reset_event)
# Discover how large a framesize is by getting the first frame
cap = cv2.VideoCapture(self._path)
ret, frame = cap.read()
if ret:
self._shape = frame.shape
frame_size = self._shape[0] * self._shape[1] * self._shape[2]
self._arr = multiprocessing.RawArray(ctypes.c_ubyte, frame_size)
else:
self._arr = None
cap.release()
def _reset_event(self):
self._event.clear()
def start_streaming(self):
cap = cv2.VideoCapture(self._path)
while True:
self._barrier.wait()
ret, frame = cap.read()
if not ret:
# No more readable frames:
break
# Store frame into shared array:
temp = np.frombuffer(self._arr, dtype=frame.dtype)
temp[:] = frame.flatten(order='C')
self._event.set()
cap.release()
self._arr = None
self._event.set()
def get_next_frame(self):
# Tell producer that this consumer is through with the previous frame:
self._barrier.wait()
# Wait for next frame to be read by the producer:
self._event.wait()
if self._arr is None:
return None
# Return shared array as a numpy array:
return np.ctypeslib.as_array(self._arr).reshape(self._shape)
def consumer(producer, id):
frame_name = f'Frame - {id}'
while True:
frame = producer.get_next_frame()
if frame is None:
break
cv2.imshow(frame_name, frame)
cv2.waitKey(1)
cv2.destroyAllWindows()
def main():
producer = StreamVideos('videoplayback.mp4', 2)
consumer1 = multiprocessing.Process(target=consumer, args=(producer, 1))
consumer1.start()
consumer2 = multiprocessing.Process(target=consumer, args=(producer, 2))
consumer2.start()
"""
# Run as a child process:
producer_process = multiprocessing.Process(target=producer.start_streaming)
producer_process.start()
producer_process.join()
"""
# Run in main process:
producer.start_streaming()
consumer1.join()
consumer2.join()
if __name__ == '__main__':
main()
below is the code i used to play multiple videos in parallel using multi threading pool. but only one video is playing for each input. i want each video to open separately. not combined
import concurrent.futures
RTSP_URL = "rtsp://wowzaec2demo.streamlock.net/vod/mp4:BigBuckBunny_115k.mp4"
RTSP_List = [RTSP_URL, RTSP_URL, RTSP_URL, RTSP_URL]
def url_to_video(url):
video = cv2.VideoCapture(url)
while True:
_, frame = video.read()
cv2.imshow("RTSP", frame)
k = cv2.waitKey(1)
if k == ord('q'):
break
video.release()
cv2.destroyAllWindows()
while True:
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(url_to_video, RTSP_List)```
how to play each video separately.
you just need each thread to use a different name for the window in cv2.imshow, so that each thread will generate a different window, and you should place them somewhere distinct so that they aren't appearing one over the other, i just added in index to them so that each distinct index will have a position on screen and different title, also you shouldn't destroy all windows when one is done ...
import concurrent.futures
import cv2
RTSP_URL = "rtsp://wowzaec2demo.streamlock.net/vod/mp4:BigBuckBunny_115k.mp4"
RTSP_List = [(RTSP_URL,0), (RTSP_URL,1), (RTSP_URL,2), (RTSP_URL,3)]
def url_to_video(tup):
url,index = tup
video = cv2.VideoCapture(url)
while True:
_, frame = video.read()
cv2.imshow(f"RTSP {index}", frame)
cv2.moveWindow(f"RTSP {index}", index*300, 0)
k = cv2.waitKey(1)
if k == ord('q'):
break
video.release()
while True:
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(url_to_video, RTSP_List)
cv2.destroyAllWindows()
This is a followup on this where #Aaron helped quite a lot.
Context:
OpenCV, Python, a USB Webcam and multiprocessing - Grabbing all the time from the camera on a subprocess and returning a single frame upon request to the main process.
#Aaron did most of the heavylifting, of not all :). What I am missing is how to turn this into a process that gets a single frame upon request.
Help Please?
import numpy as np
import cv2
from multiprocessing import Process, Queue
from multiprocessing.shared_memory import SharedMemory
def produce_frames(q):
#get the first frame to calculate size of buffer
cap = cv2.VideoCapture(0)
success, frame = cap.read()
shm = SharedMemory(create=True, size=frame.nbytes)
framebuffer = np.ndarray(frame.shape, frame.dtype, buffer=shm.buf) #could also maybe use array.array instead of numpy, but I'm familiar with numpy
framebuffer[:] = frame #in case you need to send the first frame to the main process
q.put(shm) #send the buffer back to main
q.put(frame.shape) #send the array details
q.put(frame.dtype)
try:
while True:
cap.read(framebuffer)
except KeyboardInterrupt:
pass
finally:
shm.close() #call this in all processes where the shm exists
shm.unlink() #call this in at least one process
def consume_frames(q):
shm = q.get() #get the shared buffer
shape = q.get()
dtype = q.get()
framebuffer = np.ndarray(shape, dtype, buffer=shm.buf) #reconstruct the array
try:
while True:
cv2.imshow("window title", framebuffer)
cv2.waitKey(100)
except KeyboardInterrupt:
pass
finally:
shm.close()
if __name__ == "__main__":
q = Queue()
producer = Process(target=produce_frames, args=(q,))
producer.start()
consume_frames(q)
I'm using OpenCv and Dlib to execute facial recognition w/ landmarks, live from the webcam stream. The language is Python. It works fine on my macbook laptop, but I need it to run from a desktop computer 24/7. The computer is a PC Intel® Core™2 Quad CPU Q6600 # 2.40GHz 32bit running Debian Jessie. The drop in performance is drastic : there is a 10 seconds delay due to processing !
I therefore looked into multi-threading to gain performance :
I first tried the sample code by OpenCv, and the result is great! All four cores hit 100%, and the performance is much better.
I then replaced the frame processing code with my code, and it doesn't improve performance at all ! Only one core hits the 100%, the other ones stay very low. I even think it's worse with multi-threading on.
I got the facial landmark code from the dlib sample code. I know it can probably be optimized, but I want to understand why am I not able to use my (old) computer's full power with multi-threading ?
I'll drop my code below, thanks a lot for reading :)
from __future__ import print_function
import numpy as np
import cv2
import dlib
from multiprocessing.pool import ThreadPool
from collections import deque
from common import clock, draw_str, StatValue
import video
class DummyTask:
def __init__(self, data):
self.data = data
def ready(self):
return True
def get(self):
return self.data
if __name__ == '__main__':
import sys
print(__doc__)
try:
fn = sys.argv[1]
except:
fn = 0
cap = video.create_capture(fn)
#Face detector
detector = dlib.get_frontal_face_detector()
#Landmarks shape predictor
predictor = dlib.shape_predictor("landmarks/shape_predictor_68_face_landmarks.dat")
# This is where the facial detection takes place
def process_frame(frame, t0, detector, predictor):
# some intensive computation...
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
clahe_image = clahe.apply(gray)
detections = detector(clahe_image, 1)
for k,d in enumerate(detections):
shape = predictor(clahe_image, d)
for i in range(1,68): #There are 68 landmark points on each face
cv2.circle(frame, (shape.part(i).x, shape.part(i).y), 1, (0,0,255), thickness=2)
return frame, t0
threadn = cv2.getNumberOfCPUs()
pool = ThreadPool(processes = threadn)
pending = deque()
threaded_mode = True
latency = StatValue()
frame_interval = StatValue()
last_frame_time = clock()
while True:
while len(pending) > 0 and pending[0].ready():
res, t0 = pending.popleft().get()
latency.update(clock() - t0)
draw_str(res, (20, 20), "threaded : " + str(threaded_mode))
draw_str(res, (20, 40), "latency : %.1f ms" % (latency.value*1000))
draw_str(res, (20, 60), "frame interval : %.1f ms" % (frame_interval.value*1000))
cv2.imshow('threaded video', res)
if len(pending) < threadn:
ret, frame = cap.read()
t = clock()
frame_interval.update(t - last_frame_time)
last_frame_time = t
if threaded_mode:
task = pool.apply_async(process_frame, (frame.copy(), t, detector, predictor))
else:
task = DummyTask(process_frame(frame, t, detector, predictor))
pending.append(task)
ch = cv2.waitKey(1)
if ch == ord(' '):
threaded_mode = not threaded_mode
if ch == 27:
break
cv2.destroyAllWindows()
Performance issue was due to a bad compilation of dlib. Do not use pip install dlib which runs very very slowly for some reason compared to the proper compilation. I went from almost 10 seconds lag to about 2 seconds this way. So finally I didn't need multi-threading/processing, but I'm working on it to enhance the speed even more. Thanks for the help :)
i tried a simplified approach like P.Ro mentioned in his answer with processes writing to an output queue but somehow the queue got locked most of the time because all the processes wrote to it at the same time. (just my guess) i probably did something wrong.
in the end i ended up using pipes.
the code is nasty. but if i was me a few hours ago. i would still be glad to find an example that actually runs without effort.
from multiprocessing import Process, Queue, Manager,Pipe
import multiprocessing
import face_recognition as fik
import cv2
import time
video_input = 0
obama_image = fik.load_image_file("obama.png")
obama_face_encoding = fik.face_encodings(obama_image)[0]
quality = 0.7
def f(id,fi,fl):
import face_recognition as fok
while True:
small_frame = fi.get()
print("running thread"+str(id))
face_locations = fok.face_locations(small_frame)
if(len(face_locations)>0):
print(face_locations)
for (top7, right7, bottom7, left7) in face_locations:
small_frame_c = small_frame[top7:bottom7, left7:right7]
fl.send(small_frame_c)
fps_var =0
if __name__ == '__main__':
multiprocessing.set_start_method('spawn')
# global megaman
with Manager() as manager:
video_capture = cv2.VideoCapture(video_input)
fi = Queue(maxsize=14)
threads = 8
proc = []
parent_p = []
thread_p = []
# procids = range(0,threads)
for t in range(0,threads):
p_t,c_t = Pipe()
parent_p.append(p_t)
thread_p.append(c_t)
print(t)
proc.append(Process(target=f, args=(t,fi,thread_p[t])))
proc[t].start()
useframe = False
frame_id = 0
while True:
# Grab a single frame of video
ret, frame = video_capture.read()
effheight, effwidth = frame.shape[:2]
if effwidth < 20:
break
# Resize frame of video to 1/4 size for faster face recognition processing
xxx = 930
yyy = 10/16 #0.4234375
small_frame = cv2.resize(frame, (xxx, int(xxx*yyy)))
if frame_id%2 == 0:
if not fi.full():
fi.put(small_frame)
print(frame_id)
cv2.imshow('Video', small_frame)
print("FPS: ", int(1.0 / (time.time() - fps_var)))
fps_var = time.time()
#GET ALL DETECTIONS
for t in range(0,threads):
if parent_p[t].poll():
small_frame_c = parent_p[t].recv()
cv2.imshow('recc', small_frame_c)
height34, width34 = small_frame_c.shape[:2]
# print fsizeee
if(width34<20):
print("face 2 small")
print(width34)
break
face_encodings_cam = fik.face_encodings(small_frame_c,[(0, width34, height34, 0)])
match = fik.compare_faces([obama_face_encoding], face_encodings_cam[0])
name = "Unknown"
if match[0]:
name = "Barack"
print(name)
break
frame_id += 1
# Hit 'q' on the keyboard to quit!
if cv2.waitKey(1) & 0xFF == ord('q'):
break
Do not have much experience with using ThreadPool, but I always just use Process like shown below. You should be able to easily edit this code to fit your needs. I wrote this with your implementation in mind.
This code will get the number of cores and start however many worker processes that will all be implementing the desired function in parallel. They all share a Queue of frames for input and all put to the same output Queue for the main to get and show. Each Queue has a maximum size, in this case 5. This ensures that despite the CPU time it takes to process, it will always be relatively live time.
import numpy as np
import cv2
from multiprocessing import Process, Queue
import time
#from common import clock, draw_str, StatValue
#import video
class Canny_Process(Process):
def __init__(self,frame_queue,output_queue):
Process.__init__(self)
self.frame_queue = frame_queue
self.output_queue = output_queue
self.stop = False
#Initialize your face detectors here
def get_frame(self):
if not self.frame_queue.empty():
return True, self.frame_queue.get()
else:
return False, None
def stopProcess(self):
self.stop = True
def canny_frame(self,frame):
# some intensive computation...
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 100)
#To simulate CPU Time
#############################
for i in range(1000000):
x = 546*546
res = x/(i+1)
#############################
'REPLACE WITH FACE DETECT CODE HERE'
if self.output_queue.full():
self.output_queue.get_nowait()
self.output_queue.put(edges)
def run(self):
while not self.stop:
ret, frame = self.get_frame()
if ret:
self.canny_frame(frame)
if __name__ == '__main__':
frame_sum = 0
init_time = time.time()
def put_frame(frame):
if Input_Queue.full():
Input_Queue.get_nowait()
Input_Queue.put(frame)
def cap_read(cv2_cap):
ret, frame = cv2_cap.read()
if ret:
put_frame(frame)
cap = cv2.VideoCapture(0)
threadn = cv2.getNumberOfCPUs()
threaded_mode = True
process_list = []
Input_Queue = Queue(maxsize = 5)
Output_Queue = Queue(maxsize = 5)
for x in range((threadn -1)):
canny_process = Canny_Process(frame_queue = Input_Queue,output_queue = Output_Queue)
canny_process.daemon = True
canny_process.start()
process_list.append(canny_process)
ch = cv2.waitKey(1)
cv2.namedWindow('Threaded Video', cv2.WINDOW_NORMAL)
while True:
cap_read(cap)
if not Output_Queue.empty():
result = Output_Queue.get()
cv2.imshow('Threaded Video', result)
ch = cv2.waitKey(5)
if ch == ord(' '):
threaded_mode = not threaded_mode
if ch == 27:
break
cv2.destroyAllWindows()
This should do the trick just change my canny function to do your face detection. I wrote this from your code and compared the two. This is significantly faster. I am using multiprocessing.Process here. In python processes are truly parallel and threads are not quite because of the GIL. I am using 2 queues to send data back and forth between the main and the processes. Queues are both Thread and Process safe.
you may use this, multithreaded:
from imutils.video import VideoStream
# Initialize multithreading the video stream.
videostream = "rtsp://192.168.x.y/user=admin=xxxxxxx_channel=vvvv=1.sdp?params"
vs = VideoStream(src=videostream, resolution=frameSize,
framerate=32).start()
frame = vs.read()
I am a beginner with multiprocessing in Python. I am developing a multiprocessing script for OpenCV, since my computer does not succeed in real-time processing of OpenCV frames.
I aim at loading and processing frames in the main process, and displaying them using a child process. My problem is that I do not understand how to build the display loop from the queued frames. Can someone please help?
My code:
#!/usr/bin/env python
from multiprocessing import Process, Queue
from Queue import Empty
from PIL import Image
import cv2
import cv2.cv as cv
import numpy as np
def image_display(taskqueue):
cv2.namedWindow ('image_display', cv2.CV_WINDOW_AUTOSIZE)
while True:
if taskqueue.get()==None:
continue
else:
image = taskqueue.get()
im = Image.fromstring(image['mode'], image['size'], image['pixels'])
num_im = np.asarray(im)
cv2.imshow ('image_display', num_im)
if __name__ == '__main__':
taskqueue = Queue()
vidFile = cv2.VideoCapture('doppler.wmv')
p = Process(target=image_display, args=(taskqueue,))
p.start()
while True:
flag, image=vidFile.read()
if flag == 0:
break
im = Image.fromarray(image)
im_dict = {
'pixels': im.tostring(),
'size': im.size,
'mode': im.mode,
}
taskqueue.put(im_dict)
p.join()
cv.DestroyAllWindows()
EDIT
Thanks to the answers, I was able to find the problem. Below is a modified script in which I slowed my loops on purpose and added an outqueue for debugging. It appears that although the frames captured with vidFile.read() are indeed passed as numpy arrays through the queue and are then passed unmodified as argument to cv2.imshow(),cv2.imshow() refuses to display the image for an unknown reason. Any help to fix that issue would be immensly appreciated!
modified code:
#!/usr/bin/env python
from multiprocessing import Process, Queue
from Queue import Empty
import cv2
import cv2.cv as cv
import numpy as np
import time
def image_display(taskqueue, outqueue):
cv2.namedWindow ('image_display', cv2.CV_WINDOW_AUTOSIZE)
while True:
try:
outqueue.put('trying')
time.sleep(1)
image = taskqueue.get()
outqueue.put(image)
cv2.imshow('image_display', image)
except:
continue
if __name__ == '__main__':
taskqueue = Queue()
outqueue = Queue()
vidFile = cv2.VideoCapture('doppler.wmv')
p = Process(target=image_display, args=(taskqueue, outqueue))
p.start()
while True:
print outqueue.get()
flag, image=vidFile.read()
if flag == 0:
break
taskqueue.put(image)
time.sleep(0.010)
p.join()
cv.DestroyAllWindows()
This should work (explanation of changes below):
#!/usr/bin/env python
from multiprocessing import Process, Queue
from Queue import Empty
from PIL import Image
import cv2
import cv2.cv as cv
import numpy as np
def image_display(taskqueue):
cv2.namedWindow ('image_display', cv2.CV_WINDOW_AUTOSIZE)
while True:
image = taskqueue.get() # Added
if image is None: break # Added
cv2.imshow ('image_display', image) # Added
cv2.waitKey(10) # Added
continue # Added
if taskqueue.get()==None:
continue
else:
image = taskqueue.get()
im = Image.fromstring(image['mode'], image['size'], image['pixels'])
num_im = np.asarray(im)
cv2.imshow ('image_display', num_im)
if __name__ == '__main__':
taskqueue = Queue()
vidFile = cv2.VideoCapture('doppler.wmv')
p = Process(target=image_display, args=(taskqueue,))
p.start()
while True:
flag, image=vidFile.read()
taskqueue.put(image) # Added
import time # Added
time.sleep(0.010) # Added
continue # Added
if flag == 0:
break
im = Image.fromarray(image)
im_dict = {
'pixels': im.tostring(),
'size': im.size,
'mode': im.mode,
}
taskqueue.put(im_dict)
taskqueue.put(None)
p.join()
cv.DestroyAllWindows()
I tried to make minimal changes to your code by just adding lines (lines containing comments # Added):
1) Just put the image itself (the original NumPy array) on the queue.
2) Pause a little bit in the master process before reading another frame. You need this so as not to overrun the queue, because imshow() in the spawned process may take a bit longer since it's calling X. You might need to increase this value (in seconds) depending on your system.
3) Spawned process has to do the waitKey() after every imshow().
4) Master process puts the special None image on the queue when it's done.