python thread creating in callback, not finishing - python

I'm trying to save image buffers as .pngs with blender's python API (but this is mainly just a python question)
I'm trying to speed things up by making a thread to save the image, and the function that creates the thread is being called from a callback-handler that is activated whenever the 3D screen is refreshed, here is the full code (its a little messy):
import base64, io, os, bgl, gpu, bpy, threading, time, sys
import numpy as np
from gpu_extras.presets import draw_texture_2d
from PIL import Image
import multiprocessing.pool as mpool
finalPath = bpy.context.scene.render.filepath + "hithere.png"
WIDTH = 1920
HEIGHT = 1080
offscreen = gpu.types.GPUOffScreen(WIDTH, HEIGHT)
def draw2():
global finalPath
global array
global WIDTH
global HEIGHT
global needsSaving
context = bpy.context
scene = context.scene
view_matrix = scene.camera.matrix_world.inverted()
projection_matrix = scene.camera.calc_matrix_camera(
context.depsgraph, x=WIDTH, y=HEIGHT)
offscreen.draw_view3d(
scene,
context.view_layer,
context.space_data,
context.region,
view_matrix,
projection_matrix)
bgl.glDisable(bgl.GL_DEPTH_TEST)
draw_texture_2d(offscreen.color_texture, (0, -125), WIDTH, HEIGHT)
buffer = bgl.Buffer(bgl.GL_BYTE, WIDTH * HEIGHT * 4)
bgl.glReadBuffer(bgl.GL_BACK)
bgl.glReadPixels(0, -125, WIDTH, HEIGHT, bgl.GL_RGBA, bgl.GL_UNSIGNED_BYTE, buffer)
needle = threading.Thread(target=saveIt,args=[buffer, finalPath, WIDTH, HEIGHT])
needle.daemon = True
needle.start()
#### thread.start_new_thread(saveIt,(buffer, finalPath, WIDTH, HEIGHT))
def coby(scene):
frame = scene.frame_current
folder = scene.render.filepath
myFormat = "png"#scene.render.image_settings.renderformat.lower()
outputPath = os.path.join(folder, "%05d.%s" % (frame, myFormat))
global finalPath
finalPath = outputPath
h = bpy.types.SpaceView3D.draw_handler_add(draw2, (), 'WINDOW', 'POST_PIXEL')
bpy.app.handlers.frame_change_pre.clear()
bpy.app.handlers.frame_change_pre.append(coby)
def saveIt(buffer, path, width, height):
array = np.asarray(buffer, dtype=np.uint8)
myBytes = array.tobytes()
im = Image.frombytes("RGBA",(width, height), myBytes)
rawBytes = io.BytesIO()
im.save(rawBytes, "PNG")
rawBytes.seek(0)
base64Encoded = base64.b64encode(rawBytes.read())
txt = "data:image/png;base64," + base64Encoded.decode()
f = open(finalPath, "wb")
f.write(base64.decodebytes(base64Encoded))
f.close()
it actually does work, except when I play the timeline in blender (which calls the frame_pre callback and also the 3D view-refresh callback (not sure in which order though), most of my images are replaced, except some are not, as can be seen in this screenshot:
[![enter image description here][1]][1]
I originally had all blue-planed images, then I ran the script through the thread, and it replaced almost all of them, except some of the blue-planed images still remain (seemingly at a random interval). This worked fine if I call .join() right after I make the thread, or don't use the thread at all, but seemingly the thread is the only way to make it work a lot faster.
I've been looking around for how to use threads with queues and pooling (How to use python multiprocessing Pool.map within loop, What happened to thread.start_new_thread in python 3,
How can I make a background, non-blocking input loop in python?, Creating Threads in python),
SO: Why aren't all of the threads finishing?
[1]: https://i.stack.imgur.com/nJCwH.png .................

Related

Flask app working really slow with opencv

I have a flask application which reads frame from camera and streams it to the website.
Camera.py
from threading import Thread
from copy import deepcopy
import queue
import cv2
class Camera(Thread):
def __init__(self, cam, normalQue, detectedQue):
Thread.__init__(self)
self.__cam = cam
self.__normalQue = normalQue
self.__detectedQue = detectedQue
self.__shouldStop = False
def __del__(self):
self.__cam.release()
print('Camera released')
def run(self):
while True:
rval, frame = self.__cam.read()
if rval:
frame = cv2.resize(frame, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)
_, jpeg = cv2.imencode('.jpg', frame)
self.__normalQue.put(jpeg.tobytes())
self.__detectedQue.put(deepcopy(jpeg.tobytes()))
if self.__shouldStop:
break
def stopCamera(self):
self.__shouldStop = True
From what you can see I am just reading the frame, resizing it and storing in two different ques. Nothing too complex.
I also have two two classes responsible for mjpeg stream:
NormalVideoStream.py
from threading import Thread
import traceback
import cv2
class NormalVideoStream(Thread):
def __init__(self, framesQue):
Thread.__init__(self)
self.__frames = framesQue
self.__img = None
def run(self):
while True:
if self.__frames.empty():
continue
self.__img = self.__frames.get()
def gen(self):
while True:
try:
if self.__img is None:
print('Normal stream frame is none')
continue
yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + self.__img + b'\r\n')
except:
traceback.print_exc()
print('Normal video stream genenation exception')
and
DetectionVideoStream.py
from threading import Thread
import cv2
import traceback
class DetectionVideoStream(Thread):
def __init__(self, framesQue):
Thread.__init__(self)
self.__frames = framesQue
self.__img = None
self.__faceCascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
def run(self):
while True:
if self.__frames.empty():
continue
self.__img = self.__detectFace()
def gen(self):
while True:
try:
if self.__img is None:
print('Detected stream frame is none')
yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + self.__img + b'\r\n')
except:
traceback.print_exc()
print('Detection video stream genenation exception')
def __detectFace(self):
retImg = None
try:
img = self.__frames.get()
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
faces = self.__faceCascade.detectMultiScale(gray, 1.1, 4)
for (x, y, w, h) in faces:
cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 2)
(_, encodedImage) = cv2.imencode('.jpg', img)
retImg = encodedImage.tobytes()
except:
traceback.print_exc()
print('Face detection exception')
return retImg
From what you can see in both streams I am reading camera frames from ques in infinite loop. Both classes also have gen() method which generates frame to site itself. Only difference is that in detection stream I am also doing face recognition.
Now in my main file:
main.py
from flask import Blueprint, render_template, Response, abort, redirect, url_for
from flask_login import login_required, current_user
from queue import Queue
from . import db
from .Camera import Camera
from .NormalVideoStream import NormalVideoStream
from .DetectionVideoStream import DetectionVideoStream
from .models import User
import cv2
main = Blueprint('main', __name__)
# Queues for both streams
framesNormalQue = Queue(maxsize=0)
framesDetectionQue = Queue(maxsize=0)
print('Queues created')
# RPi camera instance
camera = Camera(cv2.VideoCapture(0), framesNormalQue, framesDetectionQue)
camera.start()
print('Camera thread started')
# Streams
normalStream = NormalVideoStream(framesNormalQue)
detectionStream = DetectionVideoStream(framesDetectionQue)
print('Streams created')
normalStream.start()
print('Normal stream thread started')
detectionStream.start()
print('Detection stream thread started')
#main.route('/')
def index():
return render_template('index.html')
#main.route('/profile', methods=["POST", "GET"])
def profile():
if not current_user.is_authenticated:
abort(403)
return render_template('profile.html', name=current_user.name, id=current_user.id, detectionState=current_user.detectionState)
#main.route('/video_stream/<int:stream_id>')
def video_stream(stream_id):
if not current_user.is_authenticated:
abort(403)
print(f'Current user detection: {current_user.detectionState}')
global detectionStream
global normalStream
stream = None
if current_user.detectionState:
stream = detectionStream
print('Stream set to detection one')
else:
stream = normalStream
print('Stream set to normal one')
return Response(stream.gen(), mimetype='multipart/x-mixed-replace; boundary=frame')
#main.route('/detection')
def detection():
if not current_user.is_authenticated:
abort(403)
if current_user.detectionState:
current_user.detectionState = False
else:
current_user.detectionState = True
user = User.query.filter_by(id=current_user.id)
user.detectionState = current_user.detectionState
db.session.commit()
return redirect(url_for('main.profile', id=current_user.id, user_name=current_user.name))
#main.errorhandler(404)
def page_not_found(e):
return render_template('404.html'), 404
#main.errorhandler(403)
def page_forbidden(e):
return render_template('403.html'), 403
I am creating camera, ques and streams object globally. Also when user logs in on website, he will be able to see live video stream. There is also a button which changes the stream which is currently presented.
Whole project is working well with one exception: when I change stream to detection one it will have huge lag (around 10/15 seconds) which makes whole thing unfunctional. Tried to search a bug/optimalization in my own but can't find anything. On purpose I am running everything on separate threads to unoverload app but it looks like this is not enought. Lag on a level of 1 - 2 seconds will be acceptable, but not 10+. So guys, maybe you can see some bug here? Or know how to optimalize it?
Also need to mention that whole app is running on RPi 4B 4GB and I am accessing website on my desktop. Default server is changed to the Nginx and Gunicorn. From what I can see Pi's CPU usage is 100% when app is working. When testing on default server behaviout is the same. Guess that 1,5 GHz CPU have enough power to run it more smoothly.
One option is using VideoStream
The reason VideoCapture is so slow because the VideoCapture pipeline spends the most time on the reading and decoding the next frame. While the next frame is being read, decode, and returned the OpenCV application is completely blocked.
VideoStream solves the problem by using a queue structure, concurrently read, decode, and return the current frame.
VideoStream supports both PiCamera and webcam.
All you need to is:
Install imutils:
For virtual environment: pip install imutils
For anaconda environment: conda install -c conda-forge imutils
Initialize VideoStream on main.py
import time
from imutils.video import VideoStream
vs = VideoStream(usePiCamera=True).start() # For PiCamera
# vs = VideoStream(usePiCamera=False).start() # For Webcam
camera = Camera(vs, framesNormalQue, framesDetectionQue)
In your Camera.py
In run(self) method:
* ```python
def run(self):
while True:
frame = self.__cam.read()
frame = cv2.resize(frame, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)
_, jpeg = cv2.imencode('.jpg', frame)
self.__normalQue.put(jpeg.tobytes())
self.__detectedQue.put(deepcopy(jpeg.tobytes()))
if self.__shouldStop:
break
```
One of the issue which even I had was regrading the encoding and decoding. Like the encoder of Opencv is too slow so try to use encoder from simplejpeg. Use pip3 install simplejpeg and with respect to using cv2.imencode() use simplejpeg.encode_jpeg()
I'm not really surprised about your problem, in general "detection" using a lot of your computation time, because
performing a cascaded classification algorithm is a demanding computational task.
I found a source which compares cascaded classification algos for there performance link
An easy solution, would be to reduce the frame rate, when
processing your detection.
An easy implementation to reduce performance demand could be something like a skip counter e.g.
frameSkipFactor = 3 # use every third frame
frameCounter = 0
if (frameCounter%frameSkipFactor==0):
#process
else:
print("ignore frame", frameCounter)
frameCounter+=1
Nevertheless you will have a lag, because the detection calculation
will produce always a time offset.
I you planning to construct a "real time" classification camera system, please look for another class of classification algos,
which are more designed for this use-case.
I followed an discussion here: real time class algos
Another solution could be using a bigger "hardware hammer" than the rpi e.g. an GPU implementation of the algo via Cuda etc.
What I have found that main reason of getting slow frames is that you have higher resolution and frame rates.
To tackle this problem you can change the resolution to something 640 width by 480 height with fps 30 or less upto 5 fps (if you only need face detection) and can implement resizing of OpenCV (cv2.resize() function) by 0.25 resizing factor of fx and fy. Do this if you don't want higher resolution streams. Works smoothly with opencv. I wanted to try out that VideoStream code (from imutils) as it is also used by Adrian Rosebrock of PyImageSearch. I will use in later projects.
For reference, I am posting code snippet in the following. Special Thanks to Adrian Rosebrock and ageitey face_recognition as their code helped me making it.
class Camera(object):
SHRINK_RATIO = 0.25
FPS = 5
FRAME_RATE = 5
WIDTH = 640
HEIGHT = 480
def __init__(self):
""" initializing camera with settings """
self.cap = cv2.VideoCapture(0)
self.cap.set(3,Camera.WIDTH)
self.cap.set(4,Camera.HEIGHT)
self.cap.set(5,Camera.FPS)
self.cap.set(7,Camera.FRAME_RATE)
def get_frame(self):
""" get frames from the camera """
success, frame = self.cap.read()
if success == True:
# Resizing to 0.25 scale
rescale_frame = cv2.resize(frame, None, fx= Camera.SHRINK_RATIO, fy=Camera.SHRINK_RATIO)
cascPath = "haarcascade_frontalface_default.xml"
faceCascade = cv2.CascadeClassifier(cascPath)
gray = cv2.cvtColor(rescale_frame, cv2.COLOR_BGR2GRAY)
# detecting faces
faces = faceCascade.detectMultiScale(
gray,
scaleFactor=1.1,
minNeighbors=5,
minSize=(30,30)
)
# Draw a rectangle around the faces
if len(faces) != 0:
for (x, y, w, h) in faces:
x *=4
y *=4
w *=4
h *=4
# Draw rectangle across faces in current frame
cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
# return frame outside if-statement
return frame
Also keep in mind that JPEG is the accelerated codec, use:
cv2.imencode(".jpg",frame)[1].tobytes()

How to use all available computing powers in a function

I need a function that changes their size for a given number of photos, use all available computing powers. I found that I can use function parallel for this, but I don't know how to use it properly.
Here is my code:
from random import randint
from fastai.core import parallel
import cv2
def image_conversion(list, size, directory, conversion):
if conversion == '--resize':
for img in list:
imageAsNumpy = cv2.imread(img)
dim = (size[1], size[0])
resized = cv2.resize(imageAsNumpy, dim, interpolation = cv2.INTER_AREA)
writeStatus = cv2.imwrite(directory + img, resized)
if writeStatus is True:
print('Imagine written successfully!')
else:
print('Something went wrong!')
elif conversion == '--random crop':
for img in list:
image = cv2.imread(img)
height, width = image.shape[:2]
h = randint(0, height)
w = randint(0, width)
cropped = image[h:h+width, w:w+height]
status = cv2.imwrite(directory + img, cropped)
if status is True:
print('Imagine cropped')
else:
print('Problem')
But when I tried to call parallel function it fails every time, for example the progress bar goes to a 100%, but with none of the work is completed. Could someone please help me understand what I am doing wrong?

why BitBlt() is slower in python compared to C++?

I am trying to use BitBlt() function to copy the bitmap data.
When I use the python win32ui/win32gui API the average time it takes is coming around 30 ms, while the same API when I call using C++, the average time comes around 10-15 ms.
So I am wondering, what could be the reason for the this behavior?
Thanks
EDIT : Here is the snippet :
python :
hdesktop = win32gui.GetDesktopWindow()
# create a device context
desktop_dc = win32gui.GetWindowDC(hdesktop)
img_dc = win32ui.CreateDCFromHandle(desktop_dc)
# create a memory based device context
mem_dc = img_dc.CreateCompatibleDC()
# create a bitmap object
screenshot = win32ui.CreateBitmap()
screenshot.CreateCompatibleBitmap(img_dc, width, height)
oldbmp = mem_dc.SelectObject(screenshot)
# copy the screen into our memory device context
mem_dc.BitBlt((destUpLeftX, destUpLeftY), (width, height), img_dc, (srcUpLeftX, srcUpLeftY),win32con.SRCCOPY)
mem_dc.SelectObject(oldbmp)
win32gui.DeleteObject(screenshot.GetHandle())
img_dc.DeleteDC()
win32gui.ReleaseDC(hdesktop, desktop_dc)
mem_dc.DeleteDC()
C++ :
HDC hwindowDC=GetDC(GetDesktopWindow());
HDC hwindowCompatibleDC=CreateCompatibleDC(hwindowDC);
// create a bitmap
HBITMAP hbwindow = CreateCompatibleBitmap( hwindowDC, width, height);
LOG(LOG_DBG, "before SelectObject");
// SAVE OLD BITMAP
HGDIOBJ hOldBmp = SelectObject(hwindowCompatibleDC, hbwindow); //copy from hwindowCompatibleDC to hbwindow
BitBlt(hwindowCompatibleDC, destUpLeftX, destUpLeftY, width, height, hwindowDC, srcUpLeftX, srcUpLeftY, SRCCOPY);
SelectObject(hwindowCompatibleDC, hOldBmp);
DeleteDC(hwindowCompatibleDC);
DeleteObject(hbwindow);
// RELEASE WINDOW DC
ReleaseDC(GetDesktopWindow(), hwindowDC);

How to take snapshot from given region of the screen with python?

Here is my thoughts:
1) snap shot the given region from the screen like (100,100,80,60), save the result as image
2) process the image with OpenCV python interface
Just be first to python and wonder if this is good solution,be specific,wonder how to snapshot with python.
Thanks,
It's fairly simple using CGRectMake in Apple's CoreGraphics api:
CG.CGRectMake(x, y, w, h)
This allows you to define the horizontal/vertical positions, and width/height respectively.
Code:
#!/usr/bin/python
import Quartz
import LaunchServices
from Cocoa import NSURL
import Quartz.CoreGraphics as CG
def screenshot(path, dpi, region = None):
if region is None:
region = CG.CGRectInfinite
image = CG.CGWindowListCreateImage(
region,
CG.kCGWindowListOptionOnScreenOnly,
CG.kCGNullWindowID,
CG.kCGWindowImageDefault)
url = NSURL.fileURLWithPath_(path)
dest = Quartz.CGImageDestinationCreateWithURL(
url,
LaunchServices.kUTTypePNG, 1, None
)
prop = {
Quartz.kCGImagePropertyDPIWidth: dpi,
Quartz.kCGImagePropertyDPIHeight: dpi,
}
Quartz.CGImageDestinationAddImage(dest, image, prop)
Quartz.CGImageDestinationFinalize(dest)
spec = (0, 0, 800, 600) # x, y, w, h
path = '/path/to/screnshot_region.png' # save path
area = CG.CGRectMake(*spec) # set area to cgrect
screenshot(path, dpi=72, region=area) # call function
To use, just call the function:
screenshot(path, dpi=72, region=area)
*dpi will set the image output resolution; leave out the region argument for fullscreen capture. In regards to the OpenCV portion - I don't use it often enough to provide anything at this time.

CreateCompatibleDC fails after calling it exactly 4,984 times

I've encountered a strange bug in my program. It's a little odd, as it occurs on exactly the 4984th call to the function. I've been tweaking this all day, and without fail, that's the number at which it fails.
The code in question is a small convenience function which creates and returns a DC and Bitmap. The context of this little function is that it's a piece in my stab at a screen recorder, so it's getting called tons and tons of times.
When I first noticed the error, after some sleuthing around, I found this very similar Stackoverflow question, so the code below is modeled after the answer in that thread. However, even after following the suggested deletion and releasing pattern, the problem remains for me right on that 4984th iteration.
This is the specific failure point of the program:
def _createDcAndBitmap(self, size, input_bitmap=None):
hwnd = win32gui.GetDesktopWindow()
zhwndDevice = win32gui.GetWindowDC(hwnd)
zmfcDC = win32ui.CreateDCFromHandle(zhwndDevice)
zsaveDC = zmfcDC.CreateCompatibleDC()
zsaveBitMap = win32ui.CreateBitmap()
zsaveBitMap.CreateCompatibleBitmap(zmfcDC, *size)
hOldBmp = zsaveDC.SelectObject(zsaveBitMap)
return zsaveDC, zsaveBitMap, hOldBmp, hwnd
The error is always throw from the line:
zsaveBitMap.CreateCompatibleBitmap(zmfcDC, *size)
With the error reported by Python as:
error: CreateCompatibleDC failed
Calling FormatMessage from the win32api gives further information:
Invalid device context (DC) handle.
The Full Code:
class Bitmap(object):
_sourceDC, _sourceBitmap, hOldBmp, hwnd = self._bytesToDcAndBitmap(bytestring, sourceSize)
_bytes, _size = self._scaleBitmap(_sourceDC, _sourceBitmap, hOldBmp, hwnd, sourceSize)
def _scaleBitmap(self, sourceDC, sourceBitmap, sourceHOldBmp, sourceHwnd, sourceSize):
'''
Resizes the current bitmap down to a target size
of (X, 540), where the X is varied depending on the
aspect ratio of the input bitmap
'''
target_size = self._getTargetSize(sourceSize)
destDC, destBitmap, hOldBmp, hwnd = self._createDcAndBitmap(target_size)
win32gui.SetStretchBltMode(destDC.GetHandleAttrib(), 4)
win32gui.StretchBlt(pywintypes.HANDLE(destDC.GetHandleAttrib()), 0,0,target_size[0], target_size[1], # #UndefinedVariable HANDLE -- PyDev is dumb
sourceDC.GetHandleAttrib(), 0,0, sourceSize[0], sourceSize[1], win32con.SRCCOPY)
new_bytestring = destBitmap.GetBitmapBits(True)
new_size = self._bitmapSize(destBitmap)
self._deleteDCBitmapOldBmpAndHwmn(sourceDC, sourceBitmap, sourceHOldBmp, sourceHwnd)
self._deleteDCBitmapOldBmpAndHwmn(destDC, destBitmap, hOldBmp, hwnd)
def _bytesToDcAndBitmap(self, bytestring, sourceSize):
a = (ctypes.c_int * (sourceSize[0]*sourceSize[1]))()
ctypes.memmove(a, bytestring, len(bytestring))
hwnd = win32gui.GetDesktopWindow()
zhwndDevice = win32gui.GetWindowDC(hwnd)
zmfcDC = win32ui.CreateDCFromHandle(zhwndDevice)
zsaveDC = zmfcDC.CreateCompatibleDC()
zsaveBitMap = win32ui.CreateBitmap()
zsaveBitMap.CreateCompatibleBitmap(zmfcDC, sourceSize[0], sourceSize[1])
hOldBmp = zsaveDC.SelectObject(zsaveBitMap)
ctypes.windll.gdi32.SetBitmapBits(zsaveBitMap.GetHandle(), len(bytestring), ctypes.byref(a))
return zsaveDC, zsaveBitMap, hOldBmp, hwnd
def _createDcAndBitmap(self, size, input_bitmap=None):
hwnd = win32gui.GetDesktopWindow()
zhwndDevice = win32gui.GetWindowDC(hwnd)
zmfcDC = win32ui.CreateDCFromHandle(zhwndDevice)
zsaveDC = zmfcDC.CreateCompatibleDC()
zsaveBitMap = win32ui.CreateBitmap()
zsaveBitMap.CreateCompatibleBitmap(zmfcDC, *size)
hOldBmp = zsaveDC.SelectObject(zsaveBitMap)
return zsaveDC, zsaveBitMap, hOldBmp, hwnd
def _deleteDCBitmapOldBmpAndHwmn(self, dc, bitmap, old_bitmap, hwnd):
win32gui.SelectObject(dc.GetHandleAttrib(), old_bitmap.GetHandle())
win32gui.DeleteDC(dc.GetHandleAttrib())
win32gui.DeleteObject(bitmap.GetHandle())
win32gui.ReleaseDC(win32gui.GetDesktopWindow(), hwnd)
The code is a little peculiar, as it's running on the 'exit' end of a pipe. So it's job is reconstructing a serialized byte string (gotten from GetBitmapBits()) back into a Bitmap, scaling it, then going back to a byte string. Doing it this way is about a solid order of magnitude faster than using higher level Python libraries :)
So, I'm guessing this is due to a memory leak somewhere, but as far as I can tell, I'm closing everything down correctly. And yet, it still fails right around the 5000th call.
Am I missing a leak somewhere?

Categories