How to make this task improve cpu usage? - python

I try to hash many file, but it not use full of cpu power. it only consume 25%. i test to move the heavy process into thread. but still no different. im from nodejs use sharp library. with same task. it consume all cpu usage. How python to make it full power?
import cv2
import math
import datetime
import hashlib
import threading
def thread_function(image, yPos, xPos, wSizeBlock, hSizeBlock):
block = image[yPos:yPos+wSizeBlock, xPos:xPos+hSizeBlock]
hash = hashlib.sha256()
hash.update(block.tobytes())
print(hash.hexdigest())
image = cv2.imread('frame323.jpg', cv2.IMREAD_COLOR)
dimension = {
'width': image.shape[1],
'height': image.shape[0]
}
wSizeBlock = int(16)
hSizeBlock = int(16)
wBlockLength = math.floor(dimension['width'] / wSizeBlock)
hBlockLength = math.floor(dimension['height'] / hSizeBlock)
count = 0
start_time = datetime.datetime.now()
print(start_time)
for k in range(0, 500):
for i in range(0, wBlockLength):
for j in range(0, hBlockLength):
xPos = int(i*wSizeBlock)
yPos = int(j*hSizeBlock)
x = threading.Thread(target=thread_function, args=(image, xPos, yPos, wSizeBlock, hSizeBlock))
x.start()
count += 1
count = 0
end_time = datetime.datetime.now()
print(end_time)

For CPU intensive operations that can be split up into smaller tasks, you would want to use the multiprocessing module. It is similar to the threading module in that it allows multiple functions to be ran at once. Syntax looks something like this:
import multiprocessing as mp
def add(a, b):
return a + b
p = mp.Process(target=add, args=(1, 2))
p.start()

Related

How can I parallelize the following snippet of code in python?

I have a bunch of matrix multiplication operations that are performed only row-wise. I was wondering how to speed-up the computation by parallelization:
data = np.random.randint(1, 100, (100000, 800))
indices_1 = np.equal(data, 1)
A = np.zeros((100000, 100))
B = np.random.randn(800, 100)
for i in range(100000):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
I tried multiprocessor but for some reason, but it did not perform better than sequential. Here is my multiprocessor implementation:
from multiprocessing.pool import ThreadPool, Pool
pool = ThreadPool() # can also use Pool
def f(i):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
pool.map(f, range(100000))
Both yielded the same amount of running time (around 32 seconds). Other parallelization method like concurrent.futures did not improve the runtime (used like below):
with concurrent.futures.ThreadPoolExecutor() as executor:
result = executor.map(f, range(100000))
I also tried to apply dask but could not make their framework work in my case. Any help will be much appreciated! Thanks!
import numpy as np
import multiprocessing as mp
data = list(np.random.randint(1, 100, (100000, 800)))
indices_1 = np.equal(data, 1)
A = list(np.zeros((100000, 100)))
B = np.random.randn(800, 100)
def f(data, A, i):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
with mp.Manager() as manager:
data_global = manager.list(data)
A_global = manager.list(A)
with mp.Pool() as p:
results = [ p.apply_async(f, (data_global, A_global, i,)) for i in range(100000) ]
for i in results:
i.wait()
data_global = list(data_global)
A_global = list(A_global)

Multiprocessing - memory consumption

I've read all the related posts on the subject, but I can't for the life of me get multiprocessing to work properly with shared memory.
I'm using an EC2 instance with 96 cores, but for some reason despite using shared memory, my memory consumption explodes when using a worker pool with 96 workers.
EDIT: Had a bug earlier, which caused not all the cores to be used (stupid bug where I didn't give the right parameters for map) - anyways, clarified my current problem.
Any ideas? Attaching a screenshot of htop on my server to show the CPU usage + memory consumption.
For reference, I used the figtree package from here: https://github.com/ec2604/figtree (commit - 7ba197e45a5c6577fab56d469b4b1ccf02242e3d), it's a forked repository that ports C level code to python. Don't think it should really matter, you can plop any CPU intensive code in there instead.
!!!!!!EDIT!!!!: In hindsight, the figtree package allocates memory for the result (5000099958) / (1024**3) GB per process. If you multiply that by 96 processes this is what causes the insane memory consumption.
import figtree
import numpy as np
import multiprocessing
import ctypes
from multiprocessing import Pool, sharedctypes
n = 50000
m = 9995
X_base = sharedctypes.RawArray(ctypes.c_double, n* 77)
X_shared = np.frombuffer(X_base.get_obj())
X_shared = X_shared.reshape(n, 77)
X_shared[:] = np.random.normal(0, 1, (n, 77))
del X_shared
Q_base = sharedctypes.RawArray(ctypes.c_double, m** 2)
Q_shared = np.frombuffer(Q_base.get_obj())
Q_shared = Q_shared.reshape(m, m)
Q_shared[:] = np.random.normal(0, 1, (m, m))
del Q_shared
def fig_helper_efficient(slice):
print(id(Q_shared))
Q_shared = np.frombuffer(Q_base)
Q_shared = Q_shared.reshape(9995, 9995)
X_shared = np.frombuffer(X_base)
X_shared = X_shared.reshape(n,77)
if Q_shared.shape[0] == Q_shared.shape[1]:
res = figtree.figtree(**{'X': X_shared[slice, :], 'Y': X_shared,
'Q': Q_shared[:, slice].copy(), 'epsilon': 1e-12,
'h': 15})
print("done")
return res
def divide_batches_equally(num_examples, num_batches):
div_result = num_examples // num_batches
mod_result = num_examples % num_batches
size = np.zeros((num_batches + 1, 1)).astype(np.int32)
size[1:] = div_result
if mod_result > 0:
size[1:mod_result + 1] += 1
return np.cumsum(size)
def parallel_fig_vert_efficient():
n_proc = 96
size = divide_batches_equally(m, n_proc)
parallel_list = [slice(int(size[i]), int(size[i + 1])) for i in range(n_proc)]
with Pool(n_proc) as pool:
res = pool.map(fig_helper_efficient, parallel_list)
return res
if __name__ == '__main__':
parallel_fig_vert_efficient()

Thread pool is slow and same speed as serial

I'm trying to speed up calculations for extensive real time object detection and doing computation on it.
I'm using OpenCV with thread pool and producer, consumer for the video capture. But the execution speed is the same as the serial one.
How would I improve the speed of the execution ?
if __name__ == "__main__":
video_name = '2016-11-18_07-30-01.h264'
cap = cv2.VideoCapture(video_name)
det = detector.CarDetector()
car_tracker = Sort_Algorithm.Sort()
ped_tracker = Sort_Algorithm.Sort()
df_region, df_line = load_filter()
region = Region(df_region)
distance = compute_max_polygon_diagonal(df_region) * 0.1
region_buffered = region.buffer(distance)
threadn = cv2.getNumberOfCPUs()
pool = ThreadPool(processes = 2)
pending = deque()
threaded_mode = True
lock = threading.Lock()
while True:
while len(pending) > 0 and pending[0].ready():
res = pending.popleft().get()
cv2.imshow('video ', res)
if len(pending) < threadn:
ret, frame = cap.read()
if threaded_mode:
t1 = time.time()
H = [-2.01134074616, -16.6502442427, -1314.05715739, -3.35391526592, -22.3546973012, 2683.63584335,
-0.00130731963137, -0.0396207582264, 1]
matrix = np.reshape(H, (3, 3))
dst = cv2.warpPerspective(frame.copy(), matrix, (frame.shape[1], frame.shape[0]))
task = pool.apply_async(pipeline, (lock, frame.copy(),car_tracker, ped_tracker,df_region,region_buffered, df_line, det, dst, matrix))
cv2.imshow('dst', dst)
else:
task = DummyTask(pipeline,(lock, frame.copy(),car_tracker, ped_tracker,df_region, region_buffered, df_line, det, dst, matrix))
pending.append(task)
ch = cv2.waitKey(1)
if ch == ord(' '):
threaded_mode = not threaded_mode
if ch == 27:
break
The code for pipeline:
def pipeline(lock, img, car_tracker, ped_tracker, df_region, region_buffered, df_line, det, dst, H):
lock.acquire()
global point_lists
global df_car_lists
global frame_idx
global counter
global data_peds
global data_cars
global genera_data_pd_cars
global genera_data_pd_peds
car_box, ped_box = det.get_localization(img)
car_detections = car_tracker.update(np.array(car_box))
ped_detections = ped_tracker.update(np.array(ped_box))
saved_region = df_region.values
saved_region = np.delete(saved_region, 2, 1)
frame_idx+=1
cv2.warpPerspective(np.array(df_line, dtype=np.float32), H, (df_line.shape[1], df_line.shape[0]))
cv2.polylines(dst, np.int32([[saved_region]]), False, color=(255, 0, 0))
cv2.polylines(dst, np.int32([np.array(df_line, dtype=np.float32)]), False, color=(255, 0, 0))
for trk in car_detections:
trk = trk.astype(np.int32)
helpers.draw_box_label(img, trk, trk[4]) # Draw the bounding boxes on the
for other in ped_detections:
other = other.astype(np.int32)
helpers.draw_box_label(img, other, other[4]) # Draw the bounding boxes on the
for trk in car_detections:
trk = trk.astype(np.int32)
p = np.array([[((trk[1] + trk[3]) / 2, (trk[0] + trk[2]) / 2)]], dtype=np.float32)
center_pt = cv2.perspectiveTransform(p, H)
ptx = center_pt.T.item(0)
pty = center_pt.T.item(1)
df_cars = compute(trk[4], ptx, pty, frame_idx, df_region, region_buffered, df_line)
genera_data_pd_cars = genera_data_pd_cars.append(df_cars)
for other in ped_detections:
other = other.astype(np.int32)
p = np.array([[((other[1] + other[3]) / 2, (other[0] + other[2]) / 2)]], dtype=np.float32)
center_pt = cv2.perspectiveTransform(p, H)
ptx = center_pt.T.item(0)
pty = center_pt.T.item(1)
df_peds = compute(other[4], ptx, pty, frame_idx, df_region, region_buffered, df_line)
genera_data_pd_peds = genera_data_pd_cars.append(df_peds)
query = "is_in_region == True and is_in_region_now == True"
df_peds = genera_data_pd_peds.query(query)
query = " is_in_region == True"
df_cars = genera_data_pd_cars.query(query)
if len(df_cars)> 1 and len(df_peds) > 1:
df_car_in_t_range_ped = select_slice(df_cars, df_peds)
df_ped_in_t_range_car = select_slice(df_peds, df_cars)
t_abs_crossing_car = df_cars['t_abs_at_crossing'].iloc[0]
t_abs_crossing_ped = df_peds['t_abs_at_crossing'].iloc[0]
dt_crossing = t_abs_crossing_car - t_abs_crossing_ped
is_able_to_pass_before_ped = \
((df_car_in_t_range_ped['t_abs_at_crossing_estimated'] -
t_abs_crossing_ped) > 0).any()
behavior = Behavior( # is_passed_before_ped
dt_crossing < 0,
# is_able_to_stop
df_car_in_t_range_ped['is_able_to_halt'].any(),
# is_too_fast
df_car_in_t_range_ped['is_too_fast'].any(),
# is_close_enough
df_car_in_t_range_ped['is_close_enough'].any(),
# is_able_to_pass_before_ped
is_able_to_pass_before_ped)
interaction = Interaction(trk[4], other[4])
interaction = interaction.assess_behavior(behavior)
code, res, msg = interaction.code, interaction.res, interaction.msg
print(msg)
genera_data_pd_cars = genera_data_pd_cars.iloc[0:0]
genera_data_pd_peds = genera_data_pd_peds.iloc[0:0]
lock.release()
return img
Multi-threading in python for CPU bound tasks is limited by GIL and effectively makes single thread run a time.
Ofcourse if you launch multiple threads for CPU bound tasks the performance is going to be even degraded because there is lot of overhead for both for kernel and python interpreter to manage these threads.
Kernel wants to schedule these threads and python wants to restrict these threads from running simultaneous and this results lot of context switches happening which degrades the performance.
If you are using just numpy in the threads then you would be fine as numpy isn't impacted by GIL since it uses atomic operations, but I am not sure if that is true for OpenCV as well.
Threads in python arn't meant for computation tasks.
This is classic problem of threads with python, consider using multiprocessing and there are number of articles on this topic, you might want to check few of them.
Threads aren't executed in parallel in cpython. Try using the ProcessPoolExecutor instead.

Implement multithreading in Python Zelle graphics

I am creating a program which opens a world map in a window using Zelle's graphics.py. It has one function which draws dots on the map, and another function which undraws those dots after they are on the screen for 1 second (which are stored in a list after being drawn). I want these functions to work concurrently, but when the addDots() function is called in a thread it won't draw the dot in the window, it just stalls. Here is the module which I run:
import thread
import threading
import time
import random
import sys
sys.path.append('..')
from Display import map
import tester
import datetime
dots = list(())
def deleteDots():
while 1==1:
tF = datetime.datetime.now()
a = 0
for i in range(len(dots)):
tD = tF - dots[i-a][2]
tD = int(str(tD)[5:7])
if tD >= 1:
map.deletePoint(dots[i-a][0],dots[i-a][1])
dots.pop(i-a)
a = a+1
def addDots():
oldResponseCount = tester.getResponseCount()
oldResponseCount = int(str(oldResponseCount))
while 1==1:
print(oldResponseCount)
newResponseCount = tester.getResponseCount()
newResponseCount = int(str(newResponseCount))
print(newResponseCount)
if(newResponseCount != oldResponseCount):
difference = newResponseCount - oldResponseCount
for i in range(difference):
lat = random.randint(-90,90)
long = random.randint(-180,180)
map.drawPoint(lat,long)
tI = datetime.datetime.now()
dots.append([lat,long,tI])
oldResponseCount = newResponseCount
if __name__ == '__main__':
threading.Thread(target=addDots).start()
threading.Thread(target=deleteDots).start()
And here is the map module which draws the map on a graphics window and contains the functions to plot and delete a point:
from graphics import *
import math
import images
size = 0.6
Circles = list(())
win = GraphWin("My Window", 1920*size, 1080*size)
win.setBackground('blue')
images.test(size)
myImage = Image(Point(960*size,540*size), "../Display/temp.gif")
myImage.draw(win)
import time
def drawPoint(lat,long):
x = int(long*5.3+960)*size
y = int(lat*(-5.92)+540)*size
pt = Point(x,y)
cir = Circle(pt,5)
cir.setFill(color_rgb(255,0,0))
Circles.append([cir,x,y])
cir.draw(win)
def deletePoint(lat,long):
x = int(long*5.3+960)*size
y = int(lat*(-5.92)+540)*size
for c in Circles:
if c[1]==x and c[2]==y:
c[0].undraw()
How should I go about doing this?
There are a couple of issues that have to be addressed. First, any graphics.py commands that invoke tkinter (i.e. commands that cause something to be drawn/undrawn) must be issued by the primary (main) thread. So we need the secondary threads to communicate drawing requests to the primary thread.
Second, you have both your secondary threads modifying the Circles and dots lists -- you need to syncronize (lock) access to these lists so that only one thread at a time can modify or iterate them.
Below is my rework of your code as an example. I've eliminated map and tester routines as I'm just putting dots up on a window with one thread and deleting them after they are a second old from another thread:
from threading import Thread, Lock
from queue import Queue # use for thread-safe communications
from random import randint
import time
from graphics import *
def drawPoint(lat, long):
x = int(long * 5.3 + 960)
y = int(lat * -5.92 + 540)
point = Point(x, y)
circle = Circle(point, 5)
circle.setFill(color_rgb(255, 0, 0))
circles_lock.acquire()
circles.append(circle)
circles_lock.release()
actions.put((circle.draw, win))
def deletePoint(lat, long):
global circles
x = int(long * 5.3 + 960)
y = int(lat * -5.92 + 540)
keep_circles = []
circles_lock.acquire()
for circle in circles:
center = circle.getCenter()
if center.getX() == x and center.getY() == y:
actions.put((circle.undraw,))
else:
keep_circles.append(circle)
circles = keep_circles
circles_lock.release()
def deleteDots():
global dots
while True:
keep_dots = []
dots_lock.acquire()
now = time.time()
for dot in dots:
lat, long, then = dot
if now - then >= 1.0:
deletePoint(lat, long)
else:
keep_dots.append(dot)
dots = keep_dots
dots_lock.release()
time.sleep(0.5)
def addDots():
while True:
lat = randint(-90, 90)
long = randint(-180, 180)
drawPoint(lat, long)
dots_lock.acquire()
dots.append((lat, long, time.time()))
dots_lock.release()
time.sleep(0.25)
win = GraphWin("My Window", 1920, 1080)
circles = []
circles_lock = Lock()
dots = []
dots_lock = Lock()
actions = Queue()
Thread(target=addDots, daemon=True).start()
Thread(target=deleteDots, daemon=True).start()
while True:
if not actions.empty():
action, *arguments = actions.get()
action(*arguments)
time.sleep(0.125)

Python Speed Optimization

I am creating a program (to test a theory), and to get the data I need, I need a program to run as fast as possible.
Here's the problem - I have made it as fast as I could manage and it is still to slow. It is using a very small amount of my computer's RAM and CPU capacity. I am running the program with PyCharm 2017 Community Edition.
The code is below; How would I further optimize or change this to make it run faster?
Main:
from functions import *
from graphics import *
import time
Alpha = True
x = timestamp()
while Alpha:
master = GraphWin(title="Image", width=512, height=512)
build_image(master)
getter(master, x)
x = timestamp()
time.sleep(3)
master.close()
Module "Functions":
from graphics import *
import random
from PIL import ImageGrab
def build_image(window):
for i in range(513):
for j in range(513):
fig = Rectangle(Point(j, i), Point(j + 1, i + 1))
color = random.randrange(256)
fig.setFill(color_rgb(color, color, color))
fig.setOutline(color_rgb(color, color, color))
fig.draw(window)
def getter(widget, counter):
x = widget.winfo_rootx()+widget.winfo_x()
y = widget.winfo_rooty()+widget.winfo_y()
x1 = x+widget.winfo_width()
y1 = y+widget.winfo_height()
ImageGrab.grab().crop((x, y, x1, y1)).save("{}.png".format(str(counter)))
def timestamp():
timelist = time.gmtime()
filename = ("Image" + "_" + str(timelist[0]) + "_" + str(timelist[1]) + "_" + str(timelist[2]) + "_" +
str(timelist[3]) + "_" + str(timelist[4]) + "_" + str(timelist[5]) + "_UTC")
return filename
Note: Module "Graphics" is a module that allows for easy manipulation of Tkinter.
Your slowness is probably from treating the pixels as rectangles in your window.
If all you want to do is generate random images, you can skip the window part. I found this code laying about, after not too much ducking:
from PIL import Image
import random
def drawImage():
testImage = Image.new("RGB", (600,600), (255,255,255))
pixel = testImage.load()
for x in range(600):
for y in range(600):
red = random.randrange(0,255)
blue = random.randrange(0,255)
green = random.randrange(0,255)
pixel[x,y]=(red,blue,green)
return testImage
def main():
finalImage = drawImage()
finalImage.save("finalImage.jpg")
Use a profiler to see where your program is fast/slow. Here is a profile wrapper you can use on your functions to see what is taking too long in your program.
def line_profiler(view=None, extra_view=None):
import line_profiler
def wrapper(view):
def wrapped(*args, **kwargs):
prof = line_profiler.LineProfiler()
prof.add_function(view)
if extra_view:
[prof.add_function(v) for v in extra_view]
with prof:
resp = view(*args, **kwargs)
prof.print_stats()
return resp
return wrapped
if view:
return wrapper(view)
return wrapper
Now how to use it
#line_profiler
def simple():
print("Hello")
print("World")
Now when you run your function, you will get a printout of how long everything takes.
You might need to do pip install line_profiler
this may be a bit faster if you use numpy. loops inside loops will kill your speed.
from PIL import Image
import numpy as np
def drawImage():
return Image.fromarray(np.random.randint(255, size=(600, 600, 3)).astype(np.uint8))
Since you do a lot of independent tasks, you could benefit from parallelism. Something like:
from concurrent.futures import ThreadPoolExecutor
def build_image(window, start, end, step):
for i in range(start, end, step):
for j in range(end):
fig = Rectangle(Point(j, i), Point(j + 1, i + 1))
color = random.randrange(256)
fig.setFill(color_rgb(color, color, color))
fig.setOutline(color_rgb(color, color, color))
fig.draw(window)
max_workers = 8
with ThreadPoolExecutor(max_workers=max_workers) as executor:
for id in range(max_workers):
executor.submit(build_image, window, id, 513, max_workers)

Categories