I've read all the related posts on the subject, but I can't for the life of me get multiprocessing to work properly with shared memory.
I'm using an EC2 instance with 96 cores, but for some reason despite using shared memory, my memory consumption explodes when using a worker pool with 96 workers.
EDIT: Had a bug earlier, which caused not all the cores to be used (stupid bug where I didn't give the right parameters for map) - anyways, clarified my current problem.
Any ideas? Attaching a screenshot of htop on my server to show the CPU usage + memory consumption.
For reference, I used the figtree package from here: https://github.com/ec2604/figtree (commit - 7ba197e45a5c6577fab56d469b4b1ccf02242e3d), it's a forked repository that ports C level code to python. Don't think it should really matter, you can plop any CPU intensive code in there instead.
!!!!!!EDIT!!!!: In hindsight, the figtree package allocates memory for the result (5000099958) / (1024**3) GB per process. If you multiply that by 96 processes this is what causes the insane memory consumption.
import figtree
import numpy as np
import multiprocessing
import ctypes
from multiprocessing import Pool, sharedctypes
n = 50000
m = 9995
X_base = sharedctypes.RawArray(ctypes.c_double, n* 77)
X_shared = np.frombuffer(X_base.get_obj())
X_shared = X_shared.reshape(n, 77)
X_shared[:] = np.random.normal(0, 1, (n, 77))
del X_shared
Q_base = sharedctypes.RawArray(ctypes.c_double, m** 2)
Q_shared = np.frombuffer(Q_base.get_obj())
Q_shared = Q_shared.reshape(m, m)
Q_shared[:] = np.random.normal(0, 1, (m, m))
del Q_shared
def fig_helper_efficient(slice):
print(id(Q_shared))
Q_shared = np.frombuffer(Q_base)
Q_shared = Q_shared.reshape(9995, 9995)
X_shared = np.frombuffer(X_base)
X_shared = X_shared.reshape(n,77)
if Q_shared.shape[0] == Q_shared.shape[1]:
res = figtree.figtree(**{'X': X_shared[slice, :], 'Y': X_shared,
'Q': Q_shared[:, slice].copy(), 'epsilon': 1e-12,
'h': 15})
print("done")
return res
def divide_batches_equally(num_examples, num_batches):
div_result = num_examples // num_batches
mod_result = num_examples % num_batches
size = np.zeros((num_batches + 1, 1)).astype(np.int32)
size[1:] = div_result
if mod_result > 0:
size[1:mod_result + 1] += 1
return np.cumsum(size)
def parallel_fig_vert_efficient():
n_proc = 96
size = divide_batches_equally(m, n_proc)
parallel_list = [slice(int(size[i]), int(size[i + 1])) for i in range(n_proc)]
with Pool(n_proc) as pool:
res = pool.map(fig_helper_efficient, parallel_list)
return res
if __name__ == '__main__':
parallel_fig_vert_efficient()
Related
I try to hash many file, but it not use full of cpu power. it only consume 25%. i test to move the heavy process into thread. but still no different. im from nodejs use sharp library. with same task. it consume all cpu usage. How python to make it full power?
import cv2
import math
import datetime
import hashlib
import threading
def thread_function(image, yPos, xPos, wSizeBlock, hSizeBlock):
block = image[yPos:yPos+wSizeBlock, xPos:xPos+hSizeBlock]
hash = hashlib.sha256()
hash.update(block.tobytes())
print(hash.hexdigest())
image = cv2.imread('frame323.jpg', cv2.IMREAD_COLOR)
dimension = {
'width': image.shape[1],
'height': image.shape[0]
}
wSizeBlock = int(16)
hSizeBlock = int(16)
wBlockLength = math.floor(dimension['width'] / wSizeBlock)
hBlockLength = math.floor(dimension['height'] / hSizeBlock)
count = 0
start_time = datetime.datetime.now()
print(start_time)
for k in range(0, 500):
for i in range(0, wBlockLength):
for j in range(0, hBlockLength):
xPos = int(i*wSizeBlock)
yPos = int(j*hSizeBlock)
x = threading.Thread(target=thread_function, args=(image, xPos, yPos, wSizeBlock, hSizeBlock))
x.start()
count += 1
count = 0
end_time = datetime.datetime.now()
print(end_time)
For CPU intensive operations that can be split up into smaller tasks, you would want to use the multiprocessing module. It is similar to the threading module in that it allows multiple functions to be ran at once. Syntax looks something like this:
import multiprocessing as mp
def add(a, b):
return a + b
p = mp.Process(target=add, args=(1, 2))
p.start()
I am trying to use multiprocessing to speed up my data processing. I am working on a machine with 6 Cores, so I want to iterate through a table of 12 million rows, and for each of these rows I iterate through several time steps doing a calculation (executing a function).
This line I would like to split up that it runs in parallel on different cores:
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
I tried something with
from multiprocessing import Pool
but I did not manage to pass the arguments of the function and the iterator.
I would appreciate any idea. I am new to Python.
This is what i have:
import pyreadr
import pandas as pd
import numpy as np
import time
from datetime import timedelta
import functools
from pathlib import Path
def read_data():
current_path = os.getcwd()
myfile = os.path.join(str(Path(current_path).parents[0]), 'dummy.RData')
result = pyreadr.read_r(myfile)
pc = result["pc"]
u = result["u"]
return pc, u
# add one column per time
def prepare_output_structure(pc):
ini_cols = pc.columns
pc = pc.reindex(columns=[*pc.columns, *np.arange(0, 11), 'cat'], fill_value=0)
pc.reset_index(level=0, inplace=True)
# print(pc.columns, pc.shape, pc.dtypes)
return pc, ini_cols
def conjunction(*conditions):
return functools.reduce(np.logical_and, conditions)
def timeloop(t_final: int, count_final: int, tipo):
if tipo == 'A':
count_ini = 35
else: # B:
count_ini = 30
yy_list = []
for t in np.arange(0, 11):
yy = ((count_final - count_ini) / t_final) * t + count_ini
yy_list.append(int(yy))
return yy_list
def rowiteration(i, output, ini_cols, cols):
c_2: bool = pc.loc[i, 'tipo'] == u.iloc[:, 0].str[:1] # first character of category e.g. 'A1'
c_5: bool = pc.loc[i, 't_final'] >= u.iloc[:, 1] # t_min (u)
c_6: bool = pc.loc[i, 't_final'] <= (u.iloc[:, 2]) # t_max (u)
pc.loc[i, 'cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
pc.iloc[i, (0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(pc.loc[i, 't_final']), int(pc.loc[i, 'count_final']), pc.loc[i, 'tipo'])
out = pd.DataFrame(pc.iloc[i, :])
out = pd.DataFrame(out.transpose(), columns=cols)
output = output.append(out.iloc[0, :])
return output
if __name__ == '__main__':
start_time = time.time()
pc, u = read_data()
nrowpc = len(pc.index)
a = np.arange(0, nrowpc) # filas tabla pc
# print(a, nrowpc, len(pc.index))
pc, ini_cols = prepare_output_structure(pc)
cols = pc.columns
output = pd.DataFrame()
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
pc2 = pd.concat(test, ignore_index=True)
pc2 = pc2.iloc[:, np.r_[5, (len(ini_cols)+1):(len(pc2.columns))]]
print(pc2.head)
elapsed_time_secs = time.time() - start_time
msg = "Execution took: %s secs (Wall clock time)" % timedelta(milliseconds=elapsed_time_secs)
print(msg)```
Replace your [rowiteration(i, output, ini_cols, cols) for i in a] with:
from multiprocessing import Pool
n_cpu = 10 # put in the number of threads of cpu
with Pool(processes=n_cpu) as pool:
ret = pool.starmap(rowiteration,
[(i, output, ini_cols, cols) for i in a])
Here is an approach that I think solves the problem and that only sends what is necessary to the worker processes. I haven't tested this as is (which would be difficult without the data your code reads in) but this is basic idea:
import multiprocessing as mp
p = mp.Pool(processes=mp.cpu_count())
# Note that you already define the static cols and ini_cols
# in global scope so you don't need to pass them to the Pool.
# ... Other functions you've defined ...
def rowiteration(row):
c_2: bool = row['tipo'] == u.iloc[:, 0].str[:1]
c_5: bool = row['t_final'] >= u.iloc[:, 1]
c_6: bool = row['t_final'] <= (u.iloc[:, 2])
row['cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
row[(0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(row['t_final']), int(row['count_final']), row['tipo'])
return row
out = []
for row in p.imap_unordered(rowiteration, [r for _, r in pc.iterrows()]):
row.index = cols
out.append(cols)
pc2 = pd.DataFrame(out, ignore_index=True)
I have a bunch of matrix multiplication operations that are performed only row-wise. I was wondering how to speed-up the computation by parallelization:
data = np.random.randint(1, 100, (100000, 800))
indices_1 = np.equal(data, 1)
A = np.zeros((100000, 100))
B = np.random.randn(800, 100)
for i in range(100000):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
I tried multiprocessor but for some reason, but it did not perform better than sequential. Here is my multiprocessor implementation:
from multiprocessing.pool import ThreadPool, Pool
pool = ThreadPool() # can also use Pool
def f(i):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
pool.map(f, range(100000))
Both yielded the same amount of running time (around 32 seconds). Other parallelization method like concurrent.futures did not improve the runtime (used like below):
with concurrent.futures.ThreadPoolExecutor() as executor:
result = executor.map(f, range(100000))
I also tried to apply dask but could not make their framework work in my case. Any help will be much appreciated! Thanks!
import numpy as np
import multiprocessing as mp
data = list(np.random.randint(1, 100, (100000, 800)))
indices_1 = np.equal(data, 1)
A = list(np.zeros((100000, 100)))
B = np.random.randn(800, 100)
def f(data, A, i):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
with mp.Manager() as manager:
data_global = manager.list(data)
A_global = manager.list(A)
with mp.Pool() as p:
results = [ p.apply_async(f, (data_global, A_global, i,)) for i in range(100000) ]
for i in results:
i.wait()
data_global = list(data_global)
A_global = list(A_global)
I'm trying to speed up calculations for extensive real time object detection and doing computation on it.
I'm using OpenCV with thread pool and producer, consumer for the video capture. But the execution speed is the same as the serial one.
How would I improve the speed of the execution ?
if __name__ == "__main__":
video_name = '2016-11-18_07-30-01.h264'
cap = cv2.VideoCapture(video_name)
det = detector.CarDetector()
car_tracker = Sort_Algorithm.Sort()
ped_tracker = Sort_Algorithm.Sort()
df_region, df_line = load_filter()
region = Region(df_region)
distance = compute_max_polygon_diagonal(df_region) * 0.1
region_buffered = region.buffer(distance)
threadn = cv2.getNumberOfCPUs()
pool = ThreadPool(processes = 2)
pending = deque()
threaded_mode = True
lock = threading.Lock()
while True:
while len(pending) > 0 and pending[0].ready():
res = pending.popleft().get()
cv2.imshow('video ', res)
if len(pending) < threadn:
ret, frame = cap.read()
if threaded_mode:
t1 = time.time()
H = [-2.01134074616, -16.6502442427, -1314.05715739, -3.35391526592, -22.3546973012, 2683.63584335,
-0.00130731963137, -0.0396207582264, 1]
matrix = np.reshape(H, (3, 3))
dst = cv2.warpPerspective(frame.copy(), matrix, (frame.shape[1], frame.shape[0]))
task = pool.apply_async(pipeline, (lock, frame.copy(),car_tracker, ped_tracker,df_region,region_buffered, df_line, det, dst, matrix))
cv2.imshow('dst', dst)
else:
task = DummyTask(pipeline,(lock, frame.copy(),car_tracker, ped_tracker,df_region, region_buffered, df_line, det, dst, matrix))
pending.append(task)
ch = cv2.waitKey(1)
if ch == ord(' '):
threaded_mode = not threaded_mode
if ch == 27:
break
The code for pipeline:
def pipeline(lock, img, car_tracker, ped_tracker, df_region, region_buffered, df_line, det, dst, H):
lock.acquire()
global point_lists
global df_car_lists
global frame_idx
global counter
global data_peds
global data_cars
global genera_data_pd_cars
global genera_data_pd_peds
car_box, ped_box = det.get_localization(img)
car_detections = car_tracker.update(np.array(car_box))
ped_detections = ped_tracker.update(np.array(ped_box))
saved_region = df_region.values
saved_region = np.delete(saved_region, 2, 1)
frame_idx+=1
cv2.warpPerspective(np.array(df_line, dtype=np.float32), H, (df_line.shape[1], df_line.shape[0]))
cv2.polylines(dst, np.int32([[saved_region]]), False, color=(255, 0, 0))
cv2.polylines(dst, np.int32([np.array(df_line, dtype=np.float32)]), False, color=(255, 0, 0))
for trk in car_detections:
trk = trk.astype(np.int32)
helpers.draw_box_label(img, trk, trk[4]) # Draw the bounding boxes on the
for other in ped_detections:
other = other.astype(np.int32)
helpers.draw_box_label(img, other, other[4]) # Draw the bounding boxes on the
for trk in car_detections:
trk = trk.astype(np.int32)
p = np.array([[((trk[1] + trk[3]) / 2, (trk[0] + trk[2]) / 2)]], dtype=np.float32)
center_pt = cv2.perspectiveTransform(p, H)
ptx = center_pt.T.item(0)
pty = center_pt.T.item(1)
df_cars = compute(trk[4], ptx, pty, frame_idx, df_region, region_buffered, df_line)
genera_data_pd_cars = genera_data_pd_cars.append(df_cars)
for other in ped_detections:
other = other.astype(np.int32)
p = np.array([[((other[1] + other[3]) / 2, (other[0] + other[2]) / 2)]], dtype=np.float32)
center_pt = cv2.perspectiveTransform(p, H)
ptx = center_pt.T.item(0)
pty = center_pt.T.item(1)
df_peds = compute(other[4], ptx, pty, frame_idx, df_region, region_buffered, df_line)
genera_data_pd_peds = genera_data_pd_cars.append(df_peds)
query = "is_in_region == True and is_in_region_now == True"
df_peds = genera_data_pd_peds.query(query)
query = " is_in_region == True"
df_cars = genera_data_pd_cars.query(query)
if len(df_cars)> 1 and len(df_peds) > 1:
df_car_in_t_range_ped = select_slice(df_cars, df_peds)
df_ped_in_t_range_car = select_slice(df_peds, df_cars)
t_abs_crossing_car = df_cars['t_abs_at_crossing'].iloc[0]
t_abs_crossing_ped = df_peds['t_abs_at_crossing'].iloc[0]
dt_crossing = t_abs_crossing_car - t_abs_crossing_ped
is_able_to_pass_before_ped = \
((df_car_in_t_range_ped['t_abs_at_crossing_estimated'] -
t_abs_crossing_ped) > 0).any()
behavior = Behavior( # is_passed_before_ped
dt_crossing < 0,
# is_able_to_stop
df_car_in_t_range_ped['is_able_to_halt'].any(),
# is_too_fast
df_car_in_t_range_ped['is_too_fast'].any(),
# is_close_enough
df_car_in_t_range_ped['is_close_enough'].any(),
# is_able_to_pass_before_ped
is_able_to_pass_before_ped)
interaction = Interaction(trk[4], other[4])
interaction = interaction.assess_behavior(behavior)
code, res, msg = interaction.code, interaction.res, interaction.msg
print(msg)
genera_data_pd_cars = genera_data_pd_cars.iloc[0:0]
genera_data_pd_peds = genera_data_pd_peds.iloc[0:0]
lock.release()
return img
Multi-threading in python for CPU bound tasks is limited by GIL and effectively makes single thread run a time.
Ofcourse if you launch multiple threads for CPU bound tasks the performance is going to be even degraded because there is lot of overhead for both for kernel and python interpreter to manage these threads.
Kernel wants to schedule these threads and python wants to restrict these threads from running simultaneous and this results lot of context switches happening which degrades the performance.
If you are using just numpy in the threads then you would be fine as numpy isn't impacted by GIL since it uses atomic operations, but I am not sure if that is true for OpenCV as well.
Threads in python arn't meant for computation tasks.
This is classic problem of threads with python, consider using multiprocessing and there are number of articles on this topic, you might want to check few of them.
Threads aren't executed in parallel in cpython. Try using the ProcessPoolExecutor instead.
I have an application in Tkinter.
Part of this application is a method:
It basically takes long lists of random values and checks if the random values are inside of a previously defined grid. Afterwards it writes them into another variable to export it.
This is a rather long process. So I would like to multiprocess it.
Read some stuff about how to do that. Here's the resulting code:
I've read around SO for stuff that might be relevant. I am running an up-to-date Spyder with Python 3.7 as part of the Anaconda-suite on both machines, all (at least included) packages are up-to-date and I've included the
if __name__ == '__main__':
-line. I've also experimented with indentation of
p.start()
and
processes.append(p)
Simply can't get it to work.
def ParallelStuff(myIn1, myIn2, myIn3, myIn4, anotherIn1, anotherIn2, anotherIn3, return_dict, processIterator):
tempOut1 = np.zeros(len(myIn1)) # myIn1, myIn2, myIn3 are of the same length
tempOut2 = np.zeros(len(myIn1))
tempOut3 = np.zeros(len(myIn1))
bb = 0
for i in range(len(myIn3)):
xx = myIn3[i]
yy = myIn4[i]
hits = np.isin(anotherIn1, xx)
goodY = anotherIn3[np.where(hits==1)]
if np.isin(yy, goodY):
tempOut1[bb] = myIn1[i]
tempOut2[bb] = myIn2[i]
tempOut3[bb] = anotherIn3
bb += 1
return_dict[processIterator] = [tempOut1, tempOut1, tempOut3]
nCores = multiprocessing.cpu_count()
def export_Function(self):
out1 = np.array([])
out2 = np.array([])
out3 = np.array([])
for loop_one in range(0, N):
# ...
# stuff that works on both systems with only one core...
# ... and on linux with all cores
processes = []
nTotal = int(len(xRand))
if nTotal%nCores == 0:
o = int(nTotal/nCores)
else:
o = int(nTotal/(nCores-1))
manager = multiprocessing.Manager()
return_dict = manager.dict()
for processIterator in range (nCores):
offset = o*i
myIn1 = in1[offset : min(nTotal, offset + o)]
myIn2 = in2[offset : min(nTotal, offset + o)]
myIn3 = in3[offset : min(nTotal, offset + o)]
myIn4 = in4[offset : min(nTotal, offset + o)]
if __name__ == '__main__':
p = multiprocessing.Process(target = ParallelStuff, args = (myIn1, myIn2, myIn3, myIn4, anotherIn1, anotherIn2, anotherIn3, return_dict, processIterator))
p.start()
processes.append(p)
for p in range(len(processes)):
processes[p].join()
myOut1 = return_dict[p][0]
myOut2 = return_dict[p][1]
myOut3 = return_dict[p][2]
out1 = np.concatenate((out1, myOut1[np.where(myOut1 != 0)]))
out2 = np.concatenate((out2, myOut2[np.where(myOut2 != 0)]))
out3 = np.concatenate((out3, myOut3[np.where(myOut3 != 0)]))
When I run my programm on my Linux machine it does exactly what it's supposed to do. Distribute to all 8 cores, computes, concatenates the 3 results in the respective arrays, exports.
When I run my programm on my Windows machine the application's window freezes, the process becomes inactive, a new kernel automatically opens and a new window appears.