I'm trying to parallelise training of classifiers from sklearn (gaussian mixture modell in this case) using multiprocessing and I get a lot worse classifiers in comparison with running them sequentially. Additionally each time after training the results are different as if the code was not thread safe. Can anyone explain me what is going on? Here is the code and at the end the thread function:
nrProc = 8
semaphore = Semaphore(nrProc)
m = Manager()
models = m.list()
modelsOut = m.list()
processes = []
cnt = 0
for event_label in data_positive:
models.append(mixture.GMM(**classifier_params))
models.append(mixture.GMM(**classifier_params))
for event_label in data_positive:
if classifier_method == 'gmm':
processes.append(Process(target=trainProcess, args=(models[cnt], data_positive[event_label], semaphore, modelsOut)))
cnt = cnt + 1
processes.append(Process(target=trainProcess, args=(models[cnt], data_negative[event_label], semaphore, modelsOut)))
cnt = cnt + 1
else:
raise ValueError("Unknown classifier method ["+classifier_method+"]")
for proc in processes:
proc.start()
for proc in processes:
proc.join()
cnt = 0
for event_label in data_positive:
model_container['models'][event_label] = {}
model_container['models'][event_label]['positive'] = modelsOut[cnt]
cnt = cnt + 1
model_container['models'][event_label]['negative'] = modelsOut[cnt]
cnt = cnt + 1
def trainProcess(model, data, semaphore, modelsOut):
semaphore.acquire()
modelsOut.append(model.fit(data))
semaphore.release()
return 0
So the solution is to use clone function from sklearn which does a deep copy of the estimator.
Related
I try to hash many file, but it not use full of cpu power. it only consume 25%. i test to move the heavy process into thread. but still no different. im from nodejs use sharp library. with same task. it consume all cpu usage. How python to make it full power?
import cv2
import math
import datetime
import hashlib
import threading
def thread_function(image, yPos, xPos, wSizeBlock, hSizeBlock):
block = image[yPos:yPos+wSizeBlock, xPos:xPos+hSizeBlock]
hash = hashlib.sha256()
hash.update(block.tobytes())
print(hash.hexdigest())
image = cv2.imread('frame323.jpg', cv2.IMREAD_COLOR)
dimension = {
'width': image.shape[1],
'height': image.shape[0]
}
wSizeBlock = int(16)
hSizeBlock = int(16)
wBlockLength = math.floor(dimension['width'] / wSizeBlock)
hBlockLength = math.floor(dimension['height'] / hSizeBlock)
count = 0
start_time = datetime.datetime.now()
print(start_time)
for k in range(0, 500):
for i in range(0, wBlockLength):
for j in range(0, hBlockLength):
xPos = int(i*wSizeBlock)
yPos = int(j*hSizeBlock)
x = threading.Thread(target=thread_function, args=(image, xPos, yPos, wSizeBlock, hSizeBlock))
x.start()
count += 1
count = 0
end_time = datetime.datetime.now()
print(end_time)
For CPU intensive operations that can be split up into smaller tasks, you would want to use the multiprocessing module. It is similar to the threading module in that it allows multiple functions to be ran at once. Syntax looks something like this:
import multiprocessing as mp
def add(a, b):
return a + b
p = mp.Process(target=add, args=(1, 2))
p.start()
I am working on a custom environment using gym and currently trying to parallelize the training of my D3QN model as it is taking a lot of time to finish an episode.
Is there a way to parallelize the training and take only best cases for refinement using Keras and tensorflow?
def run(self):
reward_list = []
ave_reward_list = []
decay_step = 0
start_time = time.time()
for e in range(self.EPISODES):
state = self.env.reset()
state = np.asarray(state).reshape((1, 24))
state = (state - state.mean()) / state.std()
done = False
i = 0
first_ps = 0
total_reward = 0
#counter = 0
while not done:
#self.env.render()
decay_step += 1
action, explore_probability = self.act(state, decay_step)
acting = [action, first_ps]
next_state, reward, done, _ = self.env.step(acting)
next_state = np.asarray(next_state).reshape((1, 24))
next_state = (next_state - next_state.mean()) / next_state.std()
#print('next_state: {}'.format(next_state))
first_ps = 1
self.remember(state, action, reward, next_state, done)
state = next_state
i += 1
total_reward += reward
#print(total_reward)
#counter +=1
#if counter==100:
#self.update_target_model()
#counter = 0
if done:
# track the reward list
reward_list.append(total_reward)
if (e+1) % 100 == 0:
ave_reward = np.mean(reward_list)
ave_reward_list.append(ave_reward)
reward_list = []
# every step update target model
self.update_target_model()
# every episode, plot the result
average = self.PlotModel(i, e)
# every episode, plot the total_reward
#average_reward = self.PlotModel_reward(total_reward, e)
print("episode: {}/{}, iterations: {}, e: {:.2}, average: {}, tot_reward: {}".format(e, self.EPISODES, i, explore_probability, average, total_reward))
if e==self.EPISODES-1:
hours, rem = divmod((time.time() - start_time), 3600)
minutes, seconds = divmod(rem, 60)
print("The running time is: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
print("Saving trained model to", self.Model_name)
self.save(self.Model_name+'_'+str(int(total_reward))+".h5")
self.replay(done)
My main function:
if __name__ == "__main__":
env_name = 'trainSim-v0'
agent = DQNAgent(env_name)
agent.run()
you can only do this if you have multiple GPU. one GPU can only focus on 1 task, since your model is already slow so you need to upgrade your hardware, you either need more GPUs to train single model(opposite of your question). or you can get better GPU to train model.
https://keras.io/guides/distributed_training/
I tried to use multiprocessing on this for loop:
def segment_features(segment_pixels):
features = []
npixels, nbands = segment_pixels.shape
for b in range(nbands):
stats = scipy.stats.describe(segment_pixels[:, b])
band_stats = list(stats.minmax) + list(stats)[2:]
if npixels == 1:
# in this case the variance = nan, change it 0.0
band_stats[3] = 0.0
features += band_stats
return features
segment_ids = np.unique(segments)
objects = []
object_ids = []
for id in segment_ids:
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
By replacing the for loop section with this:
def segment_features_calc(segment_pixels):
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
print("segment "+str(id)+" features calculated")
n=mp.cpu_count()
if__name__ == '__main__':
p = mp.Pool(processes = n)
start = time.time()
async_result = p.map_async(segment_features_calc,list(segment_ids))
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
However, the multiprocessing does not get executed properly (multiprocessing stops after 0.17 seconds whereas the items to loop are close to 270,000+ segment IDs). Any insights on how to solve the issue?
I'm trying to speed up calculations for extensive real time object detection and doing computation on it.
I'm using OpenCV with thread pool and producer, consumer for the video capture. But the execution speed is the same as the serial one.
How would I improve the speed of the execution ?
if __name__ == "__main__":
video_name = '2016-11-18_07-30-01.h264'
cap = cv2.VideoCapture(video_name)
det = detector.CarDetector()
car_tracker = Sort_Algorithm.Sort()
ped_tracker = Sort_Algorithm.Sort()
df_region, df_line = load_filter()
region = Region(df_region)
distance = compute_max_polygon_diagonal(df_region) * 0.1
region_buffered = region.buffer(distance)
threadn = cv2.getNumberOfCPUs()
pool = ThreadPool(processes = 2)
pending = deque()
threaded_mode = True
lock = threading.Lock()
while True:
while len(pending) > 0 and pending[0].ready():
res = pending.popleft().get()
cv2.imshow('video ', res)
if len(pending) < threadn:
ret, frame = cap.read()
if threaded_mode:
t1 = time.time()
H = [-2.01134074616, -16.6502442427, -1314.05715739, -3.35391526592, -22.3546973012, 2683.63584335,
-0.00130731963137, -0.0396207582264, 1]
matrix = np.reshape(H, (3, 3))
dst = cv2.warpPerspective(frame.copy(), matrix, (frame.shape[1], frame.shape[0]))
task = pool.apply_async(pipeline, (lock, frame.copy(),car_tracker, ped_tracker,df_region,region_buffered, df_line, det, dst, matrix))
cv2.imshow('dst', dst)
else:
task = DummyTask(pipeline,(lock, frame.copy(),car_tracker, ped_tracker,df_region, region_buffered, df_line, det, dst, matrix))
pending.append(task)
ch = cv2.waitKey(1)
if ch == ord(' '):
threaded_mode = not threaded_mode
if ch == 27:
break
The code for pipeline:
def pipeline(lock, img, car_tracker, ped_tracker, df_region, region_buffered, df_line, det, dst, H):
lock.acquire()
global point_lists
global df_car_lists
global frame_idx
global counter
global data_peds
global data_cars
global genera_data_pd_cars
global genera_data_pd_peds
car_box, ped_box = det.get_localization(img)
car_detections = car_tracker.update(np.array(car_box))
ped_detections = ped_tracker.update(np.array(ped_box))
saved_region = df_region.values
saved_region = np.delete(saved_region, 2, 1)
frame_idx+=1
cv2.warpPerspective(np.array(df_line, dtype=np.float32), H, (df_line.shape[1], df_line.shape[0]))
cv2.polylines(dst, np.int32([[saved_region]]), False, color=(255, 0, 0))
cv2.polylines(dst, np.int32([np.array(df_line, dtype=np.float32)]), False, color=(255, 0, 0))
for trk in car_detections:
trk = trk.astype(np.int32)
helpers.draw_box_label(img, trk, trk[4]) # Draw the bounding boxes on the
for other in ped_detections:
other = other.astype(np.int32)
helpers.draw_box_label(img, other, other[4]) # Draw the bounding boxes on the
for trk in car_detections:
trk = trk.astype(np.int32)
p = np.array([[((trk[1] + trk[3]) / 2, (trk[0] + trk[2]) / 2)]], dtype=np.float32)
center_pt = cv2.perspectiveTransform(p, H)
ptx = center_pt.T.item(0)
pty = center_pt.T.item(1)
df_cars = compute(trk[4], ptx, pty, frame_idx, df_region, region_buffered, df_line)
genera_data_pd_cars = genera_data_pd_cars.append(df_cars)
for other in ped_detections:
other = other.astype(np.int32)
p = np.array([[((other[1] + other[3]) / 2, (other[0] + other[2]) / 2)]], dtype=np.float32)
center_pt = cv2.perspectiveTransform(p, H)
ptx = center_pt.T.item(0)
pty = center_pt.T.item(1)
df_peds = compute(other[4], ptx, pty, frame_idx, df_region, region_buffered, df_line)
genera_data_pd_peds = genera_data_pd_cars.append(df_peds)
query = "is_in_region == True and is_in_region_now == True"
df_peds = genera_data_pd_peds.query(query)
query = " is_in_region == True"
df_cars = genera_data_pd_cars.query(query)
if len(df_cars)> 1 and len(df_peds) > 1:
df_car_in_t_range_ped = select_slice(df_cars, df_peds)
df_ped_in_t_range_car = select_slice(df_peds, df_cars)
t_abs_crossing_car = df_cars['t_abs_at_crossing'].iloc[0]
t_abs_crossing_ped = df_peds['t_abs_at_crossing'].iloc[0]
dt_crossing = t_abs_crossing_car - t_abs_crossing_ped
is_able_to_pass_before_ped = \
((df_car_in_t_range_ped['t_abs_at_crossing_estimated'] -
t_abs_crossing_ped) > 0).any()
behavior = Behavior( # is_passed_before_ped
dt_crossing < 0,
# is_able_to_stop
df_car_in_t_range_ped['is_able_to_halt'].any(),
# is_too_fast
df_car_in_t_range_ped['is_too_fast'].any(),
# is_close_enough
df_car_in_t_range_ped['is_close_enough'].any(),
# is_able_to_pass_before_ped
is_able_to_pass_before_ped)
interaction = Interaction(trk[4], other[4])
interaction = interaction.assess_behavior(behavior)
code, res, msg = interaction.code, interaction.res, interaction.msg
print(msg)
genera_data_pd_cars = genera_data_pd_cars.iloc[0:0]
genera_data_pd_peds = genera_data_pd_peds.iloc[0:0]
lock.release()
return img
Multi-threading in python for CPU bound tasks is limited by GIL and effectively makes single thread run a time.
Ofcourse if you launch multiple threads for CPU bound tasks the performance is going to be even degraded because there is lot of overhead for both for kernel and python interpreter to manage these threads.
Kernel wants to schedule these threads and python wants to restrict these threads from running simultaneous and this results lot of context switches happening which degrades the performance.
If you are using just numpy in the threads then you would be fine as numpy isn't impacted by GIL since it uses atomic operations, but I am not sure if that is true for OpenCV as well.
Threads in python arn't meant for computation tasks.
This is classic problem of threads with python, consider using multiprocessing and there are number of articles on this topic, you might want to check few of them.
Threads aren't executed in parallel in cpython. Try using the ProcessPoolExecutor instead.
I have an application in Tkinter.
Part of this application is a method:
It basically takes long lists of random values and checks if the random values are inside of a previously defined grid. Afterwards it writes them into another variable to export it.
This is a rather long process. So I would like to multiprocess it.
Read some stuff about how to do that. Here's the resulting code:
I've read around SO for stuff that might be relevant. I am running an up-to-date Spyder with Python 3.7 as part of the Anaconda-suite on both machines, all (at least included) packages are up-to-date and I've included the
if __name__ == '__main__':
-line. I've also experimented with indentation of
p.start()
and
processes.append(p)
Simply can't get it to work.
def ParallelStuff(myIn1, myIn2, myIn3, myIn4, anotherIn1, anotherIn2, anotherIn3, return_dict, processIterator):
tempOut1 = np.zeros(len(myIn1)) # myIn1, myIn2, myIn3 are of the same length
tempOut2 = np.zeros(len(myIn1))
tempOut3 = np.zeros(len(myIn1))
bb = 0
for i in range(len(myIn3)):
xx = myIn3[i]
yy = myIn4[i]
hits = np.isin(anotherIn1, xx)
goodY = anotherIn3[np.where(hits==1)]
if np.isin(yy, goodY):
tempOut1[bb] = myIn1[i]
tempOut2[bb] = myIn2[i]
tempOut3[bb] = anotherIn3
bb += 1
return_dict[processIterator] = [tempOut1, tempOut1, tempOut3]
nCores = multiprocessing.cpu_count()
def export_Function(self):
out1 = np.array([])
out2 = np.array([])
out3 = np.array([])
for loop_one in range(0, N):
# ...
# stuff that works on both systems with only one core...
# ... and on linux with all cores
processes = []
nTotal = int(len(xRand))
if nTotal%nCores == 0:
o = int(nTotal/nCores)
else:
o = int(nTotal/(nCores-1))
manager = multiprocessing.Manager()
return_dict = manager.dict()
for processIterator in range (nCores):
offset = o*i
myIn1 = in1[offset : min(nTotal, offset + o)]
myIn2 = in2[offset : min(nTotal, offset + o)]
myIn3 = in3[offset : min(nTotal, offset + o)]
myIn4 = in4[offset : min(nTotal, offset + o)]
if __name__ == '__main__':
p = multiprocessing.Process(target = ParallelStuff, args = (myIn1, myIn2, myIn3, myIn4, anotherIn1, anotherIn2, anotherIn3, return_dict, processIterator))
p.start()
processes.append(p)
for p in range(len(processes)):
processes[p].join()
myOut1 = return_dict[p][0]
myOut2 = return_dict[p][1]
myOut3 = return_dict[p][2]
out1 = np.concatenate((out1, myOut1[np.where(myOut1 != 0)]))
out2 = np.concatenate((out2, myOut2[np.where(myOut2 != 0)]))
out3 = np.concatenate((out3, myOut3[np.where(myOut3 != 0)]))
When I run my programm on my Linux machine it does exactly what it's supposed to do. Distribute to all 8 cores, computes, concatenates the 3 results in the respective arrays, exports.
When I run my programm on my Windows machine the application's window freezes, the process becomes inactive, a new kernel automatically opens and a new window appears.