I was trying to parallelize one of our more time consuming tasks, but couldn't get it working, and can't really figure out why.
My code:
from scipy.spatial import distance
import numpy as np
from scipy import linalg
import os
from multiprocessing import Queue
from multiprocessing import Process
from typing import Tuple
num_cpu = os.cpu_count()
arr_1 = np.random.random((20000, 1000))
arr_2 = np.random.random((20000, 1000))
def cosine_similarity( a: np.ndarray, b: np.ndarray):
assert len(a) == len(b)
len_a = linalg.norm(a)
len_b = linalg.norm(b)
# Check if one vector is all zeros. Possible, not probable
if len_a == 0 or len_b == 0:
return 0
return a.dot(b) / (len_a * len_b)
def cosine_distance(a: np.ndarray, b: np.ndarray):
assert len(a) == len(b)
return 1 - cosine_similarity(a, b)
def run_task(start: int, end: int, queue_out: Queue):
print("Called with " + str(start) + " to " + str(end))
if end > len(arr_1):
end = len(arr_1)
for i in range(start, end):
min_dist = 2.0
labeled_data = arr_1[i]
for unlabeled_data in arr_2:
min_dist = min(min_dist, cosine_distance(labeled_data, unlabeled_data))
queue_out.put((i, min_dist))
step = len(arr_1) // num_cpu
t_s = []
queue = Queue()
for i in range(0, len(arr_1), step):
print("Calling with " + str(i) + " to " + str(i + step))
p = Process(target=run_task, args=(i, i + step, queue))
p.start()
t_s.append(p)
for p in t_s:
p.join()
When I call queue.qsize(), its 0. The code also basically finishes instantly, and I get only the "Calling with..." output, but not the "Called with"
If I manually call run_task(0, 1000, queue), it runs for several minutes (about 3), and queue.qsize() is 1000.
When I look at t_s, it gives me 20 lines of <Process name='Process-21' pid=17948 parent=4044 stopped exitcode=1>, (with different process names and pid's).
What am I doing wrong here?
Edit:
tried with Pool.map:
def run_task(index):
print("Called index " + str(index))
min_dist = 2.0
labeled_data = arr_1[index]
for unlabeled_data in arr_2:
min_dist = min(min_dist, cosine_distance(labeled_data, unlabeled_data))
(index, min_dist)
a = 0
with Pool(processes=num_cpu) as pool:
a = pool.imap(run_task, range(len(arr_1)))
for i in a:
print(f"showing the result as it is ready {i}")
Same behaviour, nothing seems to be called
Okay, turns out it had something to do with my imports for whatever reason. After I changed it from Process to multiprocessing.Process, it worked, so I guess there was some namespace masking happening.
Related
I have a mono threaded function that I would like to parallelize. The code is a bit too complex to show you, but here is a modelization of its behaviours
R = list(range(4))
def compute(val):
res = sum(val)
if res%2 == 0: #first condition on res
sleep(0.5) #expansive operation
if res%4 == 0: #second condition on res
sleep(0.5) #expansive operation
return 2
else:
return 1
else:
return 1
def f_single(idx, val):
if idx == len(R):
return 1
else:
val = val + [R[idx]]
ret = compute(val)
if ret == 1:
return f_single(idx+1, val)
else:
#DISJUNCT
return f_single(idx+1, val) + f_single(idx+1, val)
Basically, each recursion performs an update of the variable val, and I may need to perform a double call on certain conditions depending on the result of compute(val), which is an expansive computation.
(Sidenote: this implementation does not scale up to large lists, since I will StackOverflow quite rapidly; the multiprocessing effort is also an excuse to rewrite this code).
Ideally, I would like to spawn a new process for computing the new call to f_single.
I started refactoring the code like this:
# list of indices
def process_idxs():
return list(range(len(R)))
def are_two_path(idx, val):
val = val + [R[idx]]
ret = compute(val)
if ret == 1:
return False #simulate a "only one path"
return True #simulate a "two path available"
if __name__ == '__main__':
ret = f_single(0,[])
now = time()
idxs = process_idxs()
# start a job when the job queue is not full
# when the job is complete, return the results (solvers with call stacks)
# add the new results to the job queue
# program terminates when the job queue is done
# TODO: how to do this?
with ProcessPoolExecutor(max_workers=12) as executor:
for idx in idxs:
f = executor.submit(are_two_path, idx, val)
print(f.result())
print("multi proc: ", time()-now, "s")
I don't know how to write my parallelization routine to obtain the same return value than f_single (the last few lines are a tentative to do just that).
When looking in concurrent.future and multiprocessing, I did not found an easy way to collect the results of computation for the current index, conditionally spawn the process and perform to the next recursion, while passing the updated value of val.
I don't have any shared state, except R which is read-only so it shouldn't be an issue here.
Do you have any suggestions or guides on how to convert f_single to a multiprocessing function?
A possible way to do it is to do the following:
import os
from time import time, sleep
from multiprocessing import Queue, Process
from queue import Empty
R = list(range(16))
NUMBER_OF_PROCESSES = 32
TIMEOUT = 1
def compute(val):
res = sum(val)
if res%2 == 0: #first condition on res
sleep(1) #expansive operation
if res%4 == 0: #second condition on res
sleep(1) #expansive operation
return 2
else:
return 1
else:
return 1
def are_two_path(idx, val):
val = val + [R[idx]]
ret = compute(val)
if ret == 1:
return False #simulate a "only one path"
return True #simulate a "two path available"
def worker(q, r, start_val, start_idx):
"""Worker spawned in a new process, in charge of
going through the list iteratively.
Sends a new job to the tasks queue if two path are available
"""
val = start_val
for idx in range(start_idx, len(R)+1):
if idx == len(R):
r.put(1)
else:
result = are_two_path(idx, val)
if result:
q.put((idx+1, val+[R[idx]]))
val = val + [R[idx]]
def overseer():
"""Running in the initial process,
this function create tasks and results queues,
maintain the number of current running processes
and spawn new processes when there is enough room
"""
tasks = Queue()
results = Queue()
init_p = Process(target=worker,
args=(tasks, results, [], 0))
init_p.start()
working = 1
completed_last_cycle = 0
while True:
completed_tasks = results.qsize()
if working < NUMBER_OF_PROCESSES:
# if there is enough room in the working queue,
# spawn a new process and add it
try:
(idx, val) = tasks.get(timeout=5)
except Empty:
break
p = Process(target=worker, args=(tasks, results, val, idx))
p.start()
working += 1
if completed_tasks > completed_last_cycle:
# if some processes terminated during last cycle,
# update the working counter
working -= (completed_tasks - completed_last_cycle)
completed_last_cycle = completed_tasks
tasks.close()
tasks.join_thread()
results.close()
results.join_thread()
return results
def test():
res = overseer()
print("Number of case splits: ", res.qsize())
if __name__ == '__main__':
now = time()
test()
print("multi proc: ", time()-now, "s")
I am trying to make a calculation code using threads in Python. My code is the following:
from threading import Thread
from time import sleep
class Fact(Thread):
def __init__(self, start, end):
Thread.__init__(self)
self.start = start
self.end = end
def factorial(self):
p = 1
for i in range(self.start, self.end):
p = p * i
return p
def run(self):
global s
s = s + self.factorial()
sleep(2)
print ("output is ", s)
s = 0
def main():
n = 6
mid = n / 2
obj1 = Fact(1, mid)
obj1.start()
obj2 = Fact(mid + 1, n)
obj2.start()
if __name__ == "__main__":
main()
What I want is to compute, iteratively, the factorial of a number. For example, if I enter 6 as an input it should do the following products:
product(1, 3) + product(4, 6)
The problem is that when I run this code:
obj1.start()
I got the following error:
TypeError: 'int' object is not callable
I cannot find which one could be the mistake.
I have used multiprocessing Pool to get some performance benefit over my sequential approach. However result is just opposite and Pool takes more time than sequential:
import multiprocessing as mp
import datetime
class A:
def __init__(self):
self.result_list = []
# parallel processing function
def foo_pool(self, data):
for d in data:
d[0] = d[0] * 10
return data
# sequential function
def foo_seq(self, data):
data[0] = data[0] * 10
return data
def log_result(self, result):
# This is called whenever foo_pool(i) returns a result.
self.result_list.extend([result])
def apply_async_with_callback(self):
pool = mp.Pool(8)
# Data Creation
lst = []
for i in range(100000):
lst.append([i, i + 1, i + 2])
print('length of data ', len(lst))
dtStart = datetime.datetime.now()
print('start time:', str(datetime.datetime.now()))
# Multiprocessing takes 2 secs
for data in self.chunks(lst, 1000):
pool.apply_async(self.foo_pool, args=(data,),
callback=self.log_result)
# Sequential. It is 10x faster than pool
# for d in lst:
# self.result_list.extend([self.foo_seq(d)])
pool.close()
pool.join()
print('output data length:', len(self.result_list))
dtEnd = datetime.datetime.now()
print('end time:', str(datetime.datetime.now()))
print('Time taken:', str(dtEnd - dtStart))
# Divide big data into chunks
def chunks(self, data, n):
for i in range(0, len(data), n):
res = data[i:i + n]
yield res
if __name__ == '__main__':
a = A()
a.apply_async_with_callback()
In above python code, in apply_async_with_callback(). If you un-comment the sequential code and run, result would get 10 times faster then multiprocessing Pool code.
Can someone help me understand, what is the wrong thing i am doing?
Edit:
After applying the code provided in Why is multiprocessed code in given code taking more time than usual sequential execution?
sequential is now only 2 times faster than parallel processing code. Updated code below:
import multiprocessing as mp
import datetime
class A:
def __init__(self):
self.result_list = []
# parallel processing function
def foo_pool(self, data):
for d in data:
d[0] = d[0] * float(10) + 10 * (float(d[0]) / 100)
return data
def log_result(self, result):
# This is called whenever foo_pool(i) returns a result.
self.result_list.extend([result])
def flatten(self, ll):
lst = []
for l in ll:
lst.extend(l)
return lst
def square(self, x):
return x * x
def squareChunk(self, chunk):
return self.foo_pool(chunk) #[self.foo_pool(x) for x in chunk]
def apply_async_with_callback(self):
# Data Creation
lst = []
for i in range(1000000):
lst.append([i, i + 1, i + 2])
print('length of data ', len(lst))
chunked = self.chunks(lst, 10000) # split original list in decent sized chunks
pool = mp.Pool(2)
dtStart = datetime.datetime.now()
print('start time:', str(datetime.datetime.now()))
results = self.flatten(pool.map(self.squareChunk, chunked))
pool.close()
pool.join()
print('output data length:', len(results))
dtEnd = datetime.datetime.now()
print('end time:', str(datetime.datetime.now()))
print('multi proc Time taken:', str(dtEnd - dtStart))
def chunks(self, l, n):
n = max(1, n)
return (l[i:i + n] for i in range(0, len(l), n))
if __name__ == '__main__':
a = A()
a.apply_async_with_callback()
I can see the difference of using Pool.map instead of Pool.apply_async. Code is faster now. Earlier it was 10 times slower than sequential, now it is 2 times slower. But... slower....
This is how multiprocessing behaves? Then what is the point of using multiprocessing? Or am i still doing something wrong?
I'm been playing with the multiprocessing module to gain a better understanding from the implementation side. The code below does the following serial and then parallel way:
A set of random numbers are generated. Each number is used as a constant in an exponential function. The goal is to find, for each random number, a scalar needed such that the integral of the exponential function is 20.
The code below seems to work. However, once the value of num is set to 500, the code will just hang and I have no idea why. For what it's worth, this is on a Windows machine with everything running in Spyder.
from scipy import optimize as op
from scipy.integrate import trapz as intg
import numpy as np
import multiprocessing as mp
import random
import timeit
import time
def to_solve(a=None, x=None, y=None):
return intg(a*y, x)-20
def worker(lst, x, out_q):
ans = np.zeros(shape=(len(lst), 2))
for i, a in enumerate(lst):
y = func(a=a, x=x)
ans[i,0] = a
ans[i,1] = op.newton(func=to_solve, x0=1, args=(x, y))
out_q.put(ans)
def func(a=None, x=None):
return 1-np.exp(-a*x)
def main_p(nums):
start = timeit.default_timer()
x = np.linspace(0,100)
procs = []
out_q = mp.Queue()
num_procs = 2
step = int(len(nums)/num_procs)
first = 0
last = 0
for i in range(num_procs):
first = last
last = first+step
if i == num_procs-1:
out = nums[first:]
else:
out = nums[first:last]
p = mp.Process(target=worker, args=(out, x, out_q))
procs.append(p)
p.start()
for p in procs:
p.join()
for i in range(len(procs)):
if i == 0:
results = out_q.get()
else:
results = np.vstack((results, out_q.get()))
results = results[results[:,0].argsort()]
print timeit.default_timer() - start
return results
def main_s(nums):
start = timeit.default_timer()
results = np.zeros(shape=(len(nums),2))
x = np.linspace(0,100)
for i, a in enumerate(nums):
results[i,0] = a
y = func(a=a, x=x)
results[i,1] = op.newton(func=to_solve, x0=1, args=(x,y))
results = results[results[:,0].argsort()]
print timeit.default_timer() - start
return results
if __name__ == '__main__':
num = 400
nums = np.random.rand(1,num)
nums = nums.tolist()[0]
a = main_s(nums)
b = main_p(nums)
The object you are putting on the Queue is too large. Th worker process will not terminate until the buffer feeding the Queue is empty, which in turn will not happen until the parent process reads from the queue, which will happen after the join(), which waits -> Deadlock.
Here ist the description:
"An example which will deadlock is the following."
It follows that the problem goes away if you move the loop
for p in procs:
p.join()
after the for i in range(len(procs)): - loop.
I would like to construct an algorithm that whittles a queue down to one element but may (temporarily) empty the queue. I tried to implement this with a multinumber gcd algorithm, but it isn't working at all. I suspect the program is with the while loop, but I am not sure how to fix it.
Any help would be appreciated. I'm new to programming, so apologies if this is too basic a question.
import threading
import Queue
import time, random
#Worker Class
class Worker(threading.Thread):
def __init__(self, queue, flag):
self.__queue = queue
self.__flag = flag
threading.Thread.__init__(self)
def run(self):
while (queue.qsize() > 1):
a = self.__queue.get()
b = self.__queue.get()
#Worker task
g = gcd(a, b)
if g == 1:
flag = False
queue.put(g)
def quit(self):
return
#gcd algorithm
def gcd(a, b):
while b:
a, b = b, a%b
return a
#kill all workers
def killall(workers):
for worker in workers:
worker.quit()
#queue starts empty
queue = Queue.Queue(0)
#input thread number
WORKERS = int(raw_input("Type in the number of threads (int > 0):"))
#input list length
k = int(raw_input("Length of integer list (int > 0):"))
#random list of integers generated
list = [random.randint(1,1000) for i in range(k)]
while (len(list)):
queue.put(list.pop())
#list of workers
wrkrs = []
#flag to kill if 1 is found
flag = True
#Master function
for i in range(WORKERS):
w = Worker(queue, flag)
wrkrs.append(w)
w.start()
#kill process if 1 is found
if flag == False:
killall(w)
print "gcd is: 1"
#if this worked, answer would be the only remaining element
if queue.qsize() == 1:
print "gcd is:", queue.get()