I have a python iterator that solves a time-consuming task each iteration. It would be nice if the return values of the iterator could be precomputed in the background, such that when the iterator is called, the result can be yielded right away.
eg
import numpy as np
def sample_iterator():
while True:
x = np.random.rand(int(1e8)).mean()
yield x
Here is a iterator (precomputing_iterator) that takes an iterator (sample_iterator) as input. precomputing_iterator precomputes the return values of sample_iterator. When precomputing_iterator is created the precomputation of return values of sample_iterator is started right away. The return values are saved on a multiprocessing.Queue object. If there are values on the queue, precomputing_iterator can yield them right away.
from multiprocessing import Process, Queue
import numpy as np
import time
def sample_iterator():
while True:
x = np.random.rand(int(1e8)).mean()
yield x
def precomputing_iterator(iterator, maxsize = 5):
def enqueue(q):
while True:
q.put(iterator.next())
q = Queue(maxsize = maxsize)
p = Process(target=enqueue, args=(q,))
p.start()
while True:
yield q.get()
i1 = sample_iterator()
i2 = precomputing_iterator(i1)
t = time.time()
i2.next()
print "execution time:", time.time() - t
time.sleep(3)
t = time.time()
i2.next()
print "execution time:", time.time() - t
Here for me the first execution time is 1.4 seconds (queue is empty. No return values precomputed). The second execution time is 0.00031 seconds (the precomputed result is just returned)
Related
How come the multi function that uses a multiprocessing pool to segment and process data on multiple "processes" is slower (8 seconds) than just calling the map function (6 seconds)?
from multiprocessing import Pool
import timeit
def timer(function):
def new_function():
start_time = timeit.default_timer()
function()
elapsed = timeit.default_timer() - start_time
print('Function "{name}" took {time} seconds to complete.'.format(name=function.__name__, time=elapsed))
return new_function
def cube(n):
return n*n*n
nums = range(20000000)
if __name__ == '__main__':
#timer
def multi():
pool = Pool()
res = pool.map(cube,nums)
pool.close()
pool.join()
#timer
def test():
a = map(cube,nums)
multi()
test()
Because all the dispatching logic behind pool.map creates an overhead.
Multiprocessing always create overhead of some sort, which heavily depends on its underlying implementation.
You are running a lot of very simple tasks here, hence the overhead caused by the threading logic is not compensated by the gain of parallel execution. Try to do the same test with a lesser number of more cpu-intensive task, you should see different results.
Exemple
See this modified test. Here, we have a silly cubes function that computes n^3 1000 times.
from multiprocessing import Pool
import timeit
def timer(function):
def new_function():
start_time = timeit.default_timer()
function()
elapsed = timeit.default_timer() - start_time
print('Function "{name}" took {time} seconds to complete.'.format(name=function.__name__, time=elapsed))
return new_function
def cubes(n):
for _ in range(999):
n * n * n
return n * n * n
nums = range(20000)
if __name__ == '__main__':
#timer
def multi():
pool = Pool()
res = pool.map(cubes, nums)
pool.close()
pool.join()
#timer
def test():
# On Python 3, simply calling map() returns an iterator
# tuple() collects its values for timing
a = tuple(map(cubes, nums))
multi()
test()
We now see multiprocessing is improving our timing:
Function "multi" took 0.6272498000000001 seconds to complete.
Function "test" took 2.130454 seconds to complete.
I have a mono threaded function that I would like to parallelize. The code is a bit too complex to show you, but here is a modelization of its behaviours
R = list(range(4))
def compute(val):
res = sum(val)
if res%2 == 0: #first condition on res
sleep(0.5) #expansive operation
if res%4 == 0: #second condition on res
sleep(0.5) #expansive operation
return 2
else:
return 1
else:
return 1
def f_single(idx, val):
if idx == len(R):
return 1
else:
val = val + [R[idx]]
ret = compute(val)
if ret == 1:
return f_single(idx+1, val)
else:
#DISJUNCT
return f_single(idx+1, val) + f_single(idx+1, val)
Basically, each recursion performs an update of the variable val, and I may need to perform a double call on certain conditions depending on the result of compute(val), which is an expansive computation.
(Sidenote: this implementation does not scale up to large lists, since I will StackOverflow quite rapidly; the multiprocessing effort is also an excuse to rewrite this code).
Ideally, I would like to spawn a new process for computing the new call to f_single.
I started refactoring the code like this:
# list of indices
def process_idxs():
return list(range(len(R)))
def are_two_path(idx, val):
val = val + [R[idx]]
ret = compute(val)
if ret == 1:
return False #simulate a "only one path"
return True #simulate a "two path available"
if __name__ == '__main__':
ret = f_single(0,[])
now = time()
idxs = process_idxs()
# start a job when the job queue is not full
# when the job is complete, return the results (solvers with call stacks)
# add the new results to the job queue
# program terminates when the job queue is done
# TODO: how to do this?
with ProcessPoolExecutor(max_workers=12) as executor:
for idx in idxs:
f = executor.submit(are_two_path, idx, val)
print(f.result())
print("multi proc: ", time()-now, "s")
I don't know how to write my parallelization routine to obtain the same return value than f_single (the last few lines are a tentative to do just that).
When looking in concurrent.future and multiprocessing, I did not found an easy way to collect the results of computation for the current index, conditionally spawn the process and perform to the next recursion, while passing the updated value of val.
I don't have any shared state, except R which is read-only so it shouldn't be an issue here.
Do you have any suggestions or guides on how to convert f_single to a multiprocessing function?
A possible way to do it is to do the following:
import os
from time import time, sleep
from multiprocessing import Queue, Process
from queue import Empty
R = list(range(16))
NUMBER_OF_PROCESSES = 32
TIMEOUT = 1
def compute(val):
res = sum(val)
if res%2 == 0: #first condition on res
sleep(1) #expansive operation
if res%4 == 0: #second condition on res
sleep(1) #expansive operation
return 2
else:
return 1
else:
return 1
def are_two_path(idx, val):
val = val + [R[idx]]
ret = compute(val)
if ret == 1:
return False #simulate a "only one path"
return True #simulate a "two path available"
def worker(q, r, start_val, start_idx):
"""Worker spawned in a new process, in charge of
going through the list iteratively.
Sends a new job to the tasks queue if two path are available
"""
val = start_val
for idx in range(start_idx, len(R)+1):
if idx == len(R):
r.put(1)
else:
result = are_two_path(idx, val)
if result:
q.put((idx+1, val+[R[idx]]))
val = val + [R[idx]]
def overseer():
"""Running in the initial process,
this function create tasks and results queues,
maintain the number of current running processes
and spawn new processes when there is enough room
"""
tasks = Queue()
results = Queue()
init_p = Process(target=worker,
args=(tasks, results, [], 0))
init_p.start()
working = 1
completed_last_cycle = 0
while True:
completed_tasks = results.qsize()
if working < NUMBER_OF_PROCESSES:
# if there is enough room in the working queue,
# spawn a new process and add it
try:
(idx, val) = tasks.get(timeout=5)
except Empty:
break
p = Process(target=worker, args=(tasks, results, val, idx))
p.start()
working += 1
if completed_tasks > completed_last_cycle:
# if some processes terminated during last cycle,
# update the working counter
working -= (completed_tasks - completed_last_cycle)
completed_last_cycle = completed_tasks
tasks.close()
tasks.join_thread()
results.close()
results.join_thread()
return results
def test():
res = overseer()
print("Number of case splits: ", res.qsize())
if __name__ == '__main__':
now = time()
test()
print("multi proc: ", time()-now, "s")
So far, I only used Multiprocessing and Multi-threading on functions that return a result at the end. I know multiprocessing.Queue and multiprocessing.Queue.get() but I just don't understand how I could apply this to a data_loader..
I struggle with the following task:
def data_loader():
for _ in range(10**6):
#calculates for some seconds
yield result
for data in data_loader():
train_AI(data)
#Here an AI is being trained for another some seconds
So my question is: Is there any easy way to have my existing data_loader calculate (pre-buffer) its next yield while the AI is being trained on the GPU?
Or would I have to completely restructure this, with an external iterator that calls an inner smaller data_loader that returns a single batch each time it's called?
Yeah, you can use Python's Queue:
from multiprocessing import Process, Queue
from time import sleep
FINISHED_LOADING_DATA = 'LAST ONE' # just make sure it's not something that can be returned by some_function()
def some_function():
print('getting data')
sleep(0.5)
return 'some_result'
def train_AI(x):
print('training AI')
sleep(2)
q = Queue()
final_results = []
def data_loader(q):
for _ in range(10):
result = some_function()
q.put(result)
q.put(FINISHED_LOADING_DATA)
def train_if_data_available():
while True:
data = q.get()
if data == FINISHED_LOADING_DATA:
return 'DONE'
train_AI(data)
t = Process(target=data_loader, args=(q,))
t.daemon = True
t.start()
train_if_data_available()
I have used multiprocessing Pool to get some performance benefit over my sequential approach. However result is just opposite and Pool takes more time than sequential:
import multiprocessing as mp
import datetime
class A:
def __init__(self):
self.result_list = []
# parallel processing function
def foo_pool(self, data):
for d in data:
d[0] = d[0] * 10
return data
# sequential function
def foo_seq(self, data):
data[0] = data[0] * 10
return data
def log_result(self, result):
# This is called whenever foo_pool(i) returns a result.
self.result_list.extend([result])
def apply_async_with_callback(self):
pool = mp.Pool(8)
# Data Creation
lst = []
for i in range(100000):
lst.append([i, i + 1, i + 2])
print('length of data ', len(lst))
dtStart = datetime.datetime.now()
print('start time:', str(datetime.datetime.now()))
# Multiprocessing takes 2 secs
for data in self.chunks(lst, 1000):
pool.apply_async(self.foo_pool, args=(data,),
callback=self.log_result)
# Sequential. It is 10x faster than pool
# for d in lst:
# self.result_list.extend([self.foo_seq(d)])
pool.close()
pool.join()
print('output data length:', len(self.result_list))
dtEnd = datetime.datetime.now()
print('end time:', str(datetime.datetime.now()))
print('Time taken:', str(dtEnd - dtStart))
# Divide big data into chunks
def chunks(self, data, n):
for i in range(0, len(data), n):
res = data[i:i + n]
yield res
if __name__ == '__main__':
a = A()
a.apply_async_with_callback()
In above python code, in apply_async_with_callback(). If you un-comment the sequential code and run, result would get 10 times faster then multiprocessing Pool code.
Can someone help me understand, what is the wrong thing i am doing?
Edit:
After applying the code provided in Why is multiprocessed code in given code taking more time than usual sequential execution?
sequential is now only 2 times faster than parallel processing code. Updated code below:
import multiprocessing as mp
import datetime
class A:
def __init__(self):
self.result_list = []
# parallel processing function
def foo_pool(self, data):
for d in data:
d[0] = d[0] * float(10) + 10 * (float(d[0]) / 100)
return data
def log_result(self, result):
# This is called whenever foo_pool(i) returns a result.
self.result_list.extend([result])
def flatten(self, ll):
lst = []
for l in ll:
lst.extend(l)
return lst
def square(self, x):
return x * x
def squareChunk(self, chunk):
return self.foo_pool(chunk) #[self.foo_pool(x) for x in chunk]
def apply_async_with_callback(self):
# Data Creation
lst = []
for i in range(1000000):
lst.append([i, i + 1, i + 2])
print('length of data ', len(lst))
chunked = self.chunks(lst, 10000) # split original list in decent sized chunks
pool = mp.Pool(2)
dtStart = datetime.datetime.now()
print('start time:', str(datetime.datetime.now()))
results = self.flatten(pool.map(self.squareChunk, chunked))
pool.close()
pool.join()
print('output data length:', len(results))
dtEnd = datetime.datetime.now()
print('end time:', str(datetime.datetime.now()))
print('multi proc Time taken:', str(dtEnd - dtStart))
def chunks(self, l, n):
n = max(1, n)
return (l[i:i + n] for i in range(0, len(l), n))
if __name__ == '__main__':
a = A()
a.apply_async_with_callback()
I can see the difference of using Pool.map instead of Pool.apply_async. Code is faster now. Earlier it was 10 times slower than sequential, now it is 2 times slower. But... slower....
This is how multiprocessing behaves? Then what is the point of using multiprocessing? Or am i still doing something wrong?
I'm been playing with the multiprocessing module to gain a better understanding from the implementation side. The code below does the following serial and then parallel way:
A set of random numbers are generated. Each number is used as a constant in an exponential function. The goal is to find, for each random number, a scalar needed such that the integral of the exponential function is 20.
The code below seems to work. However, once the value of num is set to 500, the code will just hang and I have no idea why. For what it's worth, this is on a Windows machine with everything running in Spyder.
from scipy import optimize as op
from scipy.integrate import trapz as intg
import numpy as np
import multiprocessing as mp
import random
import timeit
import time
def to_solve(a=None, x=None, y=None):
return intg(a*y, x)-20
def worker(lst, x, out_q):
ans = np.zeros(shape=(len(lst), 2))
for i, a in enumerate(lst):
y = func(a=a, x=x)
ans[i,0] = a
ans[i,1] = op.newton(func=to_solve, x0=1, args=(x, y))
out_q.put(ans)
def func(a=None, x=None):
return 1-np.exp(-a*x)
def main_p(nums):
start = timeit.default_timer()
x = np.linspace(0,100)
procs = []
out_q = mp.Queue()
num_procs = 2
step = int(len(nums)/num_procs)
first = 0
last = 0
for i in range(num_procs):
first = last
last = first+step
if i == num_procs-1:
out = nums[first:]
else:
out = nums[first:last]
p = mp.Process(target=worker, args=(out, x, out_q))
procs.append(p)
p.start()
for p in procs:
p.join()
for i in range(len(procs)):
if i == 0:
results = out_q.get()
else:
results = np.vstack((results, out_q.get()))
results = results[results[:,0].argsort()]
print timeit.default_timer() - start
return results
def main_s(nums):
start = timeit.default_timer()
results = np.zeros(shape=(len(nums),2))
x = np.linspace(0,100)
for i, a in enumerate(nums):
results[i,0] = a
y = func(a=a, x=x)
results[i,1] = op.newton(func=to_solve, x0=1, args=(x,y))
results = results[results[:,0].argsort()]
print timeit.default_timer() - start
return results
if __name__ == '__main__':
num = 400
nums = np.random.rand(1,num)
nums = nums.tolist()[0]
a = main_s(nums)
b = main_p(nums)
The object you are putting on the Queue is too large. Th worker process will not terminate until the buffer feeding the Queue is empty, which in turn will not happen until the parent process reads from the queue, which will happen after the join(), which waits -> Deadlock.
Here ist the description:
"An example which will deadlock is the following."
It follows that the problem goes away if you move the loop
for p in procs:
p.join()
after the for i in range(len(procs)): - loop.