Python multiprocessing pool is slower than sequential - python

I have used multiprocessing Pool to get some performance benefit over my sequential approach. However result is just opposite and Pool takes more time than sequential:
import multiprocessing as mp
import datetime
class A:
def __init__(self):
self.result_list = []
# parallel processing function
def foo_pool(self, data):
for d in data:
d[0] = d[0] * 10
return data
# sequential function
def foo_seq(self, data):
data[0] = data[0] * 10
return data
def log_result(self, result):
# This is called whenever foo_pool(i) returns a result.
self.result_list.extend([result])
def apply_async_with_callback(self):
pool = mp.Pool(8)
# Data Creation
lst = []
for i in range(100000):
lst.append([i, i + 1, i + 2])
print('length of data ', len(lst))
dtStart = datetime.datetime.now()
print('start time:', str(datetime.datetime.now()))
# Multiprocessing takes 2 secs
for data in self.chunks(lst, 1000):
pool.apply_async(self.foo_pool, args=(data,),
callback=self.log_result)
# Sequential. It is 10x faster than pool
# for d in lst:
# self.result_list.extend([self.foo_seq(d)])
pool.close()
pool.join()
print('output data length:', len(self.result_list))
dtEnd = datetime.datetime.now()
print('end time:', str(datetime.datetime.now()))
print('Time taken:', str(dtEnd - dtStart))
# Divide big data into chunks
def chunks(self, data, n):
for i in range(0, len(data), n):
res = data[i:i + n]
yield res
if __name__ == '__main__':
a = A()
a.apply_async_with_callback()
In above python code, in apply_async_with_callback(). If you un-comment the sequential code and run, result would get 10 times faster then multiprocessing Pool code.
Can someone help me understand, what is the wrong thing i am doing?
Edit:
After applying the code provided in Why is multiprocessed code in given code taking more time than usual sequential execution?
sequential is now only 2 times faster than parallel processing code. Updated code below:
import multiprocessing as mp
import datetime
class A:
def __init__(self):
self.result_list = []
# parallel processing function
def foo_pool(self, data):
for d in data:
d[0] = d[0] * float(10) + 10 * (float(d[0]) / 100)
return data
def log_result(self, result):
# This is called whenever foo_pool(i) returns a result.
self.result_list.extend([result])
def flatten(self, ll):
lst = []
for l in ll:
lst.extend(l)
return lst
def square(self, x):
return x * x
def squareChunk(self, chunk):
return self.foo_pool(chunk) #[self.foo_pool(x) for x in chunk]
def apply_async_with_callback(self):
# Data Creation
lst = []
for i in range(1000000):
lst.append([i, i + 1, i + 2])
print('length of data ', len(lst))
chunked = self.chunks(lst, 10000) # split original list in decent sized chunks
pool = mp.Pool(2)
dtStart = datetime.datetime.now()
print('start time:', str(datetime.datetime.now()))
results = self.flatten(pool.map(self.squareChunk, chunked))
pool.close()
pool.join()
print('output data length:', len(results))
dtEnd = datetime.datetime.now()
print('end time:', str(datetime.datetime.now()))
print('multi proc Time taken:', str(dtEnd - dtStart))
def chunks(self, l, n):
n = max(1, n)
return (l[i:i + n] for i in range(0, len(l), n))
if __name__ == '__main__':
a = A()
a.apply_async_with_callback()
I can see the difference of using Pool.map instead of Pool.apply_async. Code is faster now. Earlier it was 10 times slower than sequential, now it is 2 times slower. But... slower....
This is how multiprocessing behaves? Then what is the point of using multiprocessing? Or am i still doing something wrong?

Related

Python Process spawning within for loop

I was trying to parallelize one of our more time consuming tasks, but couldn't get it working, and can't really figure out why.
My code:
from scipy.spatial import distance
import numpy as np
from scipy import linalg
import os
from multiprocessing import Queue
from multiprocessing import Process
from typing import Tuple
num_cpu = os.cpu_count()
arr_1 = np.random.random((20000, 1000))
arr_2 = np.random.random((20000, 1000))
def cosine_similarity( a: np.ndarray, b: np.ndarray):
assert len(a) == len(b)
len_a = linalg.norm(a)
len_b = linalg.norm(b)
# Check if one vector is all zeros. Possible, not probable
if len_a == 0 or len_b == 0:
return 0
return a.dot(b) / (len_a * len_b)
def cosine_distance(a: np.ndarray, b: np.ndarray):
assert len(a) == len(b)
return 1 - cosine_similarity(a, b)
def run_task(start: int, end: int, queue_out: Queue):
print("Called with " + str(start) + " to " + str(end))
if end > len(arr_1):
end = len(arr_1)
for i in range(start, end):
min_dist = 2.0
labeled_data = arr_1[i]
for unlabeled_data in arr_2:
min_dist = min(min_dist, cosine_distance(labeled_data, unlabeled_data))
queue_out.put((i, min_dist))
step = len(arr_1) // num_cpu
t_s = []
queue = Queue()
for i in range(0, len(arr_1), step):
print("Calling with " + str(i) + " to " + str(i + step))
p = Process(target=run_task, args=(i, i + step, queue))
p.start()
t_s.append(p)
for p in t_s:
p.join()
When I call queue.qsize(), its 0. The code also basically finishes instantly, and I get only the "Calling with..." output, but not the "Called with"
If I manually call run_task(0, 1000, queue), it runs for several minutes (about 3), and queue.qsize() is 1000.
When I look at t_s, it gives me 20 lines of <Process name='Process-21' pid=17948 parent=4044 stopped exitcode=1>, (with different process names and pid's).
What am I doing wrong here?
Edit:
tried with Pool.map:
def run_task(index):
print("Called index " + str(index))
min_dist = 2.0
labeled_data = arr_1[index]
for unlabeled_data in arr_2:
min_dist = min(min_dist, cosine_distance(labeled_data, unlabeled_data))
(index, min_dist)
a = 0
with Pool(processes=num_cpu) as pool:
a = pool.imap(run_task, range(len(arr_1)))
for i in a:
print(f"showing the result as it is ready {i}")
Same behaviour, nothing seems to be called
Okay, turns out it had something to do with my imports for whatever reason. After I changed it from Process to multiprocessing.Process, it worked, so I guess there was some namespace masking happening.

Python iterator that precomputes return (enqueue)

I have a python iterator that solves a time-consuming task each iteration. It would be nice if the return values of the iterator could be precomputed in the background, such that when the iterator is called, the result can be yielded right away.
eg
import numpy as np
def sample_iterator():
while True:
x = np.random.rand(int(1e8)).mean()
yield x
Here is a iterator (precomputing_iterator) that takes an iterator (sample_iterator) as input. precomputing_iterator precomputes the return values of sample_iterator. When precomputing_iterator is created the precomputation of return values of sample_iterator is started right away. The return values are saved on a multiprocessing.Queue object. If there are values on the queue, precomputing_iterator can yield them right away.
from multiprocessing import Process, Queue
import numpy as np
import time
def sample_iterator():
while True:
x = np.random.rand(int(1e8)).mean()
yield x
def precomputing_iterator(iterator, maxsize = 5):
def enqueue(q):
while True:
q.put(iterator.next())
q = Queue(maxsize = maxsize)
p = Process(target=enqueue, args=(q,))
p.start()
while True:
yield q.get()
i1 = sample_iterator()
i2 = precomputing_iterator(i1)
t = time.time()
i2.next()
print "execution time:", time.time() - t
time.sleep(3)
t = time.time()
i2.next()
print "execution time:", time.time() - t
Here for me the first execution time is 1.4 seconds (queue is empty. No return values precomputed). The second execution time is 0.00031 seconds (the precomputed result is just returned)

timing a python function timeit vs time.clock disparity

import time
import logging
from functools import reduce
logging.basicConfig(filename='debug.log', level=logging.DEBUG)
def read_large_file(file_object):
"""Uses a generator to read a large file lazily"""
while True:
data = file_object.readline()
if not data:
break
yield data
def process_file_1(file_path):
"""Opens a large file and reads it in"""
try:
with open(file_path) as fp:
for line in read_large_file(fp):
logging.debug(line)
pass
except(IOError, OSError):
print('Error Opening or Processing file')
def process_file_2(file_path):
"""Opens a large file and reads it in"""
try:
with open(path) as file_handler:
while True:
logging.debug(next(file_handler))
except (IOError, OSError):
print("Error opening / processing file")
except StopIteration:
pass
if __name__ == "__main__":
path = "TB_data_dictionary_2016-04-15.csv"
l1 = []
for i in range(1,10):
start = time.clock()
process_file_1(path)
end = time.clock()
diff = (end - start)
l1.append(diff)
avg = reduce(lambda x, y: x + y, l1) / len(l1)
print('processing time (with generators) {}'.format(avg))
l2 = []
for i in range(1,10):
start = time.clock()
process_file_2(path)
end = time.clock()
diff = (end - start)
l2.append(diff)
avg = reduce(lambda x, y: x + y, l2) / len(l2)
print('processing time (with iterators) {}'.format(avg))
Output of the program:
C:\Python34\python.exe C:/pypen/data_structures/generators/generators1.py
processing time (with generators) 0.028033358176432314
processing time (with iterators) 0.02699498330810426
Another approach that I used is as follows.
def wrapper(func, *args, **kwargs):
def wrapped():
return func(*args, **kwargs)
return wrapped
if __name__ == "__main__":
path = "TB_data_dictionary_2016-04-15.csv"
wrapped = wrapper(process_file_1, path)
t = timeit.timeit(wrapped, number=100)
print('processing time (with generators) {}'.format(t))
wrapped = wrapper(process_file_2, path)
t = timeit.timeit(wrapped, number=100)
print('processing time (with iterators) {}'.format(t))
This gives me a different result.
C:\Python34\python.exe C:/pypen/data_structures/generators/generators1.py
processing time (with generators) 3.0999817624283916
processing time (with iterators) 3.2149597826018304
I would expect the implementation that uses generators to be faster than the one that uses iterators (which is what I am getting when I use timeit).
Why am I not getting the same result using the other method.

Multiprocessing code hangs when input list is beyond a certain length

I'm been playing with the multiprocessing module to gain a better understanding from the implementation side. The code below does the following serial and then parallel way:
A set of random numbers are generated. Each number is used as a constant in an exponential function. The goal is to find, for each random number, a scalar needed such that the integral of the exponential function is 20.
The code below seems to work. However, once the value of num is set to 500, the code will just hang and I have no idea why. For what it's worth, this is on a Windows machine with everything running in Spyder.
from scipy import optimize as op
from scipy.integrate import trapz as intg
import numpy as np
import multiprocessing as mp
import random
import timeit
import time
def to_solve(a=None, x=None, y=None):
return intg(a*y, x)-20
def worker(lst, x, out_q):
ans = np.zeros(shape=(len(lst), 2))
for i, a in enumerate(lst):
y = func(a=a, x=x)
ans[i,0] = a
ans[i,1] = op.newton(func=to_solve, x0=1, args=(x, y))
out_q.put(ans)
def func(a=None, x=None):
return 1-np.exp(-a*x)
def main_p(nums):
start = timeit.default_timer()
x = np.linspace(0,100)
procs = []
out_q = mp.Queue()
num_procs = 2
step = int(len(nums)/num_procs)
first = 0
last = 0
for i in range(num_procs):
first = last
last = first+step
if i == num_procs-1:
out = nums[first:]
else:
out = nums[first:last]
p = mp.Process(target=worker, args=(out, x, out_q))
procs.append(p)
p.start()
for p in procs:
p.join()
for i in range(len(procs)):
if i == 0:
results = out_q.get()
else:
results = np.vstack((results, out_q.get()))
results = results[results[:,0].argsort()]
print timeit.default_timer() - start
return results
def main_s(nums):
start = timeit.default_timer()
results = np.zeros(shape=(len(nums),2))
x = np.linspace(0,100)
for i, a in enumerate(nums):
results[i,0] = a
y = func(a=a, x=x)
results[i,1] = op.newton(func=to_solve, x0=1, args=(x,y))
results = results[results[:,0].argsort()]
print timeit.default_timer() - start
return results
if __name__ == '__main__':
num = 400
nums = np.random.rand(1,num)
nums = nums.tolist()[0]
a = main_s(nums)
b = main_p(nums)
The object you are putting on the Queue is too large. Th worker process will not terminate until the buffer feeding the Queue is empty, which in turn will not happen until the parent process reads from the queue, which will happen after the join(), which waits -> Deadlock.
Here ist the description:
"An example which will deadlock is the following."
It follows that the problem goes away if you move the loop
for p in procs:
p.join()
after the for i in range(len(procs)): - loop.

Why does importing have a cost in multiprocessing?

Basically the more imports from different modules I include the longer these multiprocessing tasks take, even if none of the module functions are used. Is each process having to reimport everything or something? What is going on?
import time
time1 = time.time()
import multiprocessing as mp
import numpy as np # Random imports (not used)
import PIL
import PySide
import pandas
# print time.time() - time1 # here this prints 0.0
class Multi(object):
def __init__(self, queue):
self.q = queue
def run(self, a):
p = mp.Process(target=f, args=(a, q))
p.start()
print self.q.get()
p.join()
class MultiPool(object):
def __init__(self, N):
self.N = N
self.pool = mp.Pool(processes = self.N)
def run(self):
result = self.pool.map_async(f1, ((i,) for i in range(self.N)))
print result.get()
def f(a, q):
for i in range(10000000):
b = i
q.put(b)
def f1(a):
for i in range(10000000):
b = i
return b
if __name__ == '__main__':
q = mp.Queue()
e = Multi(q)
# time1 = time.time()
print f1(0)
print time.time() - time1
time1 = time.time()
e.run('123')
print time.time() - time1
time1 = time.time()
mpool = MultiPool(2)
mpool.run()
print time.time() - time1
# Output with random imports:
>9999999
>0.246000051498
>9999999
>0.693000078201
>[9999999, 9999999]
>0.720999956131
# Output without imports:
>9999999
>0.246000051498
>9999999
>0.315999984741
>[9999999, 9999999]
>0.313999891281
Yes multiprocessing must import everything in any proces just because are process (new applications) and not thread.
What you will measure by your script is the cost of methods execution plus the cost of process creation. You can measure the imports cost and they are execute in place exactly where the import statements are.

Categories