Why does importing have a cost in multiprocessing? - python

Basically the more imports from different modules I include the longer these multiprocessing tasks take, even if none of the module functions are used. Is each process having to reimport everything or something? What is going on?
import time
time1 = time.time()
import multiprocessing as mp
import numpy as np # Random imports (not used)
import PIL
import PySide
import pandas
# print time.time() - time1 # here this prints 0.0
class Multi(object):
def __init__(self, queue):
self.q = queue
def run(self, a):
p = mp.Process(target=f, args=(a, q))
p.start()
print self.q.get()
p.join()
class MultiPool(object):
def __init__(self, N):
self.N = N
self.pool = mp.Pool(processes = self.N)
def run(self):
result = self.pool.map_async(f1, ((i,) for i in range(self.N)))
print result.get()
def f(a, q):
for i in range(10000000):
b = i
q.put(b)
def f1(a):
for i in range(10000000):
b = i
return b
if __name__ == '__main__':
q = mp.Queue()
e = Multi(q)
# time1 = time.time()
print f1(0)
print time.time() - time1
time1 = time.time()
e.run('123')
print time.time() - time1
time1 = time.time()
mpool = MultiPool(2)
mpool.run()
print time.time() - time1
# Output with random imports:
>9999999
>0.246000051498
>9999999
>0.693000078201
>[9999999, 9999999]
>0.720999956131
# Output without imports:
>9999999
>0.246000051498
>9999999
>0.315999984741
>[9999999, 9999999]
>0.313999891281

Yes multiprocessing must import everything in any proces just because are process (new applications) and not thread.
What you will measure by your script is the cost of methods execution plus the cost of process creation. You can measure the imports cost and they are execute in place exactly where the import statements are.

Related

Is there a way to utilize all cpu cores and power in calculations?

First, I've tried to use multithreading solution for this problem and discovered that it is not suitable for this purpose. Then I tried as community suggested to apply multiprocessing solution to bypass the GIL and even that performs poor compared to single process single thread code. Is python flawed in this domain?
Is the only solution for heavy cpu calculations is to drop python for another language?
I post my multiprocessing test code so you can get an impression.
from itertools import cycle
import random
import multiprocessing as mp
import time
# The class that represents the process
class Task(mp.Process):
def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None):
mp.Process.__init__(self, group=group, target=target, name=name, args=args ,kwargs=kwargs, daemon=daemon)
self.inputs = []
def run(self):
print(f"{self.name} is running")
for arr in self.inputs:
arr.sort()
def add_input(self, arr):
self.inputs.append(arr)
# A util function to cycle on iterable a finite number of times.
def finite_cycle(cycle_on, times):
infinite_cycle = cycle(cycle_on)
for _ in range(times):
yield next(infinite_cycle)
# Constants
THOUSAND = 1000
MILION = THOUSAND ** 2
PCNT = 2
TASK_CNT = 50 * THOUSAND
# Main
def main():
processes = [Task(name = f"p{pid}") for pid in range(PCNT)]
for pid in finite_cycle(range(PCNT), TASK_CNT):
processes[pid].add_input([random.randint(1,10) for _ in range(100)])
stime = time.time()
for p in processes:
p.start()
for p in processes:
p.join()
print(f"execution time: {round(time.time() - stime, 2)}")
print("finish.")
And this is the single process single thread code which is faster for every varation of the constants.
def main():
inputs = [[random.randint(1,10) for _ in range(100)] for _ in range(TASK_CNT)]
stime = time.time()
for arr in inputs:
arr.sort()
print(f"execution time: {round(time.time() - stime, 2)}")
print("finish.")
On my desktop the run methods averaged each approximately .125 seconds to run while the time elapsed between calling the first start method and the start of the first run method was approximately .23 seconds (i.e. 1628456465.1061594 - 1628456464.8741603), most of that time I believe taken by the serialization/de-serialization of self.inputs. See below, which is the original program with a few timings added.
The point is that multiprocessing has two sources of overhead that the non-multiprocessing program does not have:
Overhead in creating the processes.
Overhead in passing arguments to and getting results back from the process. This involves moving data from one address space to another (via various mechanisms) in many cases unless shared memory is being used.
Multiprocessing therefore only becomes advantageous when the processing itself (the run method in this case) is so CPU-intensive that the aforementioned costs of multiprocessing are offset by being able to "divide and conquer" the problem.
from itertools import cycle
import random
import multiprocessing as mp
import time
# The class that represents the process
class Task(mp.Process):
def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None):
mp.Process.__init__(self, group=group, target=target, name=name, args=args ,kwargs=kwargs, daemon=daemon)
self.inputs = []
def run(self):
t = time.time()
print(f"{self.name} is running at:", t)
for arr in self.inputs:
arr.sort()
print('elapsed time =', time.time() - t)
def add_input(self, arr):
self.inputs.append(arr)
# A util function to cycle on iterable a finite number of times.
def finite_cycle(cycle_on, times):
infinite_cycle = cycle(cycle_on)
for _ in range(times):
yield next(infinite_cycle)
# Constants
THOUSAND = 1000
MILION = THOUSAND ** 2
PCNT = 2
TASK_CNT = 50 * THOUSAND
# Main
def main():
processes = [Task(name = f"p{pid}") for pid in range(PCNT)]
for pid in finite_cycle(range(PCNT), TASK_CNT):
processes[pid].add_input([random.randint(1,10) for _ in range(100)])
stime = time.time()
print('stime =', stime)
for p in processes:
p.start()
for p in processes:
p.join()
print(f"execution time: {round(time.time() - stime, 2)}")
print("finish.")
if __name__ == '__main__':
main()
Prints:
stime = 1628456464.8741603
p0 is running at: 1628456465.1061594
elapsed time = 0.1320023536682129
p1 is running at: 1628456465.3201597
elapsed time = 0.11999750137329102
execution time: 0.62
finish.

How to use redlock using python

I have been working few days to understand the Redlock and I have seen that the performance to lock takes around 1 second which seems abit too much to just lock it in my opinion but I could be wrong.
I have created a small script:
redis_test.py
import serialized_redis
from pottery import Redlock
redis_connection = serialized_redis.MsgpackSerializedRedis(host='localhost', port=6379, db=0)
def lock(argument):
return Redlock(key=argument, auto_release_time=120 * 1000)
main.py
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import math
import random
import sys
import time
from threading import Thread
from loguru import logger
from lib.redis_test import lock, redis_connection
class StopWatch:
def __init__(self):
self.start()
def start(self):
self._startTime = time.time()
def getStartTime(self):
return self._startTime
def elapsed(self, prec=3):
prec = 3 if prec is None or not isinstance(prec, int) else prec
diff = time.time() - self._startTime
return self.round(diff, prec)
def round(self, n, p=0):
m = 10 ** p
return math.floor(n * m + 0.5) / m
def algorithm_with_lock(random_number):
print("Keys inside Redis", redis_connection.keys())
ourWatch = StopWatch()
ourWatch.start()
if not redis_connection.exists(f'redlock:{random_number}'):
# print("Time taken before redis_connection.exists", ourWatch.elapsed()) -> 0.0 seconds
with lock(f'{random_number}'):
print("Time taken before redis_connection.exists", ourWatch.elapsed()) # 1.002 seconds
time.sleep(5)
redis_connection.set("Hello_world", random.randint(1, 5))
return True
else:
return False
def main():
while True:
chosen_number = f"number_{random.randint(1, 3)}"
response = algorithm_with_lock(chosen_number)
if response:
logger.info(f"Yay, finished my job! -> {chosen_number}")
sys.exit()
else:
logger.debug(f"Trying new number! -> {chosen_number}")
time.sleep(1)
for i in range(1):
Thread(
target=main,
).start()
time.sleep(.1)
Issue with that is that it takes too long to actually lock a redis key which ends up that multiple threads can try to lock the same key which ends up being stuck in the lock tree. My guess is that it should not take 1 second to actually lock. But I could be wrong and here I am, I wonder what could be the reason of long time locking and if there is a chance im using it incorrectly?

How to get the return value of a function in multiprocessing code

This is my python code. I am trying to get the returned value(aa1) from the print_cube()
Is there a way to get the value of aa1 inside the main(). I have to use multiprocessing to call other functions also.
import multiprocessing
def print_cube(num):
aa1 = num * num * num
return aa1
def main():
# creating processes
p1 = multiprocessing.Process(target=print_cube, args=(10, ))
p1.start()
main()
Use multiprocessing.Pool when you want to retrieve return values.
def print_cube(num):
aa1 = num * num * num
return aa1
def main():
with Pool(5) as p:
results = p.map(print_cube, range(10, 15))
print(results)
if __name__ == "__main__":
main()
You can use Queue from multiprocessing, then pass it to print_cube() as shown below:
from multiprocessing import Process, Queue
def print_cube(num, q):
aa1 = num * num * num
q.put(aa1)
def main():
queue = Queue()
p1 = Process(target=print_cube, args=(10, queue))
p1.start()
print(queue.get()) # 1000
main()
This is the result below:
1000
Be careful, if using queue module below with process, the program doesn't work properly:
import queue
queue = queue.Queue()
So, just use Queue from multiprocessing module with process as I used in the code above:
from multiprocessing import Queue
queue = Queue()

multiprocessing.Pipe is even slower than multiprocessing.Queue?

I tried to benchmark the speed up of Pipe over Queue from the multiprocessing package. T thought Pipe would be faster as Queue uses Pipe internally.
Strangely, Pipe is slower than Queue when sending large numpy array. What am I missing here?
Pipe:
import sys
import time
from multiprocessing import Process, Pipe
import numpy as np
NUM = 1000
def worker(conn):
for task_nbr in range(NUM):
conn.send(np.random.rand(400, 400, 3))
sys.exit(1)
def main():
parent_conn, child_conn = Pipe(duplex=False)
Process(target=worker, args=(child_conn,)).start()
for num in range(NUM):
message = parent_conn.recv()
if __name__ == "__main__":
start_time = time.time()
main()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print "Duration: %s" % duration
print "Messages Per Second: %s" % msg_per_sec
# Took 10.86s.
Queue
import sys
import time
from multiprocessing import Process
from multiprocessing import Queue
import numpy as np
NUM = 1000
def worker(q):
for task_nbr in range(NUM):
q.put(np.random.rand(400, 400, 3))
sys.exit(1)
def main():
recv_q = Queue()
Process(target=worker, args=(recv_q,)).start()
for num in range(NUM):
message = recv_q.get()
if __name__ == "__main__":
start_time = time.time()
main()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print "Duration: %s" % duration
print "Messages Per Second: %s" % msg_per_sec
# Took 6.86s.
You can do an experiment and put the following into your Pipe code above..
def worker(conn):
for task_nbr in range(NUM):
data = np.random.rand(400, 400, 3)
sys.exit(1)
def main():
parent_conn, child_conn = Pipe(duplex=False)
p = Process(target=worker, args=(child_conn,))
p.start()
p.join()
This gives you the time that it takes to create the data for your test. On my system this takes about 2.9 seconds.
Under the hood the queue object implements a buffer and a threaded send. The thread is still in the same process but by using it, the data creation doesn't have to wait for the system IO to complete. It effectively parallelizes the operations. Try your Pipe code modified with some simple threading implemented (disclaimer, code here is for test only and is not production ready)..
import sys
import time
import threading
from multiprocessing import Process, Pipe, Lock
import numpy as np
import copy
NUM = 1000
def worker(conn):
_conn = conn
_buf = []
_wlock = Lock()
_sentinel = object() # signal that we're done
def thread_worker():
while 1:
if _buf:
_wlock.acquire()
obj = _buf.pop(0)
if obj is _sentinel: return
_conn.send(data)
_wlock.release()
t = threading.Thread(target=thread_worker)
t.start()
for task_nbr in range(NUM):
data = np.random.rand(400, 400, 3)
data[0][0][0] = task_nbr # just for integrity check
_wlock.acquire()
_buf.append(data)
_wlock.release()
_wlock.acquire()
_buf.append(_sentinel)
_wlock.release()
t.join()
sys.exit(1)
def main():
parent_conn, child_conn = Pipe(duplex=False)
Process(target=worker, args=(child_conn,)).start()
for num in range(NUM):
message = parent_conn.recv()
assert num == message[0][0][0], 'Data was corrupted'
if __name__ == "__main__":
start_time = time.time()
main()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print "Duration: %s" % duration
print "Messages Per Second: %s" % msg_per_sec
On my machine this takes 3.4 seconds to run which is almost exactly the same as your Queue code above.
From https://docs.python.org/2/library/threading.html
In Cython, due to due to the Global Interpreter Lock, only one thread can execute Python code at once... however, threading is still an appropriate model if you want to run multiple I/O-bound tasks simultaneously.
The queue and pipe differences are definitely an odd implementation detail until you dig into it a bit.
I assume by your print command you are using Python2. However the strange behavior cannot be replicated with Python3, where Pipe is actually faster than Queue.
import sys
import time
from multiprocessing import Process, Pipe, Queue
import numpy as np
NUM = 20000
def worker_pipe(conn):
for task_nbr in range(NUM):
conn.send(np.random.rand(40, 40, 3))
sys.exit(1)
def main_pipe():
parent_conn, child_conn = Pipe(duplex=False)
Process(target=worker_pipe, args=(child_conn,)).start()
for num in range(NUM):
message = parent_conn.recv()
def pipe_test():
start_time = time.time()
main_pipe()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print("Pipe")
print("Duration: " + str(duration))
print("Messages Per Second: " + str(msg_per_sec))
def worker_queue(q):
for task_nbr in range(NUM):
q.put(np.random.rand(40, 40, 3))
sys.exit(1)
def main_queue():
recv_q = Queue()
Process(target=worker_queue, args=(recv_q,)).start()
for num in range(NUM):
message = recv_q.get()
def queue_test():
start_time = time.time()
main_queue()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print("Queue")
print("Duration: " + str(duration))
print("Messages Per Second: " + str(msg_per_sec))
if __name__ == "__main__":
for i in range(2):
queue_test()
pipe_test()
Results in:
Queue
Duration: 3.44321894646
Messages Per Second: 5808.51822408
Pipe
Duration: 2.69065594673
Messages Per Second: 7433.13169575
Queue
Duration: 3.45295906067
Messages Per Second: 5792.13354361
Pipe
Duration: 2.78426194191
Messages Per Second: 7183.23218766
------------------
(program exited with code: 0)
Press return to continue
On my system Pipe(duplex=False) is slower (twice the time, or half the rate) than Pipe(duplex=True). For anyone looking for performance here is a side-by-side comparison:
from time import time
from multiprocessing import Process, Queue, Pipe
n = 1000
buffer = b'\0' * (1000*1000) # 1 megabyte
def print_elapsed(name, start):
elapsed = time() - start
spi = elapsed / n
ips = n / elapsed
print(f'{name}: {spi*1000:.3f} ms/item, {ips:.0f} item/sec')
def producer(q):
start = time()
for i in range(n):
q.put(buffer)
print_elapsed('producer', start)
def consumer(q):
start = time()
for i in range(n):
out = q.get()
print_elapsed('consumer', start)
class PipeQueue():
def __init__(self, **kwargs):
self.out_pipe, self.in_pipe = Pipe(**kwargs)
def put(self, item):
self.in_pipe.send_bytes(item)
def get(self):
return self.out_pipe.recv_bytes()
def close(self):
self.out_pipe.close()
self.in_pipe.close()
print('duplex=True')
q = PipeQueue(duplex=True)
producer_process = Process(target=producer, args=(q,))
consumer_process = Process(target=consumer, args=(q,))
consumer_process.start()
producer_process.start()
consumer_process.join()
producer_process.join()
q.close()
print('duplex=False')
q = PipeQueue(duplex=False)
producer_process = Process(target=producer, args=(q,))
consumer_process = Process(target=consumer, args=(q,))
consumer_process.start()
producer_process.start()
consumer_process.join()
producer_process.join()
q.close()
Results:
duplex=True
consumer: 0.301 ms/item, 3317 item/sec
producer: 0.298 ms/item, 3358 item/sec
duplex=False
consumer: 0.673 ms/item, 1486 item/sec
producer: 0.669 ms/item, 1494 item/sec
I think this must come down to CPython using os.pipe vs socket.socketpair, but I'm not sure.

Python multiprocessing pool is slower than sequential

I have used multiprocessing Pool to get some performance benefit over my sequential approach. However result is just opposite and Pool takes more time than sequential:
import multiprocessing as mp
import datetime
class A:
def __init__(self):
self.result_list = []
# parallel processing function
def foo_pool(self, data):
for d in data:
d[0] = d[0] * 10
return data
# sequential function
def foo_seq(self, data):
data[0] = data[0] * 10
return data
def log_result(self, result):
# This is called whenever foo_pool(i) returns a result.
self.result_list.extend([result])
def apply_async_with_callback(self):
pool = mp.Pool(8)
# Data Creation
lst = []
for i in range(100000):
lst.append([i, i + 1, i + 2])
print('length of data ', len(lst))
dtStart = datetime.datetime.now()
print('start time:', str(datetime.datetime.now()))
# Multiprocessing takes 2 secs
for data in self.chunks(lst, 1000):
pool.apply_async(self.foo_pool, args=(data,),
callback=self.log_result)
# Sequential. It is 10x faster than pool
# for d in lst:
# self.result_list.extend([self.foo_seq(d)])
pool.close()
pool.join()
print('output data length:', len(self.result_list))
dtEnd = datetime.datetime.now()
print('end time:', str(datetime.datetime.now()))
print('Time taken:', str(dtEnd - dtStart))
# Divide big data into chunks
def chunks(self, data, n):
for i in range(0, len(data), n):
res = data[i:i + n]
yield res
if __name__ == '__main__':
a = A()
a.apply_async_with_callback()
In above python code, in apply_async_with_callback(). If you un-comment the sequential code and run, result would get 10 times faster then multiprocessing Pool code.
Can someone help me understand, what is the wrong thing i am doing?
Edit:
After applying the code provided in Why is multiprocessed code in given code taking more time than usual sequential execution?
sequential is now only 2 times faster than parallel processing code. Updated code below:
import multiprocessing as mp
import datetime
class A:
def __init__(self):
self.result_list = []
# parallel processing function
def foo_pool(self, data):
for d in data:
d[0] = d[0] * float(10) + 10 * (float(d[0]) / 100)
return data
def log_result(self, result):
# This is called whenever foo_pool(i) returns a result.
self.result_list.extend([result])
def flatten(self, ll):
lst = []
for l in ll:
lst.extend(l)
return lst
def square(self, x):
return x * x
def squareChunk(self, chunk):
return self.foo_pool(chunk) #[self.foo_pool(x) for x in chunk]
def apply_async_with_callback(self):
# Data Creation
lst = []
for i in range(1000000):
lst.append([i, i + 1, i + 2])
print('length of data ', len(lst))
chunked = self.chunks(lst, 10000) # split original list in decent sized chunks
pool = mp.Pool(2)
dtStart = datetime.datetime.now()
print('start time:', str(datetime.datetime.now()))
results = self.flatten(pool.map(self.squareChunk, chunked))
pool.close()
pool.join()
print('output data length:', len(results))
dtEnd = datetime.datetime.now()
print('end time:', str(datetime.datetime.now()))
print('multi proc Time taken:', str(dtEnd - dtStart))
def chunks(self, l, n):
n = max(1, n)
return (l[i:i + n] for i in range(0, len(l), n))
if __name__ == '__main__':
a = A()
a.apply_async_with_callback()
I can see the difference of using Pool.map instead of Pool.apply_async. Code is faster now. Earlier it was 10 times slower than sequential, now it is 2 times slower. But... slower....
This is how multiprocessing behaves? Then what is the point of using multiprocessing? Or am i still doing something wrong?

Categories