I tried to benchmark the speed up of Pipe over Queue from the multiprocessing package. T thought Pipe would be faster as Queue uses Pipe internally.
Strangely, Pipe is slower than Queue when sending large numpy array. What am I missing here?
Pipe:
import sys
import time
from multiprocessing import Process, Pipe
import numpy as np
NUM = 1000
def worker(conn):
for task_nbr in range(NUM):
conn.send(np.random.rand(400, 400, 3))
sys.exit(1)
def main():
parent_conn, child_conn = Pipe(duplex=False)
Process(target=worker, args=(child_conn,)).start()
for num in range(NUM):
message = parent_conn.recv()
if __name__ == "__main__":
start_time = time.time()
main()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print "Duration: %s" % duration
print "Messages Per Second: %s" % msg_per_sec
# Took 10.86s.
Queue
import sys
import time
from multiprocessing import Process
from multiprocessing import Queue
import numpy as np
NUM = 1000
def worker(q):
for task_nbr in range(NUM):
q.put(np.random.rand(400, 400, 3))
sys.exit(1)
def main():
recv_q = Queue()
Process(target=worker, args=(recv_q,)).start()
for num in range(NUM):
message = recv_q.get()
if __name__ == "__main__":
start_time = time.time()
main()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print "Duration: %s" % duration
print "Messages Per Second: %s" % msg_per_sec
# Took 6.86s.
You can do an experiment and put the following into your Pipe code above..
def worker(conn):
for task_nbr in range(NUM):
data = np.random.rand(400, 400, 3)
sys.exit(1)
def main():
parent_conn, child_conn = Pipe(duplex=False)
p = Process(target=worker, args=(child_conn,))
p.start()
p.join()
This gives you the time that it takes to create the data for your test. On my system this takes about 2.9 seconds.
Under the hood the queue object implements a buffer and a threaded send. The thread is still in the same process but by using it, the data creation doesn't have to wait for the system IO to complete. It effectively parallelizes the operations. Try your Pipe code modified with some simple threading implemented (disclaimer, code here is for test only and is not production ready)..
import sys
import time
import threading
from multiprocessing import Process, Pipe, Lock
import numpy as np
import copy
NUM = 1000
def worker(conn):
_conn = conn
_buf = []
_wlock = Lock()
_sentinel = object() # signal that we're done
def thread_worker():
while 1:
if _buf:
_wlock.acquire()
obj = _buf.pop(0)
if obj is _sentinel: return
_conn.send(data)
_wlock.release()
t = threading.Thread(target=thread_worker)
t.start()
for task_nbr in range(NUM):
data = np.random.rand(400, 400, 3)
data[0][0][0] = task_nbr # just for integrity check
_wlock.acquire()
_buf.append(data)
_wlock.release()
_wlock.acquire()
_buf.append(_sentinel)
_wlock.release()
t.join()
sys.exit(1)
def main():
parent_conn, child_conn = Pipe(duplex=False)
Process(target=worker, args=(child_conn,)).start()
for num in range(NUM):
message = parent_conn.recv()
assert num == message[0][0][0], 'Data was corrupted'
if __name__ == "__main__":
start_time = time.time()
main()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print "Duration: %s" % duration
print "Messages Per Second: %s" % msg_per_sec
On my machine this takes 3.4 seconds to run which is almost exactly the same as your Queue code above.
From https://docs.python.org/2/library/threading.html
In Cython, due to due to the Global Interpreter Lock, only one thread can execute Python code at once... however, threading is still an appropriate model if you want to run multiple I/O-bound tasks simultaneously.
The queue and pipe differences are definitely an odd implementation detail until you dig into it a bit.
I assume by your print command you are using Python2. However the strange behavior cannot be replicated with Python3, where Pipe is actually faster than Queue.
import sys
import time
from multiprocessing import Process, Pipe, Queue
import numpy as np
NUM = 20000
def worker_pipe(conn):
for task_nbr in range(NUM):
conn.send(np.random.rand(40, 40, 3))
sys.exit(1)
def main_pipe():
parent_conn, child_conn = Pipe(duplex=False)
Process(target=worker_pipe, args=(child_conn,)).start()
for num in range(NUM):
message = parent_conn.recv()
def pipe_test():
start_time = time.time()
main_pipe()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print("Pipe")
print("Duration: " + str(duration))
print("Messages Per Second: " + str(msg_per_sec))
def worker_queue(q):
for task_nbr in range(NUM):
q.put(np.random.rand(40, 40, 3))
sys.exit(1)
def main_queue():
recv_q = Queue()
Process(target=worker_queue, args=(recv_q,)).start()
for num in range(NUM):
message = recv_q.get()
def queue_test():
start_time = time.time()
main_queue()
end_time = time.time()
duration = end_time - start_time
msg_per_sec = NUM / duration
print("Queue")
print("Duration: " + str(duration))
print("Messages Per Second: " + str(msg_per_sec))
if __name__ == "__main__":
for i in range(2):
queue_test()
pipe_test()
Results in:
Queue
Duration: 3.44321894646
Messages Per Second: 5808.51822408
Pipe
Duration: 2.69065594673
Messages Per Second: 7433.13169575
Queue
Duration: 3.45295906067
Messages Per Second: 5792.13354361
Pipe
Duration: 2.78426194191
Messages Per Second: 7183.23218766
------------------
(program exited with code: 0)
Press return to continue
On my system Pipe(duplex=False) is slower (twice the time, or half the rate) than Pipe(duplex=True). For anyone looking for performance here is a side-by-side comparison:
from time import time
from multiprocessing import Process, Queue, Pipe
n = 1000
buffer = b'\0' * (1000*1000) # 1 megabyte
def print_elapsed(name, start):
elapsed = time() - start
spi = elapsed / n
ips = n / elapsed
print(f'{name}: {spi*1000:.3f} ms/item, {ips:.0f} item/sec')
def producer(q):
start = time()
for i in range(n):
q.put(buffer)
print_elapsed('producer', start)
def consumer(q):
start = time()
for i in range(n):
out = q.get()
print_elapsed('consumer', start)
class PipeQueue():
def __init__(self, **kwargs):
self.out_pipe, self.in_pipe = Pipe(**kwargs)
def put(self, item):
self.in_pipe.send_bytes(item)
def get(self):
return self.out_pipe.recv_bytes()
def close(self):
self.out_pipe.close()
self.in_pipe.close()
print('duplex=True')
q = PipeQueue(duplex=True)
producer_process = Process(target=producer, args=(q,))
consumer_process = Process(target=consumer, args=(q,))
consumer_process.start()
producer_process.start()
consumer_process.join()
producer_process.join()
q.close()
print('duplex=False')
q = PipeQueue(duplex=False)
producer_process = Process(target=producer, args=(q,))
consumer_process = Process(target=consumer, args=(q,))
consumer_process.start()
producer_process.start()
consumer_process.join()
producer_process.join()
q.close()
Results:
duplex=True
consumer: 0.301 ms/item, 3317 item/sec
producer: 0.298 ms/item, 3358 item/sec
duplex=False
consumer: 0.673 ms/item, 1486 item/sec
producer: 0.669 ms/item, 1494 item/sec
I think this must come down to CPython using os.pipe vs socket.socketpair, but I'm not sure.
Related
I'd like to check how much difference the for statement takes with multiprocessing. I don't think the for statement of the function do_something can be executed when I run the code. Please help me out on which part I did wrong.
The sum result kept on going to zero.
import time
import multiprocessing
from sys import stdout
sum=0
def do_something():
for i in range(1000):
global sum
sum=sum+1
progress = 100*(i+1)/1000 #process percentage
stdout.write("\r ===== %d%% completed =====" % progress) #process percentage
stdout.flush()
stdout.write("\n")
# str=StringVar()
if __name__ == '__main__':
start = time.perf_counter()
processes = []
for _ in range(1):
p = multiprocessing.Process(target=do_something) ##
p.start()
processes.append(p)
for process in processes:
process.join()
finish = time.perf_counter()
print(f'{round(finish-start,2)} sec completed')
print(sum)
#Result
0.16 sec completed
0
As #tdelaney commented the subprocess created will be updating an instance of sum that "lives" in its own address space distinct from the address space of the main process that launched it. The usual solution would be to pass to tdelaney a multiprocessing.Queue instance that it can write the sum to and which the main process can then read (which should be done before joining the subprocess).
In the code below, however, I am using a multiprocessing.Pipe on which the multiprocessing.Queue is built. It is not as flexible as a queue in that it only readily supports a single reader and writer, but for this application that is all you need and it is a much better performer. The call to Pipe() returns two connections, one for sending objects and the other for receiving objects.
Note that in your code that the final print statement needs to be indented.
You should also refrain from naming variables the same as builtin functions, e.g. sum.
import time
import multiprocessing
from sys import stdout
def do_something(send_conn):
the_sum = 0
for i in range(1000):
the_sum = the_sum + 1
progress = 100*(i+1)/1000 #process percentage
stdout.write("\r ===== %d%% completed =====" % progress) #process percentage
stdout.flush()
stdout.write("\n")
send_conn.send(the_sum)
# str=StringVar()
if __name__ == '__main__':
start = time.perf_counter()
read_conn, send_conn = multiprocessing.Pipe(duplex=False)
p = multiprocessing.Process(target=do_something, args=(send_conn,)) ##
p.start()
the_sum = read_conn.recv()
p.join()
finish = time.perf_counter()
print(f'{round(finish-start,2)} sec completed')
print(the_sum)
Prints:
===== 100% completed =====
0.16 sec completed
1000
Here is the same code using a multiprocessing.Queue:
import time
import multiprocessing
from sys import stdout
def do_something(queue):
the_sum = 0
for i in range(1000):
the_sum = the_sum + 1
progress = 100*(i+1)/1000 #process percentage
stdout.write("\r ===== %d%% completed =====" % progress) #process percentage
stdout.flush()
stdout.write("\n")
queue.put(the_sum)
# str=StringVar()
if __name__ == '__main__':
start = time.perf_counter()
queue = multiprocessing.Queue()
p = multiprocessing.Process(target=do_something, args=(queue,)) ##
p.start()
the_sum = queue.get()
p.join()
finish = time.perf_counter()
print(f'{round(finish-start,2)} sec completed')
print(the_sum)
Prints:
===== 100% completed =====
0.17 sec completed
1000
I am trying to meassure how much time one of my spiders needs for parsing a website. For that purpose, I have the following code:
class TestClass():
t0 = 0
t1 = 0
def run_experiment(self, p):
process = CrawlerProcess()
d = process.crawl(MySpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
self.t0 = time.time()
print("t0: ", self.t0)
process.start()
self.t1 = time.time()
print("t1: ", self.t1)
p.put( (self.t1-self.t0) )
def init(self):
total_time = 0
for i in range(22):
q = Queue()
p = Process(target=self.run_experiment, args=(q,))
p.start()
spent_time = q.get()
print("t0: ", self.t0, "--- t1: ", self.t1, " Spent time = ", spent_time)
p.join()
total_time = total_time + spent_time
print(total_time)
class _main_test():
if __name__ == '__main__':
testClass = TestClass()
testClass.init()
The problem I am having is that both t0 and t1 have the same value, so it seems like the process has not finished by the time I compute t1. How could I make sure the process has finished by the time I calculate t1 without penalizing the test? (without sleeping the program or similar solutions, as those would increase the tested time).
I tried to run the following codes:
import multiprocessing
import time
def init_queue():
print("init g_queue start")
while not g_queue.empty():
g_queue.get()
for _index in range(10):
g_queue.put(_index)
print("init g_queue end")
return
def task_io(task_id):
print("IOTask[%s] start" % task_id)
print("the size of queue is %s" % g_queue.qsize())
while not g_queue.empty():
time.sleep(1)
try:
data = g_queue.get(block=True, timeout=1)
print("IOTask[%s] get data: %s" % (task_id, data))
except Exception as excep:
print("IOTask[%s] error: %s" % (task_id, str(excep)))
print("IOTask[%s] end" % task_id)
return
g_queue = multiprocessing.Queue()
if __name__ == '__main__':
print("the size of queue is %s" % g_queue.qsize())
init_queue()
print("the size of queue is %s" % g_queue.qsize())
time_0 = time.time()
process_list = [multiprocessing.Process(target=task_io, args=(i,)) for i in range(multiprocessing.cpu_count())]
for p in process_list:
p.start()
for p in process_list:
if p.is_alive():
p.join()
print("End:", time.time() - time_0, "\n")
what I got was the following:
the size of queue is 0
init g_queue start
init g_queue end
the size of queue is 10
IOTask[0] start
the size of queue is 0
IOTask[0] end
IOTask[1] start
the size of queue is 0
IOTask[1] end
('End:', 0.6480000019073486, '\n')
What I was expecting was
IOTask[0] start
the size of queue is 10
Because after initialization of g_queue, the size of queue was supposed to be 10, not 0. It seems like the queue is not in the shared memory. When the sub process starts, a copy of g_queue is created and its size is 0.
Why multiprocessing.queue is not in the shared memory? Please advise. Many thanks!
You should pass your g_queue as a parameter, then it will work.
demo for using multiprocessing with queue
import multiprocessing
import time
def long_time_calculate(n, result_queue):
time.sleep(1)
result_queue.put(n)
if __name__ == '__main__':
result_queue = multiprocessing.Queue()
pool_size = multiprocessing.cpu_count() * 2
pool = multiprocessing.Pool(processes=pool_size, maxtasksperchild=4)
manager = multiprocessing.Manager()
result_queue = manager.Queue()
inputs = [(1, result_queue), (2, result_queue), (3, result_queue), (4, result_queue)]
for input in inputs:
pool.apply_async(long_time_calculate, input)
pool.close()
pool.join()
print(list(result_queue.get() for _ in inputs))
Basically the more imports from different modules I include the longer these multiprocessing tasks take, even if none of the module functions are used. Is each process having to reimport everything or something? What is going on?
import time
time1 = time.time()
import multiprocessing as mp
import numpy as np # Random imports (not used)
import PIL
import PySide
import pandas
# print time.time() - time1 # here this prints 0.0
class Multi(object):
def __init__(self, queue):
self.q = queue
def run(self, a):
p = mp.Process(target=f, args=(a, q))
p.start()
print self.q.get()
p.join()
class MultiPool(object):
def __init__(self, N):
self.N = N
self.pool = mp.Pool(processes = self.N)
def run(self):
result = self.pool.map_async(f1, ((i,) for i in range(self.N)))
print result.get()
def f(a, q):
for i in range(10000000):
b = i
q.put(b)
def f1(a):
for i in range(10000000):
b = i
return b
if __name__ == '__main__':
q = mp.Queue()
e = Multi(q)
# time1 = time.time()
print f1(0)
print time.time() - time1
time1 = time.time()
e.run('123')
print time.time() - time1
time1 = time.time()
mpool = MultiPool(2)
mpool.run()
print time.time() - time1
# Output with random imports:
>9999999
>0.246000051498
>9999999
>0.693000078201
>[9999999, 9999999]
>0.720999956131
# Output without imports:
>9999999
>0.246000051498
>9999999
>0.315999984741
>[9999999, 9999999]
>0.313999891281
Yes multiprocessing must import everything in any proces just because are process (new applications) and not thread.
What you will measure by your script is the cost of methods execution plus the cost of process creation. You can measure the imports cost and they are execute in place exactly where the import statements are.
How do speed up this test code in python to Redis on Winxp using python 2.7?
Would multiprocessing be better? The load rate in 6000/s vs publish 100,000/s rates.
I chose 100,000, but could lower in testing. The process takes 15 seconds.
Would changing setting on server help???
import time
from time import strftime
import redis
import threading, Queue
start_time = time.time()
cxn = redis.StrictRedis('127.0.0.1',6379,1)
class WorkerMain(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while 1:
try: # take a job from the queue
row = self.queue.get_nowait()
except Queue.Empty: raise SystemExit
try:
cxn.set(row, "Row")
#print (row, "Row")
except: print 'Setup Error'
if __name__ == '__main__':
connections = 5
sml = range(1,100000)
queue = Queue.Queue()
for row in sml:
queue.put(str(row))
threads = []
for dummy in range(connections):
t = WorkerMain(queue)
t.start()
threads.append(t)
# wait for all threads to finish
for thread in threads:
thread.join()
print
end_time = time.time()
duration = end_time - start_time
print "Duration: %s" % duration
Used the code below for mulitprocessing and "monitored" the data with CLI...not all data went into the server.
from multiprocessing import Pool
import time
import redis
start_time = time.time()
cxn = redis.Redis('127.0.0.1',6379,1)
def rset(var):
cxn.set(var,"value")
if __name__ =='__main__':
sml = range(1,10000)
#for x in sml:print x
pool = Pool(processes=5)
for row in sml:
pool.apply_async(rset, [(row,)])
#print result.get(),
end_time = time.time()
duration = end_time - start_time
print "Duration: %s" % duration
Here is the pipelined code...... I just commented out the threading stuff.
from time import strftime
import redis
import threading, Queue
start_time = time.time()
cxn = redis.StrictRedis('127.0.0.1',6379,0)
pipe = cxn.pipeline(transaction=False)
class WorkerMain(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while 1:
try: # take a job from the queue
row = self.queue.get_nowait()
except Queue.Empty: raise SystemExit
try:
cxn.set(row, "Row")
#print (row, "ROw")
except: print 'Setup Error'
if __name__ == '__main__':
#connections = 5
sml = range(1,100000)
#queue = Queue.Queue()
for row in sml:
#queue.put(str(row))
pipe.set(str(row),"value").execute()# key, value
# threads = []
# for dummy in range(connections):
# t = WorkerMain(queue)
# t.start()
# threads.append(t)
#
# # wait for all threads to finish
# for thread in threads:
# thread.join()
print
end_time = time.time()
duration = end_time - start_time
print "Duration: %s" % duration
Use Pipelines. A Pipeline batches commands so you don't pay for network overheads.
See :
Section on Pipelines over here https://github.com/andymccurdy/redis-py
Pipelining on Redis.io - http://redis.io/topics/pipelining
Using threading for better performance is not a really good idea if you use cpython (the standard python interpreter) because of the gil.
http://wiki.python.org/moin/GlobalInterpreterLock
multiprocessing should work better