Python multiprocessing share large data through Queue?

Python multiprocessing share large data through Queue? - python

I would like to put output data into a queue in a multiprocessing computation. It seems that when the size of the return is too large, the program got stuck. To illustrate the problem, here is a minimal codes. Anyone can help to make this work?
from multiprocessing import Process, Queue
import numpy as np
def foo(q, qid):
x = np.random.randint(0,5,7)
y = np.random.random(100*10*10).reshape(100,10,10)
q.put([qid,x,y])
def main():
processes = []
q = Queue()
for qid in range(5):
p = Process(target=foo, args=(q, qid))
p.start()
processes.append(p)
for process in processes:
process.join()
for qid in range(5):
[_, x, y] = q.get()
print(x)
print(y)
if __name__ == '__main__':
main()

I figured out one solution is as below to switch the join and get. By default, the get method blocks.
from multiprocessing import Process, Queue
import numpy as np
def foo(q, qid):
x = np.random.randint(0,5,7)
y = np.random.random(100*10*10).reshape(100,10,10)
q.put([qid,x,y])
def main():
processes = []
q = Queue()
for qid in range(5):
p = Process(target=foo, args=(q, qid))
p.start()
processes.append(p)
for qid in range(5):
[_, x, y] = q.get()
print(x)
print(y)
for process in processes:
process.join()
if __name__ == '__main__':
main()

Related

Correct way to implement producer consumer pattern in python multiprocessing Pool

I want to implement producer-consumer pattern by using multiprocessing.pool.Pool
Since the JoinableQueue cannot be used in Pool (would claim RuntimeError: JoinableQueue objects should only be shared between processes through inheritance), I have to use multiprocessing.Manager() inspired by this answer.
The question is: now the program may hang when consumer jobs are larger than producer jobs.
import queue
import random
from multiprocessing import Manager, Pool
def consumer(q):
while True:
try:
res = q.get(block=False)
if res is None:
break
print(f'Consume {res}')
except queue.Empty:
pass
def producer(q, food):
for i in range(2):
res = f'{food} {i}'
print(f'Produce {res}')
q.put(res)
q.put(None) # sentinel
if __name__ == "__main__":
with Pool() as pool:
jobs = 2
foods = ['apple', 'banana', 'melon', 'salad']
q = Manager().Queue()
[pool.apply_async(func=consumer, args=(q, )) for _ in range(jobs + 1)] # would hang
# would not hang only when the consumer jobs is equal or less than the producer jobs
# [pool.apply_async(func=consumer, args=(q, )) for _ in range(jobs)]
[
pool.apply_async(func=producer, args=(q, random.choice(foods)))
for _ in range(jobs)
]
pool.close()
pool.join()
Seems like those extra consumers can't get the sentinel and just wait there forever.
So what's the elegant way to implement the producer-consumer pattern in multiprocessing.pool.Pool?
Or is it only possible with multiprocessing.Process + JoinableQueue?

You can use a multiprocessing.JoinableQueue by having your process pool workers accessing it as a global variable that gets initialized using a pool initializer:
import multiprocessing
def init_pool(input_q, output_q):
global in_q, out_q
in_q = input_q
out_q = output_q
def worker():
print(type(in_q))
# required by Windows
if __name__ == '__main__':
in_q = multiprocessing.JoinableQueue()
out_q = multiprocessing.JoinableQueue()
pool = multiprocessing.Pool(2, initializer=init_pool, initargs=(in_q, out_q))
pool.apply(worker)
Prints:
<class 'multiprocessing.queues.JoinableQueue'>

Seems like using multiprocessing.Process + JoinableQueue is a more elegant way.
import queue
import random
from multiprocessing import JoinableQueue, Process
def consumer(q: JoinableQueue):
while True:
try:
res = q.get(block=False)
print(f'Consume {res}')
q.task_done()
except queue.Empty:
pass
def producer(q: JoinableQueue, food):
for i in range(2):
res = f'{food} {i}'
print(f'Produce {res}')
q.put(res)
q.join()
if __name__ == "__main__":
foods = ['apple', 'banana', 'melon', 'salad']
jobs = 2
q = JoinableQueue()
producers = [
Process(target=producer, args=(q, random.choice(foods)))
for _ in range(jobs)
]
# daemon=True is important here
consumers = [
Process(target=consumer, args=(q, ), daemon=True)
for _ in range(jobs * 2)
]
# + order here doesn't matter
for p in consumers + producers:
p.start()
for p in producers:
p.join()

In funkid's self-answer,
when using JoinableQueue, you don't have to send/receive the trailing None in producer/consumer anymore. All producers wait until all of the items in the queue have been consumed completely (after the final task_done), then joined by the main process.
As pointed out by #Koby,
daemon=True is important because it allows the consumers (blocked by the empty queue) to be killed when the main process terminates.
Modified program:
import random
from multiprocessing import JoinableQueue, Process
def consumer(q: JoinableQueue):
while True:
res = q.get()
print(f'Consume {res}')
q.task_done()
def producer(q: JoinableQueue, food):
for i in range(2):
res = f'{food} {i}'
print(f'Produce {res}')
q.put(res)
q.join()
if __name__ == "__main__":
foods = ['apple', 'banana', 'melon', 'salad']
jobs = 2
q = JoinableQueue()
producers = [
Process(target=producer, args=(q, random.choice(foods)))
for _ in range(jobs)
]
# daemon=True is important here
consumers = [
Process(target=consumer, args=(q, ), daemon=True)
for _ in range(jobs * 2)
]
# + order here doesn't matter
for p in consumers + producers:
p.start()
for p in producers:
p.join()

Loop Not Spending Value to Array in Python

When running this code it just prints out a blank array at the end:
[]
So why is it not appending either the value a or the value b?
import multiprocessing as mu
array_values=[]
def a(array):
array.append('a')
def b(array):
array.append('b')
def runInParallel(*fns):
z=0
while z<6:
if __name__=='__main__':
proc = []
for fn in fns:
p = mu.Process(target=fn,args=(array_values,))
p.start()
proc.append(p)
for p in proc:
p.join()
z+=1
runInParallel(a,b)
print(array_values)
DESIRED FINAL OUTPUT OF FUNCTION:
['a','b','a','b','a','b','a','b','a','b','a','b']
Thanks in advance!

The reason it doesn't word is because multiprocessing doesn't use shared memory.
You can use the following code to get your desired output (it uses threading which uses shared memory):
import threading
array_values = []
def a(array):
array.append('a')
def b(array):
array.append('b')
def runInParallel(*fns):
z = 0
while z < 6:
if __name__ == '__main__':
proc = []
for fn in fns:
p = threading.Thread(target=fn, args=(array_values,))
p.start()
proc.append(p)
for p in proc:
p.join()
z += 1
runInParallel(a, b)
print(array_values)

How multiprocess share a common queue?

I want to start 4 process which put an integer in queue when counter is divisible by 100.Same time another process continuously read it and print it.Please correct my code to run...I am getting an error ['Queue' object is not iterable]
from multiprocessing import Lock, Process, Queue, current_process
import time
import queue
def doFirstjob(process_Queue):
i=0
while True:
if i%100==0:
process_Queue.put(i)
else:
i+=1
def doSecondjob(process_Queue):
while(1):
if not process_Queue.Empty:
task = process_Queue.get()
print("task: ",task)
else:
time.sleep(0.2)
def main():
number_of_processes = 4
process_Queue = Queue()
processes = []
process_Queue.put(1)
q = Process(target=doSecondjob, args=(process_Queue))
q.start()
for w in range(number_of_processes):
p = Process(target=doFirstjob, args=(process_Queue))
processes.append(p)
p.start()
if __name__ == '__main__':
main()

You were getting error because Process was expecting a list/tuple in arguments/args.
Also instead of Empty it should be empty.
change the code to below.
from multiprocessing import Lock, Process, Queue, current_process
import time
import queue
def doFirstjob(process_Queue):
i=0
while True:
print("foo")
if i%100==0:
process_Queue.put(i)
else:
i+=1
def doSecondjob(process_Queue):
while(1):
print("bar")
if not process_Queue.empty:
task = process_Queue.get()
print("task: ",task)
else:
time.sleep(0.2)
def main():
number_of_processes = 4
process_Queue = Queue()
processes = []
process_Queue.put(1)
q = Process(target=doSecondjob, args=(process_Queue,))
q.start()
for w in range(number_of_processes):
p = Process(target=doFirstjob, args=(process_Queue,))
processes.append(p)
p.start()
if __name__ == '__main__':
main()

Python recursive multiprocessing behaving linear?

from multiprocessing import Process, Queue
class A:
def F1(self, q, code1, code2):
data = -1
q.put(data)
def F2(self, q, code1):
q2 = Queue()
for i in range(10):
'''
some processing here
'''
p = Process(target=self.F1, args=(q2, i, j))
p.start()
print(q2.get())
p.join()
def Handler(self):
q = Queue()
for i in range(10):
p = Process(target=self.F2, args=(q, i))
p.start()
print(q.get())
p.join()
if __name__ == "__main__":
app = A()
app.Handler()
After executing I observed that the code executes in a linear fashion, not utilizing multiprocessing. I can't figure out the reason ?

The problem is that by calling q2.get (for instance) in the for loop, you waited for each process to finish before starting another one. Your dispatchers could be changed as follows to get all 10 working in the background simultaneously.
def F2(self, q, code1):
q2 = Queue()
processes = []
# start all of the processes
for i in range(10):
'''
some processing here
'''
p = Process(target=self.F1, args=(q2, i, j))
p.start()
processes.append(p)
# get data for all processes. buggy because an exception in the
# child is not caught and will cause program to hang
for i in range(10):
print(q2.get())
#dispose of the processes
for p in processes:
p.join()
multiprocessing already has the Pool class that does the work for you and handles exceptions to boot.

Python Multiprocessing Pipe "Deadlock"

I'm facing problems with the following example code:
from multiprocessing import Lock, Process, Queue, current_process
def worker(work_queue, done_queue):
for item in iter(work_queue.get, 'STOP'):
print("adding ", item, "to done queue")
#this works: done_queue.put(item*10)
done_queue.put(item*1000) #this doesnt!
return True
def main():
workers = 4
work_queue = Queue()
done_queue = Queue()
processes = []
for x in range(10):
work_queue.put("hi"+str(x))
for w in range(workers):
p = Process(target=worker, args=(work_queue, done_queue))
p.start()
processes.append(p)
work_queue.put('STOP')
for p in processes:
p.join()
done_queue.put('STOP')
for item in iter(done_queue.get, 'STOP'):
print(item)
if __name__ == '__main__':
main()
When the done Queue becomes big enough (a limit about 64k i think), the whole thing freezes without any further notice.
What is the general approach for such a situation when the queue becomes too big? is there some way to remove elements on the fly once they are processed? The Python docs recommend removing the p.join(), in a real application however i can not estimate when the processes have finished. Is there a simple solution for this problem besides infinite looping and using .get_nowait()?

This works for me with 3.4.0alpha4, 3.3, 3.2, 3.1 and 2.6. It tracebacks with 2.7 and 3.0. I pylint'd it, BTW.
#!/usr/local/cpython-3.3/bin/python
'''SSCCE for a queue deadlock'''
import sys
import multiprocessing
def worker(workerno, work_queue, done_queue):
'''Worker function'''
#reps = 10 # this worked for the OP
#reps = 1000 # this worked for me
reps = 10000 # this didn't
for item in iter(work_queue.get, 'STOP'):
print("adding", item, "to done queue")
#this works: done_queue.put(item*10)
for thing in item * reps:
#print('workerno: {}, adding thing {}'.format(workerno, thing))
done_queue.put(thing)
done_queue.put('STOP')
print('workerno: {0}, exited loop'.format(workerno))
return True
def main():
'''main function'''
workers = 4
work_queue = multiprocessing.Queue(maxsize=0)
done_queue = multiprocessing.Queue(maxsize=0)
processes = []
for integer in range(10):
work_queue.put("hi"+str(integer))
for workerno in range(workers):
dummy = workerno
process = multiprocessing.Process(target=worker, args=(workerno, work_queue, done_queue))
process.start()
processes.append(process)
work_queue.put('STOP')
itemno = 0
stops = 0
while True:
item = done_queue.get()
itemno += 1
sys.stdout.write('itemno {0}\r'.format(itemno))
if item == 'STOP':
stops += 1
if stops == workers:
break
print('exited done_queue empty loop')
for workerno, process in enumerate(processes):
print('attempting process.join() of workerno {0}'.format(workerno))
process.join()
done_queue.put('STOP')
if __name__ == '__main__':
main()
HTH

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python multiprocessing share large data through Queue? - python

Related

Correct way to implement producer consumer pattern in python multiprocessing Pool

Loop Not Spending Value to Array in Python

How multiprocess share a common queue?

Python recursive multiprocessing behaving linear?

Python Multiprocessing Pipe "Deadlock"

Categories

Resources