How to set the maximum number of concurrent workers in multiprocessing? - python

Let's say we start with vartec's answer which shows how to use multiprocessing worker:
import multiprocessing
def worker(procnum, return_dict):
'''worker function'''
print str(procnum) + ' represent!'
return_dict[procnum] = procnum
if __name__ == '__main__':
manager = multiprocessing.Manager()
return_dict = manager.dict()
jobs = []
for i in range(5):
p = multiprocessing.Process(target=worker, args=(i,return_dict))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
print return_dict.values()
I want to do the same thing, only limit the number of concurrent processes to X. How can I do that using workers?
Using pool/map is not really the best option here as I have a for loop like this:
for item in items:
result = heavy_lifting_which_cannot_be_parallelized(item)
process_result_in_a_way_that_can_be_parallelized(result)
Therefore I'd like to start process_result_in_a_way_that_can_be_parallelized and continue with my for loop. Not wait until the for loop has ended and then multiprocess - that would be much more time-consuming.

You do not have to use map with a Pool. You can use apply_async to submit jobs to the pool on your own schedule.
pool = multiprocessing.Pool(processes=3)
for i in range(30):
pool.apply_async(worker, (i, return_dict))
pool.close()
pool.join()
print return_dict.values()

Related

Multiprocessing in Python using Process, how to limit the number of processes

I'd like to run multiple processes concurrently, but using Process I cannot limit the number of processes at a time, so that my computer becomes unusable for anything else.
In my problem I have to run the main_function for all of the data in my_dataset. Here is a short sample of my code, is it possible to limit the number of processes at a time?
from multiprocessing import Process
def my_function(my_dataset):
processes = []
for data in my_dataset:
transformed_data = transform(data)
p = Process(target=main_function, args=(data, transformed_data))
p.start()
processes.append(p)
for p in processes:
p.join()
You can utilize the multiprocessing's Pool
https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool
from multiprocessing import Pool
names = ["Joe", "James", "Jimmy"] * 10
def print_name(name):
print(f"Got Name: {name}")
def runner():
p = Pool(4)
p.map(print_name, names)
if __name__== "__main__":
runner()

Pass Value("i", 0) to processs

I have this code (it is snippet from my program):
from multiprocessing import Process, Manager, cpu_count, Pool, Value, Lock
def grab_future_products(p, test):
print("Starting procces %s" % p)
if __name__ == "__main__": # Main program
n = 4
test = Value('i', 0)
pool = Pool(processes=n) # n processes per every CPU core
for i in range(n):
pool.apply_async(grab_future_products, args=(i, test))
pool.close()
pool.join()
If I run it with python test.py I got no output, no errors, just nothing.
I wanted to use variable test as shared integrer between all processes so I can do in another process something like:
if test.value == X:
break
But interesting is that if I replace args=(i, test)) with args=(i, 1)), it will work as desired.
So my question is, why I can not pass Value() object into process? And how I can solve this problem?
Many thanks.
The trick is to use multiprocessing.Manager, as also mentioned here: Sharing a result queue among several processes:
from multiprocessing import Process, Manager, cpu_count, Pool, Value, Lock
def grab_future_products(p, test):
print("Starting procces %s, value=%i" % (p, test.value))
if __name__ == "__main__": # Main program
n = 4
pool = Pool(processes=n) # n processes per every CPU core
m = Manager()
v = m.Value('i', 0)
for i in range(n):
res=pool.apply_async(grab_future_products, args=(i, v))
pool.close()
pool.join()

Python multiprocessing with Queue (split loads dynamically)

I am trying to use multiprocessing to process very large number of files.
I tried to put the list of files into queue and make 3 workers split the load with a common Queue data type. However this seems not working. Probably I am misunderstanding about the queue in multiprocessing package.
Below is the example source code:
import multiprocessing
from multiprocessing import Queue
def worker(i, qu):
"""worker function"""
while ~qu.empty():
val=qu.get()
print 'Worker:',i, ' start with file:',val
j=1
for k in range(i*10000,(i+1)*10000): # some time consuming process
for j in range(i*10000,(i+1)*10000):
j=j+k
print 'Worker:',i, ' end with file:',val
if __name__ == '__main__':
jobs = []
qu=Queue()
for j in range(100,110): # files numbers are from 100 to 110
qu.put(j)
for i in range(3): # 3 multiprocess
p = multiprocessing.Process(target=worker, args=(i,qu))
jobs.append(p)
p.start()
p.join()
Thanks for the comments.
I come to know that using Pool is the best solution.
import multiprocessing
import time
def worker(val):
"""worker function"""
print 'Worker: start with file:',val
time.sleep(1.1)
print 'Worker: end with file:',val
if __name__ == '__main__':
file_list=range(100,110)
p = multiprocessing.Pool(2)
p.map(worker, file_list)
Two issues:
1) you are joining only on the 3rd process
2) Why not use multiprocessing.Pool?
3) race condition on qu.get()
1 & 3)
import multiprocessing
from multiprocessing import Queue
def worker(i, qu):
"""worker function"""
while 1:
try:
val=qu.get(timeout)
except Queue.Empty: break# Yay no race condition
print 'Worker:',i, ' start with file:',val
j=1
for k in range(i*10000,(i+1)*10000): # some time consuming process
for j in range(i*10000,(i+1)*10000):
j=j+k
print 'Worker:',i, ' end with file:',val
if __name__ == '__main__':
jobs = []
qu=Queue()
for j in range(100,110): # files numbers are from 100 to 110
qu.put(j)
for i in range(3): # 3 multiprocess
p = multiprocessing.Process(target=worker, args=(i,qu))
jobs.append(p)
p.start()
for p in jobs: #<--- join on all processes ...
p.join()
2)
for how to use the Pool, see:
https://docs.python.org/2/library/multiprocessing.html
You are joining only the last of your created processes. That means if the first or the second process is still working while the third is finished, your main process is goning down and kills the remaining processes before they are finished.
You should join them all in order to wait until they are finished:
for p in jobs:
p.join()
Another thing is you should consider using qu.get_nowait() in order to get rid of the race condition between qu.empty() and qu.get().
For example:
try:
while 1:
message = self.queue.get_nowait()
""" do something fancy here """
except Queue.Empty:
pass
I hope that helps

Python Multiprocessing Pipe "Deadlock"

I'm facing problems with the following example code:
from multiprocessing import Lock, Process, Queue, current_process
def worker(work_queue, done_queue):
for item in iter(work_queue.get, 'STOP'):
print("adding ", item, "to done queue")
#this works: done_queue.put(item*10)
done_queue.put(item*1000) #this doesnt!
return True
def main():
workers = 4
work_queue = Queue()
done_queue = Queue()
processes = []
for x in range(10):
work_queue.put("hi"+str(x))
for w in range(workers):
p = Process(target=worker, args=(work_queue, done_queue))
p.start()
processes.append(p)
work_queue.put('STOP')
for p in processes:
p.join()
done_queue.put('STOP')
for item in iter(done_queue.get, 'STOP'):
print(item)
if __name__ == '__main__':
main()
When the done Queue becomes big enough (a limit about 64k i think), the whole thing freezes without any further notice.
What is the general approach for such a situation when the queue becomes too big? is there some way to remove elements on the fly once they are processed? The Python docs recommend removing the p.join(), in a real application however i can not estimate when the processes have finished. Is there a simple solution for this problem besides infinite looping and using .get_nowait()?
This works for me with 3.4.0alpha4, 3.3, 3.2, 3.1 and 2.6. It tracebacks with 2.7 and 3.0. I pylint'd it, BTW.
#!/usr/local/cpython-3.3/bin/python
'''SSCCE for a queue deadlock'''
import sys
import multiprocessing
def worker(workerno, work_queue, done_queue):
'''Worker function'''
#reps = 10 # this worked for the OP
#reps = 1000 # this worked for me
reps = 10000 # this didn't
for item in iter(work_queue.get, 'STOP'):
print("adding", item, "to done queue")
#this works: done_queue.put(item*10)
for thing in item * reps:
#print('workerno: {}, adding thing {}'.format(workerno, thing))
done_queue.put(thing)
done_queue.put('STOP')
print('workerno: {0}, exited loop'.format(workerno))
return True
def main():
'''main function'''
workers = 4
work_queue = multiprocessing.Queue(maxsize=0)
done_queue = multiprocessing.Queue(maxsize=0)
processes = []
for integer in range(10):
work_queue.put("hi"+str(integer))
for workerno in range(workers):
dummy = workerno
process = multiprocessing.Process(target=worker, args=(workerno, work_queue, done_queue))
process.start()
processes.append(process)
work_queue.put('STOP')
itemno = 0
stops = 0
while True:
item = done_queue.get()
itemno += 1
sys.stdout.write('itemno {0}\r'.format(itemno))
if item == 'STOP':
stops += 1
if stops == workers:
break
print('exited done_queue empty loop')
for workerno, process in enumerate(processes):
print('attempting process.join() of workerno {0}'.format(workerno))
process.join()
done_queue.put('STOP')
if __name__ == '__main__':
main()
HTH

How do you pass a Queue reference to a function managed by pool.map_async()?

I want a long-running process to return its progress over a Queue (or something similar) which I will feed to a progress bar dialog. I also need the result when the process is completed. A test example here fails with a RuntimeError: Queue objects should only be shared between processes through inheritance.
import multiprocessing, time
def task(args):
count = args[0]
queue = args[1]
for i in xrange(count):
queue.put("%d mississippi" % i)
return "Done"
def main():
q = multiprocessing.Queue()
pool = multiprocessing.Pool()
result = pool.map_async(task, [(x, q) for x in range(10)])
time.sleep(1)
while not q.empty():
print q.get()
print result.get()
if __name__ == "__main__":
main()
I've been able to get this to work using individual Process objects (where I am alowed to pass a Queue reference) but then I don't have a pool to manage the many processes I want to launch. Any advise on a better pattern for this?
The following code seems to work:
import multiprocessing, time
def task(args):
count = args[0]
queue = args[1]
for i in xrange(count):
queue.put("%d mississippi" % i)
return "Done"
def main():
manager = multiprocessing.Manager()
q = manager.Queue()
pool = multiprocessing.Pool()
result = pool.map_async(task, [(x, q) for x in range(10)])
time.sleep(1)
while not q.empty():
print q.get()
print result.get()
if __name__ == "__main__":
main()
Note that the Queue is got from a manager.Queue() rather than multiprocessing.Queue(). Thanks Alex for pointing me in this direction.
Making q global works...:
import multiprocessing, time
q = multiprocessing.Queue()
def task(count):
for i in xrange(count):
q.put("%d mississippi" % i)
return "Done"
def main():
pool = multiprocessing.Pool()
result = pool.map_async(task, range(10))
time.sleep(1)
while not q.empty():
print q.get()
print result.get()
if __name__ == "__main__":
main()
If you need multiple queues, e.g. to avoid mixing up the progress of the various pool processes, a global list of queues should work (of course, each process will then need to know what index in the list to use, but that's OK to pass as an argument;-).

Categories