Use generator to iterate through data from a multiprocess - python

I would like to perform the following below using multiprocess, instead of subprocess.Popen. This is because I cannot pass objects using popen. I know my simple example below does not use/pass objects, but that is what I want to do.
Sample code is:
main.py
import subprocess
class ProcReader():
def __init__(self, python_file):
self.proc = subprocess.Popen(['python', python_file], stdout=subprocess.PIPE)
def __iter__(self):
return self
def __next__(self):
while True:
line = self.proc.stdout.readline()
if not line:
raise StopIteration
return line
if __name__ == "__main__":
r1 = ProcReader("test1.py")
r2 = ProcReader("test2.py")
r3 = ProcReader("test3.py")
for l1, l2, l3 in zip(r1, r2, r3):
d1 = l1.decode('utf-8').strip().split(",")
d2 = l2.decode('utf-8').strip().split(",")
d3 = l3.decode('utf-8').strip().split(",")
print(f"{d1[0]}:{d1[1]},{d2[0]}:{d2[1]},{d3[1]}:{d3[1]}")
test#.py
for x in range(10):
print("test1,{}".format(x))
My sample code is in python3, but I would like an equivalent, using multiprocess, in python2.7. Should the equivalent also read from stdout? Or should it utilize the queue and just have a worker reading from the queue?
Update---------
My example using multiprocessing:
import time
from multiprocessing import Process, Queue
def writer1(queue):
for x in range(10):
time.sleep(1)
queue.put("test1,{}".format(x))
def writer2(queue):
for x in range(10):
time.sleep(2)
queue.put("test2,{}".format(x))
def writer3(queue):
for x in range(10):
queue.put("test3,{}".format(x))
if __name__=='__main__':
q1 = Queue()
q2 = Queue()
q3 = Queue()
writer_1 = Process(target=writer1, args=((q1),))
writer_1.daemon = True
writer_1.start()
writer_2 = Process(target=writer2, args=((q2),))
writer_2.daemon = True
writer_2.start()
writer_3 = Process(target=writer3, args=((q3),))
writer_3.daemon = True
writer_3.start()
while True:
msg1 = q1.get()
msg2 = q2.get()
msg3 = q3.get()
if msg1 and msg2 and msg3:
d1 = msg1.strip().split(",")
d2 = msg2.strip().split(",")
d3 = msg3.strip().split(",")
print("{}:{},{}:{},{}:{}".format(d1[0],d1[1],
d2[0],d2[1],
d3[0],d3[1]))
else:
break
Didnt realize q1.get() waits until something is there, I added sleep to verify this. Also, how do I check that the process is done writing? Seems to be just waiting at the end

To adapt your second example for my comment about sentinel objects, maybe you're looking for something like
import os
import time
from multiprocessing import Process, Queue
def writer(queue):
value = os.getpid()
for x in range(10):
time.sleep(0.1)
queue.put("{},{}".format(value, x))
queue.put(None)
def spawn_process():
q = Queue()
p = Process(target=writer, args=(q,))
p.daemon = True
p.start()
return (p, q)
if __name__ == "__main__":
processes_and_queues = [spawn_process() for x in range(3)]
processes, queues = zip(*processes_and_queues)
live_queues = list(queues)
while live_queues:
messages = []
for queue in live_queues:
message = queue.get()
if message is None:
live_queues.remove(queue)
messages.append(message)
if len(messages) == len(processes):
print(messages)
It outputs (e.g.)
['51748,0', '51749,0', '51750,0']
['51748,1', '51749,1', '51750,1']
['51748,2', '51749,2', '51750,2']
['51748,3', '51749,3', '51750,3']
['51748,4', '51749,4', '51750,4']
['51748,5', '51749,5', '51750,5']
['51748,6', '51749,6', '51750,6']
['51748,7', '51749,7', '51750,7']
['51748,8', '51749,8', '51750,8']
['51748,9', '51749,9', '51750,9']

Related

Share queue between processes

I am pretty new to multiprocessing in python and trying to achieve something which should be a rather common thing to do. But I cannot find an easy way when searching the web.
I want to put data in a queue and then make this queue available to different consumer functions. Of course when getting an element from the queue, all consumer functions should get the same element. The following example should make clear what I want to achieve:
from multiprocessing import Process, Queue
def producer(q):
for i in range(10):
q.put(i)
q.put(None)
def consumer1(q):
while True:
data = q.get()
if data is None:
break
print(data)
def consumer2(q):
while True:
data = q.get()
if data is None:
break
print(data)
def main():
q = Queue()
p1 = Process(target=producer, args=(q,))
p2 = Process(target=consumer1, args=(q,))
p3 = Process(target=consumer2, args=(q,))
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()
if __name__ == '__main__':
main()
Since the script is not terminating and I only get the print output of one function I guess this is not the way to do it. I think sharing a queue implies some things to consider? It works fine when using only one consumer function.
Appreciate the help!
If the values you are storing can be represented by one of the fundamental data types defined in the ctypes module, then the following could work. Here we are implementing a "queue" that can hold int values or None:
from multiprocessing import Process, Condition
import ctypes
from multiprocessing.sharedctypes import RawArray, RawValue
from threading import local
import time
my_local = local()
my_local.current = 0
class StructuredInt(ctypes.Structure):
"""
This class is necessary because we want to be able to store in the RawArray
either an int or None, which requires using ctypes.c_void_p as the array type.
But, infortunately, ctypes.c_void_p(0) is interpreted as None.
So we need a way to represent 0. Field value 'value' is the
actual int value being stored and we use an arbitrarty 'ptr'
field value that will not be interpreted as None.
To store a None value, we set 'ptr' to ctypes.c_void_p(None) and field
'value' is irrelevant.
To store an integer. we set 'ptr' to ctypes.c_void_p(1) and field
'value' has the actual value.
"""
_fields_ = [('ptr', ctypes.c_void_p), ('value', ctypes.c_int)]
class MultiIntQueue:
"""
An integer queue that can be processed by multiple threads where each thread
can retrieve all the values added to the queue.
:param maxsize: The maximum queue capacity (defaults to 20 if specified as None)
:type maxsize: int
"""
def __init__(self, maxsize=None):
if maxsize is None:
maxsize = 20
self.maxsize = maxsize
self.q = RawArray(StructuredInt, maxsize)
self.condition = Condition()
self.size = RawValue(ctypes.c_int, 0)
def get(self):
with self.condition:
while my_local.current >= self.size.value:
self.condition.wait()
i = self.q[my_local.current]
my_local.current += 1
return None if i.ptr is None else i.value
def put(self, i):
assert 0 <= self.size.value < self.maxsize
with self.condition:
self.q[self.size.value] = (ctypes.c_void_p(None), 0) if i is None else (ctypes.c_void_p(1), i)
self.size.value += 1
self.condition.notify_all()
def producer(q):
for i in range(10):
q.put(i)
time.sleep(.3) # simulate processing
q.put(None)
def consumer1(q):
while True:
data = q.get()
if data is None:
break
time.sleep(.1) # simulate processing
print('Consumer 1:', data)
def consumer2(q):
while True:
data = q.get()
if data is None:
break
time.sleep(.1) # simulate processing
print('Consumer 2:', data)
def main():
q = MultiIntQueue()
p1 = Process(target=producer, args=(q,))
p2 = Process(target=consumer1, args=(q,))
p3 = Process(target=consumer2, args=(q,))
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()
if __name__ == '__main__':
main()
Prints:
Consumer 1: 0
Consumer 2: 0
Consumer 2: 1
Consumer 1: 1
Consumer 2: 2
Consumer 1: 2
Consumer 2: 3
Consumer 1: 3
Consumer 2: 4
Consumer 1: 4
Consumer 1: 5
Consumer 2: 5
Consumer 1: 6
Consumer 2: 6
Consumer 1: 7
Consumer 2: 7
Consumer 2: 8
Consumer 1: 8
Consumer 1: 9
Consumer 2: 9
Your question exemplifies the misunderstanding
"all consumer functions should get the same element"
That's just not how queues work. Queues are automatically managed (there's quite a lot under the hood) such if one item is put in, only one item can be taken out. That item is not duplicated to all consumers. It seems like you actually need two separate queues to guarantee that each consumer gets each input without competing against the other consumer:
from multiprocessing import Process, Queue
def producer(q1, q2):
for i in range(10):
q1.put(i)
q2.put(i)
q1.put(None)
q2.put(None)
def consumer1(q):
while True:
data = q.get()
if data is None:
break
print(data)
def consumer2(q):
while True:
data = q.get()
if data is None:
break
print(data)
def main():
q1 = Queue()
q2 = Queue()
p1 = Process(target=producer, args=(q1, q2))
p2 = Process(target=consumer1, args=(q1,))
p3 = Process(target=consumer2, args=(q2,))
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()
if __name__ == '__main__':
main()

How multiprocess share a common queue?

I want to start 4 process which put an integer in queue when counter is divisible by 100.Same time another process continuously read it and print it.Please correct my code to run...I am getting an error ['Queue' object is not iterable]
from multiprocessing import Lock, Process, Queue, current_process
import time
import queue
def doFirstjob(process_Queue):
i=0
while True:
if i%100==0:
process_Queue.put(i)
else:
i+=1
def doSecondjob(process_Queue):
while(1):
if not process_Queue.Empty:
task = process_Queue.get()
print("task: ",task)
else:
time.sleep(0.2)
def main():
number_of_processes = 4
process_Queue = Queue()
processes = []
process_Queue.put(1)
q = Process(target=doSecondjob, args=(process_Queue))
q.start()
for w in range(number_of_processes):
p = Process(target=doFirstjob, args=(process_Queue))
processes.append(p)
p.start()
if __name__ == '__main__':
main()
You were getting error because Process was expecting a list/tuple in arguments/args.
Also instead of Empty it should be empty.
change the code to below.
from multiprocessing import Lock, Process, Queue, current_process
import time
import queue
def doFirstjob(process_Queue):
i=0
while True:
print("foo")
if i%100==0:
process_Queue.put(i)
else:
i+=1
def doSecondjob(process_Queue):
while(1):
print("bar")
if not process_Queue.empty:
task = process_Queue.get()
print("task: ",task)
else:
time.sleep(0.2)
def main():
number_of_processes = 4
process_Queue = Queue()
processes = []
process_Queue.put(1)
q = Process(target=doSecondjob, args=(process_Queue,))
q.start()
for w in range(number_of_processes):
p = Process(target=doFirstjob, args=(process_Queue,))
processes.append(p)
p.start()
if __name__ == '__main__':
main()

Cannot obtain values while parallelizing 2 for loops

I am trying to run the following snippet which appends data to lists 'tests1' and 'tests2'. But when I print 'tests1' and 'tests2', the displayed list is empty. Anything incorrect here?
tests1 = []
tests2 = []
def func1():
for i in range(25,26):
tests1.append(test_loader.get_tests(test_prefix=new_paths[i],tags=params.get('tags', None),
exclude=params.get('exclude', False)))
def func2():
for i in range(26,27):
tests2.append(test_loader.get_tests(test_prefix=new_paths[i],tags=params.get('tags', None),
exclude=params.get('exclude', False)))
p1 = mp.Process(target=func1)
p2 = mp.Process(target=func2)
p1.start()
p2.start()
p1.join()
p2.join()
print tests1
print tests2
The worker processes don't actually share the same object. It gets copied (pickled).
You can send values between processes using a multiprocessing.Queue (or by various other means). See my simple example (in which I've made your tests into integers for simplicity).
from multiprocessing import Process, Queue
def add_tests1(queue):
for i in range(10):
queue.put(i)
queue.put(None)
def add_tests2(queue):
for i in range(100,110):
queue.put(i)
queue.put(None)
def run_tests(queue):
while True:
test = queue.get()
if test is None:
break
print test
if __name__ == '__main__':
queue1 = Queue()
queue2 = Queue()
add_1 = Process(target = add_tests1, args = (queue1,))
add_2 = Process(target = add_tests2, args = (queue2,))
run_1 = Process(target = run_tests, args = (queue1,))
run_2 = Process(target = run_tests, args = (queue2,))
add_1.start(); add_2.start(); run_1.start(); run_2.start()
add_1.join(); add_2.join(); run_1.join(); run_2.join()
Note that the parent program can also access the queues.

return value from spawned multiprocessing.process [duplicate]

In the example code below, I'd like to get the return value of the function worker. How can I go about doing this? Where is this value stored?
Example Code:
import multiprocessing
def worker(procnum):
'''worker function'''
print str(procnum) + ' represent!'
return procnum
if __name__ == '__main__':
jobs = []
for i in range(5):
p = multiprocessing.Process(target=worker, args=(i,))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
print jobs
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
[<Process(Process-1, stopped)>, <Process(Process-2, stopped)>, <Process(Process-3, stopped)>, <Process(Process-4, stopped)>, <Process(Process-5, stopped)>]
I can't seem to find the relevant attribute in the objects stored in jobs.
Use shared variable to communicate. For example like this:
import multiprocessing
def worker(procnum, return_dict):
"""worker function"""
print(str(procnum) + " represent!")
return_dict[procnum] = procnum
if __name__ == "__main__":
manager = multiprocessing.Manager()
return_dict = manager.dict()
jobs = []
for i in range(5):
p = multiprocessing.Process(target=worker, args=(i, return_dict))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
print(return_dict.values())
I think the approach suggested by #sega_sai is the better one. But it really needs a code example, so here goes:
import multiprocessing
from os import getpid
def worker(procnum):
print('I am number %d in process %d' % (procnum, getpid()))
return getpid()
if __name__ == '__main__':
pool = multiprocessing.Pool(processes = 3)
print(pool.map(worker, range(5)))
Which will print the return values:
I am number 0 in process 19139
I am number 1 in process 19138
I am number 2 in process 19140
I am number 3 in process 19139
I am number 4 in process 19140
[19139, 19138, 19140, 19139, 19140]
If you are familiar with map (the Python 2 built-in) this should not be too challenging. Otherwise have a look at sega_Sai's link.
Note how little code is needed. (Also note how processes are re-used).
For anyone else who is seeking how to get a value from a Process using Queue:
import multiprocessing
ret = {'foo': False}
def worker(queue):
ret = queue.get()
ret['foo'] = True
queue.put(ret)
if __name__ == '__main__':
queue = multiprocessing.Queue()
queue.put(ret)
p = multiprocessing.Process(target=worker, args=(queue,))
p.start()
p.join()
print(queue.get()) # Prints {"foo": True}
Note that in Windows or Jupyter Notebook, with multithreading you have to save this as a file and execute the file. If you do it in a command prompt you will see an error like this:
AttributeError: Can't get attribute 'worker' on <module '__main__' (built-in)>
For some reason, I couldn't find a general example of how to do this with Queue anywhere (even Python's doc examples don't spawn multiple processes), so here's what I got working after like 10 tries:
from multiprocessing import Process, Queue
def add_helper(queue, arg1, arg2): # the func called in child processes
ret = arg1 + arg2
queue.put(ret)
def multi_add(): # spawns child processes
q = Queue()
processes = []
rets = []
for _ in range(0, 100):
p = Process(target=add_helper, args=(q, 1, 2))
processes.append(p)
p.start()
for p in processes:
ret = q.get() # will block
rets.append(ret)
for p in processes:
p.join()
return rets
Queue is a blocking, thread-safe queue that you can use to store the return values from the child processes. So you have to pass the queue to each process. Something less obvious here is that you have to get() from the queue before you join the Processes or else the queue fills up and blocks everything.
Update for those who are object-oriented (tested in Python 3.4):
from multiprocessing import Process, Queue
class Multiprocessor():
def __init__(self):
self.processes = []
self.queue = Queue()
#staticmethod
def _wrapper(func, queue, args, kwargs):
ret = func(*args, **kwargs)
queue.put(ret)
def run(self, func, *args, **kwargs):
args2 = [func, self.queue, args, kwargs]
p = Process(target=self._wrapper, args=args2)
self.processes.append(p)
p.start()
def wait(self):
rets = []
for p in self.processes:
ret = self.queue.get()
rets.append(ret)
for p in self.processes:
p.join()
return rets
# tester
if __name__ == "__main__":
mp = Multiprocessor()
num_proc = 64
for _ in range(num_proc): # queue up multiple tasks running `sum`
mp.run(sum, [1, 2, 3, 4, 5])
ret = mp.wait() # get all results
print(ret)
assert len(ret) == num_proc and all(r == 15 for r in ret)
This example shows how to use a list of multiprocessing.Pipe instances to return strings from an arbitrary number of processes:
import multiprocessing
def worker(procnum, send_end):
'''worker function'''
result = str(procnum) + ' represent!'
print result
send_end.send(result)
def main():
jobs = []
pipe_list = []
for i in range(5):
recv_end, send_end = multiprocessing.Pipe(False)
p = multiprocessing.Process(target=worker, args=(i, send_end))
jobs.append(p)
pipe_list.append(recv_end)
p.start()
for proc in jobs:
proc.join()
result_list = [x.recv() for x in pipe_list]
print result_list
if __name__ == '__main__':
main()
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
['0 represent!', '1 represent!', '2 represent!', '3 represent!', '4 represent!']
This solution uses fewer resources than a multiprocessing.Queue which uses
a Pipe
at least one Lock
a buffer
a thread
or a multiprocessing.SimpleQueue which uses
a Pipe
at least one Lock
It is very instructive to look at the source for each of these types.
It seems that you should use the multiprocessing.Pool class instead and use the methods .apply() .apply_async(), map()
http://docs.python.org/library/multiprocessing.html?highlight=pool#multiprocessing.pool.AsyncResult
You can use the exit built-in to set the exit code of a process. It can be obtained from the exitcode attribute of the process:
import multiprocessing
def worker(procnum):
print str(procnum) + ' represent!'
exit(procnum)
if __name__ == '__main__':
jobs = []
for i in range(5):
p = multiprocessing.Process(target=worker, args=(i,))
jobs.append(p)
p.start()
result = []
for proc in jobs:
proc.join()
result.append(proc.exitcode)
print result
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
[0, 1, 2, 3, 4]
The pebble package has a nice abstraction leveraging multiprocessing.Pipe which makes this quite straightforward:
from pebble import concurrent
#concurrent.process
def function(arg, kwarg=0):
return arg + kwarg
future = function(1, kwarg=1)
print(future.result())
Example from: https://pythonhosted.org/Pebble/#concurrent-decorators
Thought I'd simplify the simplest examples copied from above, working for me on Py3.6. Simplest is multiprocessing.Pool:
import multiprocessing
import time
def worker(x):
time.sleep(1)
return x
pool = multiprocessing.Pool()
print(pool.map(worker, range(10)))
You can set the number of processes in the pool with, e.g., Pool(processes=5). However it defaults to CPU count, so leave it blank for CPU-bound tasks. (I/O-bound tasks often suit threads anyway, as the threads are mostly waiting so can share a CPU core.) Pool also applies chunking optimization.
(Note that the worker method cannot be nested within a method. I initially defined my worker method inside the method that makes the call to pool.map, to keep it all self-contained, but then the processes couldn't import it, and threw "AttributeError: Can't pickle local object outer_method..inner_method". More here. It can be inside a class.)
(Appreciate the original question specified printing 'represent!' rather than time.sleep(), but without it I thought some code was running concurrently when it wasn't.)
Py3's ProcessPoolExecutor is also two lines (.map returns a generator so you need the list()):
from concurrent.futures import ProcessPoolExecutor
with ProcessPoolExecutor() as executor:
print(list(executor.map(worker, range(10))))
With plain Processes:
import multiprocessing
import time
def worker(x, queue):
time.sleep(1)
queue.put(x)
queue = multiprocessing.SimpleQueue()
tasks = range(10)
for task in tasks:
multiprocessing.Process(target=worker, args=(task, queue,)).start()
for _ in tasks:
print(queue.get())
Use SimpleQueue if all you need is put and get. The first loop starts all the processes, before the second makes the blocking queue.get calls. I don't think there's any reason to call p.join() too.
If you are using Python 3, you can use concurrent.futures.ProcessPoolExecutor as a convenient abstraction:
from concurrent.futures import ProcessPoolExecutor
def worker(procnum):
'''worker function'''
print(str(procnum) + ' represent!')
return procnum
if __name__ == '__main__':
with ProcessPoolExecutor() as executor:
print(list(executor.map(worker, range(5))))
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
[0, 1, 2, 3, 4]
A simple solution:
import multiprocessing
output=[]
data = range(0,10)
def f(x):
return x**2
def handler():
p = multiprocessing.Pool(64)
r=p.map(f, data)
return r
if __name__ == '__main__':
output.append(handler())
print(output[0])
Output:
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
You can use ProcessPoolExecutor to get a return value from a function as shown below:
from concurrent.futures import ProcessPoolExecutor
def test(num1, num2):
return num1 + num2
with ProcessPoolExecutor() as executor:
feature = executor.submit(test, 2, 3)
print(feature.result()) # 5
I modified vartec's answer a bit since I needed to get the error codes from the function. (Thanks vertec!!! its an awesome trick)
This can also be done with a manager.list but I think is better to have it in a dict and store a list within it. That way, way we keep the function and the results since we can't be sure of the order in which the list will be populated.
from multiprocessing import Process
import time
import datetime
import multiprocessing
def func1(fn, m_list):
print 'func1: starting'
time.sleep(1)
m_list[fn] = "this is the first function"
print 'func1: finishing'
# return "func1" # no need for return since Multiprocess doesnt return it =(
def func2(fn, m_list):
print 'func2: starting'
time.sleep(3)
m_list[fn] = "this is function 2"
print 'func2: finishing'
# return "func2"
def func3(fn, m_list):
print 'func3: starting'
time.sleep(9)
# if fail wont join the rest because it never populate the dict
# or do a try/except to get something in return.
raise ValueError("failed here")
# if we want to get the error in the manager dict we can catch the error
try:
raise ValueError("failed here")
m_list[fn] = "this is third"
except:
m_list[fn] = "this is third and it fail horrible"
# print 'func3: finishing'
# return "func3"
def runInParallel(*fns): # * is to accept any input in list
start_time = datetime.datetime.now()
proc = []
manager = multiprocessing.Manager()
m_list = manager.dict()
for fn in fns:
# print fn
# print dir(fn)
p = Process(target=fn, name=fn.func_name, args=(fn, m_list))
p.start()
proc.append(p)
for p in proc:
p.join() # 5 is the time out
print datetime.datetime.now() - start_time
return m_list, proc
if __name__ == '__main__':
manager, proc = runInParallel(func1, func2, func3)
# print dir(proc[0])
# print proc[0]._name
# print proc[0].name
# print proc[0].exitcode
# here you can check what did fail
for i in proc:
print i.name, i.exitcode # name was set up in the Process line 53
# here will only show the function that worked and where able to populate the
# manager dict
for i, j in manager.items():
print dir(i) # things you can do to the function
print i, j

How to get the return value of a function passed to multiprocessing.Process?

In the example code below, I'd like to get the return value of the function worker. How can I go about doing this? Where is this value stored?
Example Code:
import multiprocessing
def worker(procnum):
'''worker function'''
print str(procnum) + ' represent!'
return procnum
if __name__ == '__main__':
jobs = []
for i in range(5):
p = multiprocessing.Process(target=worker, args=(i,))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
print jobs
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
[<Process(Process-1, stopped)>, <Process(Process-2, stopped)>, <Process(Process-3, stopped)>, <Process(Process-4, stopped)>, <Process(Process-5, stopped)>]
I can't seem to find the relevant attribute in the objects stored in jobs.
Use shared variable to communicate. For example like this:
import multiprocessing
def worker(procnum, return_dict):
"""worker function"""
print(str(procnum) + " represent!")
return_dict[procnum] = procnum
if __name__ == "__main__":
manager = multiprocessing.Manager()
return_dict = manager.dict()
jobs = []
for i in range(5):
p = multiprocessing.Process(target=worker, args=(i, return_dict))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
print(return_dict.values())
I think the approach suggested by #sega_sai is the better one. But it really needs a code example, so here goes:
import multiprocessing
from os import getpid
def worker(procnum):
print('I am number %d in process %d' % (procnum, getpid()))
return getpid()
if __name__ == '__main__':
pool = multiprocessing.Pool(processes = 3)
print(pool.map(worker, range(5)))
Which will print the return values:
I am number 0 in process 19139
I am number 1 in process 19138
I am number 2 in process 19140
I am number 3 in process 19139
I am number 4 in process 19140
[19139, 19138, 19140, 19139, 19140]
If you are familiar with map (the Python 2 built-in) this should not be too challenging. Otherwise have a look at sega_Sai's link.
Note how little code is needed. (Also note how processes are re-used).
For anyone else who is seeking how to get a value from a Process using Queue:
import multiprocessing
ret = {'foo': False}
def worker(queue):
ret = queue.get()
ret['foo'] = True
queue.put(ret)
if __name__ == '__main__':
queue = multiprocessing.Queue()
queue.put(ret)
p = multiprocessing.Process(target=worker, args=(queue,))
p.start()
p.join()
print(queue.get()) # Prints {"foo": True}
Note that in Windows or Jupyter Notebook, with multithreading you have to save this as a file and execute the file. If you do it in a command prompt you will see an error like this:
AttributeError: Can't get attribute 'worker' on <module '__main__' (built-in)>
For some reason, I couldn't find a general example of how to do this with Queue anywhere (even Python's doc examples don't spawn multiple processes), so here's what I got working after like 10 tries:
from multiprocessing import Process, Queue
def add_helper(queue, arg1, arg2): # the func called in child processes
ret = arg1 + arg2
queue.put(ret)
def multi_add(): # spawns child processes
q = Queue()
processes = []
rets = []
for _ in range(0, 100):
p = Process(target=add_helper, args=(q, 1, 2))
processes.append(p)
p.start()
for p in processes:
ret = q.get() # will block
rets.append(ret)
for p in processes:
p.join()
return rets
Queue is a blocking, thread-safe queue that you can use to store the return values from the child processes. So you have to pass the queue to each process. Something less obvious here is that you have to get() from the queue before you join the Processes or else the queue fills up and blocks everything.
Update for those who are object-oriented (tested in Python 3.4):
from multiprocessing import Process, Queue
class Multiprocessor():
def __init__(self):
self.processes = []
self.queue = Queue()
#staticmethod
def _wrapper(func, queue, args, kwargs):
ret = func(*args, **kwargs)
queue.put(ret)
def run(self, func, *args, **kwargs):
args2 = [func, self.queue, args, kwargs]
p = Process(target=self._wrapper, args=args2)
self.processes.append(p)
p.start()
def wait(self):
rets = []
for p in self.processes:
ret = self.queue.get()
rets.append(ret)
for p in self.processes:
p.join()
return rets
# tester
if __name__ == "__main__":
mp = Multiprocessor()
num_proc = 64
for _ in range(num_proc): # queue up multiple tasks running `sum`
mp.run(sum, [1, 2, 3, 4, 5])
ret = mp.wait() # get all results
print(ret)
assert len(ret) == num_proc and all(r == 15 for r in ret)
This example shows how to use a list of multiprocessing.Pipe instances to return strings from an arbitrary number of processes:
import multiprocessing
def worker(procnum, send_end):
'''worker function'''
result = str(procnum) + ' represent!'
print result
send_end.send(result)
def main():
jobs = []
pipe_list = []
for i in range(5):
recv_end, send_end = multiprocessing.Pipe(False)
p = multiprocessing.Process(target=worker, args=(i, send_end))
jobs.append(p)
pipe_list.append(recv_end)
p.start()
for proc in jobs:
proc.join()
result_list = [x.recv() for x in pipe_list]
print result_list
if __name__ == '__main__':
main()
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
['0 represent!', '1 represent!', '2 represent!', '3 represent!', '4 represent!']
This solution uses fewer resources than a multiprocessing.Queue which uses
a Pipe
at least one Lock
a buffer
a thread
or a multiprocessing.SimpleQueue which uses
a Pipe
at least one Lock
It is very instructive to look at the source for each of these types.
It seems that you should use the multiprocessing.Pool class instead and use the methods .apply() .apply_async(), map()
http://docs.python.org/library/multiprocessing.html?highlight=pool#multiprocessing.pool.AsyncResult
You can use the exit built-in to set the exit code of a process. It can be obtained from the exitcode attribute of the process:
import multiprocessing
def worker(procnum):
print str(procnum) + ' represent!'
exit(procnum)
if __name__ == '__main__':
jobs = []
for i in range(5):
p = multiprocessing.Process(target=worker, args=(i,))
jobs.append(p)
p.start()
result = []
for proc in jobs:
proc.join()
result.append(proc.exitcode)
print result
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
[0, 1, 2, 3, 4]
The pebble package has a nice abstraction leveraging multiprocessing.Pipe which makes this quite straightforward:
from pebble import concurrent
#concurrent.process
def function(arg, kwarg=0):
return arg + kwarg
future = function(1, kwarg=1)
print(future.result())
Example from: https://pythonhosted.org/Pebble/#concurrent-decorators
Thought I'd simplify the simplest examples copied from above, working for me on Py3.6. Simplest is multiprocessing.Pool:
import multiprocessing
import time
def worker(x):
time.sleep(1)
return x
pool = multiprocessing.Pool()
print(pool.map(worker, range(10)))
You can set the number of processes in the pool with, e.g., Pool(processes=5). However it defaults to CPU count, so leave it blank for CPU-bound tasks. (I/O-bound tasks often suit threads anyway, as the threads are mostly waiting so can share a CPU core.) Pool also applies chunking optimization.
(Note that the worker method cannot be nested within a method. I initially defined my worker method inside the method that makes the call to pool.map, to keep it all self-contained, but then the processes couldn't import it, and threw "AttributeError: Can't pickle local object outer_method..inner_method". More here. It can be inside a class.)
(Appreciate the original question specified printing 'represent!' rather than time.sleep(), but without it I thought some code was running concurrently when it wasn't.)
Py3's ProcessPoolExecutor is also two lines (.map returns a generator so you need the list()):
from concurrent.futures import ProcessPoolExecutor
with ProcessPoolExecutor() as executor:
print(list(executor.map(worker, range(10))))
With plain Processes:
import multiprocessing
import time
def worker(x, queue):
time.sleep(1)
queue.put(x)
queue = multiprocessing.SimpleQueue()
tasks = range(10)
for task in tasks:
multiprocessing.Process(target=worker, args=(task, queue,)).start()
for _ in tasks:
print(queue.get())
Use SimpleQueue if all you need is put and get. The first loop starts all the processes, before the second makes the blocking queue.get calls. I don't think there's any reason to call p.join() too.
If you are using Python 3, you can use concurrent.futures.ProcessPoolExecutor as a convenient abstraction:
from concurrent.futures import ProcessPoolExecutor
def worker(procnum):
'''worker function'''
print(str(procnum) + ' represent!')
return procnum
if __name__ == '__main__':
with ProcessPoolExecutor() as executor:
print(list(executor.map(worker, range(5))))
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
[0, 1, 2, 3, 4]
A simple solution:
import multiprocessing
output=[]
data = range(0,10)
def f(x):
return x**2
def handler():
p = multiprocessing.Pool(64)
r=p.map(f, data)
return r
if __name__ == '__main__':
output.append(handler())
print(output[0])
Output:
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
You can use ProcessPoolExecutor to get a return value from a function as shown below:
from concurrent.futures import ProcessPoolExecutor
def test(num1, num2):
return num1 + num2
with ProcessPoolExecutor() as executor:
feature = executor.submit(test, 2, 3)
print(feature.result()) # 5
I modified vartec's answer a bit since I needed to get the error codes from the function. (Thanks vertec!!! its an awesome trick)
This can also be done with a manager.list but I think is better to have it in a dict and store a list within it. That way, way we keep the function and the results since we can't be sure of the order in which the list will be populated.
from multiprocessing import Process
import time
import datetime
import multiprocessing
def func1(fn, m_list):
print 'func1: starting'
time.sleep(1)
m_list[fn] = "this is the first function"
print 'func1: finishing'
# return "func1" # no need for return since Multiprocess doesnt return it =(
def func2(fn, m_list):
print 'func2: starting'
time.sleep(3)
m_list[fn] = "this is function 2"
print 'func2: finishing'
# return "func2"
def func3(fn, m_list):
print 'func3: starting'
time.sleep(9)
# if fail wont join the rest because it never populate the dict
# or do a try/except to get something in return.
raise ValueError("failed here")
# if we want to get the error in the manager dict we can catch the error
try:
raise ValueError("failed here")
m_list[fn] = "this is third"
except:
m_list[fn] = "this is third and it fail horrible"
# print 'func3: finishing'
# return "func3"
def runInParallel(*fns): # * is to accept any input in list
start_time = datetime.datetime.now()
proc = []
manager = multiprocessing.Manager()
m_list = manager.dict()
for fn in fns:
# print fn
# print dir(fn)
p = Process(target=fn, name=fn.func_name, args=(fn, m_list))
p.start()
proc.append(p)
for p in proc:
p.join() # 5 is the time out
print datetime.datetime.now() - start_time
return m_list, proc
if __name__ == '__main__':
manager, proc = runInParallel(func1, func2, func3)
# print dir(proc[0])
# print proc[0]._name
# print proc[0].name
# print proc[0].exitcode
# here you can check what did fail
for i in proc:
print i.name, i.exitcode # name was set up in the Process line 53
# here will only show the function that worked and where able to populate the
# manager dict
for i, j in manager.items():
print dir(i) # things you can do to the function
print i, j

Categories