Loop Not Spending Value to Array in Python - python

When running this code it just prints out a blank array at the end:
[]
So why is it not appending either the value a or the value b?
import multiprocessing as mu
array_values=[]
def a(array):
array.append('a')
def b(array):
array.append('b')
def runInParallel(*fns):
z=0
while z<6:
if __name__=='__main__':
proc = []
for fn in fns:
p = mu.Process(target=fn,args=(array_values,))
p.start()
proc.append(p)
for p in proc:
p.join()
z+=1
runInParallel(a,b)
print(array_values)
DESIRED FINAL OUTPUT OF FUNCTION:
['a','b','a','b','a','b','a','b','a','b','a','b']
Thanks in advance!

The reason it doesn't word is because multiprocessing doesn't use shared memory.
You can use the following code to get your desired output (it uses threading which uses shared memory):
import threading
array_values = []
def a(array):
array.append('a')
def b(array):
array.append('b')
def runInParallel(*fns):
z = 0
while z < 6:
if __name__ == '__main__':
proc = []
for fn in fns:
p = threading.Thread(target=fn, args=(array_values,))
p.start()
proc.append(p)
for p in proc:
p.join()
z += 1
runInParallel(a, b)
print(array_values)

Related

Python multiprocessing share large data through Queue?

I would like to put output data into a queue in a multiprocessing computation. It seems that when the size of the return is too large, the program got stuck. To illustrate the problem, here is a minimal codes. Anyone can help to make this work?
from multiprocessing import Process, Queue
import numpy as np
def foo(q, qid):
x = np.random.randint(0,5,7)
y = np.random.random(100*10*10).reshape(100,10,10)
q.put([qid,x,y])
def main():
processes = []
q = Queue()
for qid in range(5):
p = Process(target=foo, args=(q, qid))
p.start()
processes.append(p)
for process in processes:
process.join()
for qid in range(5):
[_, x, y] = q.get()
print(x)
print(y)
if __name__ == '__main__':
main()
I figured out one solution is as below to switch the join and get. By default, the get method blocks.
from multiprocessing import Process, Queue
import numpy as np
def foo(q, qid):
x = np.random.randint(0,5,7)
y = np.random.random(100*10*10).reshape(100,10,10)
q.put([qid,x,y])
def main():
processes = []
q = Queue()
for qid in range(5):
p = Process(target=foo, args=(q, qid))
p.start()
processes.append(p)
for qid in range(5):
[_, x, y] = q.get()
print(x)
print(y)
for process in processes:
process.join()
if __name__ == '__main__':
main()

How to append values to the same array from two different functions?

Current Code
import multiprocessing as mu
import time
global_array=[]
def add_array1(array):
while True:
time.sleep(2.5)
global_array.append(1)
print(global_array)
def add_array2(array):
while True:
time.sleep(3)
global_array.append(2)
print(global_array)
def runInParallel(*fns):
if __name__=='__main__':
proc = []
for fn in fns:
p = mu.Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
runInParallel(
add_array1(global_array),
add_array2(global_array)
)
When running my code above only the first function add_array1() is appending the value to the array and printing instead of both functions providing the wrong output:
[1]
[1,1]
[1,1,1]
When the actual desired output for the following code is:
[1]
[1,2]
[1,2,1]
[1,2,1,2]
Your problem is that the function call
runInParallel( add_array1(global_array), add_array2(global_array))
executes the functions and provides the return value of the function calls as parameters to runInParallel. As add_array1 is an endless loop, it never returns from the execution. You need to provide your functions as functions - not the returnvalue of the functions as parameters to runInParallel(...)
Start with
runInParallel( add_array1, add_array2) # name of the functions, dont execute em
and change
def runInParallel(*fns):
proc = []
for fn in fns:
p = mu.Process(target=fn, args=(global_array,)) # provide param here
p.start()
proc.append(p)
for p in proc:
p.join()
and then fix the "not joining" problem due to your threaded functions never returning.
Example from the official documentation of multiprocessing.Process:
from multiprocessing import Process
import os
def info(title):
print(title)
print('module name:', __name__)
print('parent process:', os.getppid())
print('process id:', os.getpid())
# Function name is f
def f(name):
info('function f')
print('hello', name)
if __name__ == '__main__':
info('main line')
# f is provided, and args is provided - not f("bob")
p = Process(target=f, args=('bob',))
p.start()
p.join()
You can use the the code below to get your desired output:
import threading
import time
global_array = []
def add_array1():
while True:
time.sleep(2.5)
global_array.append(1)
print(global_array)
def add_array2():
while True:
time.sleep(3)
global_array.append(2)
print(global_array)
def runInParallel(*fns):
if __name__ == '__main__':
proc = []
for fn in fns:
p = threading.Thread(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
if __name__ == '__main__':
runInParallel(
add_array1,
add_array2
)
you can use this simple code to get your desired output:
'''
import time
global_array = []
def add_array1(array):
while True:
time.sleep(2.5)
if len(global_array) % 2 == 0:
global_array.append(1)
print(global_array)
else:
global_array.append(2)
print(global_array)
add_array1(global_array)

How to run functions with arguments in parallel?

Thanks to How to run functions in parallel? the following code works.
import time
from multiprocessing import Process
def worker():
time.sleep(2)
print("Working")
def runInParallel(*fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
if __name__ == '__main__':
start = time.time()
runInParallel(worker, worker, worker, worker)
print("Total time taken: ", time.time()-start)
However if I add argument to worker() it does not run in parallel anymore.
import time
from multiprocessing import Process
def worker(ii):
time.sleep(ii)
print("Working")
def runInParallel(*fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
if __name__ == '__main__':
start = time.time()
runInParallel(worker(2), worker(2), worker(2), worker(2))
print("Total time taken: ", time.time()-start)
What might be the reason for that?
You should modify runInParallel to do iterable unpacking.
import time
from multiprocessing import Process
def worker(ii):
time.sleep(ii)
print("Working")
def runInParallel(*fns):
proc = []
for fn in fns:
func, *args = fn
p = Process(target=func, args=args)
p.start()
proc.append(p)
for p in proc:
p.join()
if __name__ == '__main__':
start = time.time()
runInParallel((worker, 2), (worker, 3), (worker, 5), (worker, 2))
print("Total time taken: ", time.time()-start)
It's because of the difference between worker and worker(). The first is the function, and the latter is a function call. What is happening on the line runInParallel(worker(2), worker(2), worker(2), worker(2)) is that all four calls are run before the execution of runInParallel is even begun. If you add a print(fns) in beginning of runInParallel you will see some difference.
Quick fix:
def worker_caller():
worker(2)
and:
runInParallel(worker_caller, worker_caller, worker_caller, worker_caller)
That's not very convenient but it's mostly intended to show what the problem is. The problem is not in the function worker. The problem is that you're mixing up passing a function and passing a function call. If you changed your first version to:
runInParallel(worker(), worker(), worker(), worker())
then you would run into exactly the same issue.
But you can do this:
runInParallel(lambda:worker(2), lambda: worker(2), lambda: worker(2), lambda: worker(2))
Lambdas are very useful. Here is another version:
a = lambda:worker(2)
b = lambda:worker(4)
c = lambda:worker(3)
d = lambda:worker(1)
runInParallel(a, b, c, d)
To pass arguments, you need to pass them to the Process constructor:
p = Process(target=fn, args=(arg1,))
The Process constructor accepts args and kwargs parameters, which are then passed to the process when it is executed.
The documentation is quite clear about this.
So your code should be modified something like this:
def worker(ii):
time.sleep(ii)
print("Working")
def runInParallel(*fns):
proc = []
for fn in fns:
p = Process(target=fn, args=(2,))
p.start()
proc.append(p)
for p in proc:
p.join()
if __name__ == '__main__':
start = time.time()
runInParallel(worker, worker, worker, worker)
print("Total time taken: ", time.time()-start)
Of course parameters can be different for each process, you need to arrange that the right one is passed to each in args (or kwargs for keyword parameters).
This can be achieved by passing tuples such as runInParallel((worker,2), (worker,3), (worker,5), (worker,1) for example, and then processing the tuples inside runInParallel.

How can you use multiprocessing in Python to divide up iterating through a dictionary?

I have a project to read millions of lines of input and then to run calculations on a very large dictionary. I have implemented Pool/Queue techniques on some of the IO parts of the program earlier. However, is there a way to use multiprocessing to divide up the standard:
for i in xdictionary.iteritems():
Thanks!
You could use imap:
import multiprocessing
import os
def f(key_value):
print "pid={}, key={}, value={}".format(os.getpid(), *key_value)
pool = multiprocessing.Pool(2)
for _ in pool.imap(f, {1:2, 3:4, 5:6, 7:8}.iteritems()):
pass
Out:
pid=1689, key=1, value=2
pid=1689, key=3, value=4
pid=1689, key=5, value=6
pid=1690, key=7, value=8
import multiprocessing as mp
def main():
numProcs = mp.cpu_count()
qIn, qOut = [mp.Queue() for _ in xrange(2)]
procs = [mp.process(target=dowork, args=(qIn, qOut)) for _ in range(numProcs)]
for p in procs: p.start()
for k,v in xdict.iteritems():
qIn.put((k,v))
for _ in xrange(numProcs):
qIn.put(None)
done = 0
while done < numProcs:
res = qOut.get()
if res is None:
done += 1
continue
k,v,ans = res
print "processing", k, v, "gives:", ans
for p in procs: p.terminate()
def dowork(qIn, qOut):
for k,v in iter(qIn.get, None):
answer = doStuff(k,v)
qOut.put((k,v,answer))
qOut.put(None)

How to get the return value of a function passed to multiprocessing.Process?

In the example code below, I'd like to get the return value of the function worker. How can I go about doing this? Where is this value stored?
Example Code:
import multiprocessing
def worker(procnum):
'''worker function'''
print str(procnum) + ' represent!'
return procnum
if __name__ == '__main__':
jobs = []
for i in range(5):
p = multiprocessing.Process(target=worker, args=(i,))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
print jobs
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
[<Process(Process-1, stopped)>, <Process(Process-2, stopped)>, <Process(Process-3, stopped)>, <Process(Process-4, stopped)>, <Process(Process-5, stopped)>]
I can't seem to find the relevant attribute in the objects stored in jobs.
Use shared variable to communicate. For example like this:
import multiprocessing
def worker(procnum, return_dict):
"""worker function"""
print(str(procnum) + " represent!")
return_dict[procnum] = procnum
if __name__ == "__main__":
manager = multiprocessing.Manager()
return_dict = manager.dict()
jobs = []
for i in range(5):
p = multiprocessing.Process(target=worker, args=(i, return_dict))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
print(return_dict.values())
I think the approach suggested by #sega_sai is the better one. But it really needs a code example, so here goes:
import multiprocessing
from os import getpid
def worker(procnum):
print('I am number %d in process %d' % (procnum, getpid()))
return getpid()
if __name__ == '__main__':
pool = multiprocessing.Pool(processes = 3)
print(pool.map(worker, range(5)))
Which will print the return values:
I am number 0 in process 19139
I am number 1 in process 19138
I am number 2 in process 19140
I am number 3 in process 19139
I am number 4 in process 19140
[19139, 19138, 19140, 19139, 19140]
If you are familiar with map (the Python 2 built-in) this should not be too challenging. Otherwise have a look at sega_Sai's link.
Note how little code is needed. (Also note how processes are re-used).
For anyone else who is seeking how to get a value from a Process using Queue:
import multiprocessing
ret = {'foo': False}
def worker(queue):
ret = queue.get()
ret['foo'] = True
queue.put(ret)
if __name__ == '__main__':
queue = multiprocessing.Queue()
queue.put(ret)
p = multiprocessing.Process(target=worker, args=(queue,))
p.start()
p.join()
print(queue.get()) # Prints {"foo": True}
Note that in Windows or Jupyter Notebook, with multithreading you have to save this as a file and execute the file. If you do it in a command prompt you will see an error like this:
AttributeError: Can't get attribute 'worker' on <module '__main__' (built-in)>
For some reason, I couldn't find a general example of how to do this with Queue anywhere (even Python's doc examples don't spawn multiple processes), so here's what I got working after like 10 tries:
from multiprocessing import Process, Queue
def add_helper(queue, arg1, arg2): # the func called in child processes
ret = arg1 + arg2
queue.put(ret)
def multi_add(): # spawns child processes
q = Queue()
processes = []
rets = []
for _ in range(0, 100):
p = Process(target=add_helper, args=(q, 1, 2))
processes.append(p)
p.start()
for p in processes:
ret = q.get() # will block
rets.append(ret)
for p in processes:
p.join()
return rets
Queue is a blocking, thread-safe queue that you can use to store the return values from the child processes. So you have to pass the queue to each process. Something less obvious here is that you have to get() from the queue before you join the Processes or else the queue fills up and blocks everything.
Update for those who are object-oriented (tested in Python 3.4):
from multiprocessing import Process, Queue
class Multiprocessor():
def __init__(self):
self.processes = []
self.queue = Queue()
#staticmethod
def _wrapper(func, queue, args, kwargs):
ret = func(*args, **kwargs)
queue.put(ret)
def run(self, func, *args, **kwargs):
args2 = [func, self.queue, args, kwargs]
p = Process(target=self._wrapper, args=args2)
self.processes.append(p)
p.start()
def wait(self):
rets = []
for p in self.processes:
ret = self.queue.get()
rets.append(ret)
for p in self.processes:
p.join()
return rets
# tester
if __name__ == "__main__":
mp = Multiprocessor()
num_proc = 64
for _ in range(num_proc): # queue up multiple tasks running `sum`
mp.run(sum, [1, 2, 3, 4, 5])
ret = mp.wait() # get all results
print(ret)
assert len(ret) == num_proc and all(r == 15 for r in ret)
This example shows how to use a list of multiprocessing.Pipe instances to return strings from an arbitrary number of processes:
import multiprocessing
def worker(procnum, send_end):
'''worker function'''
result = str(procnum) + ' represent!'
print result
send_end.send(result)
def main():
jobs = []
pipe_list = []
for i in range(5):
recv_end, send_end = multiprocessing.Pipe(False)
p = multiprocessing.Process(target=worker, args=(i, send_end))
jobs.append(p)
pipe_list.append(recv_end)
p.start()
for proc in jobs:
proc.join()
result_list = [x.recv() for x in pipe_list]
print result_list
if __name__ == '__main__':
main()
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
['0 represent!', '1 represent!', '2 represent!', '3 represent!', '4 represent!']
This solution uses fewer resources than a multiprocessing.Queue which uses
a Pipe
at least one Lock
a buffer
a thread
or a multiprocessing.SimpleQueue which uses
a Pipe
at least one Lock
It is very instructive to look at the source for each of these types.
It seems that you should use the multiprocessing.Pool class instead and use the methods .apply() .apply_async(), map()
http://docs.python.org/library/multiprocessing.html?highlight=pool#multiprocessing.pool.AsyncResult
You can use the exit built-in to set the exit code of a process. It can be obtained from the exitcode attribute of the process:
import multiprocessing
def worker(procnum):
print str(procnum) + ' represent!'
exit(procnum)
if __name__ == '__main__':
jobs = []
for i in range(5):
p = multiprocessing.Process(target=worker, args=(i,))
jobs.append(p)
p.start()
result = []
for proc in jobs:
proc.join()
result.append(proc.exitcode)
print result
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
[0, 1, 2, 3, 4]
The pebble package has a nice abstraction leveraging multiprocessing.Pipe which makes this quite straightforward:
from pebble import concurrent
#concurrent.process
def function(arg, kwarg=0):
return arg + kwarg
future = function(1, kwarg=1)
print(future.result())
Example from: https://pythonhosted.org/Pebble/#concurrent-decorators
Thought I'd simplify the simplest examples copied from above, working for me on Py3.6. Simplest is multiprocessing.Pool:
import multiprocessing
import time
def worker(x):
time.sleep(1)
return x
pool = multiprocessing.Pool()
print(pool.map(worker, range(10)))
You can set the number of processes in the pool with, e.g., Pool(processes=5). However it defaults to CPU count, so leave it blank for CPU-bound tasks. (I/O-bound tasks often suit threads anyway, as the threads are mostly waiting so can share a CPU core.) Pool also applies chunking optimization.
(Note that the worker method cannot be nested within a method. I initially defined my worker method inside the method that makes the call to pool.map, to keep it all self-contained, but then the processes couldn't import it, and threw "AttributeError: Can't pickle local object outer_method..inner_method". More here. It can be inside a class.)
(Appreciate the original question specified printing 'represent!' rather than time.sleep(), but without it I thought some code was running concurrently when it wasn't.)
Py3's ProcessPoolExecutor is also two lines (.map returns a generator so you need the list()):
from concurrent.futures import ProcessPoolExecutor
with ProcessPoolExecutor() as executor:
print(list(executor.map(worker, range(10))))
With plain Processes:
import multiprocessing
import time
def worker(x, queue):
time.sleep(1)
queue.put(x)
queue = multiprocessing.SimpleQueue()
tasks = range(10)
for task in tasks:
multiprocessing.Process(target=worker, args=(task, queue,)).start()
for _ in tasks:
print(queue.get())
Use SimpleQueue if all you need is put and get. The first loop starts all the processes, before the second makes the blocking queue.get calls. I don't think there's any reason to call p.join() too.
If you are using Python 3, you can use concurrent.futures.ProcessPoolExecutor as a convenient abstraction:
from concurrent.futures import ProcessPoolExecutor
def worker(procnum):
'''worker function'''
print(str(procnum) + ' represent!')
return procnum
if __name__ == '__main__':
with ProcessPoolExecutor() as executor:
print(list(executor.map(worker, range(5))))
Output:
0 represent!
1 represent!
2 represent!
3 represent!
4 represent!
[0, 1, 2, 3, 4]
A simple solution:
import multiprocessing
output=[]
data = range(0,10)
def f(x):
return x**2
def handler():
p = multiprocessing.Pool(64)
r=p.map(f, data)
return r
if __name__ == '__main__':
output.append(handler())
print(output[0])
Output:
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
You can use ProcessPoolExecutor to get a return value from a function as shown below:
from concurrent.futures import ProcessPoolExecutor
def test(num1, num2):
return num1 + num2
with ProcessPoolExecutor() as executor:
feature = executor.submit(test, 2, 3)
print(feature.result()) # 5
I modified vartec's answer a bit since I needed to get the error codes from the function. (Thanks vertec!!! its an awesome trick)
This can also be done with a manager.list but I think is better to have it in a dict and store a list within it. That way, way we keep the function and the results since we can't be sure of the order in which the list will be populated.
from multiprocessing import Process
import time
import datetime
import multiprocessing
def func1(fn, m_list):
print 'func1: starting'
time.sleep(1)
m_list[fn] = "this is the first function"
print 'func1: finishing'
# return "func1" # no need for return since Multiprocess doesnt return it =(
def func2(fn, m_list):
print 'func2: starting'
time.sleep(3)
m_list[fn] = "this is function 2"
print 'func2: finishing'
# return "func2"
def func3(fn, m_list):
print 'func3: starting'
time.sleep(9)
# if fail wont join the rest because it never populate the dict
# or do a try/except to get something in return.
raise ValueError("failed here")
# if we want to get the error in the manager dict we can catch the error
try:
raise ValueError("failed here")
m_list[fn] = "this is third"
except:
m_list[fn] = "this is third and it fail horrible"
# print 'func3: finishing'
# return "func3"
def runInParallel(*fns): # * is to accept any input in list
start_time = datetime.datetime.now()
proc = []
manager = multiprocessing.Manager()
m_list = manager.dict()
for fn in fns:
# print fn
# print dir(fn)
p = Process(target=fn, name=fn.func_name, args=(fn, m_list))
p.start()
proc.append(p)
for p in proc:
p.join() # 5 is the time out
print datetime.datetime.now() - start_time
return m_list, proc
if __name__ == '__main__':
manager, proc = runInParallel(func1, func2, func3)
# print dir(proc[0])
# print proc[0]._name
# print proc[0].name
# print proc[0].exitcode
# here you can check what did fail
for i in proc:
print i.name, i.exitcode # name was set up in the Process line 53
# here will only show the function that worked and where able to populate the
# manager dict
for i, j in manager.items():
print dir(i) # things you can do to the function
print i, j

Categories