Python extract element from iterator - python

I have the following code but cannot get out the results from the iterator
from multiprocess import freeze_support
from pathos.multiprocessing import ProcessPool
if __name__ == "__main__":
freeze_support()
pool = ProcessPool(nodes=4)
results = pool.uimap(pow, [1,2,3,4], [5,6,7,8])
print("...")
print(list(results))
The code does not error it just hangs.

There are a couple subtleties to get this to work, but the short version is imap or uimap are iterators unlike map in the python multiprocessing example. To extract the results it needs to be in a for loop. If inside a class you also need the called method to be a #staticmethod
from multiprocessing import freeze_support
from multiprocessing import Pool
def f(vars):
return vars[0]**vars[1]
if __name__ == "__main__":
freeze_support()
pool = Pool(4)
for run in pool.imap(f, [(1,5), (2,8), (3,9)]):
print(run)

Related

How to use `Manager` when using "spawn"

Here is some pseudocode for what I'm doing
import multiprocessing as mp
from multiprocessing import Manager
from tqdm import tqdm
def loop(arg):
# do stuff
# ...
results.append(result_of_stuff)
if __name__ == '__main__':
manager = Manager()
results = manager.list()
with mp.get_context('spawn').Pool(4) as pool:
list(tqdm(pool.imap(loop, ls), total=len(ls)))
# do stuff with `results`
# ...
So the issue here is that loop doesn't know about results. I have one working way to do this and it's by using "fork" instead of "spawn". But I need to use "spawn" for reasons beyond the scope of my question..
So what is the minimal change I need to make for this to work? And I really want to keep tqdm hence the use of imap
PS: I'm on Linux
You can use functools.partial to add the extra parameters:
import multiprocessing as mp
import os
from functools import partial
from multiprocessing import Manager
from tqdm import tqdm
def loop(results, arg):
results.append(len(arg))
def main():
ctx = mp.get_context("spawn")
manager = Manager()
l = manager.list()
partial_loop = partial(loop, l)
ls = os.listdir("/tmp")
with ctx.Pool() as pool:
results = list(tqdm(pool.imap(partial_loop, ls), total=len(ls)))
print(f"Sum: {sum(l)}")
if __name__ == "__main__":
main()
There is some overhead with this approach as it will spawn a child process to host the Manager server.
Since you will process the results in the main process anyway I would do something like this instead (but this depends on your circumstances of course):
import multiprocessing as mp
import os
from tqdm import tqdm
def loop(arg):
return len(arg)
def main():
ctx = mp.get_context("spawn")
ls = os.listdir("/tmp")
with ctx.Pool() as pool:
results = list(tqdm(pool.imap(loop, ls), total=len(ls)))
print(f"Sum: {sum(results)}")
if __name__ == "__main__":
main()
I know you have already accepted an answer, but let me add my "two cents":
The other way of solving your issue is by initializing each process in your pool with the global variable results as originally intended. The problem was that when using spawn newly created processes do not inherit the address space of the main process (which included the definition of results). Instead execution starts from the top of the program. But the code that creates results never gets executed because of the if __name__ == '__main__' check. But that is a good thing because you do not want a separate instance of this list anyway.
So how do we share the same instance of global variable results across all processes? This is accomplished by using a pool initializer as follows. Also, if you want an accurate progress bar, you should really use imap_unordered instead of imap so that the progress bar is updated in task-completion order rather than in the order in which tasks were submitted. For example, if the first task submitted happens to be the last task to complete, then using imap would result in the progress bar not progressing until all the tasks completed and then it would shoot to 100% all at once.
But Note: The doumentation for imap_unordered only states that the results will be returned in arbitrary order, not completion order. It does however seem that when a chunksize argument of 1 is used (the default if not explicitly specified), the results are returned in completion order. If you do not want to rely on this, then use instead apply_async specifying a callback function that will update the progrss bar. See the last code example.
import multiprocessing as mp
from multiprocessing import Manager
from tqdm import tqdm
def init_pool(the_results):
global results
results = the_results
def loop(arg):
import time
# do stuff
# ...
time.sleep(1)
results.append(arg ** 2)
if __name__ == '__main__':
manager = Manager()
results = manager.list()
ls = list(range(1, 10))
with mp.get_context('spawn').Pool(4, initializer=init_pool, initargs=(results,)) as pool:
list(tqdm(pool.imap_unordered(loop, ls), total=len(ls)))
print(results)
Update: Another (Better) Way
import multiprocessing as mp
from tqdm import tqdm
def loop(arg):
import time
# do stuff
# ...
time.sleep(1)
return arg ** 2
if __name__ == '__main__':
results = []
ls = list(range(1, 10))
with mp.get_context('spawn').Pool(4) as pool:
with tqdm(total=len(ls)) as pbar:
for v in pool.imap_unordered(loop, ls):
results.append(v)
pbar.update(1)
print(results)
Update: The Safest Way
import multiprocessing as mp
from tqdm import tqdm
def loop(arg):
import time
# do stuff
# ...
time.sleep(1)
return arg ** 2
def my_callback(v):
results.append(v)
pbar.update(1)
if __name__ == '__main__':
results = []
ls = list(range(1, 10))
with mp.get_context('spawn').Pool(4) as pool:
with tqdm(total=len(ls)) as pbar:
for arg in ls:
pool.apply_async(loop, args=(arg,), callback=(my_callback))
pool.close()
pool.join()
print(results)

Python multiprocessing with dill.load

I'm having great difficulties in getting functions executed in MultiProcessing Pool method that are loaded through
dill.load('somefile.sav','rb')
The code is something like this:
import dill as dill
import multiprocessing_on_dill as mp
dill_func = dill.load('somefile.sav','rb')
def some_mp_func(x):
dill_func(x)
if (__name__ == '__main__'):
__spec__ = "ModuleSpec(name='builtins', loader=<class '_frozen_importlib.BuiltinImporter'>)"
x="test"
pool = mp.Pool(processes = (8))
pool.map(some_mp_func, x)
pool.close()
pool.join()
dill_func is an SKlearn Pipeline.
The output is:
NameError: name 'Y' is not defined
Where 'Y' is a function within dill_func, part of a Class in dill_func.
Running some_mp_func(x) without Multiprocessing runs perfectly fine, no Name errors. Any suggestions?
SOLUTION: when dumping use this setting
dill.settings['recurse']=True

Python multiprocessing pool with imported function

Using windows and python 3.6 I am having trouble with the multiprocessing once I import the function which is to be mapped.
This code works (gives output [2, 12, 30, 56, 90]):
from multiprocessing import Pool
def f(x,y):
return x*y
if __name__ == '__main__':
args=[(1,2),(3,4),(5,6),(7,8),(9,10)]
with Pool(4) as p:
result=p.starmap(f,args)
print(result)
Now I move the function f to a different .py file called 'test' and instead import it:
from multiprocessing import Pool
from test import f
if __name__ == '__main__':
args=[(1,2),(3,4),(5,6),(7,8),(9,10)]
with Pool(4) as p:
result=p.starmap(f,args)
print(result)
with test.py only containing:
def f(x,y):
return x*y
Running this leads to an infinite loop (doesn't return anything with cpu usage high).
What is causing this and is there a way to fix it? I have successfully got multiprocessing to work on a program by copying all the code into one huge .py which obviously is not ideal.

Suppress output in multiprocessing process

I have some code which runs in parallel using the multiprocessing Pool class. Unfortunately some of the functions I use from another library have some verbose output. To abstract the problem have a look at the following example:
from multiprocessing import Pool
def f(x):
print 'hello'
return x*x
p = Pool(10)
p.map(f, [1,2,3])
So this will print 'hello' like 10 times.
Is there a possibility to mute the output of the processes or
to put the std.out in some variable? Thanks for any suggestion.
EDIT:
I don't want to redirect the whole stdout, just for the processes of my pool.
Use the initializer parameter to call mute in the worker processes:
import sys
import os
from multiprocessing import Pool
def mute():
sys.stdout = open(os.devnull, 'w')
def f(x):
print 'hello'
return x*x
if __name__ == '__main__':
p = Pool(10, initializer=mute)
p.map(f, [1,2,3])
print('Hello')
prints
Hello

python multiprocessing, manager initiates process spawn loop

I have a simple python multiprocessing script that sets up a pool of workers that attempt to append work-output to a Manager list. The script has 3 call stacks: - main calls f1 that spawns several worker processes that call another function g1. When one attempts to debug the script (incidentally on Windows 7/64 bit/VS 2010/PyTools) the script runs into a nested process creation loop, spawning an endless number of processes. Can anyone determine why? I'm sure I am missing something very simple. Here's the problematic code: -
import multiprocessing
import logging
manager = multiprocessing.Manager()
results = manager.list()
def g1(x):
y = x*x
print "processing: y = %s" % y
results.append(y)
def f1():
logger = multiprocessing.log_to_stderr()
logger.setLevel(multiprocessing.SUBDEBUG)
pool = multiprocessing.Pool(processes=4)
for (i) in range(0,15):
pool.apply_async(g1, [i])
pool.close()
pool.join()
def main():
f1()
if __name__ == "__main__":
main()
PS: tried adding multiprocessing.freeze_support() to main to no avail.
Basically, what sr2222 mentions in his comment is correct. From the multiprocessing manager docs, it says that the ____main____ module must be importable by the children. Each manager " object corresponds to a spawned child process", so each child is basically re-importing your module (you can see by adding a print statement at module scope to my fixed version!)...which leads to infinite recursion.
One solution would be to move your manager code into f1():
import multiprocessing
import logging
def g1(results, x):
y = x*x
print "processing: y = %s" % y
results.append(y)
def f1():
logger = multiprocessing.log_to_stderr()
logger.setLevel(multiprocessing.SUBDEBUG)
manager = multiprocessing.Manager()
results = manager.list()
pool = multiprocessing.Pool(processes=4)
for (i) in range(0,15):
pool.apply_async(g1, [results, i])
pool.close()
pool.join()
def main():
f1()
if __name__ == "__main__":
main()

Categories