Good day to you,
Today I was moving code from threading to multiprocess. Everything seemed okay, until I got The following error:
Error
Traceback (most recent call last):
File "run.py", line 93, in <module>
main()
File "run.py", line 82, in main
emenu.executemenu(components, _path)
File "/home/s1810979/paellego/lib/execute/execute_menu.py", line 29, in executemenu
e.executeall(installed, _path)
File "/home/s1810979/paellego/lib/execute/execute.py", line 153, in executeall
pool.starmap(phase2, args)
File "/usr/lib64/python3.4/multiprocessing/pool.py", line 268, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "/usr/lib64/python3.4/multiprocessing/pool.py", line 608, in get
raise self._value
File "/usr/lib64/python3.4/multiprocessing/pool.py", line 385, in _handle_tasks
put(task)
File "/usr/lib64/python3.4/multiprocessing/connection.py", line 206, in send
self._send_bytes(ForkingPickler.dumps(obj))
File "/usr/lib64/python3.4/multiprocessing/reduction.py", line 50, in dumps
cls(buf, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <class 'module'>: attribute lookup module on builtins failed
Code
execute.py
def executeall(components, _path):
args = []
manager = multiprocessing.Manager()
q = manager.Queue()
resultloc = '/some/result.log'
for component in components:
for apkpath, resultpath in zip(execonfig.apkpaths, execonfig.resultpaths):
args.append((component,apkpath,resultpath,q,)) #Args for subprocesses
cores = askcores()
with multiprocessing.Pool(processes=cores) as pool:
watcher = pool.apply_async(lgr.log, (resultloc+'/results.txt', q,))
pool.starmap(phase2, args)
component.py
class Component(object):
def __init__(self, installmodule, runmodule, installerloc, installationloc, dependencyloc):
self.installmodule = installmodule
self.runmodule = runmodule
self.installerloc = installerloc
self.installationloc = installationloc
self.dependencyloc = dependencyloc
self.config = icnf.Installconfiguration(installerloc+'/conf.conf')
#lots of functions...
installconfig.py
class State(Enum):
BEGIN=0 #Look for units
UNIT=1 #Look for unit keypairs
KEYPAIR=3
class Phase(Enum):
NONE=0
DEPS=1
PKGS=2
class Installconfiguration(object):
def __init__(self, config):
dictionary = self.reader(config) #Fill a dictionary
#dictionary (key:Phase, value: (dictionary key: str, job))
self.deps = dictionary[Phase.DEPS]
self.pkgs = dictionary[Phase.PKGS]
job.py
class Job(object):
def __init__(self, directory=None, url=None):
self.directory = directory if directory else ''
self.url = url if url else ''
As you can see, I pass a component as argument to function phase2(component, str, str, multiprocess.manager.Queue()).
The second and third argument of the constructor of component are modules imported with importlib.
What I tried
I am new to python, but not to programming. Here is what I tried:
Because the error itself did not point out what the problem was exactly, I tried removing args to find out which can't be pickled: Remove component, and everything works fine, so this appears to be the cause for trouble. However, I need this object passed to my processes.
I searched around the internet for hours, but did not find anything but basic tutorials about multiprocessing, and explanations about how pickle works. I did find this saying it should work, but not on windows or something. However, it does not work on Unix (which I use)
My ideas
As I understood it, nothing suggests I cannot send a class containing two importlib modules. I do not know what the exact problem is with component class, but importlib module as members are the only non-regular things. This is why I believe the problem occurs here.
Question
Do you know why a class containing modules is unsuitable for 'pickling'? How can one get a better idea why and where Can't pickle <class 'module'> errors occur?
More code
Full source code for this can be found on https://github.com/Sebastiaan-Alvarez-Rodriguez/paellego
Questions to me
Please leave comments requesting clarifications/more code snippets/??? if you would like me to edit this question
A last request
I would like solutions to use python standard library only, python 3.3 preferably. Also, a requirement of my code is that it runs on Unix systems.
Thanks in advance
Edit
As requested, here is a minimal example which greatly simplifies the problem:
main.py (you could execute as python main.py foo)
#!/usr/bin/env python
import sys
import importlib
import multiprocessing
class clazz(object):
def __init__(self, moduly):
self.moduly = moduly
def foopass(self, stringy):
self.moduly.foo(stringy)
def barpass(self, stringy, numbery):
self.moduly.bar(stringy)
print('Second argument: '+str(numbery))
def worker(clazzy, numbery):
clazzy.barpass('wow', numbery)
def main():
clazzy = clazz(importlib.import_module(sys.argv[1]))
clazzy.foopass('init')
args = [(clazzy, 2,)]
with multiprocessing.Pool(processes=2) as pool:
pool.starmap(worker, args)
if __name__ == "__main__":
main()
foo.py (needs to be in same directory for above call suggestion):
#!/usr/bin/env python
globaly = 0
def foo(stringy):
print('foo '+stringy)
global globaly
globaly = 5
def bar(stringy):
print('bar '+stringy)
print(str(globaly))
This gives error upon running: TypeError: can't pickle module objects
Now we know that pickling module objects is (sadly) not possible.
In order to get rid of the error, let clazz not take a module as attribute, however convenient, but let it take "modpath", which is the required string for importlib to import the module specified by user.
It looks like this (foo.py remains exactly the same as above):
#!/usr/bin/env python
import sys
import importlib
import multiprocessing
class clazz(object):
def __init__(self, modpathy):
self.modpathy = modpathy
def foopass(self, stringy):
moduly = importlib.import_module(self.modpathy)
moduly.foo(stringy)
def barpass(self, stringy, numbery):
moduly = importlib.import_module(self.modpathy)
moduly.bar(stringy)
print('Second argument: '+str(numbery))
def worker(clazzy, number):
clazzy.barpass('wow', number)
def main():
clazzy = clazz(sys.argv[1])
clazzy.foopass('init')
args = [(clazzy, 2,)]
with multiprocessing.Pool(processes=2) as pool:
pool.starmap(worker, args)
if __name__ == "__main__":
main()
If you require that your globals, such as globaly, are guaranteed to maintain state, then you need to pass a mutable object (e.g. list, dictionary) to hold this data, thanks #DavisHerring:
Module attributes are called “global variables” in Python, but they are no more persistent or accessible than any other data. Why not just use dictionaries?
The example code would look like this:
#!/usr/bin/env python
import sys
import importlib
import multiprocessing
class clazz(object):
def __init__(self, modpathy):
self.modpathy = modpathy
self.dictionary = {}
def foopass(self, stringy):
moduly = importlib.import_module(self.modpathy)
moduly.foo(stringy, self.dictionary)
def barpass(self, stringy, numbery):
moduly = importlib.import_module(self.modpathy)
moduly.bar(stringy, self.dictionary)
print('Second argument: '+str(numbery))
def worker(clazzy, number):
clazzy.barpass('wow', number)
def main():
clazzy = clazz(sys.argv[1])
clazzy.foopass('init')
args = [(clazzy, 2,)]
with multiprocessing.Pool(processes=2) as pool:
pool.starmap(worker, args)
if __name__ == "__main__":
main()
foo.py (no more globals):
#!/usr/bin/env python
def foo(stringy, dictionary):
print('foo '+stringy)
globaly = 5
dictionary['globaly'] = globaly
def bar(stringy, dictionary):
print('bar '+stringy)
globaly = dictionary['globaly']
print(str(globaly))
This way you can work around the problem without annoying can't pickle ... errors, and while maintaining states
Related
I faced with this problem in my work code, so I can't show it. But I wrote some short example, which exactly reproduce error and cuts off redundant logic.
Example have two files: Example.py & ImportedExample.py.
Example.py
from multiprocessing import Process
from ImportedExample import Imported
class Example:
def __init__(self, number):
self.imported = Imported(number)
def func(example: Example):
print(example)
if __name__ == "__main__":
ex = Example(3)
p = Process(target=func, args=(ex,))
p.start()
ImportedExample.py
class Imported:
def __init__(self, number):
self.number = number
self.ref = self.__private_method
def __private_method(self):
print(self.number)
And Traceback looks like this:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File"C:\Python\Python36\lib\multiprocessing\spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "C:\Python\Python36\lib\multiprocessing\spawn.py", line 115, in _main
self = reduction.pickle.load(from_parent)
AttributeError: 'Imported' object has no attribute '__private_method'
The main detail is that when I make __private_method() non-private (renaming to private_method()), all works fine.
I don't understand why this happens. Any suggestions?
multiprocessing module uses pickle for transferring object between processes.
For an object to be pickable, it has to be accessible by name. Thanks to private name mangling, referenced private methods don’t fall in that category.
I suggest making the method protected – that is naming the method with only one leading underscore. From a global point of view, protected methods shoud be treated just as private methods, but they are not subject of name mangling.
I have a pickling problem. I want to serialize a function in my main script, then load it and run it in another script. To demonstrate this, I've made 2 scripts:
Attempt 1: The naive way:
dill_pickle_script_1.py
import pickle
import time
def my_func(a, b):
time.sleep(0.1) # The purpose of this will become evident at the end
return a+b
if __name__ == '__main__':
with open('testfile.pkl', 'wb') as f:
pickle.dump(my_func, f)
dill_pickle_script_2.py
import pickle
if __name__ == '__main__':
with open('testfile.pkl') as f:
func = pickle.load(f)
assert func(1, 2)==3
Problem: when I run script 2, I get AttributeError: 'module' object has no attribute 'my_func'. I understand why: because when my_func is serialized in script1, it belongs to the __main__ module. dill_pickle_script_2 can't know that __main__ there referred to the namespace of dill_pickle_script_1, and therefore cannot find the reference.
Attempt 2: Inserting an absolute import
I fix the problem by adding a little hack - I add an absolute import to my_func in dill_pickle_script_1 before pickling it.
dill_pickle_script_1.py
import pickle
import time
def my_func(a, b):
time.sleep(0.1)
return a+b
if __name__ == '__main__':
from dill_pickle_script_1 import my_func # Added absolute import
with open('testfile.pkl', 'wb') as f:
pickle.dump(my_func, f)
Now it works! However, I'd like to avoid having to do this hack every time I want to do this. (Also, I want to have my pickling be done inside some other module which wouldn't have know which module that my_func came from).
Attempt 3: Dill
I head that the package dill lets you serialize things in main and load them elsewhere. So I tried that:
dill_pickle_script_1.py
import dill
import time
def my_func(a, b):
time.sleep(0.1)
return a+b
if __name__ == '__main__':
with open('testfile.pkl', 'wb') as f:
dill.dump(my_func, f)
dill_pickle_script_2.py
import dill
if __name__ == '__main__':
with open('testfile.pkl') as f:
func = dill.load(f)
assert func(1, 2)==3
Now, however, I have another problem: When running dill_pickle_script_2.py, I get a NameError: global name 'time' is not defined. It seems that dill did not realize that my_func referenced the time module and has to import it on load.
My Question?
How can I serialize an object in main, and load it again in another script so that all the imports used by that object are also loaded, without doing the nasty little hack in Attempt 2?
Well, I found a solution. It is a horrible but tidy kludge and not guaranteed to work in all cases. Any suggestions for improvement are welcome. The solution involves replacing the main reference with an absolute module reference in the pickle string, using the following helper functions:
import sys
import os
def pickle_dumps_without_main_refs(obj):
"""
Yeah this is horrible, but it allows you to pickle an object in the main module so that it can be reloaded in another
module.
:param obj:
:return:
"""
currently_run_file = sys.argv[0]
module_path = file_path_to_absolute_module(currently_run_file)
pickle_str = pickle.dumps(obj, protocol=0)
pickle_str = pickle_str.replace('__main__', module_path) # Hack!
return pickle_str
def pickle_dump_without_main_refs(obj, file_obj):
string = pickle_dumps_without_main_refs(obj)
file_obj.write(string)
def file_path_to_absolute_module(file_path):
"""
Given a file path, return an import path.
:param file_path: A file path.
:return:
"""
assert os.path.exists(file_path)
file_loc, ext = os.path.splitext(file_path)
assert ext in ('.py', '.pyc')
directory, module = os.path.split(file_loc)
module_path = [module]
while True:
if os.path.exists(os.path.join(directory, '__init__.py')):
directory, package = os.path.split(directory)
module_path.append(package)
else:
break
path = '.'.join(module_path[::-1])
return path
Now, I can simply change dill_pickle_script_1.py to say
import time
from artemis.remote.child_processes import pickle_dump_without_main_refs
def my_func(a, b):
time.sleep(0.1)
return a+b
if __name__ == '__main__':
with open('testfile.pkl', 'wb') as f:
pickle_dump_without_main_refs(my_func, f)
And then dill_pickle_script_2.py works!
You can use dill.dump with recurse=True or dill.settings["recurse"] = True. It will capture closures:
In file A:
import time
import dill
def my_func(a, b):
time.sleep(0.1)
return a + b
with open("tmp.pkl", "wb") as f:
dill.dump(my_func, f, recurse=True)
In file B:
import dill
with open("tmp.pkl", "rb") as f:
my_func = dill.load(f)
Here's another solution that modifies the serialization so that it will deserialize without any special measures. You could argue it is less hacky than Peter's solution.
Instead of hacking the output from pickle.dumps(), this subclasses Pickler to modify the way it pickles objects that refer back to __main__. This does mean that the fast (C implementation) pickler can't be used, so there is a performance penalty with this method. It also overrides the save_pers() method of Pickler, which isn't intended to be overridden. So this could break in a future version of Python (unlikely though).
def get_function_module_str(func):
"""Returns a dotted module string suitable for importlib.import_module() from a
function reference.
"""
source_file = Path(inspect.getsourcefile(func))
# (Doesn't work with built-in functions)
if not source_file.is_absolute():
rel_path = source_file
else:
# It's an absolute path so find the longest entry in sys.path that shares a
# common prefix and remove the prefix.
for path_str in sorted(sys.path, key=len, reverse=True):
try:
rel_path = source_file.relative_to(Path(path_str))
break
except ValueError:
pass
else:
raise ValueError(f"{source_file!r} is not on the Python path")
# Replace path separators with dots.
modules_str = ".".join(p for p in rel_path.with_suffix("").parts if p != "__init__")
return modules_str, func.__name__
class ResolveMainPickler(pickle._Pickler):
"""Subclass of Pickler that replaces __main__ references with the actual module
name."""
def persistent_id(self, obj):
"""Override to see if this object is defined in "__main__" and if so to replace
__main__ with the actual module name."""
if getattr(obj, "__module__", None) == "__main__":
module_str, obj_name = get_function_module_str(obj)
obj_ref = getattr(importlib.import_module(module_str), obj_name)
return obj_ref
return None
def save_pers(self, pid):
"""Override the function to save a persistent ID so that it saves it as a
normal reference. So it can be unpickled with no special arrangements.
"""
self.save(pid, save_persistent_id=False)
with io.BytesIO() as pickled:
pickler = ResolveMainPickler(pickled)
pickler.dump(obj)
print(pickled.getvalue())
If you already know the name of the __main__ module then you could dispense with get_function_module_str() and just supply the name directly.
I am getting following error with Unittest module implement for my program
File "/usr/lib/python2.7/unittest/case.py", line 493, in _getAssertEqualityFunc
asserter = self._type_equality_funcs.get(type(first))
AttributeError: 'Utility' object has no attribute '_type_equality_funcs'
when i am trying to create a common class and try to execute through common class utility test functions getting above errors but with normal Unittest class implementation no error was getting.
below is detail explanation of program which executing without any errors
class BaseTestCase(unittest.TestCase):
def __init__(self, methodName='runTest', param=None):
super(BaseTestCase, self).__init__(methodName)
self.param = param
#staticmethod
def parametrize(testcase_klass, param=None):
testloader = unittest.TestLoader()
testnames = testloader.getTestCaseNames(testcase_klass)
suite = unittest.TestSuite()
for name in testnames:
suite.addTest(testcase_klass(name, param=param))
return suite
Now i am inheriting BaseTestCase class and calling testcases..
class salesgrowth_DevInt(BaseTestCase):
def setUp(self):
print "constructor"
pwd = os.getcwd()
def test4_refactoring(self,log):
if (STATUS.lower() == "completed" or STATUS == "Actor : SUCCESS"):`enter code here`
self.assertEqual(os.stat(OUTPUT + '/tes1.txt').st_size, 0,
'employee count is not matching with master data . Different entries are in test1.txt\n')
upto this everything working ok
now like salesgrowth_DevInt testcases there is no of other testcases which inheriting BaseTestCase and executing test4_refactoring testcases(here for example testcases no of lines removed) , to avoid duplication of code
i have created common class Utility includes test4_refactoring function serving to all the testcases like salesgrowth_DevInt .
below is Common utility class code
import sys
import json, sys, os, argparse, commands, time, string, filecmp
import unittest
class Utility(object):
''' common utility class for common test cases operations'''
def __init__(self):
print "constructor"
pwd = os.getcwd()
print "Current working directlry %s\n" % pwd
global scriptpath
scriptpath = os.path.join(pwd, "src/Runner/")
maxDiff = int(80)
def test4_refactoring(self,STATUS,BASE,ANALYSIS_DIR,OUTPUT,log):
print "common function"
log.write('\n')
if (STATUS.lower() == "completed" or STATUS == "Actor : SUCCESS"):
self.assertEqual(os.stat(OUTPUT + '/tes1.txt').st_size, 0,
'employee count is not matching with master data . Different entries are in test1.txt\n')
but using utility code when i try to execute below statment
self.assertEqual(os.stat(OUTPUT + '/tes1.txt').st_size, 0,
'employee count is not matching with master data . Different entries are in test1.txt\n')
getting below errors
Traceback (most recent call last):
File "/src/testCases/salesgrowth_DevInt.py", line 96, in test4_refactoring
utils_obj.test4_refactoring(self.STATUS,self.BASE,self.ANALYSIS_DIR,self.OUTPUT,log)
File "/src/common/Utils.py", line 436, in test4_refactoring
'employee count is not matching with master data. Different entries are in test1.txt\n')
File "/usr/lib/python2.7/unittest/case.py", line 512, in assertEqual
assertion_func = self._getAssertEqualityFunc(first, second)
File "/usr/lib/python2.7/unittest/case.py", line 493, in _getAssertEqualityFunc
asserter = self._type_equality_funcs.get(type(first))
AttributeError: 'Utility' object has no attribute '_type_equality_funcs'
Please let me know if any one has any pointers or suggestion for above issue and what is wrong in above implementation.
self.assertEqual will be only available to classes which inherits unittest.TestCase class, which your Utility class not doing.
I suggest try putting your Utility methods under BaseTestCase class.
Give it a name not starting with test_, later on call this new function to validate your asserts for numerous other functions.
I have problem with multiprocessing. Under you have the code (he's in couple of class and files, but i simplified it).
I suppose, that problem lies in pass method name which I want to multiply in multiprocessing.
Informations:
"args" is a list like as [(0,1),(1,2),(2,3)] so single "arg" like as (0,1)
This two files are in totally other calatogs
!!First file!!
from ... import EF
from ... import someclass
class performs():
def action():
for i, arg in enumerate(args):
data.append(EF(self.method,list(arg),i))
someclass.create_processes(*data)
def method(self,fa,la):
...
!!second file!!
from multiprocessing import Process,Event
class EF(object):
def __init__(self,name,args=list(),proc=1):
self.name=name
self.args=args
self.proc=proc
class someclass:
#staticmethod
def create_processes(*functions):
processes=dict()
for function in functions:
process=Process(target=function.name,args=function.args)
process.start()
processes[process.pid]=process
for process in processes.values():
process.join()
When I'm debugging, error comes, when program performing this instruction "process.start()"
Console:
File "C:\Python32\lib\multiprocessing\forking.py", line 371, in main
self = load(from_parent)
AttributeError: 'performs' object has no attribute 'function'
or in other situation
File "C:\Python32\lib\multiprocessing\process.py", line 267, in _bootstrap
self.run()
File "C:\Python32\lib\multiprocessing\process.py", line 116, in run
self._target(*self._args, **self._kwargs)
File "...\performs.py", line 88, in method
...
I don't know it's important, but I have 64bit system, and installed Python and accesories for 32 bit
The comment box is too small for this, but it seems like your code is working fine. I tested it by copying it in two files (like your environment), but it of course also works in a single file. That file you can find below: it is working as expected (save for the print statements output being mixed up, but that is to be expected).
So, most likely, your issue lies elsewhere? The error seems to indicate perhaps some inclusion order, like stated in this question?
from multiprocessing import Process,Event
class EF:
def __init__(self, name, args=list(), proc=1):
self.name = name
self.args = args
self.proc = proc
class someclass:
#staticmethod
def create_processes(*functions):
processes=dict()
for function in functions:
process=Process(target=function.name,args=function.args)
process.start()
processes[process.pid]=process
for process in processes.values():
process.join()
class performs:
def action(self, args):
data = []
for i, arg in enumerate(args):
data.append(EF(self.mult, list(arg), i))
someclass.create_processes(*data)
def mult(self,fa,la):
print '%d x %d = %d' % (fa, la, fa * la)
if __name__ == '__main__':
p = performs()
p.action([(x, x+1) for x in xrange(10)])
I am sorry that I can't reproduce the error with a simpler example, and my code is too complicated to post. If I run the program in IPython shell instead of the regular Python, things work out well.
I looked up some previous notes on this problem. They were all caused by using pool to call function defined within a class function. But this is not the case for me.
Exception in thread Thread-3:
Traceback (most recent call last):
File "/usr/lib64/python2.7/threading.py", line 552, in __bootstrap_inner
self.run()
File "/usr/lib64/python2.7/threading.py", line 505, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/lib64/python2.7/multiprocessing/pool.py", line 313, in _handle_tasks
put(task)
PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
I would appreciate any help.
Update: The function I pickle is defined at the top level of the module. Though it calls a function that contains a nested function. i.e, f() calls g() calls h() which has a nested function i(), and I am calling pool.apply_async(f). f(), g(), h() are all defined at the top level. I tried simpler example with this pattern and it works though.
Here is a list of what can be pickled. In particular, functions are only picklable if they are defined at the top-level of a module.
This piece of code:
import multiprocessing as mp
class Foo():
#staticmethod
def work(self):
pass
if __name__ == '__main__':
pool = mp.Pool()
foo = Foo()
pool.apply_async(foo.work)
pool.close()
pool.join()
yields an error almost identical to the one you posted:
Exception in thread Thread-2:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 552, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 505, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 315, in _handle_tasks
put(task)
PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
The problem is that the pool methods all use a mp.SimpleQueue to pass tasks to the worker processes. Everything that goes through the mp.SimpleQueue must be pickable, and foo.work is not picklable since it is not defined at the top level of the module.
It can be fixed by defining a function at the top level, which calls foo.work():
def work(foo):
foo.work()
pool.apply_async(work,args=(foo,))
Notice that foo is pickable, since Foo is defined at the top level and foo.__dict__ is picklable.
I'd use pathos.multiprocesssing, instead of multiprocessing. pathos.multiprocessing is a fork of multiprocessing that uses dill. dill can serialize almost anything in python, so you are able to send a lot more around in parallel. The pathos fork also has the ability to work directly with multiple argument functions, as you need for class methods.
>>> from pathos.multiprocessing import ProcessingPool as Pool
>>> p = Pool(4)
>>> class Test(object):
... def plus(self, x, y):
... return x+y
...
>>> t = Test()
>>> p.map(t.plus, x, y)
[4, 6, 8, 10]
>>>
>>> class Foo(object):
... #staticmethod
... def work(self, x):
... return x+1
...
>>> f = Foo()
>>> p.apipe(f.work, f, 100)
<processing.pool.ApplyResult object at 0x10504f8d0>
>>> res = _
>>> res.get()
101
Get pathos (and if you like, dill) here:
https://github.com/uqfoundation
When this problem comes up with multiprocessing a simple solution is to switch from Pool to ThreadPool. This can be done with no change of code other than the import-
from multiprocessing.pool import ThreadPool as Pool
This works because ThreadPool shares memory with the main thread, rather than creating a new process- this means that pickling is not required.
The downside to this method is that python isn't the greatest language with handling threads- it uses something called the Global Interpreter Lock to stay thread safe, which can slow down some use cases here. However, if you're primarily interacting with other systems (running HTTP commands, talking with a database, writing to filesystems) then your code is likely not bound by CPU and won't take much of a hit. In fact I've found when writing HTTP/HTTPS benchmarks that the threaded model used here has less overhead and delays, as the overhead from creating new processes is much higher than the overhead for creating new threads and the program was otherwise just waiting for HTTP responses.
So if you're processing a ton of stuff in python userspace this might not be the best method.
As others have said multiprocessing can only transfer Python objects to worker processes which can be pickled. If you cannot reorganize your code as described by unutbu, you can use dills extended pickling/unpickling capabilities for transferring data (especially code data) as I show below.
This solution requires only the installation of dill and no other libraries as pathos:
import os
from multiprocessing import Pool
import dill
def run_dill_encoded(payload):
fun, args = dill.loads(payload)
return fun(*args)
def apply_async(pool, fun, args):
payload = dill.dumps((fun, args))
return pool.apply_async(run_dill_encoded, (payload,))
if __name__ == "__main__":
pool = Pool(processes=5)
# asyn execution of lambda
jobs = []
for i in range(10):
job = apply_async(pool, lambda a, b: (a, b, a * b), (i, i + 1))
jobs.append(job)
for job in jobs:
print job.get()
print
# async execution of static method
class O(object):
#staticmethod
def calc():
return os.getpid()
jobs = []
for i in range(10):
job = apply_async(pool, O.calc, ())
jobs.append(job)
for job in jobs:
print job.get()
I have found that I can also generate exactly that error output on a perfectly working piece of code by attempting to use the profiler on it.
Note that this was on Windows (where the forking is a bit less elegant).
I was running:
python -m profile -o output.pstats <script>
And found that removing the profiling removed the error and placing the profiling restored it. Was driving me batty too because I knew the code used to work. I was checking to see if something had updated pool.py... then had a sinking feeling and eliminated the profiling and that was it.
Posting here for the archives in case anybody else runs into it.
Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
This error will also come if you have any inbuilt function inside the model object that was passed to the async job.
So make sure to check the model objects that are passed doesn't have inbuilt functions. (In our case we were using FieldTracker() function of django-model-utils inside the model to track a certain field). Here is the link to relevant GitHub issue.
This solution requires only the installation of dill and no other libraries as pathos
def apply_packed_function_for_map((dumped_function, item, args, kwargs),):
"""
Unpack dumped function as target function and call it with arguments.
:param (dumped_function, item, args, kwargs):
a tuple of dumped function and its arguments
:return:
result of target function
"""
target_function = dill.loads(dumped_function)
res = target_function(item, *args, **kwargs)
return res
def pack_function_for_map(target_function, items, *args, **kwargs):
"""
Pack function and arguments to object that can be sent from one
multiprocessing.Process to another. The main problem is:
«multiprocessing.Pool.map*» or «apply*»
cannot use class methods or closures.
It solves this problem with «dill».
It works with target function as argument, dumps it («with dill»)
and returns dumped function with arguments of target function.
For more performance we dump only target function itself
and don't dump its arguments.
How to use (pseudo-code):
~>>> import multiprocessing
~>>> images = [...]
~>>> pool = multiprocessing.Pool(100500)
~>>> features = pool.map(
~... *pack_function_for_map(
~... super(Extractor, self).extract_features,
~... images,
~... type='png'
~... **options,
~... )
~... )
~>>>
:param target_function:
function, that you want to execute like target_function(item, *args, **kwargs).
:param items:
list of items for map
:param args:
positional arguments for target_function(item, *args, **kwargs)
:param kwargs:
named arguments for target_function(item, *args, **kwargs)
:return: tuple(function_wrapper, dumped_items)
It returs a tuple with
* function wrapper, that unpack and call target function;
* list of packed target function and its' arguments.
"""
dumped_function = dill.dumps(target_function)
dumped_items = [(dumped_function, item, args, kwargs) for item in items]
return apply_packed_function_for_map, dumped_items
It also works for numpy arrays.
A quick fix is to make the function global
from multiprocessing import Pool
class Test:
def __init__(self, x):
self.x = x
#staticmethod
def test(x):
return x**2
def test_apply(self, list_):
global r
def r(x):
return Test.test(x + self.x)
with Pool() as p:
l = p.map(r, list_)
return l
if __name__ == '__main__':
o = Test(2)
print(o.test_apply(range(10)))
Building on #rocksportrocker solution,
It would make sense to dill when sending and RECVing the results.
import dill
import itertools
def run_dill_encoded(payload):
fun, args = dill.loads(payload)
res = fun(*args)
res = dill.dumps(res)
return res
def dill_map_async(pool, fun, args_list,
as_tuple=True,
**kw):
if as_tuple:
args_list = ((x,) for x in args_list)
it = itertools.izip(
itertools.cycle([fun]),
args_list)
it = itertools.imap(dill.dumps, it)
return pool.map_async(run_dill_encoded, it, **kw)
if __name__ == '__main__':
import multiprocessing as mp
import sys,os
p = mp.Pool(4)
res = dill_map_async(p, lambda x:[sys.stdout.write('%s\n'%os.getpid()),x][-1],
[lambda x:x+1]*10,)
res = res.get(timeout=100)
res = map(dill.loads,res)
print(res)
As #penky Suresh has suggested in this answer, don't use built-in keywords.
Apparently args is a built-in keyword when dealing with multiprocessing
class TTS:
def __init__(self):
pass
def process_and_render_items(self):
multiprocessing_args = [{"a": "b", "c": "d"}, {"e": "f", "g": "h"}]
with ProcessPoolExecutor(max_workers=10) as executor:
# Using args here is fine.
future_processes = {
executor.submit(TTS.process_and_render_item, args)
for args in multiprocessing_args
}
for future in as_completed(future_processes):
try:
data = future.result()
except Exception as exc:
print(f"Generated an exception: {exc}")
else:
print(f"Generated data for comment process: {future}")
# Dont use 'args' here. It seems to be a built-in keyword.
# Changing 'args' to 'arg' worked for me.
def process_and_render_item(arg):
print(arg)
# This will print {"a": "b", "c": "d"} for the first process
# and {"e": "f", "g": "h"} for the second process.
PS: The tabs/spaces maybe a bit off.