Related
Context:
I'm writing a personal python module to simplify some scripts I have lying around. One of the functions I have is untested and may have undesirable edge cases that I still have to consider. In order to not allow myself from relying on it from other modules or functions, I was wondering whether I could enforce it to raise an error if not called directly from the REPL.
I'm not asking whether this is a good idea or not. It obviously isn't because it defeats the purpose of writing a function in the first place. I'm wondering if is is possible in Python, and how to do it.
Question:
Is it possible to have a function raise an error if not called interactively? For example:
def is_called_from_top_level():
"How to implement this?"
pass
def shady_func():
"Only for testing at the REPL. Calling from elsewhere will raise."
if not is_called_from_top_level():
raise NotImplementedError("Shady function can only be called directly.")
return True
def other_func():
"Has an indirect call to shady."
return shady_func()
And then at a REPL:
[In:1] shady_func()
[Out:1] True
[In:2] other_func()
[Out:2] NotImplementedError: "Shady function can only be called directly."
Try checking for ps1 on sys.
import sys
def dangerous_util_func(a, b):
is_interactive = bool(getattr(sys, 'ps1', False))
print(is_interactive) # Prints True or False
return a + b
You can even get fancy and make a decorator for this to make it more reusable.
import sys
from functools import wraps
def repl_only(func):
#wraps(func)
def wrapped(*args, **kwargs):
is_interactive = bool(getattr(sys, 'ps1', False))
if not is_interactive:
raise NotImplementedError("Can only be called from REPL")
return func(*args, **kwargs)
return wrapped
#repl_only
def dangerous_util_func(a, b):
return a + b
DISCLAIMER: This is a bit of a hack, and may not work across different Python / IPython / Jupyter versions, but the underlying idea still holds, i.e. use inspect to get an idea of who is calling.
The code below was tested with Python 3.7.3, IPython 7.6.1 and Jupyter Notebook Server 5.7.8.
Using inspect (obviously), one can look for distinctive features of the REPL frame:
inside a Jupyter Notebook you can check if the repr() of the previous frame contain the string 'code <module>';
using Python / IPython you can check for the code representation of the previous frame to start at line 1.
In code, this would look like:
import inspect
def is_called_from_top_level():
"How to implement this?"
pass
def shady_func():
"Only for testing at the REPL. Calling from elsewhere will raise."
frame = inspect.currentframe()
is_interactive = (
'code <module>' in repr(frame.f_back) # Jupyter
or 'line 1>' in repr(frame.f_back.f_code)) # Python / IPython
if not is_interactive:
raise NotImplementedError("Shady function can only be called directly.")
return True
def other_func():
"Has an indirect call to shady."
return shady_func()
shady_func()
# True
other_func()
# raises NotImplementedError
(EDITED to include support for both Jupyter Notebook and Python / IPython).
As suggested by #bananafish, this is actually a good use case for a decorator:
import inspect
import functools
def repl_only(func):
#functools.wraps(func)
def wrapped(*args, **kwargs):
frame = inspect.currentframe()
is_interactive = (
'code <module>' in repr(frame.f_back) # Jupyter
or 'line 1>' in repr(frame.f_back.f_code)) # Python / IPython
if not is_interactive:
raise NotImplementedError('Can only be called from REPL')
return func(*args, **kwargs)
return wrapped
#repl_only
def foo():
return True
def bar():
return foo()
print(foo())
# True
print(bar())
# raises NotImplementedError
You can do something like that:
import inspect
def other():
shady()
def shady():
curfrm = inspect.currentframe()
calframe = inspect.getouterframes(curfrm, 2)
caller = calframe[1][3]
if not '<module>' in caller::
raise Exception("Not an acceptable caller")
print("that's fine")
if __name__ == '__main__':
import sys
args = sys.argv[1:]
shady()
other()
The module inspect allows you to get information such as the function's caller. You may have to dig a bit deeper if you have edge cases....
Inspired by the comment to the OP suggesting looking at the stack trace, #norok2 's solution based on direct caller inspection, and by #bananafish 's use of the decorator, I came up with an alternative solution that does not require inspect nor sys.
The idea is to throw and catch to get a handle on a traceback object (essentially our stack trace), and then do the direct caller inspection.
from functools import wraps
def repl_only(func):
#wraps(func)
def wrapped(*args, **kwargs):
try:
raise Exception
except Exception as e:
if "module" not in str(e.__traceback__.tb_frame.f_back)[-10:]:
raise NotImplementedError(f"{func.__name__} has to be called from the REPL!")
return func(*args, **kwargs)
return wrapped
#repl_only
def dangerous_util_func(a, b):
return a + b
def foo():
return dangerous_util_func(1, 2)
Here dangerous_util_func will run and foo will throw.
I've written a CLI with click originally as a module and it worked fine. But since my project got bigger I now need to have attributes the CLI can work with, so I tried to turn it into a class, but I'm running into an error doing it. My code is like the following:
import click
import click_repl
import os
from prompt_toolkit.history import FileHistory
class CLI:
def __init__(self):
pass
#click.group(invoke_without_command=True)
#click.pass_context
def cli(self, ctx):
if ctx.invoked_subcommand is None:
ctx.invoke(self.repl)
#cli.command()
def foo(self):
print("foo")
#cli.command()
def repl(self):
prompt_kwargs = {
'history': FileHistory(os.path.expanduser('~/.repl_history'))
}
click_repl.repl(click.get_current_context(), prompt_kwargs)
def main(self):
while True:
try:
self.cli(obj={})
except SystemExit:
pass
if __name__ == "__main__":
foo = CLI()
foo.main()
Without all the selfs and the class CLI: the CLI is working as expected, but as a class it runs into an error: TypeError: cli() missing 1 required positional argument: 'ctx' I don't understand why this happens. As far as I know calling self.cli() should pass self automatically, thus obj={} should be passed as ctx.obj, so it shouldn't make any difference to cli if it's wrapped in a class or not.
Can someone explain to me, why this happens and more important, how I can fix it?
In case it's relevant here is the complete error stack trace:
Traceback (most recent call last):
File "C:/Users/user/.PyCharmCE2018.2/config/scratches/exec.py", line
37, in <module>
foo.main()
File "C:/Users/user/.PyCharmCE2018.2/config/scratches/exec.py", line
30, in main
self.cli(obj={})
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site- packages\click\core.py", line 764, in __call__
return self.main(*args, **kwargs)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site-packages\click\core.py", line 717, in main
rv = self.invoke(ctx)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site-packages\click\core.py", line 1114, in invoke
return Command.invoke(self, ctx)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site-packages\click\core.py", line 956, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site-packages\click\core.py", line 555, in invoke
return callback(*args, **kwargs)
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site-packages\click\decorators.py", line 17, in new_func
return f(get_current_context(), *args, **kwargs)
TypeError: cli() missing 1 required positional argument: 'ctx'
EDIT: The problem seems to be the pass_context call. Usually pass_context would provide the current context as first parameter to the function, so that obj={} would be passed to the context instance. But since I wrapped the click-group into a class, the first spot is taken by the self-reference, so that the current context can't be passed to the function. Any ideas of how to work around that?
I tried changing def cli() the following way:
#click.group(invoke_without_command=True)
def cli(self):
ctx = click.get_current_context()
ctx.obj = {}
if ctx.invoked_subcommand is None:
ctx.invoke(self.repl)
So I don't pass the context by call avoiding a conflict with self, but if I try to run this with self.cli() error TypeError: cli() missing 1 required positional argument: 'self' happens.
Calling it with self.cli(self) runs into TypeError: 'CLI' object is not iterable
I am afraid the click library is not designed to work as a class. Click makes use of decorators. Don't take decorators too lightly. Decorators literally take your function as argument and return a different function.
For example:
#cli.command()
def foo(self):
Is something in line of
foo = cli.command()(foo)
So, I am afraid that click has not support to decorate functions bound to classes, but can only decorate functions that are unbound. So, basically the solution to your answer is, don't use a class.
You might be wondering how to organize your code now. Most languages present you the class as an unit of organization.
Python however goes one step further and gives you modules as well. Basically a file is a module and within this file everything you put in there is automatically associated with that file as a module.
So, just name a file cli.py and create your attributes as global variables. This might give you other problems, since you cannot alter global variables in a function scope, but you can use a class to contain your variables instead.
class Variables:
pass
variables = Variables()
variables.something = "Something"
def f():
variables.something = "Nothing"
I am trying to wrap the constructor for pyspark Pipeline.init constructor, and monkey patch in the newly wrapped constructor. However, I am running into an error that seems to have something to do with the way Pipeline.init uses decorators
Here is the code that actually does the monkey patch:
def monkeyPatchPipeline():
oldInit = Pipeline.__init__
def newInit(self, **keywordArgs):
oldInit(self, stages=keywordArgs["stages"])
Pipeline.__init__ = newInit
However, when I run a simple program:
import PythonSparkCombinatorLibrary
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
PythonSparkCombinatorLibrary.TransformWrapper.monkeyPatchPipeline()
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
I get this error:
Traceback (most recent call last):
File "C:\<my path>\PythonApplication1\main.py", line 26, in <module>
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
File "C:<my path>PythonApplication1 \PythonSparkCombinatorLibrary.py", line 36, in newInit
oldInit(self, stages=keywordArgs["stages"])
File "C:\<pyspark_path>\pyspark\__init__.py", line 98, in wrapper
return func(*args, **kwargs)
File "C:\<pyspark_path>\pyspark\ml\pipeline.py", line 63, in __init__
kwargs = self.__init__._input_kwargs
AttributeError: 'function' object has no attribute '_input_kwargs'
Looking into the pyspark interface, I see that Pipeline.init looks like this:
#keyword_only
def __init__(self, stages=None):
"""
__init__(self, stages=None)
"""
if stages is None:
stages = []
super(Pipeline, self).__init__()
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
And noting the #keyword_only decorator, I inspected that code as well:
def keyword_only(func):
"""
A decorator that forces keyword arguments in the wrapped method
and saves actual input keyword arguments in `_input_kwargs`.
"""
#wraps(func)
def wrapper(*args, **kwargs):
if len(args) > 1:
raise TypeError("Method %s forces keyword arguments." % func.__name__)
wrapper._input_kwargs = kwargs
return func(*args, **kwargs)
return wrapper
I'm totally confused both about how this code works in the first place, and also why it seems to cause problems with my own wrapper. I see that wrapper is adding a _input_kwargs field to itself, but how is Pipeline.__init__ about to read that field with self.__init__._input_kwargs? And why doesn't the same thing happen when I wrap Pipeline.__init__ again?
Decorator 101. Decorator is a higher-order function which takes a function as its first argument (and typically only), and returns a function. # annotation is just a syntactic sugar for a simple function call, so following
#decorator
def decorated(x):
...
can be rewritten for example as:
def decorated_(x):
...
decorated = decorator(decorated_)
So Pipeline.__init__ is actually a functools.wrapped wrapper which captures defined __init__ (func argument of the keyword_only) as a part of its closure. When it is called, it uses received kwargs as a function attribute of itself. Basically what happens here can be simplified to:
def f(**kwargs):
f._input_kwargs = kwargs # f is in the current scope
hasattr(f, "_input_kwargs")
False
f(foo=1, bar="x")
hasattr(f, "_input_kwargs")
True
When you further wrap (decorate) __init__ the external function won't have _input_kwargs attached, hence the error. If you want to make it work you have apply the same process, as used by the original __init__, to your own version, for example with the same decorator:
#keyword_only
def newInit(self, **keywordArgs):
oldInit(self, stages=keywordArgs["stages"])
but I liked I mentioned in the comments, you should rather consider subclassing.
I'm using ctypes to work with a library written in C. This C library allows me to register a callback function, which I'm implementing in Python.
Here is the callback function type, according to the ctypes API:
_command_callback = CFUNCTYPE(
UNCHECKED(c_int),
POINTER(vedis_context),
c_int,
POINTER(POINTER(vedis_value)))
Here is a decorator I've written to mark a function as a callback:
def wrap_callback(fn):
return _command_callback(fn)
To use this, I am able to simply write:
#wrap_callback
def my_callback(*args):
print args
return 1 # Needed by C library to indicate OK response.
c_library_func.register_callback(my_callback)
I can now invoke my callback (my_callback) from C and this works perfectly well.
The problem I'm encountering is that there will be some boilerplate behavior I would like to perform as part of these callbacks (such as returning a success flag, etc). To minimize boilerplate, I tried to write a decorator:
def wrap_callback(fn):
def inner(*args, **kwargs):
return fn(*args, **kwargs)
return _command_callback(inner)
Note that this is functionally equivalent to the previous example.
#wrap_callback
def my_callback(*args):
print args
return 1
When I attempt to invoke the callback using this approach, however, I receive the following exception, originating from _ctypes/callbacks.c:
Traceback (most recent call last):
File "_ctypes/callbacks.c", line 314, in 'calling callback function'
File "/home/charles/tmp/scrap/z1/src/vedis/vedis/core.py", line 28, in inner
return fn(*args, **kwargs)
SystemError: Objects/cellobject.c:24: bad argument to internal function
I am not sure what is going on here that would cause the first example to work but the second example to fail. Can anyone shed some light on this? Bonus points if you can help me find a way to decorate these callbacks so I can reduce boilerplate code!
Thanks to eryksyn, I was able to fix this issue. The fix looks like:
def wrap_callback(fn):
def inner(*args, **kwargs):
return fn(*args, **kwargs)
return _command_callback(inner), inner
def my_callback(*args):
print args
return 1
ctypes_cb, my_callback = wrap_callback(my_callback)
I am sorry that I can't reproduce the error with a simpler example, and my code is too complicated to post. If I run the program in IPython shell instead of the regular Python, things work out well.
I looked up some previous notes on this problem. They were all caused by using pool to call function defined within a class function. But this is not the case for me.
Exception in thread Thread-3:
Traceback (most recent call last):
File "/usr/lib64/python2.7/threading.py", line 552, in __bootstrap_inner
self.run()
File "/usr/lib64/python2.7/threading.py", line 505, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/lib64/python2.7/multiprocessing/pool.py", line 313, in _handle_tasks
put(task)
PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
I would appreciate any help.
Update: The function I pickle is defined at the top level of the module. Though it calls a function that contains a nested function. i.e, f() calls g() calls h() which has a nested function i(), and I am calling pool.apply_async(f). f(), g(), h() are all defined at the top level. I tried simpler example with this pattern and it works though.
Here is a list of what can be pickled. In particular, functions are only picklable if they are defined at the top-level of a module.
This piece of code:
import multiprocessing as mp
class Foo():
#staticmethod
def work(self):
pass
if __name__ == '__main__':
pool = mp.Pool()
foo = Foo()
pool.apply_async(foo.work)
pool.close()
pool.join()
yields an error almost identical to the one you posted:
Exception in thread Thread-2:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 552, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 505, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 315, in _handle_tasks
put(task)
PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
The problem is that the pool methods all use a mp.SimpleQueue to pass tasks to the worker processes. Everything that goes through the mp.SimpleQueue must be pickable, and foo.work is not picklable since it is not defined at the top level of the module.
It can be fixed by defining a function at the top level, which calls foo.work():
def work(foo):
foo.work()
pool.apply_async(work,args=(foo,))
Notice that foo is pickable, since Foo is defined at the top level and foo.__dict__ is picklable.
I'd use pathos.multiprocesssing, instead of multiprocessing. pathos.multiprocessing is a fork of multiprocessing that uses dill. dill can serialize almost anything in python, so you are able to send a lot more around in parallel. The pathos fork also has the ability to work directly with multiple argument functions, as you need for class methods.
>>> from pathos.multiprocessing import ProcessingPool as Pool
>>> p = Pool(4)
>>> class Test(object):
... def plus(self, x, y):
... return x+y
...
>>> t = Test()
>>> p.map(t.plus, x, y)
[4, 6, 8, 10]
>>>
>>> class Foo(object):
... #staticmethod
... def work(self, x):
... return x+1
...
>>> f = Foo()
>>> p.apipe(f.work, f, 100)
<processing.pool.ApplyResult object at 0x10504f8d0>
>>> res = _
>>> res.get()
101
Get pathos (and if you like, dill) here:
https://github.com/uqfoundation
When this problem comes up with multiprocessing a simple solution is to switch from Pool to ThreadPool. This can be done with no change of code other than the import-
from multiprocessing.pool import ThreadPool as Pool
This works because ThreadPool shares memory with the main thread, rather than creating a new process- this means that pickling is not required.
The downside to this method is that python isn't the greatest language with handling threads- it uses something called the Global Interpreter Lock to stay thread safe, which can slow down some use cases here. However, if you're primarily interacting with other systems (running HTTP commands, talking with a database, writing to filesystems) then your code is likely not bound by CPU and won't take much of a hit. In fact I've found when writing HTTP/HTTPS benchmarks that the threaded model used here has less overhead and delays, as the overhead from creating new processes is much higher than the overhead for creating new threads and the program was otherwise just waiting for HTTP responses.
So if you're processing a ton of stuff in python userspace this might not be the best method.
As others have said multiprocessing can only transfer Python objects to worker processes which can be pickled. If you cannot reorganize your code as described by unutbu, you can use dills extended pickling/unpickling capabilities for transferring data (especially code data) as I show below.
This solution requires only the installation of dill and no other libraries as pathos:
import os
from multiprocessing import Pool
import dill
def run_dill_encoded(payload):
fun, args = dill.loads(payload)
return fun(*args)
def apply_async(pool, fun, args):
payload = dill.dumps((fun, args))
return pool.apply_async(run_dill_encoded, (payload,))
if __name__ == "__main__":
pool = Pool(processes=5)
# asyn execution of lambda
jobs = []
for i in range(10):
job = apply_async(pool, lambda a, b: (a, b, a * b), (i, i + 1))
jobs.append(job)
for job in jobs:
print job.get()
print
# async execution of static method
class O(object):
#staticmethod
def calc():
return os.getpid()
jobs = []
for i in range(10):
job = apply_async(pool, O.calc, ())
jobs.append(job)
for job in jobs:
print job.get()
I have found that I can also generate exactly that error output on a perfectly working piece of code by attempting to use the profiler on it.
Note that this was on Windows (where the forking is a bit less elegant).
I was running:
python -m profile -o output.pstats <script>
And found that removing the profiling removed the error and placing the profiling restored it. Was driving me batty too because I knew the code used to work. I was checking to see if something had updated pool.py... then had a sinking feeling and eliminated the profiling and that was it.
Posting here for the archives in case anybody else runs into it.
Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
This error will also come if you have any inbuilt function inside the model object that was passed to the async job.
So make sure to check the model objects that are passed doesn't have inbuilt functions. (In our case we were using FieldTracker() function of django-model-utils inside the model to track a certain field). Here is the link to relevant GitHub issue.
This solution requires only the installation of dill and no other libraries as pathos
def apply_packed_function_for_map((dumped_function, item, args, kwargs),):
"""
Unpack dumped function as target function and call it with arguments.
:param (dumped_function, item, args, kwargs):
a tuple of dumped function and its arguments
:return:
result of target function
"""
target_function = dill.loads(dumped_function)
res = target_function(item, *args, **kwargs)
return res
def pack_function_for_map(target_function, items, *args, **kwargs):
"""
Pack function and arguments to object that can be sent from one
multiprocessing.Process to another. The main problem is:
«multiprocessing.Pool.map*» or «apply*»
cannot use class methods or closures.
It solves this problem with «dill».
It works with target function as argument, dumps it («with dill»)
and returns dumped function with arguments of target function.
For more performance we dump only target function itself
and don't dump its arguments.
How to use (pseudo-code):
~>>> import multiprocessing
~>>> images = [...]
~>>> pool = multiprocessing.Pool(100500)
~>>> features = pool.map(
~... *pack_function_for_map(
~... super(Extractor, self).extract_features,
~... images,
~... type='png'
~... **options,
~... )
~... )
~>>>
:param target_function:
function, that you want to execute like target_function(item, *args, **kwargs).
:param items:
list of items for map
:param args:
positional arguments for target_function(item, *args, **kwargs)
:param kwargs:
named arguments for target_function(item, *args, **kwargs)
:return: tuple(function_wrapper, dumped_items)
It returs a tuple with
* function wrapper, that unpack and call target function;
* list of packed target function and its' arguments.
"""
dumped_function = dill.dumps(target_function)
dumped_items = [(dumped_function, item, args, kwargs) for item in items]
return apply_packed_function_for_map, dumped_items
It also works for numpy arrays.
A quick fix is to make the function global
from multiprocessing import Pool
class Test:
def __init__(self, x):
self.x = x
#staticmethod
def test(x):
return x**2
def test_apply(self, list_):
global r
def r(x):
return Test.test(x + self.x)
with Pool() as p:
l = p.map(r, list_)
return l
if __name__ == '__main__':
o = Test(2)
print(o.test_apply(range(10)))
Building on #rocksportrocker solution,
It would make sense to dill when sending and RECVing the results.
import dill
import itertools
def run_dill_encoded(payload):
fun, args = dill.loads(payload)
res = fun(*args)
res = dill.dumps(res)
return res
def dill_map_async(pool, fun, args_list,
as_tuple=True,
**kw):
if as_tuple:
args_list = ((x,) for x in args_list)
it = itertools.izip(
itertools.cycle([fun]),
args_list)
it = itertools.imap(dill.dumps, it)
return pool.map_async(run_dill_encoded, it, **kw)
if __name__ == '__main__':
import multiprocessing as mp
import sys,os
p = mp.Pool(4)
res = dill_map_async(p, lambda x:[sys.stdout.write('%s\n'%os.getpid()),x][-1],
[lambda x:x+1]*10,)
res = res.get(timeout=100)
res = map(dill.loads,res)
print(res)
As #penky Suresh has suggested in this answer, don't use built-in keywords.
Apparently args is a built-in keyword when dealing with multiprocessing
class TTS:
def __init__(self):
pass
def process_and_render_items(self):
multiprocessing_args = [{"a": "b", "c": "d"}, {"e": "f", "g": "h"}]
with ProcessPoolExecutor(max_workers=10) as executor:
# Using args here is fine.
future_processes = {
executor.submit(TTS.process_and_render_item, args)
for args in multiprocessing_args
}
for future in as_completed(future_processes):
try:
data = future.result()
except Exception as exc:
print(f"Generated an exception: {exc}")
else:
print(f"Generated data for comment process: {future}")
# Dont use 'args' here. It seems to be a built-in keyword.
# Changing 'args' to 'arg' worked for me.
def process_and_render_item(arg):
print(arg)
# This will print {"a": "b", "c": "d"} for the first process
# and {"e": "f", "g": "h"} for the second process.
PS: The tabs/spaces maybe a bit off.