Can luigi rerun tasks when the task dependencies become out of date? - python

As far as I know, a luigi.Target can either exist, or not.
Therefore, if a luigi.Target exists, it wouldn't be recomputed.
I'm looking for a way to force recomputation of the task, if one of its dependencies is modified, or if the code of one of the tasks changes.

One way you could accomplish your goal is by overriding the complete(...) method.
The documentation for complete is straightforward.
Simply implement a function that checks your constraint, and returns False if you want to recompute the task.
For example, to force recomputation when a dependency has been updated, you could do:
def complete(self):
"""Flag this task as incomplete if any requirement is incomplete or has been updated more recently than this task"""
import os
import time
def mtime(path):
return time.ctime(os.path.getmtime(path))
# assuming 1 output
if not os.path.exists(self.output().path):
return False
self_mtime = mtime(self.output().path)
# the below assumes a list of requirements, each with a list of outputs. YMMV
for el in self.requires():
if not el.complete():
return False
for output in el.output():
if mtime(output.path) > self_mtime:
return False
return True
This will return False when any requirement is incomplete or any has been modified more recently than the current task or the output of the current task does not exist.
Detecting when code has changed is harder. You could use a similar scheme (checking mtime), but it'd be hit-or-miss unless every task has its own file.
Because of the ability to override complete, any logic you want for recomputation can be implemented. If you want a particular complete method for many tasks, I'd recommend sub-classing luigi.Task, implementing your custom complete there, and then inheriting your tasks from the sub-class.

I'm late to the game, but here's a mixin that improves the accepted answer to support multiple input / output files.
class MTimeMixin:
"""
Mixin that flags a task as incomplete if any requirement
is incomplete or has been updated more recently than this task
This is based on http://stackoverflow.com/a/29304506, but extends
it to support multiple input / output dependencies.
"""
def complete(self):
def to_list(obj):
if type(obj) in (type(()), type([])):
return obj
else:
return [obj]
def mtime(path):
return time.ctime(os.path.getmtime(path))
if not all(os.path.exists(out.path) for out in to_list(self.output())):
return False
self_mtime = min(mtime(out.path) for out in to_list(self.output()))
# the below assumes a list of requirements, each with a list of outputs. YMMV
for el in to_list(self.requires()):
if not el.complete():
return False
for output in to_list(el.output()):
if mtime(output.path) > self_mtime:
return False
return True
To use it, you would just declare your class using, for example class MyTask(Mixin, luigi.Task).

The above code works well for me except that I believe for proper timestamp comparison mtime(path) must return a float instead of a string ("Sat " > "Mon "...[sic]). Thus simply,
def mtime(path):
return os.path.getmtime(path)
instead of:
def mtime(path):
return time.ctime(os.path.getmtime(path))

Regarding the Mixin suggestion from Shilad Sen posted below, consider this example:
# Filename: run_luigi.py
import luigi
from MTimeMixin import MTimeMixin
class PrintNumbers(luigi.Task):
def requires(self):
wreturn []
def output(self):
return luigi.LocalTarget("numbers_up_to_10.txt")
def run(self):
with self.output().open('w') as f:
for i in range(1, 11):
f.write("{}\n".format(i))
class SquaredNumbers(MTimeMixin, luigi.Task):
def requires(self):
return [PrintNumbers()]
def output(self):
return luigi.LocalTarget("squares.txt")
def run(self):
with self.input()[0].open() as fin, self.output().open('w') as fout:
for line in fin:
n = int(line.strip())
out = n * n
fout.write("{}:{}\n".format(n, out))
if __name__ == '__main__':
luigi.run()
where MTimeMixin is as in the post above. I run the task once using
luigi --module run_luigi SquaredNumbers
Then I touch file numbers_up_to_10.txt and run the task again. Then Luigi gives the following complaint:
File "c:\winpython-64bit-3.4.4.6qt5\python-3.4.4.amd64\lib\site-packages\luigi-2.7.1-py3.4.egg\luigi\local_target.py", line 40, in move_to_final_destination
os.rename(self.tmp_path, self.path)
FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'squares.txt-luigi-tmp-5391104487' -> 'squares.txt'
This may just be a Windows problem, not an issue on Linux where "mv a b" may just delete the old b if it already exists and is not write-protected. We can fix this with the following patch to Luigi/local_target.py:
def move_to_final_destination(self):
if os.path.exists(self.path):
os.rename(self.path, self.path + time.strftime("_%Y%m%d%H%M%S.txt"))
os.rename(self.tmp_path, self.path)
Also for completeness here is the Mixin again as a separate file, from the other post:
import os
class MTimeMixin:
"""
Mixin that flags a task as incomplete if any requirement
is incomplete or has been updated more recently than this task
This is based on http://stackoverflow.com/a/29304506, but extends
it to support multiple input / output dependencies.
"""
def complete(self):
def to_list(obj):
if type(obj) in (type(()), type([])):
return obj
else:
return [obj]
def mtime(path):
return os.path.getmtime(path)
if not all(os.path.exists(out.path) for out in to_list(self.output())):
return False
self_mtime = min(mtime(out.path) for out in to_list(self.output()))
# the below assumes a list of requirements, each with a list of outputs. YMMV
for el in to_list(self.requires()):
if not el.complete():
return False
for output in to_list(el.output()):
if mtime(output.path) > self_mtime:
return False
return True

Related

understanding of some Luigi issues

Look at class ATask
class ATask(luigi.Task):
config = luigi.Parameter()
def requires(self):
# Some Tasks maybe
def output(self):
return luigi.LocalTarget("A.txt")
def run(self):
with open("A.txt", "w") as f:
f.write("Complete")
Now look at class BTask
class BTask(luigi.Task):
config = luigi.Parameter()
def requires(self):
return ATask(config = self.config)
def output(self):
return luigi.LocalTarget("B.txt")
def run(self):
with open("B.txt", "w") as f:
f.write("Complete")
Question is there is a chance that while TaskA running and start write "A.txt" taskB will start before taskA finishing writing?
The second is that if I start execution like
luigi.build([BTask(config=some_config)], local_scheduler=True )
And if this pipilene fail inside - Could I somehow to know outside about this like return value of luigi.build or smth else?
No, luigi won't start executing TaskB until TaskA has finished (ie, until it has finished writing the target file)
If you want to get a detailed response for luigi.build in case of error, you must pass an extra keyword argument: detailed_summary=True to build/run methods and then access the summary_text, this way:
luigi_run_result = luigi.build(..., detailed_summary=True)
print(luigi_run_result.summary_text)
For details on that, please read Response of luigi.build()/luigi.run() in Luigi documentation.
Also, you may be interested in this answer about how to access the error / exception: https://stackoverflow.com/a/33396642/3219121

luigi - how to create a dependency not between files, but between tasks? (or how to not involve the output method)

Given two luigi tasks, how can I add one as a requirement for the other, in a way that if the required is done, the second task could start, with no output involved?
Currently I get RuntimeError: Unfulfilled dependency at run time: MyTask___home_... even though the task completed ok, because my requires / output methods are not configured right...
class ShellTask(ExternalProgramTask):
"""
ExternalProgramTask's subclass dedicated for one task with the capture output ability.
Args:
shell_cmd (str): The shell command to be run in a subprocess.
capture_output (bool, optional): If True the output is not displayed to console,
and printed after the task is done via
logger.info (both stdout + stderr).
Defaults to True.
"""
shell_cmd = luigi.Parameter()
requirement = luigi.Parameter(default='')
succeeded = False
def on_success(self):
self.succeeded = True
def requires(self):
return eval(self.requirement) if self.requirement else None
def program_args(self):
"""
Must be implemented in an ExternalProgramTask subclass.
Returns:
A script that would be run in a subprocess.Popen.
Args:
shell_cmd (luigi.Parameter (str)): the shell command to be passed as args
to the run method (run should not be overridden!).
"""
return self.shell_cmd.split()
class MyTask(ShellTask):
"""
Args: if __name__ == '__main__':
clean_output_files(['_.txt'])
task = MyTask(
shell_cmd='...',
requirement="MyTask(shell_cmd='...', output_file='_.txt')",
)
"""
pass
if __name__ == '__main__':
task_0 = MyTask(
shell_cmd='...',
requirement="MyTask(shell_cmd='...')",
)
luigi.build([task_0], workers=2, local_scheduler=False)
I hoped using the on_success could prompt something to the caller task, but I didn't figure out how to.
I'm currently overcoming this in the following way:
0) implement the output method based on the input of the task (much like the eval(requirement) I did
2) implement the run method (calling the super run and then writing "ok" to output
3) deleting the output files from main.
4) calling it somehitng like this:
if __name__ == '__main__':
clean_output_files(['_.txt'])
task = MyTask(
shell_cmd='...',
requirement="MyTask(shell_cmd='...', output_file='_.txt')",
)
So within your first luigi task, you could call your second Task within by making it a requirement.
For example:
class TaskB(luigi.Task):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.complete_flag = False
def run(self):
self.complete_flag = True
print('do something')
def complete(self):
return self.is_complete
class TaskA(luigi.Task):
def requires(self):
return TaskB()
def run(self):
print('Carry on with other logic')

How to check output dynamically with Luigi

I realize I likely need to use dynamic requirements to accomplish the following task, however I have not been able to wrap my head around what this would look like in practice.
The goal is to use Luigi to generate data and add it to a database, without knowing ahead of time what data will be generated.
Take the following example using mongodb:
import luigi
from uuid import uuid4
from luigi.contrib import mongodb
import pymongo
# Make up IDs, though in practice the IDs may be generated from an API
class MakeID(luigi.Task):
def run(self):
with self.output().open('w') as f:
f.write(','.join([str(uuid4()) for e in range(10)]))
# Write the data to file
def output(self):
return luigi.LocalTarget('data.csv')
class ToDataBase(luigi.Task):
def requires(self):
return MakeID()
def run(self):
with self.input().open('r') as f:
ids = f.read().split(',')
# Add some fake data to simulate generating new data
count_data = {key: value for value, key in enumerate(ids)}
# Add data to the database
self.output().write(count_data)
def output(self):
# Attempt to read non-existent file to get the IDs to check if task is complete
with self.input().open('r') as f:
valid_ids = f.read().split(',')
client = pymongo.MongoClient('localhost',
27017,
ssl=False)
return mongodb.MongoRangeTarget(client,
'myDB',
'myData',
valid_ids,
'myField')
if __name__ == '__main__':
luigi.run()
The goal is to obtain data, modify it and then add it to a database.
The above code fails when run because the output method of ToDataBase runs before the require method so the while the function has access to the input, the input does not yet exist. Regardless I still need to check to be sure that the data was added to the database.
This github issue is close to what I am looking for, though as I mentioned I have not been able to figure out dynamic requirements for this use case in practice.
The solution is to create a third task (in the example Dynamic) that yields the task that is waiting on dynamic input and making the dependency a parameter rather than a requires method.
class ToDatabase(luigi.Task):
fp = luigi.Parameter()
def output(self):
with open(self.fp, 'r') as f:
valid_ids = [str(e) for e in f.read().split(',')]
client = pymongo.MongoClient('localhost', 27017, ssl=False)
return mongodb.MongoRangeTarget(client, 'myDB', 'myData',
valid_ids, 'myField')
def run(self):
with open(self.fp, 'r') as f:
valid_ids = [str(e) for e in f.read().split(',')]
self.output().write({k: 5 for k in valid_ids})
class Dynamic(luigi.Task):
def output(self):
return self.input()
def requires(self):
return MakeIDs()
def run(self):
yield(AddToDatabase(fp=self.input().path))

manually open context manager

My question is, how can I execute any context manager without using with?
Python has the idea of context managers,
instead of
file = open('some_file', 'w')
try:
file.write('Hola!')
finally:
file.close()
# end try
you can write
with open('some_file', 'w') as opened_file:
opened_file.write('Hola!')
# end with
While in most cases the second one is the golden solution, however for the specific cases of testing in unit tests as well exploring in the interactive console, the first one can be much better used, as you can write it line by line.
>>> file = open('some_file', 'w')
>>> file.write('Hola!')
>>> file.close()
My question is, how can I execute any with context manager like this, best suited for exploring?
My actual use case follows below, but please try to give a answer which is generic and will work for other context managers too.
import flask
app = flask.Flask(__name__)
with app.test_request_context('/?name=Peter'):
assert flask.request.path == '/'
assert flask.request.args['name'] == 'Peter'
from flask docs
You can still use with syntax in the interactive console, however a context is based on 2 magic methods __enter__ and __exit__, so you can just use them:
class MyCtx(object):
def __init__(self, f):
self.f = f
def __enter__(self):
print("Enter")
return self.f
def __exit__(*args, **kwargs):
print("Exit")
def foo():
print("Hello")
usually you do:
with MyCtx(foo) as f:
f()
Same as:
ctx = MyCtx(foo)
f = ctx.__enter__()
f()
ctx.__exit__()
Here you have the live example
Remember that contexts __exit__ method are used for managing errors within the context, so most of them have a signature of __exit__(exception_type, exception_value, traceback), if you dont need to handle it for the tests, just give it some None values:
__exit__(None, None, None)
You can call app.test_request.context('/?name=Peter') to a variable (e.g. ctx), and then call ctx.__enter__() on it to enter the context manager, and ctx.__exit__(None, None, None) to perform the cleanup. Note that you lose the safety guarantees of context managers, unless you put the ctx.__exit__ in a finally clause.

Using context managers without "with" block

Below is an example of my my_create method, and an example of that method in use.
#contextmanager
def my_create(**attributes):
obj = MyObject(**attributes)
yield obj
obj.save()
with my_create(a=10) as new_obj:
new_obj.b = 7
new_obj.a # => 10
new_obj.b # => 7
new_obj.is_saved() # => True
To users of Ruby/Rails, this may look familiar. It's similar to the ActiveRecord::create method, with the code inside the with block acting as, well, a block.
However:
with my_create(a=10) as new_obj:
pass
new_obj.a # => 10
new_obj.is_saved() # => True
In the above example, I've passed an empty "block" to my my_create function. Things work as expected (my_obj was initialized, and saved), but the formatting looks a little wonky, and the with block seems unnecessary.
I would prefer to be able to call my_create directly, without having to setup a passing with block. Unfortunately, that's not possible with my current implementation of my_create.
my_obj = create(a=10)
my_obj # => <contextlib.GeneratorContextManager at 0x107c21050>
I'd have to call both __enter__ and __exit__ on the GeneratorContextManager to get my desired result.
The question:
Is there a way to write my my_create function so that it can be called with a "block" as an optional "parameter"? I don't want to pass an optional function to my_create. I want my_create to optionally yield execution to a block of code.
The solution doesn't have to involve with or contextmanager. For instance, the same results as above can be achieved with a generator and a for loop, although the syntax becomes even more unclear.
At this point I'm afraid that a readable-enough-to-be-sensibly-usable solution doesn't exist, but I'm still interested to see what everyone comes up with.
Some clarification:
Another example would be:
#contextmanager
def header_file(path):
touch(path)
f = open(path, 'w')
f.write('This is the header')
yield f
f.close()
with header_file('some/path') as f:
f.write('some more stuff')
another_f = header_file('some/other/path')
I always want to do the __enter__ and __exit__ parts of the context manager. I don't always want to supply a block. I don't want to have to set up a passing with block if I don't have to.
This is possible and easy in Ruby. It would be cool if it were possible in Python too, since we're already so close (we just have to set up a passing with block). I understand that the language mechanics make it a difficult (technically impossible?) but a close-enough solution is interesting to me.
Add a new method on MyObject which creates and saves.
class MyObject:
#classmethod
def create(cls, **attributes):
obj = cls(**attributes)
obj.save()
return obj
This is an alternate initializer, a factory, and the design pattern has precedent in Python standard libraries and in many popular frameworks. Django models use this pattern where an alternate initializer Model.create(**args) can offer additional features that the usual Model(**args) would not (e.g. persisting to the database).
Is there a way to write my my_create function so that it can be called with a "block" as an optional "parameter"?
No.
I'd suggest using different functions to get a context manager that saves an object on __exit__ and to get an automatically saved object. There's no easy way to have one function do both things. (There are no "blocks" that you can pass around, other than functions, which you say you don't want.)
For instance, you could create a second function that just creates and immediately saves an object without running any extra code to run in between:
def create_and_save(**args):
obj = MyObject(**args)
obj.save()
return obj
So you could make it work with two functions. But a more Pythonic approach would probably be to get rid of the context manager function and make the MyObject class serve as its own context manager. You can give it very simple __enter__ and __exit__ methods:
def __enter__(self):
return self
def __exit__(self, exception_type, exception_value, traceback):
if exception_type is None:
self.save()
Your first example would become:
with MyObject(a=10) as new_obj:
new_obj.b = 7
You could also turn the create_and_save function I showed above into a classmethod:
#classmethod
def create_and_save(cls, **args):
obj = cls(**args)
obj.save()
return obj
Your second example would then be:
new_obj = MyObject.create_and_save(a=10)
Both of those methods could be written in a base class and simply inherited by other classes, so don't think you'd need to rewrite them all the time.
Ok, there seems to be some confusion so I've been forced to come up with an example solution. Here's the best I've been able to come up with so far.
class my_create(object):
def __new__(cls, **attributes):
with cls.block(**attributes) as obj:
pass
return obj
#classmethod
#contextmanager
def block(cls, **attributes):
obj = MyClass(**attributes)
yield obj
obj.save()
If we design my_create like above, we can use it normally without a block:
new_obj = my_create(a=10)
new_obj.a # => 10
new_obj.is_saved() # => True
And we can call it slightly differently with a block.
with my_create.block(a=10) as new_obj:
new_obj.b = 7
new_obj.a # => 10
new_obj.b # => 7
new_obj.saved # => True
Calling my_create.block is kind of similar to calling Celery tasks Task.s, and users who don't want to call my_create with a block just call it normally, so I'll allow it.
However, this implementation of my_create looks wonky, so we can create a wrapper to make it more like the implementation of context_manager(my_create) in the question.
import types
# The abstract base class for a block accepting "function"
class BlockAcceptor(object):
def __new__(cls, *args, **kwargs):
with cls.block(*args, **kwargs) as yielded_value:
pass
return yielded_value
#classmethod
#contextmanager
def block(cls, *args, **kwargs):
raise NotImplementedError
# The wrapper
def block_acceptor(f):
block_accepting_f = type(f.func_name, (BlockAcceptor,), {})
f.func_name = 'block'
block_accepting_f.block = types.MethodType(contextmanager(f), block_accepting_f)
return block_accepting_f
Then my_create becomes:
#block_acceptor
def my_create(cls, **attributes):
obj = MyClass(**attributes)
yield obj
obj.save()
In use:
# creating with a block
with my_create.block(a=10) as new_obj:
new_obj.b = 7
new_obj.a # => 10
new_obj.b # => 7
new_obj.saved # => True
# creating without a block
new_obj = my_create(a=10)
new_obj.a # => 10
new_obj.saved # => True
Ideally the my_create function wouldn't need to accept a cls, and the block_acceptor wrapper would handle that, but I haven't got time to make those changes just now.
pythonic? no. useful? possibly?
I'm still interested to see what others come up with.
With a slight change, you can can really close to what you want, just not via implementation using contextlib.contextmanager:
creator = build_creator_obj()
# "with" contextmanager interface
with creator as obj:
obj.attr = 'value'
# "call" interface
obj = creator(attr='value')
Where creator is an object that implements __enter__ and __exit__ for the first usage and implements __call__ for the second usage.
You can also hide the construction of creator inside a property on some persistent object, e.g.:
class MyDatabase():
#property
def create(self):
return build_creator_obj()
db = MyDatabase()
# so that you can do either/both:
with db.create as obj:
obj.attr = 'value'
obj = db.create(attr='value')

Categories