As we get run_id in Airflow, how to get timestamp(ts)?
First:
In your task set provide_context=True
bye_operator = PythonOperator(
task_id='bye_task',
python_callable=print_goodbye,
provide_context=True,
dag=dag
)
Second:
Ensure you are passing the known arguments into your callback function:
def print_goodbye(**kwargs):
ts = kwargs.get('ts', None)
print(ts)
return 'Good bye world!'
Related
The below code works but my requirement is to pass totalbuckets as an input to the function as opposed to global variable. I am having trouble passing it as a variable and do xcom_pull in next task. This dag basically creates buckets based on the number of inputs and totalbuckets is a constant. Appreciate your help in advance.
from airflow import DAG
from airflow.operators.python import PythonOperator, BranchPythonOperator
with DAG('test-live', catchup=False, schedule_interval=None, default_args=args) as test_live:
totalbuckets = 3
# branches based on number of buckets
def branch_buckets(**context):
buckets = defaultdict(list)
for i in range(len(inputs_to_process)):
buckets[f'bucket_{(1+i % totalbuckets)}'].append(inputs_to_process[i])
for bucket_name, input_sublist in buckets.items():
context['ti'].xcom_push(key = bucket_name, value = input_sublist)
return list(buckets.keys())
# BranchPythonOperator will launch the buckets and distributes inputs among the buckets
branch_buckets = BranchPythonOperator(
task_id='branch_buckets',
python_callable=branch_buckets,
trigger_rule=TriggerRule.NONE_FAILED,
provide_context=True,
dag=test_live
)
# update provider tables with merge sql
def update_inputs(sf_conn_id, bucket_name, **context):
input_sublist = context['ti'].xcom_pull(task_ids='branch_buckets', key=bucket_name)
print(f"Processing inputs {input_sublist} in {bucket_name}")
from custom.hooks.snowflake_hook import SnowflakeHook
for p in input_sublist:
merge_sql=f"""
merge into ......"""
bucket_tasks = []
for i in range(totalbuckets):
task= PythonOperator(
task_id=f'bucket_{i+1}',
python_callable=update_inputs,
provide_context=True,
op_kwargs={'bucket_name':f'bucket_{i+1}','sf_conn_id': SF_CONN_ID},
dag=test_live
)
bucket_tasks.append(task)
If totalbuckets is different from run to other, it should be a run conf variable, you can provide it for each run crated from the UI, CLI, Airflow REST API or even python API.
from airflow import DAG
from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.models.param import Param
with DAG(
'test-live',
catchup=False,
schedule_interval=None,
default_args=args,
params={"totalbuckets": Param(default=3, type="integer")},
) as test_live:
# branches based on number of buckets
def branch_buckets(**context):
buckets = defaultdict(list)
for i in range(len(inputs_to_process)):
buckets[f'bucket_{(1+i % int("{{ params.totalbuckets }}"))}'].append(inputs_to_process[i])
for bucket_name, input_sublist in buckets.items():
context['ti'].xcom_push(key = bucket_name, value = input_sublist)
return list(buckets.keys())
# BranchPythonOperator will launch the buckets and distributes inputs among the buckets
branch_buckets = BranchPythonOperator(
task_id='branch_buckets',
python_callable=branch_buckets,
trigger_rule=TriggerRule.NONE_FAILED,
provide_context=True,
dag=test_live
)
# update provider tables with merge sql
def update_inputs(sf_conn_id, bucket_name, **context):
input_sublist = context['ti'].xcom_pull(task_ids='branch_buckets', key=bucket_name)
print(f"Processing inputs {input_sublist} in {bucket_name}")
from custom.hooks.snowflake_hook import SnowflakeHook
for p in input_sublist:
merge_sql=f"""
merge into ......"""
bucket_tasks = []
for i in range(int("{{ params.totalbuckets }}")):
task= PythonOperator(
task_id=f'bucket_{i+1}',
python_callable=update_inputs,
provide_context=True,
op_kwargs={'bucket_name':f'bucket_{i+1}','sf_conn_id': SF_CONN_ID},
dag=test_live
)
bucket_tasks.append(task)
Example to run it:
airflow dags trigger --conf '{"totalbuckets": 10}' test-live
Or via the UI.
update:
And if it's static, but different from an environment to other, it can be an Airflow variable, and read it directly in the tasks using jinja to avoid reading it at each Dag Files processing.
But if it's completely static, the most recommended solution is using python variable as you do, because to read dag run conf and Airflow variables, the task/dag send a query to the database.
#hussein awala I am doing something like below but cannot parse totalbuckets in bucket_tasks
from airflow.operators.python import PythonOperator, BranchPythonOperator
with DAG('test-live', catchup=False, schedule_interval=None, default_args=args) as test_live:
#totalbuckets = 3
def branch_buckets(totalbuckets, **context):
buckets = defaultdict(list)
for i in range(len(inputs_to_process)):
buckets[f'bucket_{(1+i % totalbuckets)}'].append(inputs_to_process[i])
for bucket_name, input_sublist in buckets.items():
context['ti'].xcom_push(key = bucket_name, value = input_sublist)
return list(buckets.keys())
# BranchPythonOperator will launch the buckets and distributes inputs among the buckets
branch_buckets = BranchPythonOperator(
task_id='branch_buckets',
python_callable=branch_buckets,
trigger_rule=TriggerRule.NONE_FAILED,
provide_context=True, op_kwargs={'totalbuckets':3},
dag=test_live
)
# update provider tables with merge sql
def update_inputs(sf_conn_id, bucket_name, **context):
input_sublist = context['ti'].xcom_pull(task_ids='branch_buckets', key=bucket_name)
print(f"Processing inputs {input_sublist} in {bucket_name}")
from custom.hooks.snowflake_hook import SnowflakeHook
for p in input_sublist:
merge_sql=f"""
merge into ......"""
bucket_tasks = []
for i in range(totalbuckets):
task= PythonOperator(
task_id=f'bucket_{i+1}',
python_callable=update_inputs,
provide_context=True,
op_kwargs={'bucket_name':f'bucket_{i+1}','sf_conn_id': SF_CONN_ID},
dag=test_live
)
bucket_tasks.append(task)```
Broken DAG: [/opt/airflow/dags/my_dag.py] Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.6/site-packages/airflow/models/baseoperator.py", line 179, in apply_defaults
result = func(self, *args, **kwargs)
File "/home/airflow/.local/lib/python3.6/site-packages/airflow/operators/python.py", line 136, in init
raise AirflowException('python_callable param must be callable')
airflow.exceptions.AirflowException: python_callable param must be callable
import airflow
from airflow import DAG
from airflow.operators.python import BranchPythonOperator, PythonOperator
from datetime import datetime
from random import randint
from airflow.operators.bash import BashOperator
def _training_model():
return randint(1,11)
def _choose_best_model(ti):
accuracies = ti.xcom_pull(task_ids =[
'training_model_A'
'training_model_B'
'training_model_C'
])
best_accuracy = max(accuracies)
if (best_accuracy >8):
return 'accurate'
return 'inaccurate'
with DAG(
dag_id="mobile_app_usage", start_date=datetime(2021,1,1),
schedule_interval="#daily",catchup=False) as dag:
training_model_A = PythonOperator(
task_id = "training_model_A",
python_callable=_training_model()
)
training_model_B = PythonOperator(
task_id = "training_model_B",
python_callable=_training_model()
)
training_model_C = PythonOperator(
task_id = "training_model_B",
python_callable=_training_model()
)
choose_best_model = BranchPythonOperator(
task_id = "choose_best_model",
python_callable= _choose_best_model()
)
accurate = BashOperator(
task_id ="accurate",
bash_command="echo Accurate"
)
inaccurate = BashOperator(
task_id ="inaccurate",
bash_command="echo Inacurate"
)
Image of my airflow webserver
The python_callable parameter only needs the name of the callable to be executed instead of actually calling it. Like this:
...
training_model_A = PythonOperator(
task_id = "training_model_A",
python_callable=_training_model
)
training_model_B = PythonOperator(
task_id = "training_model_B",
python_callable=_training_model
)
training_model_C = PythonOperator(
task_id = "training_model_C",
python_callable=_training_model
)
choose_best_model = BranchPythonOperator(
task_id = "choose_best_model",
python_callable= _choose_best_model
)
...
FYI - In the snippet above I also updated the task_id argument for the task assigned as "training_model_C" since it had the same task_id as "training_model_B". In Airflow the task_id values need to be unique within a DAG.
I created a dag which contains a subdag for loop through a list which is return value of a task.
subdag function
def mySubDag(parent: Text, child: Text, args, **context):
task = context['tasl_instance']
data = task.xcom_pull(task_ids='task1', dag_id=parent)
for d in data:
# do something...
parent dag
with DAG(...) as dag:
task1 = PythonOperator(task_id="task1", ..., providde_context=True, dag=dag)
task2 = SubDagOperator(subdag=mySubDag(...),..., provide_context=True, dag=dag)
task1 >> task2
I dont know where to put the argument 'context' or how to put it for the subdag function to use it.
really appreciate if any one could help to resolve it.
code define xcom_pull in taskinstance.py
def xcom_pull(
self,
task_ids=None,
dag_id=None,
key=XCOM_RETURN_KEY,
include_prior_dates=False):
if dag_id is None:
dag_id = self.dag_id
...
it's pass dag_id of current dag to xom_pull
so if you want to get data from parent dag, please override dag_id argument with parent's dag_id
For your example, pass more context with op_kwargs:
def set_cookies_func(config, **kwargs):
cookies = service_get_cookies_login(config)
kwargs['ti'].xcom_push(key="SESSION", value=cookies)
def get_data_func(parent_dag_name, **kwargs):
cookies = kwargs['ti'].xcom_pull(
task_ids='set_cookies_task',
key="SESSION",
dag_id="my_dag_id.set_cookies_task"
)
def sub_cache_load_to_gcs(parent_dag_name, child_dag_name):
sub_dag = DAG(
dag_id= "{}.{}".format(parent_dag_name, child_dag_name),
...
)
PythonOperator(
task_id="get_data_func",
python_callable=get_data_func,
op_kwargs={"parent_dag_name": parent_dag_name},
providde_context=True,
dag=sub_dag
)
with DAG(
dag_id= "my_dag_id",
...
) as dag:
task1 = PythonOperator(
task_id="set_cookies_task",
python_callable=set_cookies_func,
op_kwargs={"config": config},
providde_context=True,
dag=dag
)
task2 = SubDagOperator(
task_id='branch_cache_task',
subdag=sub_cache_load_to_gcs(dag.dag_id, 'branch_cache_task'),
provide_context=True,
dag=dag
)
task1 >> task2
I just started using Airflow, can anyone enlighten me how to pass a parameter into PythonOperator like below:
t5_send_notification = PythonOperator(
task_id='t5_send_notification',
provide_context=True,
python_callable=SendEmail,
op_kwargs=None,
#op_kwargs=(key1='value1', key2='value2'),
dag=dag,
)
def SendEmail(**kwargs):
msg = MIMEText("The pipeline for client1 is completed, please check.")
msg['Subject'] = "xxxx"
msg['From'] = "xxxx"
......
s = smtplib.SMTP('localhost')
s.send_message(msg)
s.quit()
I would like to be able to pass some parameters into the t5_send_notification's callable which is SendEmail, ideally I want to attach the full log and/or part of the log (which is essentially from the kwargs) to the email to be sent out, guessing the t5_send_notification is the place to gather those information.
Thank you very much.
Pass a dict object to op_kwargs
Use the keys to access their value from kwargs dict in your python callable
def SendEmail(**kwargs):
print(kwargs['key1'])
print(kwargs['key2'])
msg = MIMEText("The pipeline for client1 is completed, please check.")
msg['Subject'] = "xxxx"
msg['From'] = "xxxx"
......
s = smtplib.SMTP('localhost')
s.send_message(msg)
s.quit()
t5_send_notification = PythonOperator(
task_id='t5_send_notification',
provide_context=True,
python_callable=SendEmail,
op_kwargs={'key1': 'value1', 'key2': 'value2'},
dag=dag,
)
PythonOperator have a named parameter op_kwargs and accepts dict object.
have
t5_send_notification = PythonOperator(
task_id='t5_send_notification',
provide_context=True,
python_callable=SendEmail,
op_kwargs={"my_param":'value1'},
dag=dag,
)
def SendEmail(my_param,**kwargs):
print(my_param) #'value_1'
msg = MIMEText("The pipeline for client1 is completed, please check.")
msg['Subject'] = "xxxx"
msg['From'] = "xxxx"
......
s = smtplib.SMTP('localhost')
s.send_me
I have 3 tasks to run in same dags. While Task1 return list of dictionary task2 and task3 try to use one dictionary element from result return by
task1.
def get_list():
....
return listOfDict
def parse_1(example_dict):
...
def parse_2(example_dict):
...
dag = DAG('dagexample', default_args=default_args)
data_list = PythonOperator(
task_id='get_lists',
python_callable=get_list,
dag=dag)
for data in data_list:
sub_task1 = PythonOperator(
task_id='data_parse1' + data['id'],
python_callable=parse_1,
op_kwargs={'dataObject': data},
dag=dag,
)
sub_task2 = PythonOperator(
task_id='data_parse2' + data['id'],
python_callable=parse_2,
op_kwargs={'dataObject': data},
dag=dag,
)
You should use XCom for passing variables/messages between different task. Take a look at this example: https://github.com/apache/incubator-airflow/blob/master/airflow/example_dags/example_xcom.py
For your case, it should be something similar as below:
default_args = {
'owner': 'airflow',
'start_date': airflow.utils.dates.days_ago(2),
'provide_context': True, # This is needed
}
def get_list():
....
return listOfDict
def parse_1(**kwargs):
ti = kwargs['ti']
# get listOfDict
v1 = ti.xcom_pull(key=None, task_ids='get_lists')
# You can now use this v1 dictionary as a normal python dict
...
def parse_2(**kwargs):
ti = kwargs['ti']
# get listOfDict
v1 = ti.xcom_pull(key=None, task_ids='get_lists')
...
dag = DAG('dagexample', default_args=default_args)
data_list = PythonOperator(
task_id='get_lists',
python_callable=get_list,
dag=dag)
for data in get_list():
sub_task1 = PythonOperator(
task_id='data_parse1' + data['id'],
python_callable=parse_1,
op_kwargs={'dataObject': data},
dag=dag,
)
sub_task2 = PythonOperator(
task_id='data_parse2' + data['id'],
python_callable=parse_2,
op_kwargs={'dataObject': data},
dag=dag,
)
You can use XComs as they are designed for inter-task communication. If your dictionary is very big, then I recommend storing it as a csv file.
Generally, tasks in Airflow don't share data between them, so XComs are a way to achieve them but are limited to small amounts of data.