Airflow - Bach command failed - python

I am trying to execute following airflow dag file. but getting following error.
import json
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago
from datetime import datetime, timedelta, time
args = {
'owner': 'test',
'start_date': days_ago(2),
'depends_on_past': False,
}
dag = DAG(
dag_id='test',
default_args=args,
schedule_interval='0 5 * * *',
tags=['test']
)
start_test = DummyOperator(
task_id='start_test',
dag=dag,
)
end_test = DummyOperator(
task_id='end_test',
dag=dag,
)
load_complete = DummyOperator(
task_id='load_complete',
dag=dag,
)
execution_date = datetime.now()
def check_monthstart_trigger(execution_date, **kwargs):
return execution_date.day() == 1
for i in json.loads(open('/home/test_123/list_of_files.json', 'r').read())['tables'].keys():
extract_phase = BashOperator(
task_id = 'extract_' + str(i),
bash_command = 'python3 /home/python/extract_code.py -t {}'.format(i),
dag = dag,
)
create_phase = BashOperator(
task_id = 'modify_' + str(i),
bash_command = 'python3 /home/python/table_create.py -t {}'.format(i),
dag = dag,
)
load_phase = BashOperator(
task_id = 'load_' + str(i),
bash_command = 'python3 /home/python/load_test.py -t {}'.format(i),
dag = dag,
)
start_test >> extract_phase >> create_phase >> load_phase >> load_complete >> end_test
This is the error what i am getting
i have tried this solution also (Error calling BashOperator: Bash command failed) but didn't work. Any idea how to resolve this.

Related

airflow dynamic task returns list instead of dictionary

Please refer my code below.
While recon_rule_setup task is running, each time it is getting Dictionary (recon_conf) as input from previous read_recon_config task.
However, while recon_rule_exec is running, it is getting List as input (recon_rule) from previous task.
My expectation was, recon_rule_setup should run 2 times and recon_rule_exec should run 4 times depending on the return values.
Why expand is behaving differently each time.
from datetime import datetime
from airflow.models import DAG, XCom
from airflow.utils.dates import days_ago
from airflow.decorators import task
from airflow.utils.db import provide_session
from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator
from airflow.utils.task_group import TaskGroup
#provide_session
def clear_xcom_data(session=None, **kwargs):
dag_instance = kwargs["dag"]
dag_id = dag_instance._dag_id
session.query(XCom).filter(XCom.dag_id == dag_id).delete()
#task(task_id="read_recon_config")
def read_recon_config(dag_run=None):
parent_dict = dag_run.conf
d1 = {"name": "Santanu"}
d2 = {"name": "Ghosh"}
return [d1, d2]
#task(task_id="recon_rule_setup")
def recon_rule_setup(recon_conf):
print(f"type of recon_conf_dict: {type(recon_conf)}")
print(f"recon_conf_dict: {recon_conf}")
return [recon_conf, {"name": "Kolkata"}]
#task(task_id="recon_rule_exec")
def recon_rule_exec(recon_rule, master_key):
print(f"master_key type: {type(master_key)}")
print(f"master_key: {master_key}")
print(f"recon_rule type: {type(recon_rule)}")
print(f"recon_rule: {recon_rule}")
default_args = {
'owner': 'Airflow',
'start_date': days_ago(1),
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1
}
dag_name_id = "dynamic_demo"
cur_datetime = datetime.utcnow().strftime("%Y%m%d%H%M%S%f")[:-3]
master_dag_key = f"{dag_name_id}_{cur_datetime}"
with DAG(
dag_id=dag_name_id,
default_args=default_args,
schedule_interval=None,
catchup=False
) as dag:
start = BashOperator(task_id="start", bash_command='echo "starting reconciliation"', do_xcom_push=False)
stop = BashOperator(task_id="stop", bash_command='echo "stopping reconciliation"', do_xcom_push=False)
delete_xcom = PythonOperator(
task_id="delete_xcom",
python_callable=clear_xcom_data
)
with TaskGroup(group_id="reconciliation_process") as tg1:
recon_config_list = read_recon_config()
recon_rule_list = recon_rule_setup.expand(recon_conf=recon_config_list)
recon_rule_exec.partial(master_key=master_dag_key).expand(recon_rule=recon_rule_list)
start >> tg1 >> delete_xcom >> stop
Regards,
Santanu
Since you are trying to return the dictionary as a list that’s why it is returning the dictionary inside a list. For your requirement, you can try the below code which is returning a dictionary.
Code:
from datetime import datetime
from airflow.models import DAG, XCom
from airflow.utils.dates import days_ago
from airflow.decorators import task
from airflow.utils.db import provide_session
from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator
from airflow.utils.task_group import TaskGroup
#provide_session
def clear_xcom_data(session=None, **kwargs):
dag_instance = kwargs["dag"]
dag_id = dag_instance._dag_id
session.query(XCom).filter(XCom.dag_id == dag_id).delete()
#task(task_id="read_recon_config")
def read_recon_config(dag_run=None):
parent_dict = dag_run.conf
d1 = {1:{"name": "S"} ,2:{"name": "G"}}
return d1
#task(task_id="recon_rule_setup")
def recon_rule_setup(recon_conf):
print(f"type of recon_conf_dict: {type(recon_conf)}")
print(f"recon_conf_dict: {recon_conf}")
return [recon_conf, {"name": "Kolkata"}]
#task(task_id="recon_rule_exec")
def recon_rule_exec(recon_rule, master_key):
print(f"master_key type: {type(master_key)}")
print(f"master_key: {master_key}")
print(f"recon_rule type: {type(recon_rule)}")
print(f"recon_rule: {recon_rule}")
default_args = {
'owner': 'Airflow',
'start_date': days_ago(1),
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1
}
dag_name_id = "dynamic_demo"
cur_datetime = datetime.utcnow().strftime("%Y%m%d%H%M%S%f")[:-3]
master_dag_key = f"{dag_name_id}_{cur_datetime}"
with DAG(
dag_id=dag_name_id,
default_args=default_args,
schedule_interval=None,
catchup=False
) as dag:
start = BashOperator(task_id="start", bash_command='echo "starting reconciliation"', do_xcom_push=False)
stop = BashOperator(task_id="stop", bash_command='echo "stopping reconciliation"', do_xcom_push=False)
delete_xcom = PythonOperator(
task_id="delete_xcom",
python_callable=clear_xcom_data
)
with TaskGroup(group_id="reconciliation_process") as tg1:
recon_config_list = read_recon_config()
recon_rule_list = recon_rule_setup.expand(recon_conf=recon_config_list)
recon_rule_exec.partial(master_key=master_dag_key).expand(recon_rule=recon_rule_list)
start >> tg1 >> delete_xcom >> stop
Output:

airflow - creating dag and task dynamically create the pipeline for one object

In airflow I want to export some tables from pg to BQ.
task1: get the max id from BQ
task2: export the data from PG (id>maxid)
task3: GCS to BQ stage
task4: BQ stage to BQ main
But there is a slight challenge, The schedule interval is different. So I created a JSON file to tell the sync interval. So if it is 2mins then it'll use the DAG upsert_2mins else 10mins interval (upsert_10mins) . I used this syntax to generate it dynamically.
JSON config file:
{
"tbl1": ["update_timestamp", "2mins", "stg"],
"tbl2": ["update_timestamp", "2mins", "stg"]
}
Code:
import json
from airflow import DAG
from datetime import datetime, timedelta
from airflow.utils.dates import days_ago
from airflow.contrib.hooks.bigquery_hook import BigQueryHook
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from customoperator.custom_PostgresToGCSOperator import custom_PostgresToGCSOperator
from airflow.contrib.operators.gcs_to_bq import custom_PostgresToGoogleCloudStorageOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.dummy_operator import DummyOperator
table_list = ['tbl1','tbl2']
#DAG details
docs = """test"""
# Add args and Dag
default_args = {
'owner': 'DBteam',
'depends_on_past': False,
'start_date': days_ago(1),
'email_on_failure': True,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1)
}
with open('/home/airflow/gcs/dags/upsert_dag/config.json','r') as conf:
config = json.loads(conf.read())
def get_max_ts(dag,tablename,**kwargs):
code for find the max record
return records[0][0]
def pgexport(dag,tablename, **kwargs):
code for exporting the data PGtoGCS
export_tables.execute(None)
def stg_bqimport(dag,tablename, **kwargs):
code to import GCS to BQ
bqload.execute(None)
def prd_merge(dag,tablename, **kwargs):
code to merge bq to main bq table
bqmerge.execute(context=kwargs)
for table_name in table_list:
sync_interval = config[table_name][1]
cron_time = ''
if sync_interval == '2mins':
cron_time = '*/20 * * * *'
else:
cron_time = '*/10 * * * *'
dag = DAG(
'upsert_every_{}'.format(sync_interval),
default_args=default_args,
description='Incremental load - Every 10mins',
schedule_interval=cron_time,
catchup=False,
max_active_runs=1,
doc_md = docs
)
max_ts = PythonOperator(
task_id="get_maxts_{}".format(table_name),
python_callable=get_max_ts,
op_kwargs={'tablename':table_name, 'dag': dag},
provide_context=True,
dag=dag
)
export_gcs = PythonOperator(
task_id='export_gcs_{}'.format(table_name),
python_callable=pgexport,
op_kwargs={'tablename':table_name, 'dag': dag},
provide_context=True,
dag=dag
)
stg_load = PythonOperator(
task_id='stg_load_{}'.format(table_name),
python_callable=stg_bqimport,
op_kwargs={'tablename':table_name, 'dag': dag},
provide_context=True,
dag=dag
)
merge = PythonOperator(
task_id='merge_{}'.format(table_name),
python_callable=prd_merge,
op_kwargs={'tablename':table_name, 'dag': dag},
provide_context=True,
dag=dag
)
globals()[sync_interval] = dag
max_ts >> export_gcs >> stg_load >> merge
It actually created the dag but the issue is from the web UI im able to see the task for the last table.But it has to show the tasks for 2 tables.
Your code is creating 2 dags, one for each table, but overwriting the first one with the second.
My suggestion is to change the format of the JSON file to:
{
"2mins": [
"tbl1": ["update_timestamp", "stg"],
"tbl2": ["update_timestamp", "stg"]
],
"10mins": [
"tbl3": ["update_timestamp", "stg"],
"tbl4": ["update_timestamp", "stg"]
]
}
And have your code iterate over the schedules and create the needed tasks for each table (you will need two loops):
# looping on the schedules to create two dags
for schedule, tables in config.items():
cron_time = '*/10 * * * *'
if schedule== '2mins':
cron_time = '*/20 * * * *'
dag_id = 'upsert_every_{}'.format(schedule)
dag = DAG(
dag_id ,
default_args=default_args,
description='Incremental load - Every 10mins',
schedule_interval=cron_time,
catchup=False,
max_active_runs=1,
doc_md = docs
)
# Looping over the tables to create the tasks for
# each table in the current schedule
for table_name, table_config in tables.items():
max_ts = PythonOperator(
task_id="get_maxts_{}".format(table_name),
python_callable=get_max_ts,
op_kwargs={'tablename':table_name, 'dag': dag},
provide_context=True,
dag=dag
)
export_gcs = PythonOperator(
task_id='export_gcs_{}'.format(table_name),
python_callable=pgexport,
op_kwargs={'tablename':table_name, 'dag': dag},
provide_context=True,
dag=dag
)
stg_load = PythonOperator(
task_id='stg_load_{}'.format(table_name),
python_callable=stg_bqimport,
op_kwargs={'tablename':table_name, 'dag': dag},
provide_context=True,
dag=dag
)
merge = PythonOperator(
task_id='merge_{}'.format(table_name),
python_callable=prd_merge,
op_kwargs={'tablename':table_name, 'dag': dag},
provide_context=True,
dag=dag
)
# Tasks for the same table will be chained
max_ts >> export_gcs >> stg_load >> merge
# DAG is created among the global objects
globals()[dag_id] = dag

Scheduling Airflow DAGs to run exclusively Monday through Friday i.e only weekdays

I have a DAG executing a Python script which takes a date argument (the current date). I'm scheduling the DAG to run at 6:00 AM Monday through Friday i.e weekdays Eastern Standard Time. The DAG has to run the Python script on Monday with Mondays date as an argument, same for Tuesday all the way to Friday with Fridays date as an argument.
I noticed using a schedule interval of '0 6 * * 1-5' didn't work because Fridays execution didn't occur until the following Monday.
I changed the schedule interval to '0 6 * * *' to run everyday at 6:00 AM and at the start of my dag, filter for dates that fall within ‘0 6 * * 1-5’, so effectively Monday to Friday. For Saturday and Sunday, the downstream tasks should be skipped.
This is my code
from __future__ import print_function
import pendulum
import logging
from airflow.models import DAG
from airflow.models import Variable
from datetime import datetime, timedelta
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.python_operator import ShortCircuitOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.trigger_rule import TriggerRule
from croniter import croniter
log = logging.getLogger(__name__)
def filter_processing_date(**context):
execution_date = context['execution_date']
cron = croniter('0 6 * * 1-5', execution_date)
log.info('cron is: {}'.format(cron))
log.info('execution date is: {}'.format(execution_date))
#prev_date = cron.get_prev(datetime)
#log.info('prev_date is: {}'.format(prev_date))
return execution_date == cron.get_next(datetime).get_prev(datetime)
local_tz = pendulum.timezone("America/New_York")
# DAG parameters
default_args = {
'owner': 'Managed Services',
'depends_on_past': False,
'start_date': datetime(2020, 8, 3, tzinfo=local_tz),
'dagrun_timeout': None,
'email': Variable.get('email'),
'email_on_failure': True,
'email_on_retry': False,
'provide_context': True,
'retries': 12,
'retry_delay': timedelta(minutes=5)
}
with DAG(
'execute_python',
schedule_interval='0 6 * * *',
default_args=default_args
) as dag:
start_dummy = DummyOperator(
task_id='start',
dag=dag
)
end_dummy = DummyOperator(
task_id='end',
trigger_rule=TriggerRule.NONE_FAILED,
dag=dag
)
weekdays_only = ShortCircuitOperator(
task_id='weekdays_only',
python_callable=filter_processing_date,
dag=dag
)
run_python = SSHOperator(
ssh_conn_id="oci_connection",
task_id='run_python',
command='/usr/bin/python3 /home/sb/local/bin/runProcess.py -d {{ ds_nodash }}',
dag=dag)
start_dummy >> weekdays_only >> run_python >> end_dummy
Unfortunately, weekdays_only task is failing with the below error message. What is going wrong?
Airflow error message
Airflow error message continuation
Airflow version: v1.10.9-composer
Python 3.
I managed to solve my problem by hacking something together. Checking if the next execution date is a weekday and returning true if it's the case or false otherwise. I call the function in a ShortCircuitOperator which proceeds with downstream tasks if true or skips them if false.
This is my code below but I'm open to better solutions.
from __future__ import print_function
import pendulum
import logging
from airflow.models import DAG
from airflow.models import Variable
from datetime import datetime, timedelta
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.python_operator import ShortCircuitOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.trigger_rule import TriggerRule
log = logging.getLogger(__name__)
def checktheday(**context):
next_execution_date = context['next_execution_date']
log.info('next_execution_date is: {}'.format(next_execution_date))
date_check = next_execution_date.weekday()
log.info('date_check is: {}'.format(date_check))
if date_check == 0 or date_check == 1 or date_check == 2 or date_check == 3 or date_check == 4:
decision = True
else:
decision = False
log.info('decision is: {}'.format(decision))
return decision
local_tz = pendulum.timezone("America/New_York")
# DAG parameters
default_args = {
'owner': 'Managed Services',
'depends_on_past': False,
'start_date': datetime(2020, 8, 3, tzinfo=local_tz),
'dagrun_timeout': None,
'email': Variable.get('email'),
'email_on_failure': True,
'email_on_retry': False,
'provide_context': True,
'retries': 12,
'retry_delay': timedelta(minutes=5)
}
with DAG(
'execute_python',
schedule_interval='0 6 * * *',
default_args=default_args
) as dag:
start_dummy = DummyOperator(
task_id='start',
dag=dag
)
end_dummy = DummyOperator(
task_id='end',
trigger_rule=TriggerRule.NONE_FAILED,
dag=dag
)
weekdays_only = ShortCircuitOperator(
task_id='weekdays_only',
python_callable=checktheday,
dag=dag
)
run_python = SSHOperator(
ssh_conn_id="oci_connection",
task_id='run_python',
command='/usr/bin/python3 /home/sb/local/bin/runProcess.py -d {{ macros.ds_format(macros.ds_add(ds, 1), "%Y-%m-%d", "%Y%m%d") }}',
dag=dag)
start_dummy >> weekdays_only >> run_python >> end_dummy

Apache Airflow - Prescript rerun at each task of the dag and date change

I am new with using airflow.
I noticed that if you define a global variable (timestamp) in the code, this value will change for each task. For example in the very basic example bellow, I define a variable now but each time I print it in a task, this value changes.
from datetime import timedelta
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
import time
now = int(time.time() * 1000)
RANGE = range(1, 10)
def init_step():
print("Run on RANGE {}".format(RANGE))
print("Date of the Scans {}".format(now))
return RANGE
def trigger_step(index):
time.sleep(10)
print("index {} - date {}".format(index, now))
return index
default_args = {
'owner': 'airflow',
'start_date': days_ago(1),
'retries': 2,
'retry_delay': timedelta(minutes=15)
}
with DAG('test',
default_args=default_args,
schedule_interval='0 16 */7 * *',
) as dag:
init = PythonOperator(task_id='init',
python_callable=init_step,
dag=dag)
for index in init_step():
run = PythonOperator(task_id='trigger-port-' + str(index),
op_kwargs={'index': index},
python_callable=trigger_step, dag=dag)
dag >> init >> run
Is it a normal behavior ? Is there a way to change it ?

Usage of variable in airflow DAG

I set the variable with the "airflow variables" command in cli
I wants to use this variable in DAG.
I executed the following command on the terminal
The error continues occurs.
Broken DAG: [/root/airflow/dags/param_test.py] invalid syntax (param_test.py, line 13)
airflow variables -s sh_path = "/tmp/echo_test.sh"
airflow scheduler
here the code :
from airflow import DAG
from airflow.models import Variable
from airflow.operators.bash_operator import BashOperator
tmpl_search_path = Variable.get ("sh_path")
dag = DAG ('param_test', schedule_interval = '* / 5 * * * *'
           start_date = datetime (2018,9,4), catchup = False)
bash_task = BashOperator (
      task_id = "bash_task"
      bash_command = 'sh '+ {{var.value.tmpl_search_path}},
      dag = dag)
bash_task.set_downstream (python_task)
bash_task1 = BashOperator (
      task_id = 'echo',
      bash_command = 'echo 1',
      dag = dag)
bash_task.set_downstream (bash_task1)
You need to quote the jinja templating. Use it as below:
bash_task = BashOperator (
task_id = "bash_task"
bash_command = "sh {{var.value.tmpl_search_path}}",
dag = dag)

Categories