import logging
import pandas as pd
import boto3
from datetime import datetime
from airflow import DAG, settings
from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.contrib.hooks.snowflake_hook import SnowflakeHook
from airflow.models import Variable, Connection
from airflow.utils.trigger_rule import TriggerRule
from settings import settings_manager
from config.misc import misc_config
logger = logging.getLogger(__name__)
list_tables_query = f"""SHOW TABLES IN SCHEMA {settings_manager.SNOWFLAKE_DATABASE}.STAGING"""
overwrite_variable_name = 'overwrite_staging_tables'
dag = DAG("v4_functional_table_creation", schedule_interval=None,
start_date=datetime(2019, 2, 20))
start = DummyOperator(task_id='start',
dag=dag)
def print_str(file):
logger.info(file)
def list_existing_tables():
# left out for brevity
return table_list
def does_table_exists(file, table_name, list_of_tables):
if table_name.upper() in list_of_tables:
return f"""overwrite_check_for_{table_name}"""
else:
return f"""get_latest_{file}_file"""
def overwrite_check(source_name, table_name):
overwrite = 'False'
if overwrite == 'True':
return f"""drop_{source_name}.{table_name}"""
else:
return "end"
def create_table(table_name, file_path):
logger.info(f"""creating table {table_name} using {file_path} file.""")
def get_latest_uploaded_file(file, source):
"""
Given a s3 prefix, returns path of the latest uploaded file.
"""
# left out for brevity
return latest.get('Key', '')
list_existing_tables = PythonOperator(task_id="list_existing_tables",
python_callable=list_existing_tables,
dag=dag)
end = DummyOperator(task_id='end',
dag=dag)
start >> list_existing_tables
for source in misc_config.get('s3_sources_to_parse'):
file_list_str = Variable.get(f"""file_list_{source}""")
file_list_str = file_list_str[2:-2]
file_list = file_list_str.split(',')
for file_str in file_list:
file = file_str.strip(" ").strip('"').strip("'")
table_name = f"""{source}_{file}"""
check_table_exists = BranchPythonOperator(task_id=f"""check_{table_name}_exists""",
python_callable=does_table_exists,
op_kwargs={'table_name': table_name,
'list_of_tables': list_existing_tables.output,
'file': file},
dag=dag)
check_overwrite_condition = BranchPythonOperator(task_id=f"""overwrite_check_for_{table_name}""",
python_callable=overwrite_check,
op_kwargs={'source_name': source,
'table_name': table_name},
dag=dag)
get_latest_file = PythonOperator(task_id=f"""get_latest_{file}_file""",
python_callable=get_latest_uploaded_file,
op_kwargs={'file': file,
'source': source},
trigger_rule='none_failed_or_skipped',
dag=dag)
drop_table = PythonOperator(task_id=f"""drop_{table_name}""",
python_callable=print_str,
op_kwargs={'file': f"""dropping_{table_name}"""},
trigger_rule='none_failed_or_skipped',
dag=dag)
create_table_task = PythonOperator(task_id=f"""create_{table_name}""",
python_callable=create_table,
op_kwargs={'table_name': table_name,
'file_path': get_latest_file.output},
dag=dag)
list_existing_tables >> check_table_exists >> [check_overwrite_condition, get_latest_file]
check_overwrite_condition >> [drop_table, end]
drop_table >> get_latest_file >> create_table_task >> end
For the DAG declared at the bottom of the file, referring the attached screenshot, I'm hoping for the get_latest_waffle_switch_file and create_sos_waffle_switch to be skipped.
I'm not sure how/why get_latest_waffle_switch_file task is getting triggered.
I think it should be a matter of declaring the DAG/task relations right.
Any pointers will be really helpful! Thanks in advance _/\_
[1]: https://i.stack.imgur.com/eUUbh.png
Related
I am trying to implement this example below from Airflow documentation, but using the new ExternalPythonOperator.
from datetime import datetime
from airflow import DAG
from airflow.decorators import task
with DAG(dag_id="simple_mapping", start_date=datetime(2022, 3, 4)) as dag:
#task
def add_one(x: int):
return x + 1
#task
def sum_it(values):
total = sum(values)
print(f"Total was {total}")
added_values = add_one.expand(x=[1, 2, 3])
sum_it(added_values)
My code looks like this:
from airflow.decorators import dag, task
from airflow.sensors.filesystem import FileSensor
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
from datetime import datetime
import os
from company.my_task import task_one_func, task_two_func, task_three_func
default_args: dict = {
"owner": "admin",
"depends_on_past": False,
"email": [],
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
}
AIRFLOW_HOME = os.getcwd()
DROP_DIR = f"{AIRFLOW_HOME}/resources/drop_folder"
VENVS_DIR = "/opt/airflow/dags/company/venvs"
DAG_ENV = f"{VENVS_DIR}/my_env_name/bin/python"
#dag(
schedule="#daily",
start_date=datetime(2022, 11, 17, 8, 0, 0, 0),
catchup=False,
default_args=default_args,
tags=[],
)
def my_dag():
now: str = "{{ execution_date.strftime('%Y-%m-%dT%H:%M:%S') }}"
# Wait for file into drop_folder
wait_files = FileSensor(
task_id="wait_file",
fs_conn_id="fs_default",
poke_interval=30*60,
filepath=DROP_DIR,
mode="reschedule",
timeout=60*60*8
)
# External python operator manual decorators
task_one_dec = task.external_python(
task_id="t_one", python=os.fspath(DAG_ENV), retries=0, expect_airflow=False)(task_one_func)
task_two_dec = task.external_python(
task_id="t_two", python=os.fspath(DAG_ENV), retries=0, expect_airflow=False)(task_two_func)
task_three_dec = task.external_python(
task_id="t_three", python=os.fspath(DAG_ENV), trigger_rule="one_success", expect_airflow=False)(task_three_func)
# Get the sub directories of a drop folder
# Return a list
task_one = task_one_dec(DROP_DIR)
# Expand for each sub directory found
# Verify the each file and return its path
task_two = task_two_dec.expand(
file_path=task_one)
# Loop through the task_two list and move the subdir to another folder.
task_three = task_three_dec(
task_two)
trigger_next_dag = TriggerDagRunOperator(
task_id="trigger_next",
trigger_dag_id="next_dag_name",
execution_date=now
)
wait_files >> task_one >> task_two >> task_three >> trigger_next_dag
my_dag()
My task file:
def task_one_func(drop_dir: str) -> list:
# Analyze drop_dir
return ['subdir1', 'subdir2']
def task_two_func(file_path: str) -> str:
# Verify the file
print("Verified", file_path)
return file_path
def task_three_func(file_paths: list) -> None:
for item in file_paths:
"Move the file to another place"
print(item)
return None
task_three return an error:
_pickle.PicklingError: Can't pickle <class 'sqlalchemy.orm.session.Session'>: it's not the same object as sqlalchemy.orm.session.Session
What I tried:
Installing the correct Airflow version according to the venv python version and adding expect_airflow=True to my decorators:
The error still remains
Replacing task_three (ExternalPythonOperator) to a simple PythonOperator:
The error does not appear anymore and I am able to browse the list
Adding a PythonOperator between task_two and task_three moving output to a new list solves the problem, but why?
Enabling Xcom pickling inside the airflow cfg
`
#Declare task two here..
#task
def lazyXcomToList(li) -> list:
new_list = []
for item in li:
new_list.append(item)
return new_list
lazyXcomToListTask = lazyXcomToList(task_two)
#Declare task three here...
wait_files >> task_one >> task_two >> lazyXcomToList >> task_three >> trigger_next_dag
`
I have 2 DAGs: dag_a and dag_b (dag_a -> dag_b)
After dag_a is executed, TriggerDagRunOperator is called, which starts dag_b. The problem is, when dag_b is off (paused), dag_a's TriggerDagRunOperator creates scheduled runs in dag_b that queue up for as long as dag_a is running. After turning dag_b back ON, the execution of tasks from the queue begins.
I'm trying to find a solution for TriggerDagRunOperator, namely a conditionally_trigger function that would skip the execution of the TriggerDagRunOperator task if dag_b is paused (OFF). How can i do this?
You can use ShortCircuitOperator to execute/skip the downstream dag_b. Then, use the Airflow Rest API (or shell/CLI) to figure out whether dag_b is paused or not.
dag_a = TriggerDagRunOperator(
trigger_dag_id='dag_a',
...
)
pause_check = ShortCircuitOperator(
task_id='pause_check',
python_callable=is_dag_paused,
op_kwargs={
'dag_id': 'dag_b'
}
)
dag_b = TriggerDagRunOperator(
trigger_dag_id='dag_b',
...
)
dag_a >> pause_check >> dag_b
and is_dag_paused function can be like this. (here I use Rest API.)
def is_dag_paused(**kwargs):
import requests
from requests.auth import HTTPBasicAuth
dag_id = kwargs['dag_id']
res = requests.get(f'http://{airflow_host}/api/v1/dags/{dag_id}/details',
auth=HTTPBasicAuth('username', 'pasword')) # The auth method could be different for you.
if res.status_code == 200:
rjson = res.json()
# if you return True, the downstream tasks will be executed
# if False, it will be skipped
return not rjson['is_paused']
else:
print('Error: ', res)
exit(1)
import airflow.settings
from airflow.models import DagModel
def check_status_dag(*op_args):
session = airflow.settings.Session()
qry = session.query(DagModel).filter(DagModel.dag_id == op_args[0])
if not qry.value(DagModel.is_paused):
return op_args[1]
else: return op_args[2]
Where check_status_dag is the method of making a choice decision for executing a further branch, op_args[0] is the dag_id of the dag being checked for pause status, op_args[1] and op_args[2] are the names of the tasks in accordance with the logic of the BranchPythonOperator
start = DummyOperator(
task_id = 'start',
dag=dag
)
check_dag_B = BranchPythonOperator(
task_id = "check_dag_B",
python_callable = check_status_dag,
op_args = ['dag_B','trigger_dag_B','skip_trigger_dag_B'],
trigger_rule = 'all_done',
dag = dag
)
trigger_dag_B = TriggerDagRunOperator(
task_id = 'trigger_dag_B',
trigger_dag_id = 'dag_B',
dag = dag
)
skip_trigger_dag_B = DummyOperator(
task_id = 'skip_trigger_dag_B',
dag = dag
)
finish = DummyOperator(
task_id = 'finish',
trigger_rule = 'all_done',
dag=dag
)
start >> check_dag_B >> [trigger_dag_B, skip_trigger_dag_B] >> finish#or continue working
I am new in python and airflow, I am using GCP composer environment for creating a DAG.
In this python code I created two task one is for reading a zip or csv file another one for creating a dataproc cluster. In one task I am calling one method readYML which is reading the yml configuration file for dataproc cluster argument like cluster-name, project_id etc, and same argument I am using further in the second task, see the below code for better understanding
# Importing Modules
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta
from zipfile import ZipFile
from airflow.models import Variable
import yaml
from google.cloud import storage
from airflow.contrib.operators import dataproc_operator
import pandas as pd
global cfg
def readYML():
print("inside readzip")
file_name = "/home/airflow/gcs/data/cluster_config.yml"
with open(file_name, 'r') as ymlfile:
cfg = yaml.load(ymlfile)
print("inside readYML method : ", cfg['configs']['project_id'])
def iterate_bucket():
global blobs
bucket_name = 'europe-west1-airflow-test-9bbb5fc7-bucket'
storage_client = storage.Client.from_service_account_json(
'/home/airflow/gcs/data/service_account_key_gcp_compute_bmg.json')
bucket = storage_client.get_bucket(bucket_name)
blobs = bucket.list_blobs()
def print_PcsvData():
iterate_bucket()
readYML()
global readPcsv
for blob in blobs:
if "physical.zip" in blob.name:
print("hello : ", blob.name)
file_name = "/home/airflow/gcs/" + blob.name
with ZipFile(file_name, 'r') as zip:
# printing all the contents of the zip file
for info in zip.infolist():
readfilename = info.filename
print(readfilename)
readPcsv = pd.read_csv("/home/airflow/gcs/data/" + readfilename)
print("physi cal.csv : ", readPcsv)
print('Done!')
dag_name = Variable.get("dag_name")
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.now(),
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'cluster_name': cfg['configs']['cluster_name'],
}
# Instantiate a DAG
dag = DAG(dag_id='read_yml', default_args=default_args,
schedule_interval=timedelta(days=1))
# Creating Tasks
t1 = PythonOperator(task_id='Raw1', python_callable=print_PcsvData,
dag=dag)
create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
task_id='create_dataproc_cluster',
project_id=cfg['configs']['project_id'],
cluster_name=cfg['configs']['cluster_name'],
num_workers=cfg['configs']['num_workers'],
zone=cfg['configs']['zone'],
master_machine_type=cfg['configs']['master_machine_type'],
worker_machine_type=cfg['configs']['worker_machine_type'],
dag=dag)
t1 >> create_dataproc_cluster
In this code, I want to use cfg variable globally, In default args also I want to access this variable but I am getting error, I don't know its scope related issue or what even I declared cfg variable inside readYML method also but still error persists.
Any help would be appreciated.
Thanks in advance
Check the DAG file below that you should use:
Few changes that you should make:
Change the global vars and return the values from functions
Never use datetime.now() - https://airflow.apache.org/faq.html#what-s-the-deal-with-start-date
Updated file:
# Importing Modules
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta
from zipfile import ZipFile
from airflow.models import Variable
import yaml
from google.cloud import storage
from airflow.contrib.operators import dataproc_operator
import pandas as pd
def readYML():
print("inside readzip")
file_name = "/home/airflow/gcs/data/cluster_config.yml"
with open(file_name, 'r') as ymlfile:
cfg = yaml.load(ymlfile)
print("inside readYML method : ", cfg['configs']['project_id'])
return cfg
def iterate_bucket():
bucket_name = 'europe-west1-airflow-test-9bbb5fc7-bucket'
storage_client = storage.Client.from_service_account_json(
'/home/airflow/gcs/data/service_account_key_gcp_compute_bmg.json')
bucket = storage_client.get_bucket(bucket_name)
blobs = bucket.list_blobs()
return blobs
def print_PcsvData():
blobs = iterate_bucket()
for blob in blobs:
if "physical.zip" in blob.name:
print("hello : ", blob.name)
file_name = "/home/airflow/gcs/" + blob.name
with ZipFile(file_name, 'r') as zip:
# printing all the contents of the zip file
for info in zip.infolist():
readfilename = info.filename
print(readfilename)
readPcsv = pd.read_csv("/home/airflow/gcs/data/" + readfilename)
print("physi cal.csv : ", readPcsv)
print('Done!')
return readPcsv
dag_name = Variable.get("dag_name")
cfg = readYML()
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(2),
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'cluster_name': cfg['configs']['cluster_name'],
}
# Instantiate a DAG
dag = DAG(dag_id='read_yml', default_args=default_args,
schedule_interval=timedelta(days=1))
# Creating Tasks
t1 = PythonOperator(task_id='Raw1', python_callable=print_PcsvData,
dag=dag)
create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
task_id='create_dataproc_cluster',
project_id=cfg['configs']['project_id'],
cluster_name=cfg['configs']['cluster_name'],
num_workers=cfg['configs']['num_workers'],
zone=cfg['configs']['zone'],
master_machine_type=cfg['configs']['master_machine_type'],
worker_machine_type=cfg['configs']['worker_machine_type'],
dag=dag)
t1 >> create_dataproc_cluster
I have extensively searched for airflow blogs and documentation to debug a problem I have.
What I am trying to solve
Check if a particular file exists on an ftp server
If it exists upload it to cloud
If it doesn't exist, send an email to the client reporting that no file is found
What I have
A custom operator extending the BaseOperator that uses the SSH Hook and pushes a value (true or false).
Task that uses BranchPythonOperator to pull the value from xcom and check if previous task returned true or false and make the decision about the next task.
Please look at the code below. This code is a simplified version of what I am trying to do.
If anyone is interested in my original code, please scroll down to the end of the question.
Here the custom operator simply returns a String Even or Odd, based on the minute being even or odd.
import logging
from airflow.models import BaseOperator
from airflow.plugins_manager import AirflowPlugin
from airflow.utils.decorators import apply_defaults
from datetime import datetime
log = logging.getLogger(__name__)
class MediumTestOperator(BaseOperator):
#apply_defaults
def __init__(self,
do_xcom_push=True,
*args,
**kwargs):
super(MediumTestOperator, self).__init__(*args, **kwargs)
self.do_xcom_push = do_xcom_push
self.args = args
self.kwargs = kwargs
def execute(self, context):
# from IPython import embed; embed()
current_minute = datetime.now().minute
context['ti'].xcom_push(key="Airflow", value="Apache Incubating")
if current_minute %2 == 0:
context['ti'].xcom_push(key="minute", value="Even")
else:
context['ti'].xcom_push(key="minute", value="Odd")
# from IPython import embed; embed()
class MediumTestOperatorPlugin(AirflowPlugin):
name = "medium_test"
operators = [MediumTestOperator]
File: caller.py
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.python_operator import BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from medium_payen_op import MediumTestOperator
from datetime import datetime, timedelta
default_args = {
'owner': 'guillaume',
'depends_on_past': False,
'start_date': datetime(2018, 6, 18),
'email': ['hello#moonshots.ai'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1)
}
dag = DAG(
'Weekday',
default_args=default_args,
schedule_interval="#once")
sample_task = MediumTestOperator(
task_id='task_1',
provide_context=True,
dag=dag
)
def get_branch_follow(**kwargs):
x = kwargs['ti'].xcom_pull(task_ids='task_1', key="minute")
print("From Kwargs: ", x)
if x == 'Even':
return 'task_3'
else:
return 'task_4'
task_2 = BranchPythonOperator(
task_id='task_2_branch',
python_callable=get_branch_follow,
provide_context=True,
dag=dag
)
def get_dample(**kwargs):
x = kwargs['ti'].xcom_pull(task_ids='task_1', key="minute")
y = kwargs['ti'].xcom_pull(task_ids='task_1', key="Airflow")
print("Minute is:", x, " Airflow is from: ", y)
print("Task 3 Running")
task_3 = PythonOperator(
python_callable=get_dample,
provide_context=True,
dag=dag,
task_id='task_3'
)
def get_dample(**kwargs):
x = kwargs['ti'].xcom_pull(task_ids='task_1', key="minute")
y = kwargs['ti'].xcom_pull(task_ids='task_1', key="Airflow")
print("Minute is:", x, " Airflow is from: ", y)
print("Task 4 Running")
task_4 = PythonOperator(
python_callable=get_dample,
provide_context=True,
dag=dag,
task_id='task_4'
)
sample_task >> task_3
task_2 >> task_3
task_2 >> task_4
As you can see from the attached images, the Xcom push did work and I can pull the values from PythonOperator but not from the BranchPythonOperator.
Any help is appreciated.
Xcom Pull from inside the Python Callable of the BranchPythonOperator returns 'None' always, resulting in the Else block running always.
A Tree View of the DAG
XCom Values from the Admin Screen
Xcom Pull from the PythonOperator returns proper values.
This is the original code that I am working with
The custom operator pushes a string True or False as an Xcom Value which then read by the BranchPythonOperator.
I want to read the value pushed by a task created using the above custom operator inside of a BranchPythonOperator task and choose a different path based on the returned value.
File: check_file_exists_operator.py
import logging
from tempfile import NamedTemporaryFile
from airflow.contrib.hooks.ssh_hook import SSHHook
from airflow.models import BaseOperator
from airflow.plugins_manager import AirflowPlugin
from airflow.utils.decorators import apply_defaults
log = logging.getLogger(__name__)
class CheckFileExistsOperator(BaseOperator):
"""
This operator checks if a given file name exists on the
the sftp server.
Returns true if it exists, false otherwise.
:param sftp_path_prefix: The sftp remote path. This is the specified file path
for downloading the file from the SFTP server.
:type sftp_path_prefix: string
:param file_to_be_processed: File that is to be Searched
:type file_to_be_processed: str
:param sftp_conn_id: The sftp connection id. The name or identifier for
establishing a connection to the SFTP server.
:type sftp_conn_id: string
:param timeout: timeout (in seconds) for executing the command.
:type timeout: int
:param do_xcom_push: return the stdout which also get set in xcom by
airflow platform
:type do_xcom_push: bool
"""
FORWARD_SLASH_LITERAL = '/'
template_fields = ('file_to_be_processed',)
#apply_defaults
def __init__(self,
sftp_path_prefix,
file_to_be_processed,
sftp_conn_id='ssh_default',
timeout=10,
do_xcom_push=True,
*args,
**kwargs):
super(CheckFileExistsOperator, self).__init__(*args, **kwargs)
self.sftp_path_prefix = sftp_path_prefix
self.file_to_be_processed = file_to_be_processed
self.sftp_conn_id = sftp_conn_id
self.timeout = timeout
self.do_xcom_push = do_xcom_push
self.args = args
self.kwargs = kwargs
def execute(self, context):
# Refer to https://docs.paramiko.org/en/2.4/api/sftp.html
ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id)
sftp_client = ssh_hook.get_conn().open_sftp()
sftp_file_absolute_path = self.sftp_path_prefix.strip() + \
self.FORWARD_SLASH_LITERAL + \
self.file_to_be_processed.strip()
task_instance = context['task_instance']
log.debug('Checking if the follwoing file exists: %s', sftp_file_absolute_path)
try:
with NamedTemporaryFile("w") as temp_file:
sftp_client.get(sftp_file_absolute_path, temp_file.name)
# Return a string equivalent of the boolean.
# Returning a boolean will make the key unreadable
params = {'file_exists' : True}
self.kwargs['params'] = params
task_instance.xcom_push(key="file_exists", value='True')
log.info('File Exists, returning True')
return 'True'
except FileNotFoundError:
params = {'file_exists' : False}
self.kwargs['params'] = params
task_instance.xcom_push(key="file_exists", value='False')
log.info('File Does not Exist, returning False')
return 'False'
class CheckFilePlugin(AirflowPlugin):
name = "check_file_exists"
operators = [CheckFileExistsOperator]
File: airflow_dag_sample.py
import logging
from airflow import DAG
from check_file_exists_operator import CheckFileExistsOperator
from airflow.contrib.operators.sftp_to_s3_operator import SFTPToS3Operator
from airflow.operators.python_operator import BranchPythonOperator
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from airflow.operators.email_operator import EmailOperator
log = logging.getLogger(__name__)
FORWARD_SLASH_LITERAL = '/'
default_args = {
'owner': 'gvatreya',
'depends_on_past': False,
'start_date': datetime(2019, 1, 1),
'email': ['***#***.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 5,
'retry_delay': timedelta(minutes=2),
'timeout': 10,
'sftp_conn_id': 'sftp_local_cluster',
'provide_context': True
}
dag = DAG('my_test_dag',
description='Another tutorial DAG',
schedule_interval='0 12 * * *',
start_date=datetime(2017, 3, 20),
default_args=default_args,
template_searchpath='/Users/your_name/some_path/airflow_home/sql',
catchup=False)
template_filename_from_xcom = """
{{ task_instance.xcom_pull(task_ids='get_fname_ships', key='file_to_be_processed', dag_id='my_test_dag') }}
"""
template_file_prefix_from_xcom = """
{{ task_instance.xcom_pull(task_ids='get_fname_ships', key="month_prefix_for_file", dag_id='my_test_dag') }}
"""
t_check_file_exists = CheckFileExistsOperator(
sftp_path_prefix='/toDjembe',
file_to_be_processed=template_filename_from_xcom.strip(),
sftp_conn_id='sftp_local_cluster',
task_id='check_file_exists',
dag=dag
)
def branch(**kwargs):
file_exist = kwargs['task_instance'].xcom_pull(task_ids='get_fname_ships', key="file_exists",
dag_id='my_test_dag')
print(template_filename_from_xcom)
from IPython import embed; embed()
log.debug("FILE_EXIST(from branch): %s", file_exist)
if file_exist:
return 's3_upload'
else:
return 'send_file_not_found_email'
t_branch_on_file_existence = BranchPythonOperator(
task_id='branch_on_file_existence',
python_callable=branch,
dag=dag
)
t_send_file_not_found_email = EmailOperator(
task_id='send_file_not_found_email',
to='***#***.com',
subject=template_email_subject.format(state='FAILURE',filename=template_filename_from_xcom.strip(),content='Not found on SFTP Server'),
html_content='File Not Found in SFTP',
mime_charset='utf-8',
dag=dag
)
t_upload_to_s3 = SFTPToS3Operator(
task_id='s3_upload',
sftp_conn_id='sftp_local_cluster',
sftp_path='/djembe/' + template_filename_from_xcom.strip(),
s3_conn_id='s3_conn',
s3_bucket='djembe-users',
s3_key='gvatreya/experiment/' + template_file_prefix_from_xcom.strip() + FORWARD_SLASH_LITERAL + template_filename_from_xcom.strip(),
dag=dag
)
t_check_file_exists >> t_branch_on_file_existence
t_branch_on_file_existence >> t_upload_to_s3
t_branch_on_file_existence >> t_send_file_not_found_email
However, when I run the code, the branch operator always sees the string 'None'.
However, the Xcom has the value true.
I tried debugging using IPython embed() and see that the taskinstance doesnot hold the value of the xcom. I tried using params, and other things that I could think of, but to no avail.
After spending days on this, I am now starting to think I have missed something crucial about the XCom in Airflow.
Hoping anyone could help.
Thanks in advance.
I think, the issue is with dependency.
You currently have the following:
sample_task >> task_3
task_2 >> task_3
task_2 >> task_4
Change it to the following i.e. adding sample_task >> tasK_2 line.
sample_task >> task_3
sample_task >> tasK_2
task_2 >> task_3
task_2 >> task_4
Your task that pushes to xcom should run first before the task that uses BranchPythonOperator
In your 2nd example, the branch function uses xcom_pull(task_ids='get_fname_ships' but I can't find any task with get_fname_ships task_id.
How does the task_ids work when multiple tasks are specified?
In this particular code example I expected to retreive the load_cycle_id_2 from both tasks in a tuple (5555,22222) but instead it comes out (None, 22222).
Why is that?
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime
args = {
'owner': 'airflow',
'start_date': datetime.now(),
'provide_context': True
}
demo_dag = DAG(dag_id='first', start_date=datetime.now(), schedule_interval='#once',default_args=args)
def push_load_id(**kwargs):
kwargs['ti'].xcom_push(key='load_cycle_id_2',value=22222)
kwargs['ti'].xcom_push(key='load_cycle_id_3',value=44444)
def another_push_load_id(**kwargs):
kwargs['ti'].xcom_push(key='load_cycle_id_2',value=5555)
kwargs['ti'].xcom_push(key='anotherload_cycle_id_3',value=6666)
def pull_load_id(**kwargs):
ti = kwargs['ti'].xcom_pull(key='load_cycle_id_2', task_ids=['another_push_load_id','push_load_id'])
print(ti)
push_operator = PythonOperator(task_id='push_load_id', python_callable=push_load_id, dag=demo_dag)
pull_operator = PythonOperator(task_id='pull_load_id', python_callable=pull_load_id, dag=demo_dag)
push_operator >> pull_operator
Your dags runs only push_load_id andpull_load_id functions. You do not create an operator that uses another_push_load_id function.
The end of your code should look like:
push_operator = PythonOperator(task_id='push_load_id', python_callable=push_load_id, dag=demo_dag)
another_push_operator = PythonOperator(task_id='push_load_id', python_callable= another_push_load_id, dag=demo_dag)
pull_operator = PythonOperator(task_id='pull_load_id', python_callable=pull_load_id, dag=demo_dag)
push_operator >> another_push_operator >> pull_operator