I am trying to convert Unix Timestamp into Date/Time format in Airflow Dag .
Function get_execution_time() in below Airflow script throwing parsing error :-
ERROR - HTTP error: Bad Request
[2021-10-04 21:39:46,187] ERROR - {"error":"unable to parse: metric parse error: expected field at 1:94: \"model_i7,model_owner=cgrm_developer,model_name=m1,execution_time=**<function get_execution_time at 0x7f46df0b1b70>\**""}
[2021-10-04 21:39:46,216] ERROR - 400:Bad Request
Parser failed to resolve the unix timestamp into Date/Time format and throwing execution_time=<function get_execution_time at 0x7f46df0b1b70>\ .
Here is the executable script :-
from airflow import DAG
from random import random, seed, choice
from datetime import datetime, timedelta
from airflow.operators.python_operator import PythonOperator
from airflow.operators.postgres_operator import PostgresOperator
from airflow.operators.http_operator import SimpleHttpOperator
import time
import warnings
import requests
from datetime import datetime
import calendar
from functools import wraps
warnings.filterwarnings("ignore",category=DeprecationWarning)
def get_execution_time():
ts = int("1284101485")
return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
model_list = ['m1', 'm2', 'm3', 'm4']
def get_data_str(model_name):
table_name = 'model_i7,'
execution_time=get_execution_time()
parameters = ',model_name={model_name},execution_time={execution_time}'.format(\
model_name=model_name,execution_time=get_execution_time)
return table_name + 'model_owner=test_developer' + parameters
default_args = {
'owner': 'developer',
'depends_on_past': False,
'start_date': datetime(2021,10,04),
'retries': 1,
'retry_delay': timedelta(minutes=1),
'catchup' : False,
'dagrun_timeout' : timedelta(hours=3),
'email_on_success': False,
'email_on_failure': False,
'email_on_retry': False
}
with DAG(
dag_id='testi7',
schedule_interval=None,
tags=['testi7'],
access_control = {'developer':{'can_dag_read','can_dag_edit'}},
default_args=default_args
) as dag:
for model_name in model_list:
extracting_metrics = SimpleHttpOperator(
task_id='extracting_metrics',
http_conn_id ='metrics_api',
endpoint='/write',
python_callable=get_data_str,
provide_context=True,
data=get_data_str(model_name)
)
Expected Output :-
table_name
model_owner
model_name
execution_time
model_output
developer
model1
2010-09-10T10:59:33+00:00
Would appreciate it if some one can help to resolve above exception .
Related
I am trying to load parquet file data from GCS to BigQuery as an append column.
I am able to append the schemas in BigQuery, but there is one column which is not loading/appearing in my final BigQuery.
Below is the code which I am using:
# -*- coding: utf-8 -*-
import datetime,time
from airflow import DAG,models
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
from airflow.operators import dummy_operator
import logging
import json,os
from airflow.operators import bash_operator
from airflow.providers.google.cloud.sensors.bigquery import (
BigQueryTableExistenceSensor,
BigQueryTablePartitionExistenceSensor,
)
from airflow.utils import timezone
yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
default_args = {
'owner': 'airflow',
'start_date': yesterday
}
with models.DAG(
'Parquet_GCS_to_BQ',
default_args=default_args,
catchup=False,
#schedule_interval='* 5 * * * ') as dag:
schedule_interval=None) as dag:
start = dummy_operator.DummyOperator(
task_id='start',
trigger_rule='all_success'
)
end = dummy_operator.DummyOperator(
task_id='end',
trigger_rule='all_success'
)
GCS_to_BQ = GoogleCloudStorageToBigQueryOperator(
task_id='Loading_Parquet_Into_BQ',
bucket='my bucket',
source_objects=['path/to/parquet/year=2021/month=8/*.parquet', 'path/to/parquet/year=2022/month=9/*.parquet'],
source_format='parquet',
destination_project_dataset_table='project_name.dataset_name.table_name',
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_APPEND',
schema_update_options=['ALLOW_FIELD_RELAXATION', 'ALLOW_FIELD_ADDITION'],
autodetect=True,
skip_leading_rows=1,
google_cloud_storage_conn_id='google_cloud_default',
bigquery_conn_id='google_cloud_default'
)
start >> GCS_to_BQ >> end
Please refer my code below.
While recon_rule_setup task is running, each time it is getting Dictionary (recon_conf) as input from previous read_recon_config task.
However, while recon_rule_exec is running, it is getting List as input (recon_rule) from previous task.
My expectation was, recon_rule_setup should run 2 times and recon_rule_exec should run 4 times depending on the return values.
Why expand is behaving differently each time.
from datetime import datetime
from airflow.models import DAG, XCom
from airflow.utils.dates import days_ago
from airflow.decorators import task
from airflow.utils.db import provide_session
from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator
from airflow.utils.task_group import TaskGroup
#provide_session
def clear_xcom_data(session=None, **kwargs):
dag_instance = kwargs["dag"]
dag_id = dag_instance._dag_id
session.query(XCom).filter(XCom.dag_id == dag_id).delete()
#task(task_id="read_recon_config")
def read_recon_config(dag_run=None):
parent_dict = dag_run.conf
d1 = {"name": "Santanu"}
d2 = {"name": "Ghosh"}
return [d1, d2]
#task(task_id="recon_rule_setup")
def recon_rule_setup(recon_conf):
print(f"type of recon_conf_dict: {type(recon_conf)}")
print(f"recon_conf_dict: {recon_conf}")
return [recon_conf, {"name": "Kolkata"}]
#task(task_id="recon_rule_exec")
def recon_rule_exec(recon_rule, master_key):
print(f"master_key type: {type(master_key)}")
print(f"master_key: {master_key}")
print(f"recon_rule type: {type(recon_rule)}")
print(f"recon_rule: {recon_rule}")
default_args = {
'owner': 'Airflow',
'start_date': days_ago(1),
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1
}
dag_name_id = "dynamic_demo"
cur_datetime = datetime.utcnow().strftime("%Y%m%d%H%M%S%f")[:-3]
master_dag_key = f"{dag_name_id}_{cur_datetime}"
with DAG(
dag_id=dag_name_id,
default_args=default_args,
schedule_interval=None,
catchup=False
) as dag:
start = BashOperator(task_id="start", bash_command='echo "starting reconciliation"', do_xcom_push=False)
stop = BashOperator(task_id="stop", bash_command='echo "stopping reconciliation"', do_xcom_push=False)
delete_xcom = PythonOperator(
task_id="delete_xcom",
python_callable=clear_xcom_data
)
with TaskGroup(group_id="reconciliation_process") as tg1:
recon_config_list = read_recon_config()
recon_rule_list = recon_rule_setup.expand(recon_conf=recon_config_list)
recon_rule_exec.partial(master_key=master_dag_key).expand(recon_rule=recon_rule_list)
start >> tg1 >> delete_xcom >> stop
Regards,
Santanu
Since you are trying to return the dictionary as a list that’s why it is returning the dictionary inside a list. For your requirement, you can try the below code which is returning a dictionary.
Code:
from datetime import datetime
from airflow.models import DAG, XCom
from airflow.utils.dates import days_ago
from airflow.decorators import task
from airflow.utils.db import provide_session
from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator
from airflow.utils.task_group import TaskGroup
#provide_session
def clear_xcom_data(session=None, **kwargs):
dag_instance = kwargs["dag"]
dag_id = dag_instance._dag_id
session.query(XCom).filter(XCom.dag_id == dag_id).delete()
#task(task_id="read_recon_config")
def read_recon_config(dag_run=None):
parent_dict = dag_run.conf
d1 = {1:{"name": "S"} ,2:{"name": "G"}}
return d1
#task(task_id="recon_rule_setup")
def recon_rule_setup(recon_conf):
print(f"type of recon_conf_dict: {type(recon_conf)}")
print(f"recon_conf_dict: {recon_conf}")
return [recon_conf, {"name": "Kolkata"}]
#task(task_id="recon_rule_exec")
def recon_rule_exec(recon_rule, master_key):
print(f"master_key type: {type(master_key)}")
print(f"master_key: {master_key}")
print(f"recon_rule type: {type(recon_rule)}")
print(f"recon_rule: {recon_rule}")
default_args = {
'owner': 'Airflow',
'start_date': days_ago(1),
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1
}
dag_name_id = "dynamic_demo"
cur_datetime = datetime.utcnow().strftime("%Y%m%d%H%M%S%f")[:-3]
master_dag_key = f"{dag_name_id}_{cur_datetime}"
with DAG(
dag_id=dag_name_id,
default_args=default_args,
schedule_interval=None,
catchup=False
) as dag:
start = BashOperator(task_id="start", bash_command='echo "starting reconciliation"', do_xcom_push=False)
stop = BashOperator(task_id="stop", bash_command='echo "stopping reconciliation"', do_xcom_push=False)
delete_xcom = PythonOperator(
task_id="delete_xcom",
python_callable=clear_xcom_data
)
with TaskGroup(group_id="reconciliation_process") as tg1:
recon_config_list = read_recon_config()
recon_rule_list = recon_rule_setup.expand(recon_conf=recon_config_list)
recon_rule_exec.partial(master_key=master_dag_key).expand(recon_rule=recon_rule_list)
start >> tg1 >> delete_xcom >> stop
Output:
I have a DAG executing a Python script which takes a date argument (the current date). I'm scheduling the DAG to run at 6:00 AM Monday through Friday i.e weekdays Eastern Standard Time. The DAG has to run the Python script on Monday with Mondays date as an argument, same for Tuesday all the way to Friday with Fridays date as an argument.
I noticed using a schedule interval of '0 6 * * 1-5' didn't work because Fridays execution didn't occur until the following Monday.
I changed the schedule interval to '0 6 * * *' to run everyday at 6:00 AM and at the start of my dag, filter for dates that fall within ‘0 6 * * 1-5’, so effectively Monday to Friday. For Saturday and Sunday, the downstream tasks should be skipped.
This is my code
from __future__ import print_function
import pendulum
import logging
from airflow.models import DAG
from airflow.models import Variable
from datetime import datetime, timedelta
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.python_operator import ShortCircuitOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.trigger_rule import TriggerRule
from croniter import croniter
log = logging.getLogger(__name__)
def filter_processing_date(**context):
execution_date = context['execution_date']
cron = croniter('0 6 * * 1-5', execution_date)
log.info('cron is: {}'.format(cron))
log.info('execution date is: {}'.format(execution_date))
#prev_date = cron.get_prev(datetime)
#log.info('prev_date is: {}'.format(prev_date))
return execution_date == cron.get_next(datetime).get_prev(datetime)
local_tz = pendulum.timezone("America/New_York")
# DAG parameters
default_args = {
'owner': 'Managed Services',
'depends_on_past': False,
'start_date': datetime(2020, 8, 3, tzinfo=local_tz),
'dagrun_timeout': None,
'email': Variable.get('email'),
'email_on_failure': True,
'email_on_retry': False,
'provide_context': True,
'retries': 12,
'retry_delay': timedelta(minutes=5)
}
with DAG(
'execute_python',
schedule_interval='0 6 * * *',
default_args=default_args
) as dag:
start_dummy = DummyOperator(
task_id='start',
dag=dag
)
end_dummy = DummyOperator(
task_id='end',
trigger_rule=TriggerRule.NONE_FAILED,
dag=dag
)
weekdays_only = ShortCircuitOperator(
task_id='weekdays_only',
python_callable=filter_processing_date,
dag=dag
)
run_python = SSHOperator(
ssh_conn_id="oci_connection",
task_id='run_python',
command='/usr/bin/python3 /home/sb/local/bin/runProcess.py -d {{ ds_nodash }}',
dag=dag)
start_dummy >> weekdays_only >> run_python >> end_dummy
Unfortunately, weekdays_only task is failing with the below error message. What is going wrong?
Airflow error message
Airflow error message continuation
Airflow version: v1.10.9-composer
Python 3.
I managed to solve my problem by hacking something together. Checking if the next execution date is a weekday and returning true if it's the case or false otherwise. I call the function in a ShortCircuitOperator which proceeds with downstream tasks if true or skips them if false.
This is my code below but I'm open to better solutions.
from __future__ import print_function
import pendulum
import logging
from airflow.models import DAG
from airflow.models import Variable
from datetime import datetime, timedelta
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.python_operator import ShortCircuitOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.trigger_rule import TriggerRule
log = logging.getLogger(__name__)
def checktheday(**context):
next_execution_date = context['next_execution_date']
log.info('next_execution_date is: {}'.format(next_execution_date))
date_check = next_execution_date.weekday()
log.info('date_check is: {}'.format(date_check))
if date_check == 0 or date_check == 1 or date_check == 2 or date_check == 3 or date_check == 4:
decision = True
else:
decision = False
log.info('decision is: {}'.format(decision))
return decision
local_tz = pendulum.timezone("America/New_York")
# DAG parameters
default_args = {
'owner': 'Managed Services',
'depends_on_past': False,
'start_date': datetime(2020, 8, 3, tzinfo=local_tz),
'dagrun_timeout': None,
'email': Variable.get('email'),
'email_on_failure': True,
'email_on_retry': False,
'provide_context': True,
'retries': 12,
'retry_delay': timedelta(minutes=5)
}
with DAG(
'execute_python',
schedule_interval='0 6 * * *',
default_args=default_args
) as dag:
start_dummy = DummyOperator(
task_id='start',
dag=dag
)
end_dummy = DummyOperator(
task_id='end',
trigger_rule=TriggerRule.NONE_FAILED,
dag=dag
)
weekdays_only = ShortCircuitOperator(
task_id='weekdays_only',
python_callable=checktheday,
dag=dag
)
run_python = SSHOperator(
ssh_conn_id="oci_connection",
task_id='run_python',
command='/usr/bin/python3 /home/sb/local/bin/runProcess.py -d {{ macros.ds_format(macros.ds_add(ds, 1), "%Y-%m-%d", "%Y%m%d") }}',
dag=dag)
start_dummy >> weekdays_only >> run_python >> end_dummy
I'm scheduling a DAG to run at 04:00 AM, Tuesday through Saturday eastern standard time (NY) starting from today 2020/08/11. After writing up the code and deploying, I expected the DAG to get triggered. I refreshed my Airflow UI page a couple of times but it's not triggering still. I am using Airflow version v1.10.9-composer with python 3.
This is my DAG code:
"""
This DAG executes a retrieval job
"""
# Required packages to execute DAG
from __future__ import print_function
import pendulum
from airflow.models import DAG
from airflow.models import Variable
from datetime import datetime, timedelta
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.trigger_rule import TriggerRule
local_tz = pendulum.timezone("America/New_York")
# DAG parameters
default_args = {
'owner': 'Me',
'depends_on_past': False,
'start_date': datetime(2020, 8, 10, 4, tzinfo=local_tz),
'dagrun_timeout': None,
'email': Variable.get('email'),
'email_on_failure': True,
'email_on_retry': False,
'provide_context': True,
'retries': None,
'retry_delay': timedelta(minutes=5)
}
# create DAG object with Name and default_args
with DAG(
'retrieve_files',
schedule_interval='0 4 * * 2-6',
description='Retrieves files from sftp',
max_active_runs=1,
catchup=True,
default_args=default_args
) as dag:
# Define tasks - below are dummy tasks and a task instantiated by SSHOperator- calling methods written in other py class
start_dummy = DummyOperator(
task_id='start',
dag=dag
)
end_dummy = DummyOperator(
task_id='end',
trigger_rule=TriggerRule.NONE_FAILED,
dag=dag
)
retrieve_file = SSHOperator(
ssh_conn_id="my_conn",
task_id='retrieve_file',
command='/usr/bin/python3 /path_to_file/getFile.py',
dag=dag)
dag.doc_md = __doc__
retrieve_file.doc_md = """\
#### Task Documentation
Connects to sftp and retrieves files.
"""
start_dummy >> retrieve_file >> end_dummy
Referring to the official documentation:
The scheduler runs your job one schedule_interval AFTER the start date.
If your start_date is 2020-01-01 and schedule_interval is #daily, the
first run will be created on 2020-01-02 i.e., after your start date
has passed.
In order to run a DAG at a specific time everyday (including today), the start_date needs to be set to a time in the past and schedule_interval needs to have the desired time in cron format. It is very important to set yesterday's datetime properly or the trigger won't work.
In that case, we should set the start_date as a Tuesday from previous week, which is: (2020, 8, 4). There should be 1 week interval that has passed since your start date, because of running it weekly.
Let's take a look for the following example, which shows how to run a job 04:00 AM, Tuesday through Saturday EST:
from datetime import datetime, timedelta
from airflow import models
import pendulum
from airflow.operators import bash_operator
local_tz = pendulum.timezone("America/New_York")
default_dag_args = {
'start_date': datetime(2020, 8, 4, 4, tzinfo=local_tz),
'retries': 0,
}
with models.DAG(
'Test',
default_args=default_dag_args,
schedule_interval='00 04 * * 2-6') as dag:
# DAG code
print_dag_run_conf = bash_operator.BashOperator(
task_id='print_dag_run_conf', bash_command='echo {{ dag_run.id }}')
I recommend you to check the what’s the deal with start_date documentation.
I am new with using airflow.
I noticed that if you define a global variable (timestamp) in the code, this value will change for each task. For example in the very basic example bellow, I define a variable now but each time I print it in a task, this value changes.
from datetime import timedelta
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
import time
now = int(time.time() * 1000)
RANGE = range(1, 10)
def init_step():
print("Run on RANGE {}".format(RANGE))
print("Date of the Scans {}".format(now))
return RANGE
def trigger_step(index):
time.sleep(10)
print("index {} - date {}".format(index, now))
return index
default_args = {
'owner': 'airflow',
'start_date': days_ago(1),
'retries': 2,
'retry_delay': timedelta(minutes=15)
}
with DAG('test',
default_args=default_args,
schedule_interval='0 16 */7 * *',
) as dag:
init = PythonOperator(task_id='init',
python_callable=init_step,
dag=dag)
for index in init_step():
run = PythonOperator(task_id='trigger-port-' + str(index),
op_kwargs={'index': index},
python_callable=trigger_step, dag=dag)
dag >> init >> run
Is it a normal behavior ? Is there a way to change it ?