How to poll and keep track of external job status through Airflow?

How to poll and keep track of external job status through Airflow? - python

I am working on polling boto3 to check the status of a SageMaker Autopilot job using Airflow. I am using a PythonSensor to wait for the status to return Completed for both JobStatus and JobSecondaryStatus, then end the entire pipeline. These are the values that they can contain which I made enums of in the code:
'AutoMLJobStatus': 'Completed'|'InProgress'|'Failed'|'Stopped'|'Stopping',
'AutoMLJobSecondaryStatus': 'Starting'|'AnalyzingData'|'FeatureEngineering'|'ModelTuning'|'MaxCandidatesReached'|'Failed'|'Stopped'|'MaxAutoMLJobRuntimeReached'|'Stopping'|'CandidateDefinitionsGenerated'|'GeneratingExplainabilityReport'|'Completed'|'ExplainabilityError'|'DeployingModel'|'ModelDeploymentError'
_sagemaker_job_status takes automl_job_name through xcom from an upstream task and it successfully gets passed. With this job name I can pass it to descibe_auto_ml_job() to get the status through AutoMLJobStatus and AutoMLJobSecondaryStatus.
The main point of this is for messaging through Slack to see all the unique stages the job is at. Currently, I am trying to save all the unique job statuses to a set and then checking that set before sending a message with the job statuses in it.
But everytime _sagemaker_job_status is poked, the values of the set seem to be the same therefore sending a slack message everytime the function is poked, I logged the sets and both are empty. Below this I made a simpler example that worked.
import airflow
from airflow import DAG
from airflow.exceptions import AirflowFailException
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator
from airflow.sensors.python import PythonSensor
import boto3
def _sagemaker_job_status(templates_dict, **context):
"""
Checks the SageMaker AutoMLJobStatus and AutoMLJobSecondaryStatus
for updates and when both are complete the entire process is marked as
successful
"""
automl_job_name = templates_dict.get("automl_job_name")
if not automl_job_name:
error_message = "AutoMLJobName was not passed from upstream"
print(error_message)
task_fail_slack_alert(
context=context,
extra_message=error_message,
)
client = boto3.client("sagemaker", "us-east-1")
response = client.describe_auto_ml_job(
AutoMLJobName=automl_job_name,
)
job_status = response.get("AutoMLJobStatus")
secondary_job_status = response.get("AutoMLJobSecondaryStatus")
past_job_statuses = set()
past_secondary_job_statuses = set()
print(f"Past Job Statuses : {past_job_statuses}")
print(f"Past Secondary Job Statuses : {past_secondary_job_statuses}")
# If the job status has not been already seen
if (
job_status not in past_job_statuses
and secondary_job_status not in past_secondary_job_statuses
):
message = f"""
JobStatus : {job_status}
JobSecondaryStatus : {secondary_job_status}
"""
print(message)
task_success_slack_alert(
context=context,
extra_message=message,
)
past_job_statuses.add(job_status)
past_secondary_job_statuses.add(secondary_job_status)
# If the main job fails
if job_status == JobStatus.Failed.value:
error_message = "SageMaker Autopilot Job Failed!"
task_fail_slack_alert(
context=context,
extra_message=error_message,
)
raise AirflowFailException(error_message)
return (
job_status == JobStatus.Completed.value
and secondary_job_status == JobSecondaryStatus.Completed.value
)
args = {
"owner": "Yudhiesh",
"start_date": airflow.utils.dates.days_ago(1),
"schedule_interval": "#once",
"on_failure_callback": task_fail_slack_alert,
}
with DAG(
dag_id="02_lasic_retraining_sagemaker_autopilot",
default_args=args,
render_template_as_native_obj=True,
) as dag:
sagemaker_job_status = PythonSensor(
task_id="sagemaker_job_status",
python_callable=_sagemaker_job_status,
templates_dict={
"automl_job_name": "{{task_instance.xcom_pull(task_ids='train_model_sagemaker_autopilot')}}", # noqa: E501
},
)
end = DummyOperator(
task_id="end",
)
sagemaker_job_status >> end
I created a similar setup as before but this time I randomly generated the values from an enum of JobStatus & JobSecondaryStatus and tried to only print the values if they are unique, and turns out it works perfectly. Could anyone explain why this happens and what I can do to the main example to get it to work?
import airflow
import random
from airflow import DAG
from airflow.sensors.python import PythonSensor
from airflow.operators.dummy import DummyOperator
from airflow.exceptions import AirflowFailException
def _mimic_sagemaker_job_status():
job_statuses = [status.value for status in JobStatus]
job_secondary_statuses = [
secondary_status.value for secondary_status in JobSecondaryStatus
]
past_job_statuses = set()
past_secondary_job_statuses = set()
job_status = random.choice(job_statuses)
job_secondary_status = random.choice(job_secondary_statuses)
if (
job_status not in past_job_statuses
and job_secondary_status not in past_secondary_job_statuses
):
message = f"""
JobStatus : {job_status}
JobSecondaryStatus : {job_secondary_status}
"""
# Send alerts on every new job status update
print(message)
past_job_statuses.add(job_status)
past_secondary_job_statuses.add(job_secondary_status)
if (
job_status == JobStatus.Failed.value
or job_secondary_status == JobSecondaryStatus.Failed.value
):
raise AirflowFailException("SageMaker Autopilot Job Failed!")
return (
job_secondary_status == JobSecondaryStatus.Completed.value
and job_status == JobStatus.Completed.value
)
with DAG(
dag_id="04_sagemaker_sensor",
start_date=airflow.utils.dates.days_ago(3),
schedule_interval="#once",
render_template_as_native_obj=True,
) as dag:
wait_for_status = PythonSensor(
task_id="wait_for_status",
python_callable=_mimic_sagemaker_job_status,
dag=dag,
)
end = DummyOperator(
task_id="end",
)
wait_for_status >> end
Enums used in the above code:
from enum import Enum
class JobStatus(Enum):
"""
Enum of all the potential values of a SageMaker Autopilot job status
"""
Completed = "Completed"
InProgress = "InProgress"
Failed = "Failed"
Stopped = "Stopped"
Stopping = "Stopping"
class JobSecondaryStatus(Enum):
"""
Enum of all the potential values of a SageMaker Autopilot job secondary
status
"""
Starting = "Starting"
AnalyzingData = "AnalyzingData"
FeatureEngineering = "FeatureEngineering"
ModelTuning = "ModelTuning"
MaxCandidatesReached = "MaxCandidatesReached"
Failed = "Failed"
Stopped = "Stopped"
MaxAutoMLJobRuntimeReached = "MaxAutoMLJobRuntimeReached"
Stopping = "Stopping"
CandidateDefinitionsGenerated = "CandidateDefinitionsGenerated"
GeneratingExplainabilityReport = "GeneratingExplainabilityReport"
Completed = "Completed"
ExplainabilityError = "ExplainabilityError"
DeployingModel = "DeployingModel"
ModelDeploymentError = "ModelDeploymentError"
EDIT:
I suppose another work around for the main example would be to have an operator create a temporary file containing JSON of the set before the sagemaker job status, then within the sagemaker job status I can check the job statuses saved to the file and then print them if they are unique. I just realised that I can make use of the database as well.

So I couldn't seem to get it working as it is so I resorted to creating a JSON file that stores the different SageMaker Autopilot job statuses which I read and write to in the PythonSensor.
This takes in the AutoMLJobName from the previous step, creates a temporary file of the job statuses, and returns the AutoMLJobName and the name of the JSON file.
import tempfile
def _create_job_status_json(templates_dict, **context):
automl_job_name = templates_dict.get("sagemaker_autopilot_data_paths")
if not automl_job_name:
error_message = "AutoMLJobName was not passed from upstream"
print(error_message)
task_fail_slack_alert(
context=context,
extra_message=error_message,
)
initial = {
"JobStatus": [],
"JobSecondaryStatus": [],
}
file = tempfile.NamedTemporaryFile(mode="w", delete=False)
json.dump({"Status": initial}, file)
file.flush()
return (file.name, automl_job_name)
Next this function reads the JSON file based on the name and then checks the different job statuses based on the boto3 sagemaker client. If the main job fails then the whole run fails. It adds the job statuses to a dictionary if one of them are unique. Once that is done it will write the dictionary to the JSON file. When the entire job finishes, it sends some details about the best model as a Slack message. It returns true when both job statuses are Completed. Just a note, I also removed the JSON file if the job is successfull or if it fails.
import airflow
from airflow import DAG
from airflow.exceptions import AirflowFailException
import boto3
def _sagemaker_job_status(templates_dict, **context):
"""
Checks the SageMaker AutoMLJobStatus and AutoMLJobSecondaryStatus
for updates and when both are complete the entire process is marked as
successful
"""
file_name, automl_job_name = templates_dict.get("automl_job_data")
job_status_dict = {}
client = boto3.client("sagemaker", "us-east-1")
if not client:
raise AirflowFailException(
"Unable to get access to boto3 sagemaker client",
)
with open(file_name, "r") as json_file:
response = client.describe_auto_ml_job(
AutoMLJobName=automl_job_name,
)
job_status = response.get("AutoMLJobStatus")
secondary_job_status = response.get("AutoMLJobSecondaryStatus")
job_status_dict = json.load(json_file)
status = job_status_dict.get("Status")
past_job_statuses = status.get("JobStatus")
past_secondary_job_statuses = status.get("JobSecondaryStatus")
if job_status == JobStatus.Failed.value:
error_message = "SageMaker Autopilot Job Failed!"
task_fail_slack_alert(
context=context,
extra_message=error_message,
)
os.remove(file_name)
raise AirflowFailException(error_message)
if (
job_status not in past_job_statuses
or secondary_job_status not in past_secondary_job_statuses
):
message = f"""
JobStatus : {job_status}
JobSecondaryStatus : {secondary_job_status}
"""
print(message)
task_success_slack_alert(
context=context,
extra_message=message,
)
past_job_statuses.append(job_status)
past_secondary_job_statuses.append(secondary_job_status)
with open(file_name, "w") as file:
json.dump(job_status_dict, file)
if (
job_status == JobStatus.Completed.value
and secondary_job_status == JobSecondaryStatus.Completed.value
):
os.remove(file_name)
response = client.describe_auto_ml_job(
AutoMLJobName=automl_job_name,
)
best_candidate = response.get("BestCandidate")
best_candidate_id = best_candidate.get("CandidateName")
best_metric_name = (
best_candidate.get("FinalAutoMLJobObjectiveMetric")
.get("MetricName")
.split(":")[1]
.upper()
)
best_metric_value = round(
best_candidate.get("FinalAutoMLJobObjectiveMetric").get(
"Value",
),
3,
)
message = f"""
Best Candidate ID : {best_candidate_id}
Best Candidate Metric Score : {best_metric_value}{best_metric_name}
""" # noqa: E501
task_success_slack_alert(
context=context,
extra_message=message,
)
return (
job_status == JobStatus.Completed.value
and secondary_job_status == JobSecondaryStatus.Completed.value
)
DAG code:
import airflow
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.sensors.python import PythonSensor
args = {
"owner": "Yudhiesh",
"start_date": airflow.utils.dates.days_ago(1),
"schedule_interval": "#once",
"on_failure_callback": task_fail_slack_alert,
}
with DAG(
dag_id="02_lasic_retraining_sagemaker_autopilot",
default_args=args,
render_template_as_native_obj=True,
) as dag:
create_job_status_json = PythonOperator(
task_id="create_job_status_json",
python_callable=_create_job_status_json,
templates_dict={
"sagemaker_autopilot_data_paths": "{{task_instance.xcom_pull(task_ids='train_model_sagemaker_autopilot')}}", # noqa: E501
},
)
sagemaker_job_status = PythonSensor(
task_id="sagemaker_job_status",
python_callable=_sagemaker_job_status,
templates_dict={
"automl_job_data": "{{task_instance.xcom_pull(task_ids='create_job_status_json')}}", # noqa: E501
},
)
# train_model_sagemaker_autopilot is not included but it initiates the training through boto3
train_model_sagemaker_autopilot >> create_job_status_json
create_job_status_json >> sagemaker_job_status

Related

How to pass a variable from one task to another in airflow

The below code works but my requirement is to pass totalbuckets as an input to the function as opposed to global variable. I am having trouble passing it as a variable and do xcom_pull in next task. This dag basically creates buckets based on the number of inputs and totalbuckets is a constant. Appreciate your help in advance.
from airflow import DAG
from airflow.operators.python import PythonOperator, BranchPythonOperator
with DAG('test-live', catchup=False, schedule_interval=None, default_args=args) as test_live:
totalbuckets = 3
# branches based on number of buckets
def branch_buckets(**context):
buckets = defaultdict(list)
for i in range(len(inputs_to_process)):
buckets[f'bucket_{(1+i % totalbuckets)}'].append(inputs_to_process[i])
for bucket_name, input_sublist in buckets.items():
context['ti'].xcom_push(key = bucket_name, value = input_sublist)
return list(buckets.keys())
# BranchPythonOperator will launch the buckets and distributes inputs among the buckets
branch_buckets = BranchPythonOperator(
task_id='branch_buckets',
python_callable=branch_buckets,
trigger_rule=TriggerRule.NONE_FAILED,
provide_context=True,
dag=test_live
)
# update provider tables with merge sql
def update_inputs(sf_conn_id, bucket_name, **context):
input_sublist = context['ti'].xcom_pull(task_ids='branch_buckets', key=bucket_name)
print(f"Processing inputs {input_sublist} in {bucket_name}")
from custom.hooks.snowflake_hook import SnowflakeHook
for p in input_sublist:
merge_sql=f"""
merge into ......"""
bucket_tasks = []
for i in range(totalbuckets):
task= PythonOperator(
task_id=f'bucket_{i+1}',
python_callable=update_inputs,
provide_context=True,
op_kwargs={'bucket_name':f'bucket_{i+1}','sf_conn_id': SF_CONN_ID},
dag=test_live
)
bucket_tasks.append(task)

If totalbuckets is different from run to other, it should be a run conf variable, you can provide it for each run crated from the UI, CLI, Airflow REST API or even python API.
from airflow import DAG
from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.models.param import Param
with DAG(
'test-live',
catchup=False,
schedule_interval=None,
default_args=args,
params={"totalbuckets": Param(default=3, type="integer")},
) as test_live:
# branches based on number of buckets
def branch_buckets(**context):
buckets = defaultdict(list)
for i in range(len(inputs_to_process)):
buckets[f'bucket_{(1+i % int("{{ params.totalbuckets }}"))}'].append(inputs_to_process[i])
for bucket_name, input_sublist in buckets.items():
context['ti'].xcom_push(key = bucket_name, value = input_sublist)
return list(buckets.keys())
# BranchPythonOperator will launch the buckets and distributes inputs among the buckets
branch_buckets = BranchPythonOperator(
task_id='branch_buckets',
python_callable=branch_buckets,
trigger_rule=TriggerRule.NONE_FAILED,
provide_context=True,
dag=test_live
)
# update provider tables with merge sql
def update_inputs(sf_conn_id, bucket_name, **context):
input_sublist = context['ti'].xcom_pull(task_ids='branch_buckets', key=bucket_name)
print(f"Processing inputs {input_sublist} in {bucket_name}")
from custom.hooks.snowflake_hook import SnowflakeHook
for p in input_sublist:
merge_sql=f"""
merge into ......"""
bucket_tasks = []
for i in range(int("{{ params.totalbuckets }}")):
task= PythonOperator(
task_id=f'bucket_{i+1}',
python_callable=update_inputs,
provide_context=True,
op_kwargs={'bucket_name':f'bucket_{i+1}','sf_conn_id': SF_CONN_ID},
dag=test_live
)
bucket_tasks.append(task)
Example to run it:
airflow dags trigger --conf '{"totalbuckets": 10}' test-live
Or via the UI.
update:
And if it's static, but different from an environment to other, it can be an Airflow variable, and read it directly in the tasks using jinja to avoid reading it at each Dag Files processing.
But if it's completely static, the most recommended solution is using python variable as you do, because to read dag run conf and Airflow variables, the task/dag send a query to the database.

#hussein awala I am doing something like below but cannot parse totalbuckets in bucket_tasks
from airflow.operators.python import PythonOperator, BranchPythonOperator
with DAG('test-live', catchup=False, schedule_interval=None, default_args=args) as test_live:
#totalbuckets = 3
def branch_buckets(totalbuckets, **context):
buckets = defaultdict(list)
for i in range(len(inputs_to_process)):
buckets[f'bucket_{(1+i % totalbuckets)}'].append(inputs_to_process[i])
for bucket_name, input_sublist in buckets.items():
context['ti'].xcom_push(key = bucket_name, value = input_sublist)
return list(buckets.keys())
# BranchPythonOperator will launch the buckets and distributes inputs among the buckets
branch_buckets = BranchPythonOperator(
task_id='branch_buckets',
python_callable=branch_buckets,
trigger_rule=TriggerRule.NONE_FAILED,
provide_context=True, op_kwargs={'totalbuckets':3},
dag=test_live
)
# update provider tables with merge sql
def update_inputs(sf_conn_id, bucket_name, **context):
input_sublist = context['ti'].xcom_pull(task_ids='branch_buckets', key=bucket_name)
print(f"Processing inputs {input_sublist} in {bucket_name}")
from custom.hooks.snowflake_hook import SnowflakeHook
for p in input_sublist:
merge_sql=f"""
merge into ......"""
bucket_tasks = []
for i in range(totalbuckets):
task= PythonOperator(
task_id=f'bucket_{i+1}',
python_callable=update_inputs,
provide_context=True,
op_kwargs={'bucket_name':f'bucket_{i+1}','sf_conn_id': SF_CONN_ID},
dag=test_live
)
bucket_tasks.append(task)```

Database Query returning empty in test cases - Django

I have updated the Django version for my project from Django-2.2.16 --> Django3.2.14.
But with this update, some of my test cases are failing and I cannot understand the reason for the failure.
My test-case file:
import json
from os import access
from unittest import mock
from unittest.mock import patch
import asyncio
from app.models import UserProfile
from django.test import TestCase, TransactionTestCase
from requests.models import Response
from services.check_status import check_status
loop = asyncio.get_event_loop()
#mock.patch("services.check_status.save_status")
#mock.patch("services..check_status.send_wss_message")
class TestUserStatus(TransactionTestCase):
def setUp(self):
super().setUp()
self.account_id = 4
self.sim_profile = UserProfile.objects.create()
def test_check_status_completed(
self,
mock_send_wss_message,
mock_save_status,
):
mock_save_status.return_value = {}
send_wss_message_future = asyncio.Future()
send_wss_message_future.set_result(True)
mock_send_wss_message.return_value = send_wss_message_future
loop.run_until_complete(
check_status(
self.sim_profile.id,
)
)
self.assertTrue(mock_save_status.called)
self.assertTrue(mock_send_wss_message.called)
My pseudo check_status file is :-
import logging
from app.models import UserProfile, UserStatus
from services.constants import WebsocketGroups
from services.user.constants import USER
from app.api.serializers import UserStatusSerializer
from services.utils import send_wss_message, Client
logger = logging.getLogger(__name__)
def save_status(**kwargs):
status = UserStatus.objects.filter(
status_id=kwargs.get("status_id")
).first()
data = kwargs
user_status_serializer = UserStatusSerializer(status, data, partial=True)
if user_status_serializer.is_valid():
user_status_serializer.save()
async def check_status(
profile_id
):
user_profile = UserProfile.objects.get(id=profile_id)
login_token = get_login_token(user_profile)
user_creds = env["user_api"]
headers = USER["headers"]
subscription_details = Client.get(
USER["url"], headers
)
transaction_status = subscription_details.json()["Status"]
subscription_data = subscription_details.json()["Data"][0]
transaction_status_details = subscription_data["TransactionStatusDetails"]
error_message = ""
status = ""
if transaction_status == "Success":
#perform some actions and save status...
message = {
"type": "user_profile",
"data": [user_profile.id, transaction_status, {"results": {}},],
}
await send_wss_message(
user_profile.id, message=message, group_name=WebsocketGroups.USER_PROFILE,
)
else:
#perform some actions ...
When I am running my test-case file it's creating the UserProfile object but when control goes to the check_status function in int UserProfile.objects.all returns <QuerySet []>.
I made a temporary sync function to return a list of all user profiles and called it inside my test_check_status_completed and it returned the list. But for async functions that are called through the loop.run_until_complete, they all returned <QuerySet []>.

Run Multiple Athena Queries in Airflow 2.0

I am trying to create a DAG in which one of the task does athena query using boto3. It worked for one query however I am facing issues when I try to run multiple athena queries.
This problem can be broken as follows:-
If one goes through this blog, it can be seen that athena uses start_query_execution to trigger query and get_query_execution for getting status, queryExecutionId and other data about the query (docs for athena)
After following the above pattern I have following code:-
import json
import time
import asyncio
import boto3
import logging
from airflow import DAG
from airflow.operators.python import PythonOperator
def execute_query(client, query, database, output_location):
response = client.start_query_execution(
QueryString=query,
QueryExecutionContext={
'Database': database
},
ResultConfiguration={
'OutputLocation': output_location
}
)
return response['QueryExecutionId']
async def get_ids(client_athena, query, database, output_location):
query_responses = []
for i in range(5):
query_responses.append(execute_query(client_athena, query, database, output_location))
res = await asyncio.gather(*query_responses, return_exceptions=True)
return res
def run_athena_query(query, database, output_location, region_name, **context):
BOTO_SESSION = boto3.Session(
aws_access_key_id = 'YOUR_KEY',
aws_secret_access_key = 'YOUR_ACCESS_KEY')
client_athena = BOTO_SESSION.client('athena', region_name=region_name)
loop = asyncio.get_event_loop()
query_execution_ids = loop.run_until_complete(get_ids(client_athena, query, database, output_location))
loop.close()
repetitions = 900
error_messages = []
s3_uris = []
while repetitions > 0 and len(query_execution_ids) > 0:
repetitions = repetitions - 1
query_response_list = client_athena.batch_get_query_execution(
QueryExecutionIds=query_execution_ids)['QueryExecutions']
for query_response in query_response_list:
if 'QueryExecution' in query_response and \
'Status' in query_response['QueryExecution'] and \
'State' in query_response['QueryExecution']['Status']:
state = query_response['QueryExecution']['Status']['State']
if state in ['FAILED', 'CANCELLED']:
error_reason = query_response['QueryExecution']['Status']['StateChangeReason']
error_message = 'Final state of Athena job is {}, query_execution_id is {}. Error: {}'.format(
state, query_execution_id, error_message
)
error_messages.append(error_message)
query_execution_ids.remove(query_response['QueryExecutionId'])
elif state == 'SUCCEEDED':
result_location = query_response['QueryExecution']['ResultConfiguration']['OutputLocation']
s3_uris.append(result_location)
query_execution_ids.remove(query_response['QueryExecutionId'])
time.sleep(2)
logging.exception(error_messages)
return s3_uris
DEFAULT_ARGS = {
'owner': 'ubuntu',
'depends_on_past': True,
'start_date': datetime(2021, 6, 8),
'retries': 0,
'concurrency': 2
}
with DAG('resync_job_dag', default_args=DEFAULT_ARGS, schedule_interval=None) as dag:
ATHENA_QUERY = PythonOperator(
task_id='athena_query',
python_callable=run_athena_query,
provide_context=True,
op_kwargs={
'query': 'SELECT request_timestamp FROM "sampledb"."elb_logs" limit 10;', # query provide in athena tutorial
'database':'sampledb',
'output_location':'YOUR_BUCKET',
'region_name':'YOUR_REGION'
}
)
ATHENA_QUERY
On running above code, I am getting following error:-
[2021-06-16 20:34:52,981] {taskinstance.py:1455} ERROR - An asyncio.Future, a coroutine or an awaitable is required
Traceback (most recent call last):
File "/home/ubuntu/venv/lib/python3.6/site-packages/airflow/models/taskinstance.py", line 1112, in _run_raw_task
self._prepare_and_execute_task_with_callbacks(context, task)
File "/home/ubuntu/venv/lib/python3.6/site-packages/airflow/models/taskinstance.py", line 1285, in _prepare_and_execute_task_with_callbacks
result = self._execute_task(context, task_copy)
File "/home/ubuntu/venv/lib/python3.6/site-packages/airflow/models/taskinstance.py", line 1315, in _execute_task
result = task_copy.execute(context=context)
File "/home/ubuntu/venv/lib/python3.6/site-packages/airflow/operators/python.py", line 117, in execute
return_value = self.execute_callable()
File "/home/ubuntu/venv/lib/python3.6/site-packages/airflow/operators/python.py", line 128, in execute_callable
return self.python_callable(*self.op_args, **self.op_kwargs)
File "/home/ubuntu/iac-airflow/dags/helper/tasks.py", line 93, in run_athena_query
query_execution_ids = loop.run_until_complete(get_ids(client_athena, query, database, output_location))
File "/usr/lib/python3.6/asyncio/base_events.py", line 484, in run_until_complete
return future.result()
File "/home/ubuntu/iac-airflow/dags/helper/tasks.py", line 79, in get_ids
res = await asyncio.gather(*query_responses, return_exceptions=True)
File "/usr/lib/python3.6/asyncio/tasks.py", line 602, in gather
fut = ensure_future(arg, loop=loop)
File "/usr/lib/python3.6/asyncio/tasks.py", line 526, in ensure_future
raise TypeError('An asyncio.Future, a coroutine or an awaitable is '
TypeError: An asyncio.Future, a coroutine or an awaitable is required
I am unable to get where I am going wrong. Would appreciate some hint over the issue

I think what you are doing here isn't really needed.
Your issues ares:
Executing multiple queries in parallel.
Being able to recover queryExecutionId per query.
Both issues are solved simply by using AWSAthenaOperator. The operator already handles everything you mentioned for you.
Example:
from airflow.models import DAG
from airflow.utils.dates import days_ago
from airflow.operators.dummy import DummyOperator
from airflow.providers.amazon.aws.operators.athena import AWSAthenaOperator
with DAG(
dag_id="athena",
schedule_interval='#daily',
start_date=days_ago(1),
catchup=False,
) as dag:
start_op = DummyOperator(task_id="start_task")
query_list = ["SELECT 1;", "SELECT 2;" "SELECT 3;"]
for i, sql in enumerate(query_list):
run_query = AWSAthenaOperator(
task_id=f'run_query_{i}',
query=sql,
output_location='s3://my-bucket/my-path/',
database='my_database'
)
start_op >> query_op
Athena tasks will be created dynamically simply by adding more queries to query_list:
Note that the QueryExecutionId is pushed to xcom thus you can access the in a downstream task if needed.

Following as well worked for me. I just complicated simple problem with asyncio.
Since I needed S3 URIs for each query at last therefore I went for writing script from scratch. In the current implementation of AWSAthenaOperator, one can get the queryExecutionId and then do the remaining processing(i.e create another task) for getting S3 URI of CSV result file. This can add some overhead in terms of delay between two tasks(of getting queryExecutionId and retrieving S3 URI) along with added resource usuage.
Therefore I went for doing the complete operation in a single operator as follows:-
Code:-
import json
import time
import asyncio
import boto3
import logging
from airflow import DAG
from airflow.operators.python import PythonOperator
def execute_query(client, query, database, output_location):
response = client.start_query_execution(
QueryString=query,
QueryExecutionContext={
'Database': database
},
ResultConfiguration={
'OutputLocation': output_location
}
)
return response
def run_athena_query(query, database, output_location, region_name, **context):
BOTO_SESSION = boto3.Session(
aws_access_key_id = 'YOUR_KEY',
aws_secret_access_key = 'YOUR_ACCESS_KEY')
client_athena = BOTO_SESSION.client('athena', region_name=region_name)
query_execution_ids = []
if message_list:
for parameter in message_list:
query_response = execute_query(client_athena, query, database, output_location)
query_execution_ids.append(query_response['QueryExecutionId'])
else:
raise Exception(
'Error in upstream value recived from kafka consumer. Got message list as - {}, with type {}'
.format(message_list, type(message_list))
)
repetitions = 900
error_messages = []
s3_uris = []
while repetitions > 0 and len(query_execution_ids) > 0:
repetitions = repetitions - 1
query_response_list = client_athena.batch_get_query_execution(
QueryExecutionIds=query_execution_ids)['QueryExecutions']
for query_response in query_response_list:
if 'QueryExecution' in query_response and \
'Status' in query_response['QueryExecution'] and \
'State' in query_response['QueryExecution']['Status']:
state = query_response['QueryExecution']['Status']['State']
if state in ['FAILED', 'CANCELLED']:
error_reason = query_response['QueryExecution']['Status']['StateChangeReason']
error_message = 'Final state of Athena job is {}, query_execution_id is {}. Error: {}'.format(
state, query_execution_id, error_message
)
error_messages.append(error_message)
query_execution_ids.remove(query_response['QueryExecutionId'])
elif state == 'SUCCEEDED':
result_location = query_response['QueryExecution']['ResultConfiguration']['OutputLocation']
s3_uris.append(result_location)
query_execution_ids.remove(query_response['QueryExecutionId'])
time.sleep(2)
logging.exception(error_messages)
return s3_uris
DEFAULT_ARGS = {
'owner': 'ubuntu',
'depends_on_past': True,
'start_date': datetime(2021, 6, 8),
'retries': 0,
'concurrency': 2
}
with DAG('resync_job_dag', default_args=DEFAULT_ARGS, schedule_interval=None) as dag:
ATHENA_QUERY = PythonOperator(
task_id='athena_query',
python_callable=run_athena_query,
provide_context=True,
op_kwargs={
'query': 'SELECT request_timestamp FROM "sampledb"."elb_logs" limit 10;', # query provide in athena tutorial
'database':'sampledb',
'output_location':'YOUR_BUCKET',
'region_name':'YOUR_REGION'
}
)
ATHENA_QUERY
However, the approach shared by #Elad is more clean and apt if one wants to get queryExecutionIds of all the queries.

Unable to import module - Lambda handler Error

When I am trying to run my python code in lambda passing the handler to the function.module getting the below error, any suggestions how i could resolve this?
the below file test_client_visitor is triggered to call the client_visitor and send an email to the clients accordingly, when i run thd python file test_client_visitor in my local i get the email triggered successfully but in lambda facing the issue.
file_name: test_client_visitor
function = __import__('client_visitor')
handler = function.scan_clients
class TestFunction(unittest.TestCase):
def test_function(self):
file = open('event.json', 'rb')
try:
ba = bytearray(file.read())
event = jsonpickle.decode(ba)
print('## EVENT')
print(jsonpickle.encode(event))
context = {'requestid': '1234'}
result = handler(event, context)
print(result)
self.assertTrue(result, 'Emails could not be sent!')
finally:
file.close()
file.close()
if __name__ == '__main__':
unittest.main()
file_name: client_visitor.py
import datetime
import boto3
from aws_ses import send_bulk_templated_email
# boto3.set_stream_logger('botocore', level='DEBUG')
from mongodb import get_mongo_db
def process_clients(clients, developers, clients_to_be_notified, days):
if not clients:
pass
check_date = datetime.datetime.now() + datetime.timedelta(days)
for client in clients:
client_id_ = client['client_id']
if 'developer_id' in client:
developers[client_id_] = client['developer_id']
else:
if 'secrets' in client:
secrets = client['secrets']
for secret in secrets:
if 'not_on_or_after' in secret and secret['not_on_or_after'] < check_date.timestamp():
clients_to_be_notified.append({'client_id': client_id_,
'expiration_date': datetime.datetime.fromtimestamp(
secret['not_on_or_after']).strftime('%m/%d/%Y')})
print("adding client to notify List", client_id_, ":", client['sort'])
def notify_clients(clients_to_be_notified, developers):
developer_id_list = []
for client_secret in clients_to_be_notified:
developer_id_list.append(developers[client_secret['client_id']])
if developer_id_list:
db = get_mongo_db()
if db:
users = list(db.users.find({'guid': {'$in': developer_id_list}}, {'email', 'guid'}))
need_to_send_email = False
for user in users:
for client_secret in clients_to_be_notified:
if developers[client_secret['client_id']] == user['guid']:
client_secret['email'] = user['email']
need_to_send_email = True
break
if need_to_send_email:
return send_bulk_templated_email(clients_to_be_notified)
else:
return False
return True
def scan_clients(event, context):
local = False
if 'local' in event:
local = event['local'] == 'True'
if local:
dynamodb = boto3.resource('dynamodb', endpoint_url="http://localhost:8000")
else:
dynamodb = boto3.resource('dynamodb')
days = 30
if 'days' in event:
days = int(event['days'])
print(f"Scanning Clients with {days} or less to secret expiration")
table = dynamodb.Table('****')
scan_kwargs = {
'ProjectionExpression': 'client_id, sort, developer_id, secrets, approved'
}
test = False
if 'test' in event:
test = event['test'] == 'True'
done = False
start_key = None
developers = {}
clients_to_be_notified = []
if test:
developers['idm-portal1'] = '***'
clients_to_be_notified = [{'client_id': 'idm-portal1', 'expiration_date': '04/17/2021'}]
while not done:
if start_key:
scan_kwargs['ExclusiveStartKey'] = start_key
response = table.scan(**scan_kwargs)
process_clients(response.get('Items', []), developers, clients_to_be_notified, days)
start_key = response.get('LastEvaluatedKey', None)
done = start_key is None
print("total developers ", len(developers), " total clients_to_be_notified ", len(clients_to_be_notified))
return notify_clients(clients_to_be_notified, developers)
if __name__ == '__main__':
scan_clients(event={'days': 30, 'local': False, 'test': True}, context=None)
Response
{
"errorMessage": "Unable to import module 'test_client_visitor': No module named 'test_client_visitor'",
"errorType": "Runtime.ImportModuleError",
"stackTrace": []
}

Your file must be named test_client_visitor.py. The way lambda runs the code is by trying to import the main file and call the handler function. See the AWS docs to set up a handler for Python.
The reason you didn't run into this issue locally is because I assume you are calling python directly on the command line — python test_client_visitor. When you import a module in Python, the file has to end in the .py extension.

Able to fix this issue with right packaging of the contents to zip, avoided the creation of extra folder with the below command.
Command:
cd folder; zip -r ../filename.zip *
Thankyou everyone for your inputs.

How to handle incoming PubSub messages in Python?

I have created a Cloud Compute Engine instance on Debian, and have successfully created a PUSH subscription to a topic with
from google.cloud import pubsub_v1
project_id = "censored"
topic_name = "censored"
subscription_name = "censored"
endpoint = "https://censored.appspot.com/pubsub/push?token=censored"
def create_push_subscription(project_id,
topic_name,
subscription_name,
endpoint):
"""Create a new push subscription on the given topic."""
# [START pubsub_create_push_subscription]
subscriber = pubsub_v1.SubscriberClient()
topic_path = subscriber.topic_path(project_id, topic_name)
subscription_path = subscriber.subscription_path(
project_id, subscription_name)
push_config = pubsub_v1.types.PushConfig(
push_endpoint=endpoint)
subscription = subscriber.create_subscription(
subscription_path, topic_path, push_config)
print('Push subscription created: {}'.format(subscription))
print('Endpoint for subscription is: {}'.format(endpoint))
# [END pubsub_create_push_subscription]
create_push_subscription(project_id, topic_name, subscription_name, endpoint)
but I'm not sure how exactly incoming messages arrive. I have found this sample code to parse messages, but I'm not sure how to get it to monitor in the background and 'activate' whenever incoming messages arrive.
import argparse
import base64
import json
import sys
import time
from google.cloud import pubsub_v1
def summarize(message):
# [START parse_message]
data = message.data.decode('utf-8')
attributes = message.attributes
name = attributes['name']
time_created = attributes['timeCreated']
bucket_id = attributes['bucketId']
object_id = attributes['objectId']
generation = attributes['objectGeneration']
description = (
'\tName: {name}\n'
'\tTime Created: {time_created}\n'
'\tBucket ID: {bucket_id}\n'
'\tObject ID: {object_id}\n'
'\tGeneration: {generation}\n'
).format(
name=name,
time_created=time_created,
bucket_id=bucket_id,
object_id=object_id,
generation=generation
)
if 'overwroteGeneration' in attributes:
description += '\tOverwrote generation: %s\n' % (
attributes['overwroteGeneration'])
if 'overwrittenByGeneration' in attributes:
description += '\tOverwritten by generation: %s\n' % (
attributes['overwrittenByGeneration'])
payload_format = attributes['payloadFormat']
if payload_format == 'JSON_API_V1':
object_metadata = json.loads(data)
name = object_metadata['name']
time_created = object_metadata['timeCreated']
size = object_metadata['size']
content_type = object_metadata['contentType']
metageneration = object_metadata['metageneration']
description += (
'\tName: {name}\n'
'\tTime Created: {time_created}\n'
'\tContent type: {content_type}\n'
'\tSize: {object_size}\n'
'\tMetageneration: {metageneration}\n'
).format(
name=name,
time_created=time_created,
content_type=content_type,
object_size=size,
metageneration=metageneration
)
return description
print('Note for developer: If BucketId and ObjectId listed, utf encoding.')
print('If not, JSON_V1 encoding. Adjust accordingly.')
# [END parse_message]
while(True):
print("signpost 1")
summarize(message)
print("signpost 2")
time.sleep(10)
print("signpost 3")
For example, this code will return
NameError: name 'message' is not defined
which is expected...
Could someone please help me set it up properly?
I know it's different in PULL because then the message will be defined during the pull, but I'd like to keep it as PUSH, if possible.

You need to create a long-running process which is either able to continuously poll for new messages (pull subscription) or have a reachable endpoint to receive new messages (push subscription).
See the example here: https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/pubsub/cloud-client/subscriber.py, as well as the differences between push and pull here: https://cloud.google.com/pubsub/docs/subscriber

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to poll and keep track of external job status through Airflow? - python

Related

How to pass a variable from one task to another in airflow

Database Query returning empty in test cases - Django

Run Multiple Athena Queries in Airflow 2.0

Unable to import module - Lambda handler Error

How to handle incoming PubSub messages in Python?

Categories

Resources