'S3Hook' object has no attribute 'download_file' in AIRFLOW DAG - python

2023-02-05 11:32:43,293] {{taskinstance.py:887}} INFO - Executing <Task(PythonOperator): download_from_s3> on 2023-02-05T11:32:34.016335+00:00
[2023-02-05 11:32:43,299] {{standard_task_runner.py:53}} INFO - Started process 87503 to run task
[2023-02-05 11:32:43,474] {{logging_mixin.py:112}} INFO - Running %s on host %s <TaskInstance: s3_download.download_from_s3 2023-02-05T11:32:34.016335+00:00 [running]> 67c7842be21b
[2023-02-05 11:32:43,555] {{taskinstance.py:1128}} ERROR - 'S3Hook' object has no attribute 'download_file'
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/airflow/models/taskinstance.py", line 966, in _run_raw_task
result = task_copy.execute(context=context)
File "/usr/local/lib/python3.7/site-packages/airflow/operators/python_operator.py", line 113, in execute
return_value = self.execute_callable()
File "/usr/local/lib/python3.7/site-packages/airflow/operators/python_operator.py", line 118, in execute_callable
return self.python_callable(*self.op_args, **self.op_kwargs)
File "/usr/local/airflow/dags/dwnld_frm_awss3.py", line 12, in download_from_s3
file_name = hook.download_file(key=key, bucket_name=bucket_name, local_path=local_path)
AttributeError: 'S3Hook' object has no attribute 'download_file'
[2023-02-05 11:32:43,570] {{taskinstance.py:1185}} INFO - Marking task as FAILED.dag_id=s3_download, task_id=download_from_s3, execution_date=20230205T113234, start_date=20230205T113243, end_date=20230205T113243
Getting error of download_file
My code is
import os
from datetime import datetime
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.hooks.S3_hook import S3Hook
from airflow.contrib.hooks.aws_hook import AwsHook
# Function of the DAG
def download_from_s3(key: str, bucket_name: str, local_path: str) -> str:
hook = S3Hook('my_conn_S3')
file_name = hook.download_file(key=key, bucket_name=bucket_name, local_path=local_path)
return file_name
with DAG(
dag_id='s3_download',
schedule_interval='#daily',
start_date=datetime(2023, 2, 4),
catchup=False
) as dag:
task_download_from_s3 = PythonOperator(
task_id='download_from_s3',
python_callable=download_from_s3,
op_kwargs={
'key': 'sample.txt',
'bucket_name': 'airflow-sample-s3-bucket',
'local_path': '/usr/local/airflow/'
}
)

The imports suggests that you are using older version of Airflow.
You should install Amazon backport provider then import the hook as from airflow.providers.amazon.aws.hooks.s3 import S3Hook
Note that Airflow 1.10 is end-of-life for 2+ years, you should upgrade Airflow version as soon as possible. To upgrade Airflow you can follow this guide.

Related

Airflow triggers Sagemaker job in test mode

I've an Airflow(v1.10.12) dag that triggers a Sagemaker Processor job as part of one of it's tasks. I've written few tests(pytest 6.2.2) to check the basic sanity of the dag.
It seems like just fetching the dag by id from DagBag triggers a Sagemaker job. i.e When I do pytest test_file_name.py, a job is triggered which isn't ideal.
from airflow.models import DagBag
class TestSagemakerDAG:
#classmethod
def setup(cls):
cls.dagbag = DagBag()
cls.dag = DagBag().get_dag(dag_id='sagemaker-processor')
def test_dag_loaded(self):
"""
To verify if dags are loaded onto the dagabag
:return:
"""
assert self.dagbag.import_errors == {}
assert self.dag is not None
assert len(self.dag.tasks) == 2
For more clarity this is how the Sagemaker Processor job(sagemaker 2.24.1) definition looks like
def initiate_sage_maker_job(self, session):
return Processor(
image_uri=self.ecr_uri,
role=self.iam_role,
instance_count=self.instance_count,
instance_type=self.instance_type,
base_job_name=self.processor_name,
sagemaker_session=session,
).run()
And the boto3(v 1.16.63) session is generated as
def get_session(self):
boto_session = boto3.session.Session()
client = boto_session.client('sagemaker', region_name=self.region)
session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=client)
return session
Finally, the Dag itself
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
with DAG('sagemaker-processor',
default_args=default_args,
schedule_interval='#hourly',
) as dag:
t1 = BashOperator(
task_id='print_current_date',
bash_command='date'
)
t2 = PythonOperator(
task_id='sagemaker_trigger', python_callable=initiate_sage_maker_job()
)
t1 >> t2
I'm just trying to import dags from a folder and check import errors, check upstream and downstream list.
On a side note, I've made sure the Dag is turned off on the Airflow UI and or to execute airflow scheduler to start queueing tasks. It's really just a standard test I want to execute using Pytest.
The issue pops up as follows
Job Name: airflow-ecr-test-2021-02-26-21-10-39-935
Inputs: []
Outputs: []
[2021-02-26 16:10:39,935] {session.py:854} INFO - Creating processing-job with name airflow-ecr-test-2021-02-26-21-10-39-935
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydev_bundle/pydev_umd.py", line 197, in runfile
pydev_imports.execfile(filename, global_vars, local_vars) # execute the script
File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "/Users/Ajeya.Kempegowda/dags/sample_mwaa.py", line 31, in <module>
initiate_sage_maker_job()
File "/Users/Ajeya.Kempegowda/anaconda3/envs/airflow/lib/python3.7/site-packages/sagemaker/processing.py", line 180, in run
experiment_config=experiment_config,
File "/Users/Ajeya.Kempegowda/anaconda3/envs/airflow/lib/python3.7/site-packages/sagemaker/processing.py", line 695, in start_new
processor.sagemaker_session.process(**process_args)
File "/Users/Ajeya.Kempegowda/anaconda3/envs/airflow/lib/python3.7/site-packages/sagemaker/session.py", line 856, in process
self.sagemaker_client.create_processing_job(**process_request)
File "/Users/Ajeya.Kempegowda/anaconda3/envs/airflow/lib/python3.7/site-packages/botocore/client.py", line 357, in _api_call
return self._make_api_call(operation_name, kwargs)
File "/Users/Ajeya.Kempegowda/anaconda3/envs/airflow/lib/python3.7/site-packages/botocore/client.py", line 676, in _make_api_call
raise error_class(parsed_response, operation_name)
botocore.exceptions.ClientError: An error occurred (ExpiredTokenException) when calling the CreateProcessingJob operation: The security token included in the request is expired
The error displayed says TokenExpired but it really stems while creating a job itself.
Is there something obvious I'm missing while testing airflow? My understanding is that airflow scheduler should queue up dags and only when told to execute(Turn on dag on Airflow UI/CLI) the tasks must be triggered.
Any help would be appreciated. Thanks!

rqscheduler docker is stopping with timemismatch error

I have created 3 dockers in same network
redis queue
rq scheduler
Python based docker
Error is coming when redis is trying to schedule the task on scheduler.
docker ps output
b18b7d21894f redis "docker-entrypoint.s…" 27 minutes ago Up 27 minutes 6379/tcp test_redis_1
140a7c31b87d python "python3" 13 hours ago Up 13 hours pyRed5
55dc5bcd3f57 anarchy/rq-scheduler "rqscheduler --host …" 27 minutes ago Exited (1) 13 minutes ago boring_bohr
I am trying to schedule the periodic task.
File iss.py
from rq_scheduler import Scheduler
from redis import Redis
from datetime import datetime, timedelta,timezone
import pytz
import mail
scheduler = Scheduler(connection=Redis("test_redis_1"))
def get_next_pass():
x= datetime.now() + timedelta(minutes = 1)
return x.replace(tzinfo=timezone.utc)
#.strftime("%Y-%m-%dT%H:%M:%SZ")
def send_text_message(time):
mail.mail()
scheduler.enqueue_at(time+100, iss.send_text_message,time+100)
File scheduler.py
from datetime import datetime
from redis import Redis
from rq_scheduler import Scheduler
import iss
scheduler = Scheduler(connection=Redis("test_redis_1")) # Get a scheduler for the "default" queue
next_pass = iss.get_next_pass()
if next_pass:
print(next_pass)
next_pass
print("reached here")
scheduler.enqueue_at(next_pass, iss.send_text_message,next_pass)
I am calling schduler.py from python docker. Task is going to rq but it is getting failed at rq scheduler with the below error
root#healthbot-build-vm1:~/redis# docker logs 55dc5bcd3f57
19:09:55 Running RQ scheduler...
19:09:55 Checking for scheduled jobs...
19:10:55 Checking for scheduled jobs...
19:11:55 Checking for scheduled jobs...
19:12:55 Checking for scheduled jobs...
19:13:55 Checking for scheduled jobs...
19:14:55 Checking for scheduled jobs...
19:15:56 Checking for scheduled jobs...
19:16:56 Checking for scheduled jobs...
19:17:56 Checking for scheduled jobs...
19:18:56 Checking for scheduled jobs...
19:19:56 Checking for scheduled jobs...
19:20:56 Checking for scheduled jobs...
19:21:56 Checking for scheduled jobs...
19:22:56 Checking for scheduled jobs...
19:23:56 Checking for scheduled jobs...
Traceback (most recent call last):
File "/usr/local/lib/python3.5/site-packages/rq/utils.py", line 164, in utcparse
return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%SZ')
File "/usr/local/lib/python3.5/_strptime.py", line 510, in _strptime_datetime
tt, fraction = _strptime(data_string, format)
File "/usr/local/lib/python3.5/_strptime.py", line 343, in _strptime
(data_string, format))
ValueError: time data '2021-01-14T19:22:07.242474Z' does not match format '%Y-%m-%dT%H:%M:%SZ'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/bin/rqscheduler", line 11, in <module>
sys.exit(main())
File "/usr/local/lib/python3.5/site-packages/rq_scheduler/scripts/rqscheduler.py", line 53, in main
scheduler.run(burst=args.burst)
File "/usr/local/lib/python3.5/site-packages/rq_scheduler/scheduler.py", line 340, in run
self.enqueue_jobs()
File "/usr/local/lib/python3.5/site-packages/rq_scheduler/scheduler.py", line 322, in enqueue_jobs
jobs = self.get_jobs_to_queue()
File "/usr/local/lib/python3.5/site-packages/rq_scheduler/scheduler.py", line 271, in get_jobs_to_queue
return self.get_jobs(to_unix(datetime.utcnow()), with_times=with_times)
File "/usr/local/lib/python3.5/site-packages/rq_scheduler/scheduler.py", line 254, in get_jobs
job = Job.fetch(job_id, connection=self.connection)
File "/usr/local/lib/python3.5/site-packages/rq/job.py", line 294, in fetch
job.refresh()
File "/usr/local/lib/python3.5/site-packages/rq/job.py", line 410, in refresh
self.created_at = to_date(as_text(obj.get('created_at')))
File "/usr/local/lib/python3.5/site-packages/rq/job.py", line 403, in to_date
return utcparse(as_text(date_str))
File "/usr/local/lib/python3.5/site-packages/rq/utils.py", line 167, in utcparse
return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S.%f+00:00')
File "/usr/local/lib/python3.5/_strptime.py", line 510, in _strptime_datetime
tt, fraction = _strptime(data_string, format)
File "/usr/local/lib/python3.5/_strptime.py", line 343, in _strptime
(data_string, format))
ValueError: time data '2021-01-14T19:22:07.242474Z' does not match format '%Y-%m-%dT%H:%M:%S.%f+00:00'

Airflow PythonVirtualenvOperator, No such file or directory: 'virtualenv'

I am trying to run the Apache Airflow PythonVirtualenvOperator in one of my DAGs but Airflow is throwing the following error:
[2020-12-14 20:06:32,291] {python_operator.py:316} INFO - Executing cmd
['virtualenv', '/tmp/venvwtqb3rki', '--python=python3.8']
[2020-12-14 20:06:32,301] {taskinstance.py:1150} ERROR - [Errno 2] No such file or directory: 'virtualenv'
Traceback (most recent call last):
File "/opt/airflow/airflow_env/lib/python3.8/site-packages/airflow/models/taskinstance.py", line 984, in _run_raw_task
result = task_copy.execute(context=context)
File "/opt/airflow/airflow_env/lib/python3.8/site-packages/airflow/operators/python_operator.py", line 113, in execute
return_value = self.execute_callable()
File "/opt/airflow/airflow_env/lib/python3.8/site-packages/airflow/operators/python_operator.py", line 292, in execute_callable
self._execute_in_subprocess(self._generate_virtualenv_cmd(tmp_dir))
File "/opt/airflow/airflow_env/lib/python3.8/site-packages/airflow/operators/python_operator.py", line 317, in _execute_in_subprocess
output = subprocess.check_output(cmd,
File "/usr/lib/python3.8/subprocess.py", line 411, in check_output
return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
File "/usr/lib/python3.8/subprocess.py", line 489, in run
with Popen(*popenargs, **kwargs) as process:
File "/usr/lib/python3.8/subprocess.py", line 854, in __init__
self._execute_child(args, executable, preexec_fn, close_fds,
File "/usr/lib/python3.8/subprocess.py", line 1702, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
I have Airflow and all my DAGs running as an airflow user. I thought maybe airflow can't find the virutalenv command in its path during task setup/execution.
Here is the code that I have in place currently to test.
import logging
import datetime
from airflow import DAG
import airflow
from airflow.hooks.S3_hook import S3Hook
from airflow.contrib.hooks import aws_hook
from airflow.models import Variable
from airflow.operators.python_operator import PythonOperator, PythonVirtualenvOperator
from airflow.utils.dates import days_ago
import time
default_args = {
'owner':'airflow',
'depends_on_past': False,
'start_date': days_ago(2),
'retries': 0
}
dag = DAG (
dag_id = 'list_reqestor',
default_args = default_args,
catchup=False,
schedule_interval = None
)
def setup_driver(ti):
from libcloud.compute.types import Provider
from libcloud.compute.providers import get_driver
"""
Sets up Apache libcloud AWS ec2 node driver.
Args:
region: AWS region to perform credential check.
"""
try:
shopper_logger.info("Setting up node deployment driver.")
region = Variable.get('REGION')
cls = get_driver(Provider.EC2)
a_hook = aws_hook.AwsHook()
driver = cls(creds,region=region)
ti.xcom_push(XCOM_REQUESTOR_LIB_CLOUD_DRIVER, driver)
time.sleep(30)
setup_driver_task = PythonVirtualenvOperator(
task_id='setup_driver_task',
python_callable=setup_driver,
retries=0,
requirements=['apache-libcloud'],
python_version="3.8",
system_site_packages=False,
provide_context=True,
xcom_push=True,
dag=dag
)
setup_driver
I am not sure what I am missing.
Most likely it is due the lack of the " virtualenv " in your airflow's enviroment.
You can check (when in the same enviroment as airflow) with virtualenv --version in your terminal.
If it does not find the "virtualenv", just install it with:
pip install virtualenv
and it should work

AttributeError when I'm running Celery worker

I'm trying to use celery for background process in my Django application. Django version is 1.4.8 and latest suitable celery version is 3.1.25.
I use Redis (3.1.0) as broker and backend, json as serializer.
When I start the worker
celery -A celery_app worker -l info I'm getting Attribute error 'unicode' object has no attribute 'iteritems'
My settings.py file:
BROKER_URL = 'redis://localhost'
CELERY_RESULT_BACKEND = 'redis://localhost/'
CELERY_ACCEPT_CONTENT = ['json']
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'
CELERY_STORE_ERRORS_EVEN_IF_IGNORED = True
celery_app.py:
import sys
from django.conf import settings
from celery import Celery
project_root = os.path.dirname(__file__)
sys.path.insert(0, os.path.join(project_root, '../env'))
sys.path.insert(0, os.path.join(project_root, '../'))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'project.settings')
app = Celery('project')
app.config_from_object('project.settings')
app.autodiscover_tasks(lambda: settings.INSTALLED_APPS, force=True)
tasks.py:
#celery_app.task
def sample_task(x):
return 'Test response'
and that's how I run this task:
sample_task.delay({'key': 'test'})
And I get the following error:
File "/Users/user/project/venv/lib/python2.7/site-packages/redis/_compat.py", line 94, in iteritems
return x.iteritems()
AttributeError: 'unicode' object has no attribute 'iteritems'
full traceback:
[2019-01-31 16:43:08,909: ERROR/MainProcess] Unrecoverable error: AttributeError("'unicode' object has no attribute 'iteritems'",)
Traceback (most recent call last):
File "/Users/user/project/venv/lib/python2.7/site-packages/celery/worker/__init__.py", line 206, in start
self.blueprint.start(self)
File "/Users/user/project/venv/lib/python2.7/site-packages/celery/bootsteps.py", line 123, in start
step.start(parent)
File "/Users/user/project/venv/lib/python2.7/site-packages/celery/bootsteps.py", line 374, in start
return self.obj.start()
File "/Users/user/project/venv/lib/python2.7/site-packages/celery/worker/consumer.py", line 280, in start
blueprint.start(self)
File "/Users/user/project/venv/lib/python2.7/site-packages/celery/bootsteps.py", line 123, in start
step.start(parent)
File "/Users/user/project/venv/lib/python2.7/site-packages/celery/worker/consumer.py", line 884, in start
c.loop(*c.loop_args())
File "/Users/user/project/venv/lib/python2.7/site-packages/celery/worker/loops.py", line 76, in asynloop
next(loop)
File "/Users/user/project/venv/lib/python2.7/site-packages/kombu/async/hub.py", line 340, in create_loop
cb(*cbargs)
File "/Users/user/project/venv/lib/python2.7/site-packages/kombu/transport/redis.py", line 1019, in on_readable
self._callbacks[queue](message)
File "/Users/user/project/venv/lib/python2.7/site-packages/kombu/transport/virtual/__init__.py", line 534, in _callback
self.qos.append(message, message.delivery_tag)
File "/Users/user/project/venv/lib/python2.7/site-packages/kombu/transport/redis.py", line 146, in append
pipe.zadd(self.unacked_index_key, delivery_tag, time()) \
File "/Users/user/project/venv/lib/python2.7/site-packages/redis/client.py", line 2320, in zadd
for pair in iteritems(mapping):
File "/Users/user/project/venv/lib/python2.7/site-packages/redis/_compat.py", line 94, in iteritems
return x.iteritems()
AttributeError: 'unicode' object has no attribute 'iteritems'
I tried to find the issue on the internet, tried to pass another params to task. I don't know how to debug celery process and could not find the solution by myself. Please help me
Seems that this Celery version doesn't support Redis 3. Try to install Redis 2.10.6.

Apache Airflow SqlSensor MSSQL issues on Docker running on Windows 10

I'm trying to run the SqlSensor locally under docker on a Windows 10 machine, it runs on Linux but get below errors when trying to run the same simple DAG locally.
The reason I'm trying to set this up is so that I can develop locally and test to speed up the development cycle.
Error from Airflow log:
[2018-05-22 08:27:04,929] {{models.py:1428}} INFO - Executing <Task(SqlSensor): limits_test> on 2018-05-21 08:00:00
[2018-05-22 08:27:04,929] {{base_task_runner.py:115}} INFO - Running: ['bash', '-c', 'airflow run sql-sensor-test-dag limits_test 2018-05-21T08:00:00 --job_id 8 --raw -sd DAGS_FOLDER/sql_sensor_test.py']
[2018-05-22 08:27:05,685] {{base_task_runner.py:98}} INFO - Subtask: [2018-05-22 08:27:05,684] {{__init__.py:45}} INFO - Using executor CeleryExecutor
[2018-05-22 08:27:05,749] {{base_task_runner.py:98}} INFO - Subtask: [2018-05-22 08:27:05,749] {{models.py:189}} INFO - Filling up the DagBag from /usr/local/airflow/dags/sql_sensor_test.py
[2018-05-22 08:27:05,791] {{cli.py:374}} INFO - Running on host 0f8e7a60dbab
[2018-05-22 08:27:05,858] {{base_task_runner.py:98}} INFO - Subtask: [2018-05-22 08:27:05,858] {{base_hook.py:80}} INFO - Using connection to: LABCHGVA-SQL295
[2018-05-22 08:27:05,888] {{base_task_runner.py:98}} INFO - Subtask: [2018-05-22 08:27:05,888] {{sensors.py:111}} INFO - Poking: SELECT max(snapshot_id) FROM limits_run
[2018-05-22 08:27:05,896] {{base_task_runner.py:98}} INFO - Subtask: [2018-05-22 08:27:05,896] {{base_hook.py:80}} INFO - Using connection to: LABCHGVA-SQL295
[2018-05-22 08:27:05,924] {{models.py:1595}} ERROR - Connection to the database failed for an unknown reason.
Traceback (most recent call last):
File "pymssql.pyx", line 635, in pymssql.connect (pymssql.c:10734)
File "_mssql.pyx", line 1902, in _mssql.connect (_mssql.c:21821)
File "_mssql.pyx", line 638, in _mssql.MSSQLConnection.__init__ (_mssql.c:6594)
_mssql.MSSQLDriverException: Connection to the database failed for an unknown reason.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/airflow/models.py", line 1493, in _run_raw_task
result = task_copy.execute(context=context)
File "/usr/local/lib/python3.6/site-packages/airflow/operators/sensors.py", line 78, in execute
while not self.poke(context):
File "/usr/local/lib/python3.6/site-packages/airflow/operators/sensors.py", line 112, in poke
records = hook.get_records(self.sql)
File "/usr/local/lib/python3.6/site-packages/airflow/hooks/dbapi_hook.py", line 106, in get_records
with closing(self.get_conn()) as conn:
File "/usr/local/lib/python3.6/site-packages/airflow/hooks/mssql_hook.py", line 43, in get_conn
port=conn.port)
File "pymssql.pyx", line 644, in pymssql.connect (pymssql.c:10892)
pymssql.InterfaceError: Connection to the database failed for an unknown reason.
Using this Docker image:
FROM puckel/docker-airflow:1.9.0-2
USER root
RUN apt-get update
RUN apt-get install freetds-dev -yqq && \
pip install apache-airflow[mssql]
USER airflow
And the following simple DAG:
from datetime import timedelta
import airflow
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.sensors import SqlSensor
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'catchup': False,
'start_date': airflow.utils.dates.days_ago(1),
'email': ['myemail#company.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 10,
'retry_delay': timedelta(minutes=15),
'sla': timedelta(hours=3)
}
dag = DAG(
'sql-sensor-test-dag',
default_args=default_args,
description='Sensor tests',
schedule_interval='0 8 * * *'
# schedule_interval='#once'
)
with dag:
sql_sensor = SqlSensor(
task_id='limits_test',
conn_id='bpeak_limits_ro',
sql="SELECT max(snapshot_id) FROM limits_run"
)
done = DummyOperator(task_id='done')
sql_sensor >> done

Categories