I'm trying to create an Airflow pipeline that downloads data from an API, processes it, saves it as a CSV and then loads the data to a Postgres database (all within a docker container).
The code looks something like this
from datetime import datetime, timedelta
import pandas as pd
from airflow import DAG
from airflow.providers.postgres.operators.postgres import PostgresOperator
from airflow.operators.python import PythonOperator
default_args = {
"owner": "airflow",
"retries": 5,
"retry_delay": timedelta(minutes=1),
"email": ['airflow#domain.com'],
"email_on_failure": True,
"email_on_retry": False
}
def get_data():
request = request.get("some_url")
request_data = request.json()
all_data = pd.DataFrame.from_dict(request_data["data"])
all_data.to_csv("/opt/airflow/data/all_data.csv",index=False)
with DAG(
dag_id="my_dag",
default_args=default_args,
start_date=datetime(2022,1,24),
catchup=False,
schedule_interval=timedelta(minutes=5)
) as dag:
create_table = PostgresOperator(
task_id="create_table",
postgres_conn_id="postgres_localhost",
sql="""
create table if not exists my_table(
created_at timestamp,
col1 double precision,
col2 smallint,
primary key (created_at, col1)
)
"""
)
get_data = PythonOperator(
task_id="get_data",
python_callable=get_data
)
load_data = PostgresOperator(
task_id = "load_data",
postgres_conn_id="postgres_localhost",
sql="""
copy my_table
from '/opt/airflow/data/all_data.csv'
delimiter ',' csv;
"""
)
create_table >> get_data >> load_data
The problem is that when I try to run the DAG I get an error in the load_data task saying psycopg2.errors.UndefinedFile: could not open file "/opt/***/data/all_data.csv" for reading: No such file or directory HINT: COPY FROM instructs the PostgreSQL server process to read a file. You may want a client-side facility such as psql's \copy.
I don't know why the word airflow is getting replaced in the path or how to save it properly so that the CSV file can be copied into postgres.
This error is because the postgres server is a separate instance within Docker. You could try one of the following ways around this:
Copy the file between servers by using using scp to place the data file onto the postgres server
Copy the file between servers by using SFTPOperator (which requires SSH Hook instatiation), then do the COPY statement.
Connect manually to the Postgres db via the BashOperator and run the copy CLI command for Postgres.
If anyone has a more elegant solution, please answer. I have this same problem right now and am working on it. Once I have it, I'll post it back here.
Related
I have a Airflow dag with a PostgresOperator to execute a SQL query. I want to switch to my test database or my prod database with config (run w/config). But postgres_conn_id is not a template field and so PostgresOperator say "{{ dag_run.conf.get('CONN_ID_TEST', 'pg_database') }}" is not a connection.
I run this script with {"CONN_ID_TEST": "pg_database_test"} config.
I try to create a custom postgresql operator with the same code of Airflow github and I add template_fields: Sequence[str] = ("postgres_conn_id",) at the top of my class CustomPostgresOperator but that doesn't work too (same error).
I have two conn_id env variables :
AIRFLOW_CONN_ID_PG_DATABASE (prod)
AIRFLOW_CONN_ID_PG_DATABASE_TEST (test)
My script looks like :
from airflow import DAG
from airflow.providers.postgres.operators.postgres import PostgresOperator
from airflow.operators.dummy import DummyOperator
DAG_ID = "init_database"
POSTGRES_CONN_ID = "{{ dag_run.conf.get('CONN_ID_TEST', 'pg_database') }}"
with DAG(
dag_id=DAG_ID,
description="My dag",
schedule_interval="#once",
start_date=dt.datetime(2022, 1, 1),
catchup=False,
) as dag:
start = DummyOperator(task_id = 'start')
my_task = PostgresOperator( #### OR CustomPostgresOperator
task_id="select",
sql="SELECT * FROM pets LIMIT 1;",
postgres_conn_id=POSTGRES_CONN_ID,
autocommit=True
)
start >> my task
How I can process to solve my problem ? And if is not possible how I can switch my PostgresOperator connection to my dev database without recreate an other DAG script ?
Thanks, Léo
Subclassing is a solid way to modify the template_fields how you wish. Since template_fields is a class attribute your subclass only really needs to be the following (assuming you're just adding the connection ID to the existing template_fields):
from airflow.providers.postgres.operators.postgres import PostgresOperator as _PostgresOperator
class PostgresOperator(_PostgresOperator):
template_fields = [*_PostgresOperator.template_fields, "conn_id"]
The above is using Postgres provider version 5.3.1 which actually uses the Common SQL provider under the hood so the connection parameter is actually conn_id. (template_fields refer to the instance attribute name rather than the parameter name.)
For example, assume the below DAG gets triggered with a run config of {"environment": "dev"}:
from pendulum import datetime
from airflow.decorators import dag
from airflow.providers.postgres.operators.postgres import PostgresOperator as _PostgresOperator
class PostgresOperator(_PostgresOperator):
template_fields = [*_PostgresOperator.template_fields, "conn_id"]
#dag(start_date=datetime(2023, 1, 1), schedule=None)
def template_postgres_conn():
PostgresOperator(task_id="run_sql", sql="SELECT 1;", postgres_conn_id="{{ dag_run.conf['environment'] }}")
template_postgres_conn()
Looking at the task log, the connection ID of "dev" is used to execute the SQL:
I want to audit all the information about the DAGs execution status into a table BigQuery ,
I want to do this through a python code in Dags , as already have written code is loading the data into the BigQuery table (as given below). Need help to append the audit logic in the existing code.
with models.DAG(
'C360_GBL_CCN2DPN_CLASSIC',
default_args=default_args,
#schedule_interval='0 9 * * * ') as dag:
schedule_interval=None) as dag:
start = dummy_operator.DummyOperator(
task_id='start',
trigger_rule='all_success'
)
read_json_file(config_file_path)
end = dummy_operator.DummyOperator(
task_id='end',
trigger_rule='all_success'
)
a=[]
if (len(configurations) > 1):
for k in range(0,len(configurations)):
config=configurations[k]
project_id = config['Project_Id']
staging_dataset = config['Dataset']
table_name = config['Table-Name']
write_disposition =config['write_disposition']
sql = config['Sql']
create_disposition = config['create_disposition']
a.append(BigQueryOperator(
task_id=table_name+'_ccn_2_dpn_bq',
sql=sql,
write_disposition=write_disposition,
create_disposition=create_disposition,
use_legacy_sql=False
))
if k != 0 :
a[k-1].set_downstream(a[k])
else:
a[k].set_upstream(start)
a[len(configurations)-1].set_downstream(end)
else:
config = configurations[0]
project_id = config['Project_Id']
staging_dataset = config['Dataset']
table_name = config['Table-Name']
write_disposition = config['write_disposition']
sql = config['Sql']
create_disposition = config['create_disposition']
Task1 = BigQueryOperator(
task_id=table_name+'_ccn_2_dpn_bq',
sql=sql,
write_disposition=write_disposition,
create_disposition=create_disposition,
use_legacy_sql=False
)
Task1.set_upstream(start)
Task1.set_downstream(end)
Airflow writes all the audit log to the table log in its metadata database, you can check if the information are enough for your needs, and if it's the case, you can export them to BigQuery using a stream app (if you need them ASAP after they are created) or by an airflow dag in two steps using the official operators:
a task form PostgresToGCSOperator to export the data to GCS
then a second task from GCSToBigQueryOperator to import the exported files into your BigQuery table,
or in one step using a new operator (you need to develop it).
After pushing my DAG I get this error
I am new to data engineering. I tried to solve this error in different ways at the expense of my knowledge, but nothing worked. I want to write a DAG that consists of two tasks, the first is to export data from one database table on one server as CSV files and import these CSV files into database tables on another server. The variable contains DAG configuration and SQL scripts for exporting and importing data.
Please tell me how can I solve this error?
I have this exporting code:
def export_csv():
import json
from airflow.models import Variable
import pandas as pd
instruction_data = json.loads(Variable.get('MAIN_SOURCE_DAMDI_INSTRUCTIONS'))
requirement_data = instruction_data['requirements']
lst = requirement_data['scripts']
ms_hook = MsSqlHook(mssql_conn_id='OKTELL')
connection = ms_hook.get_conn()
cursor = connection.cursor()
for i in lst:
result = cursor.execute(i['export_script'])
df = pd.DataFrame(result)
df.to_csv(i['filename'], index=False, header=None, sep=',', encoding='utf-8')
cursor.close()
And this is my task for exporting:
export_csv_func = PythonOperator(
task_id='export_csv_func',
python_callable=export_csv,
mssql_conn_id='OKTELL'
P.S. I imported the libraries and airflow variables inside the function because before that there was a lot of load on the server and this method helped to reduce the load.
When using the PythonOperator you pass args to a callable via op_args and/or op_kwargs. In this case, if you wanted to pass the mssql_conn_id arg you can try:
export_csv_func = PythonOperator(
task_id='export_csv_func',
python_callable=export_csv,
op_kwargs={'mssql_conn_id': 'OKTELL'},
)
Then you need to update the export_csv() function signature to accept this kwarg too.
I have airflow dag written to work with Python operator.I need to use PostgreSQL operator for same dag without changing functionality of dag. Here is code with Python operators. How Should I replace Python operator with PostgreSQL operator? Or can we use two different operators in a single dag?
from airflow import DAG
from airflow.models import dag
from airflow.utils.dates import days_ago
from airflow.operators.python_operator import PythonOperator
import os
script_dir_path = os.path.dirname(os.path.realpath(__file__))
import time
from time import sleep
from xlwt import Workbook
import pandas as pd
from csv import writer
from csv import DictReader
from datetime import datetime
from selenium import webdriver
import psycopg2
opt = webdriver.FirefoxOptions()
wb=Workbook()
sheet1=wb.add_sheet('Sheet 1',cell_overwrite_ok=False)
i=0
j=0
default_args = {
'owner': 'airflow',
'retries': 1
}
dag = DAG( 'Yahoo_Finance',
default_args=default_args,
description='fetching ticker symbol',
catchup=False,
start_date= datetime.now(),
schedule_interval= '* 7 * * *'
)
def extract_tickers():
conn = psycopg2.connect(dbname='postgres', user='airflow', password='airflow', host='postgres')
cur = conn.cursor()
with open(r'./fromlocal/EQUITY_L.csv') as read_obj:
csv_dict_reader = DictReader(read_obj)
url = "https://finance.yahoo.com"
driver = webdriver.Remote("http://selenium:4444/wd/hub", options=opt)
driver.get(url)
for row in csv_dict_reader:
time.sleep(4)
# action = ActionChains(driver)
time.sleep(4)
searchBox = driver.find_element_by_id('yfin-usr-qry')
time.sleep(4)
searchBox.send_keys(row['SYMBOL'])
time.sleep(4)
# clicking on search
driver.find_element_by_xpath('//*[#id="header-desktop-search-button"]').click()
time.sleep(15)
companyname = driver.find_elements_by_xpath('//*[#id="quote-header-info"]/div[2]/div[1]/div[1]/h1')
ticker = companyname = str(companyname[0].text)
print("comapny name: "+ companyname)
ticker = ticker[::-1]
ticker = ticker[1:ticker.find("(")]
ticker = ticker[::-1]
print("extracted ticker: " + ticker)
companyname = companyname[:companyname.find(" (")]
companyname = companyname.replace("'","''")
cur.execute("INSERT INTO tickers1 (keyword,companyName) values ('" + ticker + "','" + companyname + "')")
conn.commit()
cur.close()
conn.close()
print(script_dir_path)
Yahoo_Finance = PythonOperator(task_id = 'extract_tickers',
python_callable = extract_tickers,
provide_context = True,
dag= dag )
Yahoo_Finance
PostgesOperator runs SQL. Your code is querying API, generate a CSV and loading it to the DB. You can not do that with PostgesOperator.
What you can do is to replace the usage of psycopg2 with PostgresHook.
The hook is a wrapper around psycopg2 that expose you functions that you can interact with. This means that, for example, you don't need to handle how to connect to Postgres on your own. Simply define the connection in Admin -> Connections and reference the connection name in the hook:
from airflow.providers.postgres.hooks.postgres import PostgresHook
def extract_tickers():
with PostgresHook(postgres_conn_id="postgres_default").get_conn() as conn:
with conn.cursor() as cur:
cur.execute("Your SQL CODE")
To see other methods available in the hook check the hook source code.
Rewrite the Tasks as follows
Task0 Uses the PostgresOperator and creates the Table.
create_table = PostgresOperator(
task_id='create_table',
postgres_conn_id='airflo_db_id',
sql="""CREATE TABLE IF NOT EXISTS yahoo_data_rev(id BIG serial, stock_name VARCHAR, open NUMERIC, close NUMERIC,
high NUMERIC, low NUMERIC)"""
)
Task1 uses the Python Operator, uses webdriver, scrapes the site and creates the CSV. That is done with Python Operator.
Task2 Uses the Postgres Operator as below
insert_table = PostgresOperator(
task_id='insert_table',
postgres_conn_id='airflo_db_id',
sql="""COPY yahoo_data_rev from /file/path/yahoo_rev.csv
WIT DELIMITER ',' CSV HEADER)"""
)
Name the Tables created above uniquely for each time the DAG runs.
The CSV file names have to similarly unique for each DAG run.
The tables created from this DAG can be merged with the main table that is maintained using another DAG schedule after the market hours. That DAG also will implement the PostgreSQL Operator to run the queries.
https://stackoverflow.com/a/66112728/16388185
discusses how to upsert the data into existing table. You may refer to it, if you are thinking about using Upsert instead of inserting and later merging.
I am new to Airflow. I have to check for a file which is generated from DAG (eg: sample.txt) is moved from bucket(in my case the file I have generated will be moved away from the bucket when picked up by other system, and then there won't be this output file in the bucket. It might take few minutes for the file to be removed from bucket)
How to add a task in the same DAG where it waits/retires till the file is moved away from the bucket and when the sample.txt file is moved away then proceed with the next task.
Is there any operator which satisfies the above criteria? please throw some light on how to proceed
You can create a custom sensor based on the current GCSObjectExistenceSensor
The modification is simple:
from airflow.providers.google.cloud.sensors.gcs import GCSObjectExistenceSensor
class GCSObjectNotExistenceSensor(GCSObjectExistenceSensor):
def poke(self, context: dict) -> bool:
self.log.info('Sensor checks if : %s, %s does not exist', self.bucket, self.object)
hook = GCSHook(
gcp_conn_id=self.google_cloud_conn_id,
delegate_to=self.delegate_to,
impersonation_chain=self.impersonation_chain,
)
return not hook.exists(self.bucket, self.object)
Then use the sensor GCSObjectNotExistenceSensor in your code like:
gcs_object_does_not_exists = GCSObjectNotExistenceSensor(
bucket=BUCKET_1,
object=PATH_TO__FILE,
mode='poke',
task_id="gcs_object_does_not_exists_task",
)
The sensor will not let the pipeline to continue until the object PATH_TO__FILE is removed.
You can use airflow PythonOperator to achieve the task. Make the Python callable continuously poke GCS and check if the file is removed. Return from Python function when the file from GCS is removed.
from airflow.operators.python_operator import PythonOperator
from google.cloud import storage
import google.auth
def check_file_in_gcs():
credentials, project = google.auth.default()
storage_client = storage.Client('your_Project_id', credentials=credentials)
name = 'sample.txt'
bucket_name = 'Your_Bucket_name'
bucket = storage_client.bucket(bucket_name)
while True:
stats = storage.Blob(bucket=bucket, name=name).exists(storage_client)
if not stats:
print("Returning as file is removed!!!!")
return
check_gcs_file_removal = PythonOperator(
task_id='check_gcs_file_removal',
python_callable= check_file_in_gcs,
#op_kwargs={'params': xyz},
#Pass bucket name and other details if needed by commentating above
dag=dag
)
you might need to install Python packages for the google cloud libraries to work. Please install one from below. (Not sure which one to install exactly.Taken from my virtualenv)
google-api-core==1.16.0
google-api-python-client==1.8.0
google-auth==1.12.0
google-auth-httplib2==0.0.3
google-auth-oauthlib==0.4.1
google-cloud-core==1.3.0
google-cloud-storage==1.27.0