I have airflow dag written to work with Python operator.I need to use PostgreSQL operator for same dag without changing functionality of dag. Here is code with Python operators. How Should I replace Python operator with PostgreSQL operator? Or can we use two different operators in a single dag?
from airflow import DAG
from airflow.models import dag
from airflow.utils.dates import days_ago
from airflow.operators.python_operator import PythonOperator
import os
script_dir_path = os.path.dirname(os.path.realpath(__file__))
import time
from time import sleep
from xlwt import Workbook
import pandas as pd
from csv import writer
from csv import DictReader
from datetime import datetime
from selenium import webdriver
import psycopg2
opt = webdriver.FirefoxOptions()
wb=Workbook()
sheet1=wb.add_sheet('Sheet 1',cell_overwrite_ok=False)
i=0
j=0
default_args = {
'owner': 'airflow',
'retries': 1
}
dag = DAG( 'Yahoo_Finance',
default_args=default_args,
description='fetching ticker symbol',
catchup=False,
start_date= datetime.now(),
schedule_interval= '* 7 * * *'
)
def extract_tickers():
conn = psycopg2.connect(dbname='postgres', user='airflow', password='airflow', host='postgres')
cur = conn.cursor()
with open(r'./fromlocal/EQUITY_L.csv') as read_obj:
csv_dict_reader = DictReader(read_obj)
url = "https://finance.yahoo.com"
driver = webdriver.Remote("http://selenium:4444/wd/hub", options=opt)
driver.get(url)
for row in csv_dict_reader:
time.sleep(4)
# action = ActionChains(driver)
time.sleep(4)
searchBox = driver.find_element_by_id('yfin-usr-qry')
time.sleep(4)
searchBox.send_keys(row['SYMBOL'])
time.sleep(4)
# clicking on search
driver.find_element_by_xpath('//*[#id="header-desktop-search-button"]').click()
time.sleep(15)
companyname = driver.find_elements_by_xpath('//*[#id="quote-header-info"]/div[2]/div[1]/div[1]/h1')
ticker = companyname = str(companyname[0].text)
print("comapny name: "+ companyname)
ticker = ticker[::-1]
ticker = ticker[1:ticker.find("(")]
ticker = ticker[::-1]
print("extracted ticker: " + ticker)
companyname = companyname[:companyname.find(" (")]
companyname = companyname.replace("'","''")
cur.execute("INSERT INTO tickers1 (keyword,companyName) values ('" + ticker + "','" + companyname + "')")
conn.commit()
cur.close()
conn.close()
print(script_dir_path)
Yahoo_Finance = PythonOperator(task_id = 'extract_tickers',
python_callable = extract_tickers,
provide_context = True,
dag= dag )
Yahoo_Finance
PostgesOperator runs SQL. Your code is querying API, generate a CSV and loading it to the DB. You can not do that with PostgesOperator.
What you can do is to replace the usage of psycopg2 with PostgresHook.
The hook is a wrapper around psycopg2 that expose you functions that you can interact with. This means that, for example, you don't need to handle how to connect to Postgres on your own. Simply define the connection in Admin -> Connections and reference the connection name in the hook:
from airflow.providers.postgres.hooks.postgres import PostgresHook
def extract_tickers():
with PostgresHook(postgres_conn_id="postgres_default").get_conn() as conn:
with conn.cursor() as cur:
cur.execute("Your SQL CODE")
To see other methods available in the hook check the hook source code.
Rewrite the Tasks as follows
Task0 Uses the PostgresOperator and creates the Table.
create_table = PostgresOperator(
task_id='create_table',
postgres_conn_id='airflo_db_id',
sql="""CREATE TABLE IF NOT EXISTS yahoo_data_rev(id BIG serial, stock_name VARCHAR, open NUMERIC, close NUMERIC,
high NUMERIC, low NUMERIC)"""
)
Task1 uses the Python Operator, uses webdriver, scrapes the site and creates the CSV. That is done with Python Operator.
Task2 Uses the Postgres Operator as below
insert_table = PostgresOperator(
task_id='insert_table',
postgres_conn_id='airflo_db_id',
sql="""COPY yahoo_data_rev from /file/path/yahoo_rev.csv
WIT DELIMITER ',' CSV HEADER)"""
)
Name the Tables created above uniquely for each time the DAG runs.
The CSV file names have to similarly unique for each DAG run.
The tables created from this DAG can be merged with the main table that is maintained using another DAG schedule after the market hours. That DAG also will implement the PostgreSQL Operator to run the queries.
https://stackoverflow.com/a/66112728/16388185
discusses how to upsert the data into existing table. You may refer to it, if you are thinking about using Upsert instead of inserting and later merging.
Related
I want to audit all the information about the DAGs execution status into a table BigQuery ,
I want to do this through a python code in Dags , as already have written code is loading the data into the BigQuery table (as given below). Need help to append the audit logic in the existing code.
with models.DAG(
'C360_GBL_CCN2DPN_CLASSIC',
default_args=default_args,
#schedule_interval='0 9 * * * ') as dag:
schedule_interval=None) as dag:
start = dummy_operator.DummyOperator(
task_id='start',
trigger_rule='all_success'
)
read_json_file(config_file_path)
end = dummy_operator.DummyOperator(
task_id='end',
trigger_rule='all_success'
)
a=[]
if (len(configurations) > 1):
for k in range(0,len(configurations)):
config=configurations[k]
project_id = config['Project_Id']
staging_dataset = config['Dataset']
table_name = config['Table-Name']
write_disposition =config['write_disposition']
sql = config['Sql']
create_disposition = config['create_disposition']
a.append(BigQueryOperator(
task_id=table_name+'_ccn_2_dpn_bq',
sql=sql,
write_disposition=write_disposition,
create_disposition=create_disposition,
use_legacy_sql=False
))
if k != 0 :
a[k-1].set_downstream(a[k])
else:
a[k].set_upstream(start)
a[len(configurations)-1].set_downstream(end)
else:
config = configurations[0]
project_id = config['Project_Id']
staging_dataset = config['Dataset']
table_name = config['Table-Name']
write_disposition = config['write_disposition']
sql = config['Sql']
create_disposition = config['create_disposition']
Task1 = BigQueryOperator(
task_id=table_name+'_ccn_2_dpn_bq',
sql=sql,
write_disposition=write_disposition,
create_disposition=create_disposition,
use_legacy_sql=False
)
Task1.set_upstream(start)
Task1.set_downstream(end)
Airflow writes all the audit log to the table log in its metadata database, you can check if the information are enough for your needs, and if it's the case, you can export them to BigQuery using a stream app (if you need them ASAP after they are created) or by an airflow dag in two steps using the official operators:
a task form PostgresToGCSOperator to export the data to GCS
then a second task from GCSToBigQueryOperator to import the exported files into your BigQuery table,
or in one step using a new operator (you need to develop it).
After pushing my DAG I get this error
I am new to data engineering. I tried to solve this error in different ways at the expense of my knowledge, but nothing worked. I want to write a DAG that consists of two tasks, the first is to export data from one database table on one server as CSV files and import these CSV files into database tables on another server. The variable contains DAG configuration and SQL scripts for exporting and importing data.
Please tell me how can I solve this error?
I have this exporting code:
def export_csv():
import json
from airflow.models import Variable
import pandas as pd
instruction_data = json.loads(Variable.get('MAIN_SOURCE_DAMDI_INSTRUCTIONS'))
requirement_data = instruction_data['requirements']
lst = requirement_data['scripts']
ms_hook = MsSqlHook(mssql_conn_id='OKTELL')
connection = ms_hook.get_conn()
cursor = connection.cursor()
for i in lst:
result = cursor.execute(i['export_script'])
df = pd.DataFrame(result)
df.to_csv(i['filename'], index=False, header=None, sep=',', encoding='utf-8')
cursor.close()
And this is my task for exporting:
export_csv_func = PythonOperator(
task_id='export_csv_func',
python_callable=export_csv,
mssql_conn_id='OKTELL'
P.S. I imported the libraries and airflow variables inside the function because before that there was a lot of load on the server and this method helped to reduce the load.
When using the PythonOperator you pass args to a callable via op_args and/or op_kwargs. In this case, if you wanted to pass the mssql_conn_id arg you can try:
export_csv_func = PythonOperator(
task_id='export_csv_func',
python_callable=export_csv,
op_kwargs={'mssql_conn_id': 'OKTELL'},
)
Then you need to update the export_csv() function signature to accept this kwarg too.
I'm trying to create an Airflow pipeline that downloads data from an API, processes it, saves it as a CSV and then loads the data to a Postgres database (all within a docker container).
The code looks something like this
from datetime import datetime, timedelta
import pandas as pd
from airflow import DAG
from airflow.providers.postgres.operators.postgres import PostgresOperator
from airflow.operators.python import PythonOperator
default_args = {
"owner": "airflow",
"retries": 5,
"retry_delay": timedelta(minutes=1),
"email": ['airflow#domain.com'],
"email_on_failure": True,
"email_on_retry": False
}
def get_data():
request = request.get("some_url")
request_data = request.json()
all_data = pd.DataFrame.from_dict(request_data["data"])
all_data.to_csv("/opt/airflow/data/all_data.csv",index=False)
with DAG(
dag_id="my_dag",
default_args=default_args,
start_date=datetime(2022,1,24),
catchup=False,
schedule_interval=timedelta(minutes=5)
) as dag:
create_table = PostgresOperator(
task_id="create_table",
postgres_conn_id="postgres_localhost",
sql="""
create table if not exists my_table(
created_at timestamp,
col1 double precision,
col2 smallint,
primary key (created_at, col1)
)
"""
)
get_data = PythonOperator(
task_id="get_data",
python_callable=get_data
)
load_data = PostgresOperator(
task_id = "load_data",
postgres_conn_id="postgres_localhost",
sql="""
copy my_table
from '/opt/airflow/data/all_data.csv'
delimiter ',' csv;
"""
)
create_table >> get_data >> load_data
The problem is that when I try to run the DAG I get an error in the load_data task saying psycopg2.errors.UndefinedFile: could not open file "/opt/***/data/all_data.csv" for reading: No such file or directory HINT: COPY FROM instructs the PostgreSQL server process to read a file. You may want a client-side facility such as psql's \copy.
I don't know why the word airflow is getting replaced in the path or how to save it properly so that the CSV file can be copied into postgres.
This error is because the postgres server is a separate instance within Docker. You could try one of the following ways around this:
Copy the file between servers by using using scp to place the data file onto the postgres server
Copy the file between servers by using SFTPOperator (which requires SSH Hook instatiation), then do the COPY statement.
Connect manually to the Postgres db via the BashOperator and run the copy CLI command for Postgres.
If anyone has a more elegant solution, please answer. I have this same problem right now and am working on it. Once I have it, I'll post it back here.
I have a python script (written in Jupyter notebook) and I would like to run this script in Azure. The python script basically gets data from API source (which updated every 24 hours) and updates the SQL database which is Azure. So this automated python script will update the database table whenever it runs
Can someone please me with this?
Below is the python code i have written,
import pyodbc
import requests
import json
import pandas as pd
responses = requests.get("https://data.buffalony.gov/resource/d6g9-xbgu.json")
crime_data = json.loads(responses.text)
dic = {}
dic = crime_data
df = pd.DataFrame.from_dict(dic)
dff = df[['case_number','day_of_week','incident_datetime','incident_description','incident_id','incident_type_primary']].copy()
connection = pyodbc.connect ('Driver={ODBC Driver 17 for SQL Server};Server=servername;Database=Databasename;UID=admin;PWD=admin')
cur = connection.cursor()
row = []
for i in range(dff.shape[0]):
row.append(dff.iloc[i].tolist())
sql = '''\
INSERT INTO [dbo].[FF] ([case_number],[day_of_week],[incident_datetime],[incident_description],[incident_id],[incident_type_primary]) values (?,?,?,?,?,?)
'''
for i in range(dff.shape[0]):
cur.execute(sql,row[i])
connection.commit()
I don't use azure and jupyter notebook but I think I have a solution
If you leave your computer run all night change your code into this :
import time
import pyodbc
import requests
import json
import pandas as pd
while 1:
responses = requests.get("https://data.buffalony.gov/resource/d6g9-xbgu.json")
crime_data = json.loads(responses.text)
dic = {}
dic = crime_data
df = pd.DataFrame.from_dict(dic)
dff = df [['case_number','day_of_week','incident_datetime','incident_description','incident_i d','incident_type_primary']].copy()
connection = pyodbc.connect ('Driver={ODBC Driver 17 for SQL Server};Server=servername;Database=Databasename;UID=admin;PWD=admin')
cur = connection.cursor()
row = []
for i in range(dff.shape[0]):
row.append(dff.iloc[i].tolist())
sql = '''\
INSERT INTO [dbo].[FF] ([case_number],[day_of_week],[incident_datetime], [incident_description],[incident_id],[incident_type_primary]) values (?,?,?,?,?,?)
'''
for i in range(dff.shape[0]):
cur.execute(sql,row[i])
connection.commit()
time.sleep(86400)
if not create a new python program in the startup file like this:
import time, os
while 1:
if time.ctime()[11:13] >= "update hour" and time.ctime()[0:4] != open("path/to/any_file.txt").read():
file = open("path/to/any_file.txt", "w")
file.write(time.ctime()[0:4])
file.close()
os.system("python /path/to/file.py")
A task scheduler like Azure WebJobs will do this for you.
In Python version 2.7.6
Pandas version 0.18.1
MySQL 5.7
import MySQLdb as dbapi
import sys
import csv
import os
import sys, getopt
import pandas as pd
df = pd.read_csv('test.csv')
rows = df.apply(tuple, 1).unique().tolist()
db=dbapi.connect(host=dbServer,user=dbUser,passwd=dbPass)
cur=db.cursor()
for (CLIENT_ID,PROPERTY_ID,YEAR) in rows:
INSERT_QUERY=("INSERT INTO {DATABASE}.TEST SELECT * FROM {DATABASE}_{CLIENT_ID}.TEST WHERE PROPERTY_ID = {PROPERTY_ID} AND YEAR = {YEAR};".format(
CLIENT_ID=CLIENT_ID,
PROPERTY_ID=PROPERTY_ID,
YEAR=YEAR,
DATABASE=DATABASE
))
print INSERT_QUERY
cur.execute(INSERT_QUERY)
db.query(INSERT_QUERY)
This will print out the query I am looking for, however, without successfully returning the results of INSERT INTO when I checked the results in MySQL
INSERT INTO test.TEST SELECT * FROM test_1.TEST WHERE PROPERTY_ID = 1 AND YEAR = 2015;
However, if I just copy and paste this MySQL query into MySQL GUI, it will execute without any problem. Could any guru enlighten?
I also tried the following
cur.execute(INSERT_QUERY, multi=True)
Returns an error
TypeError: execute() got an unexpected keyword argument 'multi'
The answer here is we need to use "from mysql.connector" and a db.commit(). Here is a good example
http://www.mysqltutorial.org/python-mysql-insert/
import MySQLdb as dbapi
import mysql.connector
import sys
import csv
import os
import sys, getopt
import pandas as pd
df = pd.read_csv('test.csv')
rows = df.apply(tuple, 1).unique().tolist()
db=dbapi.connect(host=dbServer,user=dbUser,passwd=dbPass)
cur=db.cursor()
conn = mysql.connector.connect(host=dbServer,user=dbUser,port=dbPort,password=dbPass)
cursor=conn.cursor()
for (CLIENT_ID,PROPERTY_ID,YEAR) in rows:
INSERT_QUERY=("INSERT INTO {DATABASE}.TEST SELECT * FROM {DATABASE}_{CLIENT_ID}.TEST WHERE PROPERTY_ID = {PROPERTY_ID} AND YEAR = {YEAR};".format(
CLIENT_ID=CLIENT_ID,
PROPERTY_ID=PROPERTY_ID,
YEAR=YEAR,
DATABASE=DATABASE
))
print INSERT_QUERY
cursor.execute(INSERT_QUERY)
conn.commit()
Only by having the commit, the database/ table changes will be accepted
I was using mysql-connector pool, trying to insert a new row into a table, and got the same problem. The version info: mysql-8, python3.7.
The solution is to add connection.commit at last even you didn't start transaction.