luigi.Task.complete = True when no output is created

luigi.Task.complete = True when no output is created - python

Luigi is considering the task as finished even when the object is empty and no output is created:
import luigi
import luigi.contrib.azureblob as ab
import luigi.format as fm
import pandas as pd
import sqlalchemy
DB_ACCT = '*-*-*' # database credentials
STGE_ACCT = '-*-*-' # storage credentials
class MyTask:
ref_date = luigi.DateParameter()
#finished = False
def run(self):
eng = sqlalchemy.create_engine(DB_ACCT)
query = f"""SELECT * FROM tbl WHERE date_column = DATE '{self.ref_date}'"""
with eng.connect() as conn, conn.begin():
df = pd.read_sql(query, conn)
with self.output().open('w') as f:
df.to_parquet(f)
def output(self):
path_blob = f'{self.ref_date}.parquet'
return ab.AzureBlobTarget('container-name', path_blob, STGE_ACCT, fm.Nop)
Is this problematic in terms of luigi's expected behavior?
Appreciate any help.

Related

How to improvise my DB connection code in python for best practices

I Have a config file to read different variables for different sources for Oracle and Mongo for different environments and the Configparser is defined in a module named config_utils.
I have the below code in a db_conn module in my Python framework. I would need suggestions on how to improvise the code for best practices
Note: Connectorx module will be used whenever I want to execute SQL and the module has no open or close connection methods
db_conn.py
from pymongo import MongoClient
import config.config_utils as config
import reusables.dataframes.mongo_utils as mongo_utils
oracle_conn_str = oracle_host = ''
schema_name = table_name = obj_name = batch_name = pk = sur_key = oracle_audit_col_lst = ''
max_updated_date = ''
mongo_job_lst = mongo_raw_job_col_lst = []
mongo_db_name = mongo_collection_name = ''
#------ Oracle -----
def oracle():
global oracle_conn_str, oracle_host
global schema_name, table_name, obj_name, batch_name, pk, sur_key, oracle_audit_col_lst
global max_updated_date
oracle_conn_str = config.test_env['oracle_conn_str']
# Similarly assigning values to other oracle related variables from config file
#------ Mongo ------
def mongo_conn_open():
global mongo_collection_name, mongo_db_name, mongo_job_lst, mongo_raw_job_col_lst
conn_str = config.test_env['mongo_con_string']
# Similarly assigning values to other mongo related variables from config file
return mongo_client
def mongo_query():
df = mongo_utils.mongo_df()
return df
def mongo_conn_close(client):
client.close()
def mongo():
client = mongo_conn_open()
df = mongo_query()
mongo_conn_close(client)
return df
mongo.py
import pandas as pd
import reusables.db.db_conn as db_conn
# Function to query in Mongo DB
def base_df():
db = db_conn.mongo_db_name
# mongo_query
return df
# Function to do the df operations/manipulations in the base_df
def mongo_df():
df = base_df()
# df operations
df.reset_index(inplace=True)
return df

Lambda function timeout error when setting up SNS

I am getting this error when trying to send and SNS email via lambda function:
"errorMessage": "Connect timeout on endpoint URL: \"https://sns.us-west-1.amazonaws.com/\"",
"errorType": "ConnectTimeoutError"
I have all the policies set up with SNS full access to the respective role tied to function. Here is the full function:
import json
import psycopg2
import boto3
import time
import requests
import pandas as pd
import numpy as np
from datetime import datetime
import sys
import logging
import os
import csv
import smtplib
from base64 import b64decode
#bucket = 's3://data-lake-020192/'
credential = {
'dbname' : 'main',
'host_url' : 'test.us-west-1.redshift.amazonaws.com',
'port' : '5439',
'user' : '####',
'password' : '########'
}
redshift_role = {
'dev': 'arn:aws:lambda:us-west-1:##########:function:test_function'
}
def lambda_handler(event, context):
## S3 CONNECTIVITY ##
s3 = boto3.resource('s3')
#client = boto3.client('s3')
# TODO implement
conn_string = "dbname='{}' port='{}' user='{}' password='{}' host='{}'"\
.format(credential['dbname'], credential['port'], credential['user'], credential['password'], credential['host_url'])
sql_query = """with
tbl as (
select
case
when (sa.parentid like '001i0000023STBY%' or sa.ultimate_parent_account__c like '001i0000023STBY%') --Parent OR Ultimate Parent is <Department of Defense>
then sa.id
else
coalesce(sa.ultimate_parent_account__c, sa.parentid, sa.id) end as cust_id,
(select name from salesforce.account where id=cust_id) as cust_name,
sa.name as acct_name,
sa.id as acct_id,
sa.parentid,
(select name from salesforce.account where id=sa.parentid) as par_name,
(select name from salesforce.account where id=sa.ultimate_parent_account__c) as ult_par_name,
so.id as opp_id,
so.name as opp_name,
so.stagename as stg_name,
so.type as opp_type,
so.Manager_Commit__c as mgr_commit,
so.renewal_risk__c as opp_risk,
so.isclosed as cls
salesforce.opportunity so
join
salesforce.account sa on
so.accountid = sa.id
join salesforce.user su on
so.ownerid = su.id
join salesforce.opportunitylineitem sol on
so.id = sol.opportunityid
join salesforce.product2 sp on
sol.product2id = sp.id
join salesforce.customasset__c sca on
so.id = sca.opportunity__c
where
so.isdeleted = false
and sa.isdeleted = false
and sol.isdeleted = false
)
select * from
(select
tbl.acct_name as acct,
'[' || 'Link' || '](' || concat('https://vectranetworks.lightning.force.com/', tbl.opp_id) || ')' as opp_link,
tbl.ca_name,
tbl.ca_pr_name,
tbl.ca_mode,
date(tbl.ca_last_seen) as ca_last_seen,
tbl.ca_sw_version,
tbl.ca_tot_hosts,
tbl.ca_active_hosts,
tbl.ca_x95_hosts_tot,
tbl.ca_traffic,
tbl.ca_uiconfig
from
tbl
where
tbl.stg_name like 'Closed Won%'
and tbl.arr is not null
group by
tbl.acct_name,
tbl.opp_id,
tbl.ca_name,
tbl.ca_pr_name,
tbl.ca_mode,
tbl.ca_last_seen,
tbl.ca_sw_version,
tbl.ca_tot_hosts,
tbl.ca_active_hosts,
tbl.ca_x95_hosts_tot,
tbl.ca_traffic,
tbl.ca_uiconfig) df
WHERE ca_last_seen >= DATEADD(MONTH, -3, GETDATE())
limit 5"""
con = psycopg2.connect(conn_string)
client2 = boto3.client('sns')
with con.cursor() as cur:
# Enter the query that you want to execute
cur.execute(sql_query)
for row in cur:
df = pd.DataFrame.from_records(cur.fetchall(), columns = [desc[0] for desc in cur.description])
df['Time_Stamp'] = pd.to_datetime('now', utc=True)
df['ca_active_hosts'] = df['ca_active_hosts'].astype('Int64', errors='ignore')
df['ca_active_hosts'].fillna(0, inplace=True)
#print(df.iloc[0])
#if (df.iloc[0]['ca_active_hosts'].notna()):
if (df['ca_active_hosts'] >= 0).all():
print('the file is present, going to send notifaction')
response = client2.publish(
TopicArn = 'arn:aws:sns:us-west-1:##########:email-data-lake',
Message = 'Warning User active_hosts is ' +str(df['Time_Stamp']),
Subject = 'User Warning')
else:
print('the file is not present')
#cur.close()
Is there anything else in code/connection I need to change? Feel I have exhausted all that I can find online being new to SNS

I imagine that your lambda function does not have any internet connectivity.
Thus, a connection timeout issue indicates that the network interface associated with your lambda function is unable to talk to the service.
To fix this, create a VPC interface endpoint for sns.us-west-1.amazonaws.com in the same subnet as that of the lambda's network interface.

Dataframe results to bigquery are empty

I am working in a google cloud function with the intention of putting the results in a dataframe and then porting all of that into BigQuery. My function was able to be deployed without error but when looking into the associated bq table I am seeing no data. Below is a view of my code:
# general setup, common imports
import json, requests, time, urllib.parse
import pandas as pd
from pandas import DataFrame
import datetime
import io
import os
from google.cloud import bigquery
from google.cloud.bigquery.client import Client
def crux_data():
# Read the URLs for auditing
url_list = open('pagespeedlist', 'r')
url_list.read()
results = []
for x in url_list:
url = x[0]
pagespeed_results = urllib.request.urlopen('https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url={}&strategy=mobile&key=API_KEY'\
.format(url)).read().decode('UTF-8')
pagespeed_results_json = json.loads(pagespeed_results)
add_date = datetime.date.today()
largest_contentful_paint = pagespeed_results_json['lighthouseResult']['audits']['largest-contentful-paint']['displayValue'].replace(u'\xa0', u'') # Largest Contenful Paint
first_input_delay = str(round(pagespeed_results_json['loadingExperience']['metrics']['FIRST_INPUT_DELAY_MS']['distributions'][2]['proportion'] * 1000, 1)) + 'ms' # First Input Delay
cumulative_layout_shift = pagespeed_results_json['lighthouseResult']['audits']['cumulative-layout-shift']['displayValue'] # CLS
crux_lcp = pagespeed_results_json['loadingExperience']['metrics']['LARGEST_CONTENTFUL_PAINT_MS']['category'] # Largest Contenful Paint Score
crux_fid = pagespeed_results_json['loadingExperience']['metrics']['FIRST_INPUT_DELAY_MS']['category'] # First Input Delay Score
crux_cls = pagespeed_results_json['loadingExperience']['metrics']['CUMULATIVE_LAYOUT_SHIFT_SCORE']['category'] # CLS Score
result_url = [url,date,largest_contentful_paint,first_input_delay,cumulative_layout_shift,lcp_score,fid_score,cls_score]
results.append(result_url)
#Convert to dataframe
results_csv = DataFrame (results,columns=['URL','DATE','LCP','FID','CLS','LCP_SCORE','FID_SCORE','CLS_SCORE'])
# Construct a BigQuery client object.
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'credentials.json'
client = Client()
# TODO(developer): Set table_id to the ID of the table to create.
table_id = "db.datatable.dataLoc"
job_config = bigquery.LoadJobConfig()
job = client.load_table_from_dataframe(
results_csv, table_id, job_config=job_config
) # Make an API request.
job.result() # Wait for the job to complete.
table = client.get_table(table_id) # Make an API request.
print(
"Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), table_id
)
)
I do see the proper schema in the bq table but no actual data. Is there something I am missing with loading a df to bigquery?
Any help is much appreciated!

process pool executor hangs using map

I'm having an issue using python 3 and concurrent.futures ProcessPoolExecutor and the map function.
My code is this:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import psycopg2
import psycopg2.extensions
import psycopg2.extras
from asq import query
import select
import concurrent.futures
import asyncio
class UpdateElastic:
def __init__(self):
conn = psycopg2.connect(
"dbname=db user=mad password=hat host=blah",
async_=True
)
self.wait(conn)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("SELECT * FROM table",)
self.wait(cur.connection)
self.report_files = cur.fetchall()
cur.execute("SELECT * FROM othertable",)
self.wait(cur.connection)
self.payment_events = cur.fetchall()
cur.close()
conn.close()
self.esconn = Elasticsearch([{'host':'elasticsearch.example.com','port':1234}])
# pass
def wait(self, conn):
while 1:
state = conn.poll()
if state == psycopg2.extensions.POLL_OK:
break
elif state == psycopg2.extensions.POLL_WRITE:
select.select([], [conn.fileno()], [])
elif state == psycopg2.extensions.POLL_READ:
select.select([conn.fileno()], [], [])
else:
raise psycopg2.OperationalError("poll() returned %s" % state)
def get_es_indices(self):
indices = self.esconn.indices.get_alias("digital-sales-csv*")
return list(indices.keys())
def update_documents(self, index, scroll_id=None):
print(index)
# return index
# documents = _get_es_documents(conn, index)
# print(documents['_scroll_id'])
# scroll_id = documents['_scroll_id']
# for document in documents['hits']['hits']:
# ids = {
# "report_id": document['_source']['report_id'],
# "payment_id": document['_source']['payment_id'],
# "document_id": document['_id']
# }
# asyncio.run(_update_es_document(conn, index, report_files, payment_events, ids))
# update_documents(index, conn, report_files, payment_events, scroll_id)
def main():
print('main called')
print('instantiating UpdateElastic')
us = UpdateElastic()
print('UpdateElastic instantiated')
print('setting up ProcessPoolExecutor')
blah = ['abc', 'def', 'ghi']
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
print('calling executor.map')
executor.map(us.update_documents, blah, timeout=10)
if __name__ == "__main__":
main()
With this code, all I'm expecting it to do is print out the values of the array that I've passed, so:
'abc'
'def'
'ghi'
However, after printing: calling executor.map, it hangs.
When i change my constructor to be:
class UpdateElastic:
def __init__(self):
# conn = psycopg2.connect(
# "dbname=db user=mad password=hat host=blah",
# async_=True
# )
# self.wait(conn)
# cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
# cur.execute("SELECT * FROM table",)
# self.wait(cur.connection)
# self.report_files = cur.fetchall()
# cur.execute("SELECT * FROM othertable",)
# self.wait(cur.connection)
# self.payment_events = cur.fetchall()
# cur.close()
# conn.close()
# self.esconn = Elasticsearch([{'host':'elasticsearch.example.com','port':1234}])
pass
(containing only a "pass" in the constructor), it will actually print out the values of the array, as expected.
I'm running this on python 3.7.3, on OSX Mojave 10.14.2.

gloabl name cx not defined

I wanna call an Oracle function returning an objectby using cx_Oracle`s cursor.callfunc(). But this is not working
Here you can see my code:
import cx_Oracle
import json
import web
urls = (
"/", "index",
"/grid", "grid",
)
app = web.application(urls, globals(),web.profiler )
web.config.debug = True
connection = cx_Oracle.Connection("TEST_3D/limo1013#10.40.33.160:1521/sdetest")
typeObj = connection.gettype("MDSYS.SDO_GEOMETRY")
class index:
def GET(self):
return "hallo moritz "
class grid:
def GET(self):
web.header('Access-Control-Allow-Origin', '*')
web.header('Access-Control-Allow-Credentials', 'true')
web.header('Content-Type', 'application/json')
cursor = connection.cursor()
cursor.arraysize = 10000 # default = 50
cursor.execute("""SELECT a.id AS building_nr, c.Geometry AS geometry, d.Classname FROM building a, THEMATIC_SURFACE b, SURFACE_GEOMETRY c, OBJECTCLASS d WHERE a.grid_id_400 = 4158 AND a.id = b.BUILDING_ID AND b.LOD2_MULTI_SURFACE_ID = c.ROOT_ID AND c.GEOMETRY IS NOT NULL AND b.OBJECTCLASS_ID = d.ID""")
obj = cursor.fetchone()
obj = obj[1]
print obj
cursor.callfunc("SDO2GEOJSON", cx.Oracle.OBJECT, [obj])
# Aufruf der App
if __name__ == "__main__":
app.run(web.profiler)
Error message:
at /grid
global name 'cx' is not defined
But I am sure that cx_Oracle is correct installed. Furthermore I use import cx_Oracle at the beginning and this is working.
What is wrong?

Simple typo. In the line
cursor.callfunc("SDO2GEOJSON", cx.Oracle.OBJECT, [obj])
You should use cx_Oracle.OBJECT

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

luigi.Task.complete = True when no output is created - python

Related

How to improvise my DB connection code in python for best practices

Lambda function timeout error when setting up SNS

Dataframe results to bigquery are empty

process pool executor hangs using map

gloabl name cx not defined

Categories

Resources