I am running a AWS Glue job to execute stored procedures in an oracle database. I want to be able to catch the sql exception when a stored procedure fails. I am using 'from py4j.java_gateway import java_import' to set up the connection and execute sql commands on the connection.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
############################################
### UPDATE THE STORE PROCEDURE NAME HERE ###
sp_names = [
'DEV_WF_POC1',
'DEV_WF_POC2'
]
############################################
#Set the conection name (Will be replaced for FT and PROD by powershell deployment script)
glue_connection_name = 'dw-dev-connection'
#Use systems args to return job name and pass to local variable
args = getResolvedOptions(sys.argv, ['JOB_NAME','WORKFLOW_NAME', 'WORKFLOW_RUN_ID'])
workflow_name = args['WORKFLOW_NAME']
workflow_run_id = args['WORKFLOW_RUN_ID']
glue_job_name = args['JOB_NAME']
#Create spark handler and update status of glue job
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(glue_job_name, args)
job.commit()
logger = glueContext.get_logger()
glue_client = boto3.client('glue')
#Extract connection details from Data Catelog
source_jdbc_conf = glueContext.extract_jdbc_conf(glue_connection_name)
#Import Python for Java Java.sql libs
from py4j.java_gateway import java_import
java_import(sc._gateway.jvm,"java.sql.Connection")
java_import(sc._gateway.jvm,"java.sql.DatabaseMetaData")
java_import(sc._gateway.jvm,"java.sql.DriverManager")
java_import(sc._gateway.jvm,"java.sql.SQLException")
#Extract the URL from the JDBC connection
oracleurl = source_jdbc_conf.get('url')
# Update connection string to expected Oracle format
oracleurl = oracleurl.replace("oracle://", "oracle:thin:#")
oracleurl = oracleurl + ':orcl'
#Create the connection to the Oracle database with java.sql
conn = sc._gateway.jvm.DriverManager.getConnection(oracleurl, source_jdbc_conf.get('user'), source_jdbc_conf.get('password'))
#Change autocommit to false to avoid Table lock error
conn.setAutoCommit(False);
# error dict
errs = {}
err = ''
try:
for sp_name in sp_names:
#Prepare call storeproc statement and execute
cstmt = conn.prepareCall("{call reporting." + sp_name + "}");
results = cstmt.execute();
conn.commit();
# capture error
except Exception as e: # work on python 3.x
##errs['msg'] = str(sc._gateway.jvm.SQLException.getMessage())- doesn't work
errs['error'] = str(e)
errs['sp_name'] = sp_name
errs['error_type'] = str(type(e)).replace("<class '","").replace("'>","")
if len(errs) != 0:
stmt = conn.createStatement();
sql = "insert into dev_workflow_errors (timestamp, workflow_id, workflow_name, job_name, sp_name, error_type, error) values (current_timestamp, '" + workflow_run_id + "', '" + workflow_name + "', '" + glue_job_name + "', '" + errs['sp_name'] + "', '" + errs['error_type'] + "', '" + errs['msg'] + "')"
rs = stmt.executeUpdate(sql);
conn.commit();
#sys.exit(1)
#Close down the connection
conn.close();
#Update Logger
logger.info("Finished")
I tried the pythonic 'Try' and 'Except' method but for the base exception i just get the full 'py4j.protocol.Py4JJavaError' error message. Inside this message conatins the database specific error messages i want to extract.
Can I use 'java_import(sc._gateway.jvm,"java.sql.SQLException")' in any way to extract database specific errors from the execute function?
Related
I am trying to extract data from SQL Server and load it into PostgreSQL. The SQL query runs fine in SQL Server, the connection doesn't seem to be the problem.
the following error message is:
Data extract error: ('42000', "[42000] [Microsoft][SQL Server Native Client 11.0][SQL Server]Incorrect syntax near ':'. (102) (SQLExecDirectW)")
# import needed libraries
from sqlalchemy import create_engine
import pyodbc
import pandas as pd
import os
# get password from environmnet var
pwd = os.environ['PGPASS']
uid = os.environ['PGUID']
# sql db details
driver = "{SQL Server Native Client 11.0}"
server = "DESKTOP-7CS8HNG"
database = "AdventureWorksDW2019;"
# extract data from sql server
def extract():
try:
src_conn = pyodbc.connect(
'DRIVER=' + driver + ';SERVER=' + server + '\SQLEXPRESS' + ';DATABASE=' + database +
';UID=' + uid + ';PWD=' + pwd)
src_cursor = src_conn.cursor()
# execute query
src_cursor.execute(""" select t.name as table_name
from sys.tables t where t.name in ('DimProduct','DimProductSubcategory',
'DimProductSubcategory','DimProductCategory','DimSalesTerritory','FactInternetSales') """)
src_tables = src_cursor.fetchall()
for tbl in src_tables:
# query and load save data to dataframe
df = pd.read_sql_query(f'select * from {tbl[0]}', src_conn)
load(df, tbl[0])
except Exception as e:
print("Data extract error: " + str(e))
finally:
src_conn.close()
# load data to postgres
def load(df, tbl):
try:
rows_imported = 0
engine = create_engine(f'postgresql://{uid}:{pwd}#{server}:5432/AdventureWorks')
print(f'importing rows {rows_imported} to {rows_imported + len(df)}... for table {tbl}')
# save df to postgres
df.to_sql(f'stg_{tbl}', engine, if_exists='replace', index=False)
rows_imported += len(df)
# add elapsed time to final print out
print("Data imported successful")
except Exception as e:
print("Data load error: " + str(e))
try:
# call extract function
extract()
except Exception as e:
print("Error while extracting data: " + str(e))
I have written a code to pick specific values from email body and store it in dataframe now the next step is to store those values in oracle database for that i am using sqlalchemy but i am not sure how can i pass those values to store proc like below
call CORE_VALUATIONS.VALUATIONS.INSERTEQCLOSINGPRICE("SGEPSBSH",to_date('"&TEXT(2022-06-01,"DDMMMYYYY")&"','ddmonyyyy'),"111.9852",NULL,NULL);
from sqlalchemy.engine import create_engine
import datetime
today = datetime.date.today()
DIALECT = 'oracle'
SQL_DRIVER = 'cx_oracle'
USERNAME = 'robinhood' # enter your username
PASSWORD = 'XXXXXX' # enter your password
HOST = 'pv-prod-orc-01.XXXXX.com' # enter the oracle db host url
PORT = 1521 # enter the oracle port number
SERVICE = 'XXXX_APP.ec2.internal' # enter the oracle db service name
ENGINE_PATH_WIN_AUTH = DIALECT + '+' + SQL_DRIVER + '://' + USERNAME + ':' + PASSWORD + '#' + HOST + ':' + str(
PORT) + '/?service_name=' + SERVICE
engine = create_engine(ENGINE_PATH_WIN_AUTH)
# test query
query = """
call CORE_VALUATIONS.VALUATIONS.INSERTEQCLOSINGPRICE("**SGEPSBSH**",to_date('"&TEXT(**2022-06-01**,"DDMMMYYYY")&"','ddmonyyyy'),"**111.9852**",NULL,NULL);
"""
con = engine.connect()
outpt = con.execute(query)
con.close()
how can i pass those values to store proc
Call .execute(sql_text, dict_of_param_values), e.g., something like
import sqlalchemy as sa
# …
sql_text = sa.text(
"call CORE_VALUATIONS.VALUATIONS.INSERTEQCLOSINGPRICE(:param1, :param2, :param3, NULL, NULL);"
)
dict_of_param_values = dict(param1="SGEPSBSH", param2="2022-06-01", param3=111.9852)
with engine.begin() as conn:
conn.execute(sql_text, dict_of_param_values)
# (transaction will automatically commit when `with` block exits)
I want to collect and check what errors are occurring, so I am trying to upload log in the database. I wrote the code to upload the log to mysql by referring to this page. python logging to database.
However, I get the following error. Which part is wrong? Also, if there is another way to easily upload logs in mysql, please let me know.
import logging
import time
import pymysql
user = 'test'
passw = '******'
host = 'db'
port = ****
database = '****'
db_tbl_log = 'log'
log_file_path = 'C:\\Users\\Desktop\\test_log.txt'
log_error_level = 'DEBUG' # LOG error level (file)
log_to_db = True # LOG to database?
class LogDBHandler(logging.Handler):
'''
Customized logging handler that puts logs to the database.
pymssql required
'''
def __init__(self, sql_conn, sql_cursor, db_tbl_log):
logging.Handler.__init__(self)
self.sql_cursor = sql_cursor
self.sql_conn = sql_conn
self.db_tbl_log = db_tbl_log
def emit(self, record):
# Set current time
tm = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(record.created))
# Clear the log message so it can be put to db via sql (escape quotes)
self.log_msg = record.msg
self.log_msg = self.log_msg.strip()
self.log_msg = self.log_msg.replace('\'', '\'\'')
# Make the SQL insert
sql = 'INSERT INTO ' + self.db_tbl_log + ' (log_level, ' + \
'log_levelname, log, created_at, created_by) ' + \
'VALUES (' + \
'' + str(record.levelno) + ', ' + \
'\'' + str(record.levelname) + '\', ' + \
'\'' + str(self.log_msg) + '\', ' + \
'(convert(datetime2(7), \'' + tm + '\')), ' + \
'\'' + str(record.name) + '\')'
try:
self.sql_cursor.execute(sql)
self.sql_conn.commit()
# If error - print it out on screen. Since DB is not working - there's
# no point making a log about it to the database :)
except pymysql.Error as e:
print("error: ", e)
# print(sql)
# print('CRITICAL DB ERROR! Logging to database not possible!')
# Main settings for the database logging use
if (log_to_db):
# Make the connection to database for the logger
log_conn = pymysql.connect(host=host,
port=port,
user=user,
password=passw,
database=database,
charset='utf8')
log_cursor = log_conn.cursor()
logdb = LogDBHandler(log_conn, log_cursor, db_tbl_log)
# Set logger
logging.basicConfig(filename=log_file_path)
# Set db handler for root logger
if (log_to_db):
logging.getLogger('').addHandler(logdb)
# Register MY_LOGGER
log = logging.getLogger('MY_LOGGER')
log.setLevel(log_error_level)
# Example variable
test_var = 'This is test message'
# Log the variable contents as an error
log.error('This error occurred: %s' % test_var)
error: (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near ''2021-02-22 16:52:06')), 'MY_LOGGER')' at line 1")
Don't format SQL statements by yourself, you will miss a lot of cases. Just pass them as the second parameter:
sql = f'INSERT INTO {self.db_tbl_log} (log_level, log_levelname, log, created_at, created_by) VALUES (%s, %s, %s, %s, %s)'
self.sql_cursor.execute(sql, (record.levelno, record.levelname, self.log_msg, tm, record.name))
%s is placeholder, pymysql will convert given params to valid formats one by one.
I'm downloading a data from an API and storing it in SQLite db. I want to implement the process using "multithreading". Can someone please help me with how to implement it.
I found a library but getting an error. below is the code.
import sqlite3
import os
import pandas as pd
from sodapy import Socrata
import concurrent.futures
dbPath = 'folder where db exists'
dbName = 'db file name'
## Setup connection & cursor with the DB
dbConn = sqlite3.connect(os.path.join(dbPath, dbName), check_same_thread=False)
## Setup the API and bring in the data
client = Socrata("health.data.ny.gov", None)
## Define all the countys to be used in threading
countys = [all 62 countys in New York]
varDict = dict.fromkeys(countys, {})
strDataList = ['test_date', 'LoadDate']
intDataList = ['new_positives', 'cumulative_number_of_positives', 'total_number_of_tests', 'cumulative_number_of_tests']
def getData(county):
## Check if table exists
print("Processing ", county)
varDict[county]['dbCurs'] = dbConn.cursor()
varDict[county]['select'] = varDict[county]['dbCurs'].execute('SELECT name FROM sqlite_master WHERE type="table" AND name=?', (county,) )
if not len(varDict[county]['select'].fetchall()):
createTable(county)
whereClause = 'county="'+county+'"'
varDict[county]['results'] = client.get("xdss-u53e", where=whereClause)
varDict[county]['data'] = pd.DataFrame.from_records(varDict[county]['results'])
varDict[county]['data'].drop(['county'], axis=1, inplace=True)
varDict[county]['data']['LoadDate'] = pd.to_datetime('now')
varDict[county]['data'][strDataList] = varDict[county]['data'][strDataList].astype(str)
varDict[county]['data']['test_date'] = varDict[county]['data']['test_date'].apply(lambda x: x[:10])
varDict[county]['data'][intDataList] = varDict[county]['data'][intDataList].astype(int)
varDict[county]['data'] = varDict[county]['data'].values.tolist()
## Insert values into SQLite
varDict[county]['sqlQuery'] = 'INSERT INTO ['+county+'] VALUES (?,?,?,?,?,?)'
varDict[county]['dbCurs'].executemany(varDict[county]['sqlQuery'], varDict[county]['data'])
dbConn.commit()
# for i in dbCurs.execute('SELECT * FROM albany'):
# print(i)
def createTable(county):
sqlQuery = 'CREATE TABLE ['+county+'] ( [Test Date] TEXT, [New Positives] INTEGER NOT NULL, [Cumulative Number of Positives] INTEGER NOT NULL, [Total Number of Tests Performed] INTEGER NOT NULL, [Cumulative Number of Tests Performed] INTEGER NOT NULL, [Load date] TEXT NOT NULL, PRIMARY KEY([Test Date]))'
varDict[county]['dbCurs'].execute(sqlQuery)
# for _ in countys:
# getData(_)
# x = countys[:5]
with concurrent.futures.ThreadPoolExecutor() as executor:
# results = [executor.submit(getData, y) for y in x]
executor.map(getData, countys)
getData is the function which brings in the data county wise and loads into the db. Countys is a list of all the countys. I am able to do it synchronously but would like to implement multithreading.
The for loop to do it synchronously (which works) is
for _ in countys:
getData(_)
The error message is
ProgrammingError: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 8016 and this is thread id 19844.
You might find this useful
sqlite.connect(":memory:", check_same_thread=False)
Using the below script, I am trying to fetch documents added in last hour for multiple collections; but it's giving me zero value.
Can someone look at the code below and help me in fixing it?
import pymongo
import sys
from datetime import datetime
from datetime import timedelta
from pymongo import MongoClient
# establish connectivity to Mongodb via ssl using pymongo module
#args = sys.argv
host = 'mongo-db-prd'
uname = 'superuser'
passwrd = 'Hayyo'
#print (args)
port = "27017"
print(uname)
print(passwrd)
uri = 'mongodb://' + uname + ":" + passwrd + "#" + host + ":" + port + '/?authSource=admin'
client = MongoClient(uri, ssl=True, ssl_ca_certs='./files/rds-combined-ca-bundle.pem')
# This will create hl7feeds docdb
print("connected client")
db = client.feeds # This command will create a DB
print(client.list_database_names()) # This command will print list of DBs
print(client.list_database_names()) # This command will print list of DBs
mycol = db[ "feeds_100"] # This command will create a collection in DB
docins=mycol.insert_one({"name" : "test"}) # This will insert a document in collection
dblist = client.list_database_names()
print(client.list_database_names())
# Lets create collections on docdb for all tenants
tlist1 = ["feeds_104","feeds_105","feeds_106"]
for each_val in tlist1:
print (each_val)
countvalue = db.getCollection('each_val').find({"row_created_date":{"$gt":datetime.utcnow() - timedelta(hours=1)}}).count();
print (countvalue)
In above query using db[collection-name] method I can get results