Python script to query data for last 1 hour - python

Using the below script, I am trying to fetch documents added in last hour for multiple collections; but it's giving me zero value.
Can someone look at the code below and help me in fixing it?
import pymongo
import sys
from datetime import datetime
from datetime import timedelta
from pymongo import MongoClient
# establish connectivity to Mongodb via ssl using pymongo module
#args = sys.argv
host = 'mongo-db-prd'
uname = 'superuser'
passwrd = 'Hayyo'
#print (args)
port = "27017"
print(uname)
print(passwrd)
uri = 'mongodb://' + uname + ":" + passwrd + "#" + host + ":" + port + '/?authSource=admin'
client = MongoClient(uri, ssl=True, ssl_ca_certs='./files/rds-combined-ca-bundle.pem')
# This will create hl7feeds docdb
print("connected client")
db = client.feeds # This command will create a DB
print(client.list_database_names()) # This command will print list of DBs
print(client.list_database_names()) # This command will print list of DBs
mycol = db[ "feeds_100"] # This command will create a collection in DB
docins=mycol.insert_one({"name" : "test"}) # This will insert a document in collection
dblist = client.list_database_names()
print(client.list_database_names())
# Lets create collections on docdb for all tenants
tlist1 = ["feeds_104","feeds_105","feeds_106"]
for each_val in tlist1:
print (each_val)
countvalue = db.getCollection('each_val').find({"row_created_date":{"$gt":datetime.utcnow() - timedelta(hours=1)}}).count();
print (countvalue)

In above query using db[collection-name] method I can get results

Related

Extract java.sql.SQLException from execute call in Python - AWS Glue

I am running a AWS Glue job to execute stored procedures in an oracle database. I want to be able to catch the sql exception when a stored procedure fails. I am using 'from py4j.java_gateway import java_import' to set up the connection and execute sql commands on the connection.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
############################################
### UPDATE THE STORE PROCEDURE NAME HERE ###
sp_names = [
'DEV_WF_POC1',
'DEV_WF_POC2'
]
############################################
#Set the conection name (Will be replaced for FT and PROD by powershell deployment script)
glue_connection_name = 'dw-dev-connection'
#Use systems args to return job name and pass to local variable
args = getResolvedOptions(sys.argv, ['JOB_NAME','WORKFLOW_NAME', 'WORKFLOW_RUN_ID'])
workflow_name = args['WORKFLOW_NAME']
workflow_run_id = args['WORKFLOW_RUN_ID']
glue_job_name = args['JOB_NAME']
#Create spark handler and update status of glue job
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(glue_job_name, args)
job.commit()
logger = glueContext.get_logger()
glue_client = boto3.client('glue')
#Extract connection details from Data Catelog
source_jdbc_conf = glueContext.extract_jdbc_conf(glue_connection_name)
#Import Python for Java Java.sql libs
from py4j.java_gateway import java_import
java_import(sc._gateway.jvm,"java.sql.Connection")
java_import(sc._gateway.jvm,"java.sql.DatabaseMetaData")
java_import(sc._gateway.jvm,"java.sql.DriverManager")
java_import(sc._gateway.jvm,"java.sql.SQLException")
#Extract the URL from the JDBC connection
oracleurl = source_jdbc_conf.get('url')
# Update connection string to expected Oracle format
oracleurl = oracleurl.replace("oracle://", "oracle:thin:#")
oracleurl = oracleurl + ':orcl'
#Create the connection to the Oracle database with java.sql
conn = sc._gateway.jvm.DriverManager.getConnection(oracleurl, source_jdbc_conf.get('user'), source_jdbc_conf.get('password'))
#Change autocommit to false to avoid Table lock error
conn.setAutoCommit(False);
# error dict
errs = {}
err = ''
try:
for sp_name in sp_names:
#Prepare call storeproc statement and execute
cstmt = conn.prepareCall("{call reporting." + sp_name + "}");
results = cstmt.execute();
conn.commit();
# capture error
except Exception as e: # work on python 3.x
##errs['msg'] = str(sc._gateway.jvm.SQLException.getMessage())- doesn't work
errs['error'] = str(e)
errs['sp_name'] = sp_name
errs['error_type'] = str(type(e)).replace("<class '","").replace("'>","")
if len(errs) != 0:
stmt = conn.createStatement();
sql = "insert into dev_workflow_errors (timestamp, workflow_id, workflow_name, job_name, sp_name, error_type, error) values (current_timestamp, '" + workflow_run_id + "', '" + workflow_name + "', '" + glue_job_name + "', '" + errs['sp_name'] + "', '" + errs['error_type'] + "', '" + errs['msg'] + "')"
rs = stmt.executeUpdate(sql);
conn.commit();
#sys.exit(1)
#Close down the connection
conn.close();
#Update Logger
logger.info("Finished")
I tried the pythonic 'Try' and 'Except' method but for the base exception i just get the full 'py4j.protocol.Py4JJavaError' error message. Inside this message conatins the database specific error messages i want to extract.
Can I use 'java_import(sc._gateway.jvm,"java.sql.SQLException")' in any way to extract database specific errors from the execute function?

How to parse variable values in stored proc from dataframe using python

I have written a code to pick specific values from email body and store it in dataframe now the next step is to store those values in oracle database for that i am using sqlalchemy but i am not sure how can i pass those values to store proc like below
call CORE_VALUATIONS.VALUATIONS.INSERTEQCLOSINGPRICE("SGEPSBSH",to_date('"&TEXT(2022-06-01,"DDMMMYYYY")&"','ddmonyyyy'),"111.9852",NULL,NULL);
from sqlalchemy.engine import create_engine
import datetime
today = datetime.date.today()
DIALECT = 'oracle'
SQL_DRIVER = 'cx_oracle'
USERNAME = 'robinhood' # enter your username
PASSWORD = 'XXXXXX' # enter your password
HOST = 'pv-prod-orc-01.XXXXX.com' # enter the oracle db host url
PORT = 1521 # enter the oracle port number
SERVICE = 'XXXX_APP.ec2.internal' # enter the oracle db service name
ENGINE_PATH_WIN_AUTH = DIALECT + '+' + SQL_DRIVER + '://' + USERNAME + ':' + PASSWORD + '#' + HOST + ':' + str(
PORT) + '/?service_name=' + SERVICE
engine = create_engine(ENGINE_PATH_WIN_AUTH)
# test query
query = """
call CORE_VALUATIONS.VALUATIONS.INSERTEQCLOSINGPRICE("**SGEPSBSH**",to_date('"&TEXT(**2022-06-01**,"DDMMMYYYY")&"','ddmonyyyy'),"**111.9852**",NULL,NULL);
"""
con = engine.connect()
outpt = con.execute(query)
con.close()
how can i pass those values to store proc
Call .execute(sql_text, dict_of_param_values), e.g., something like
import sqlalchemy as sa
# …
sql_text = sa.text(
"call CORE_VALUATIONS.VALUATIONS.INSERTEQCLOSINGPRICE(:param1, :param2, :param3, NULL, NULL);"
)
dict_of_param_values = dict(param1="SGEPSBSH", param2="2022-06-01", param3=111.9852)
with engine.begin() as conn:
conn.execute(sql_text, dict_of_param_values)
# (transaction will automatically commit when `with` block exits)

Python - Postgres query using sqlalchemy returns "Empty Dataframe"

I try to query some data from a postgres database and add the results into an excel with the below Python code (I am connecting to the server through ssh tunnel and connecting to database using sqlalchemy):
from sshtunnel import SSHTunnelForwarder
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import pandas as pd
from pandas import DataFrame
import xlsxwriter
import openpyxl
with SSHTunnelForwarder(
('<server_ip>', 22),
ssh_username="<server_username>",
ssh_private_key='<private_key_path>',
remote_bind_address=('localhost', 5432)) as server:
server.start()
print "server connected"
#connect to DB
local_port = str(server.local_bind_port)
engine = create_engine('postgresql://<db_username>:<db_password>:' + local_port +'/<db_name>')
Session = sessionmaker(bind=engine)
s = Session()
print 'Database session created'
not_empty_query = False #flag empty queries
arg_query = "SELECT * from portalpage where id not in (select entityid from sharepermissions where entitytype='PortalPage')"
query = s.execute(arg_query)
print(query)
for row in query: #check if the query is empty
if (row[0] > 0):
not_empty_query = True
break
if not_empty_query == True: #if the query isn not empty add response into excel
df = pd.DataFrame(pd.np.empty((0, 8)))
df = DataFrame(query.fetchall())
print(df)
df.columns = query.keys()
df.to_excel("out.xlsx", engine="openpyxl", sheet_name="Worksheet_Name")
s.close()
It works for the most of the queries that I tried to execute, however with the above query it returns the below error:
ValueError: Length mismatch: Expected axis has 0 elements, new values have 8 elements
While I was troubleshooting I printed the the df parameter and I got an "Empty Dataframe".
However when I run the same query in my database directly I get results.
I also noticed that in the response, on my database, some columns are empty (not sure if it makes any difference).
Please also find a print screen of the code execution.
The above will work if I remove the below piece of code:
for row in query: #check if the query is empty
if (row[0] > 0):
not_empty_query = True
break
if not_empty_query == True:
However, if I remove this 'for loop' then for other queries (mainly for queries which return empty results) I get the same error.
Please find an example below.
Ay ideas?
Please try this. I found that the logic you are using to check if the query returns any data is the problem. I have modified it to have that check first. If there is any row returned then it builds the dataframe and then exports to excel. Please let me know if it works.
from sshtunnel import SSHTunnelForwarder
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import pandas as pd
from pandas import DataFrame
import xlsxwriter
import openpyxl
with SSHTunnelForwarder(
('<server_ip>', 22),
ssh_username="<server_username>",
ssh_private_key='<private_key_path>',
remote_bind_address=('localhost', 5432)) as server:
server.start()
print "server connected"
#connect to DB
local_port = str(server.local_bind_port)
engine = create_engine('postgresql://<db_username>:<db_password>:' + local_port +'/<db_name>')
Session = sessionmaker(bind=engine)
s = Session()
print 'Database session created'
arg_query = "SELECT * from portalpage where id not in (select entityid from sharepermissions where entitytype='PortalPage')"
query = conn.execute(arg_query)##rows_count
rows = query.fetchall()
columns=query.keys()
if len(rows) > 0:
df = DataFrame(rows)
df.columns =columns
df.to_excel("out.xlsx", engine="openpyxl", sheet_name="Worksheet_Name")
else:
print "no data"
Try to create an empty data frame first.
if not_empty_query == True: #if the query isn not empty add response into excel
df = pd.DataFrame(pd.np.empty((0, 8)))
df = DataFrame(query.fetchall())
print(df)
df.columns = query.keys()
df.to_excel("out.xlsx", engine="openpyxl", sheet_name="Worksheet_Name")

Not able to iterate mongo object in python

I am trying to get the records (books), which are present the in Mongo DB collection "books". I am using pymongo and flask.
Below is my code. In the code, if I remove the query update line (query.update({'author': 'sachin'})), it works fine.
What is the problem while updating the query dict?
from pymongo import connection as pymongo_client
import urllib
def get_books(query, skip_val=0, limit=None):
conn, client = _connect()
if limit:
result = client.books.find(query).skip(skip_val).limit(limit)
else:
result = client.books.find(query)
conn.close()
return result
def _connect():
user = "root"
password = "password"
host = "172.17.1.14"
port = "27017"
db_name = "books"
auth_database = "admin"
if user and password and auth_database:
uri = "mongodb://%s:%s#%s:%s/%s" % (
user, urllib.quote_plus(password), host, port, auth_database)
else:
uri = "mongodb://%s:%s" % (host, port)
conn = pymongo_client.Connection(uri, j=True)
db = conn[db_name]
return conn, db
query = {'project_name': 'cricket'}
books = get_books(query)
query.update({'author': 'sachin'})
for book in books:
print book

Write Large Pandas DataFrames to SQL Server database

I have 74 relatively large Pandas DataFrames (About 34,600 rows and 8 columns) that I am trying to insert into a SQL Server database as quickly as possible. After doing some research, I learned that the good ole pandas.to_sql function is not good for such large inserts into a SQL Server database, which was the initial approach that I took (very slow - almost an hour for the application to complete vs about 4 minutes when using mysql database.)
This article, and many other StackOverflow posts have been helpful in pointing me in the right direction, however I've hit a roadblock:
I am trying to use SQLAlchemy's Core rather than the ORM for reasons explained in the link above. So, I am converting the dataframe to a dictionary, using pandas.to_dict and then doing an execute() and insert():
self._session_factory.engine.execute(
TimeSeriesResultValues.__table__.insert(),
data)
# 'data' is a list of dictionaries.
The problem is that insert is not getting any values -- they appear as a bunch of empty parenthesis and I get this error:
(pyodbc.IntegretyError) ('23000', "[23000] [FreeTDS][SQL Server]Cannot
insert the value NULL into the column...
There are values in the list of dictionaries that I passed in, so I can't figure out why the values aren't showing up.
EDIT:
Here's the example I'm going off of:
def test_sqlalchemy_core(n=100000):
init_sqlalchemy()
t0 = time.time()
engine.execute(
Customer.__table__.insert(),
[{"name": 'NAME ' + str(i)} for i in range(n)]
)
print("SQLAlchemy Core: Total time for " + str(n) +
" records " + str(time.time() - t0) + " secs")
I've got some sad news for you, SQLAlchemy actually doesn't implement bulk imports for SQL Server, it's actually just going to do the same slow individual INSERT statements that to_sql is doing. I would say that your best bet is to try and script something up using the bcp command line tool. Here is a script that I've used in the past, but no guarantees:
from subprocess import check_output, call
import pandas as pd
import numpy as np
import os
pad = 0.1
tablename = 'sandbox.max.pybcp_test'
overwrite=True
raise_exception = True
server = 'P01'
trusted_connection= True
username=None
password=None
delimiter='|'
df = pd.read_csv('D:/inputdata.csv', encoding='latin', error_bad_lines=False)
def get_column_def_sql(col):
if col.dtype == object:
width = col.str.len().max() * (1+pad)
return '[{}] varchar({})'.format(col.name, int(width))
elif np.issubdtype(col.dtype, float):
return'[{}] float'.format(col.name)
elif np.issubdtype(col.dtype, int):
return '[{}] int'.format(col.name)
else:
if raise_exception:
raise NotImplementedError('data type {} not implemented'.format(col.dtype))
else:
print('Warning: cast column {} as varchar; data type {} not implemented'.format(col, col.dtype))
width = col.str.len().max() * (1+pad)
return '[{}] varchar({})'.format(col.name, int(width))
def create_table(df, tablename, server, trusted_connection, username, password, pad):
if trusted_connection:
login_string = '-E'
else:
login_string = '-U {} -P {}'.format(username, password)
col_defs = []
for col in df:
col_defs += [get_column_def_sql(df[col])]
query_string = 'CREATE TABLE {}\n({})\nGO\nQUIT'.format(tablename, ',\n'.join(col_defs))
if overwrite == True:
query_string = "IF OBJECT_ID('{}', 'U') IS NOT NULL DROP TABLE {};".format(tablename, tablename) + query_string
query_file = 'c:\\pybcp_tempqueryfile.sql'
with open (query_file,'w') as f:
f.write(query_string)
if trusted_connection:
login_string = '-E'
else:
login_string = '-U {} -P {}'.format(username, password)
o = call('sqlcmd -S {} {} -i {}'.format(server, login_string, query_file), shell=True)
if o != 0:
raise BaseException("Failed to create table")
# o = call('del {}'.format(query_file), shell=True)
def call_bcp(df, tablename):
if trusted_connection:
login_string = '-T'
else:
login_string = '-U {} -P {}'.format(username, password)
temp_file = 'c:\\pybcp_tempqueryfile.csv'
#remove the delimiter and change the encoding of the data frame to latin so sql server can read it
df.loc[:,df.dtypes == object] = df.loc[:,df.dtypes == object].apply(lambda col: col.str.replace(delimiter,'').str.encode('latin'))
df.to_csv(temp_file, index = False, sep = '|', errors='ignore')
o = call('bcp sandbox.max.pybcp_test2 in c:\pybcp_tempqueryfile.csv -S "localhost" -T -t^| -r\n -c')
This just recently been updated as of SQLAchemy ver: 1.3.0 just in case anyone else needs to know. Should make your dataframe.to_sql statement much faster.
https://docs.sqlalchemy.org/en/latest/changelog/migration_13.html#support-for-pyodbc-fast-executemany
engine = create_engine(
"mssql+pyodbc://scott:tiger#mssql2017:1433/test?driver=ODBC+Driver+13+for+SQL+Server",
fast_executemany=True)

Categories