Database Connection Timeout Issue - python

I have a python code that takes a set of inputs ( possibly 250 ) and then looks up the values corresponding to that input in 2 databases.
engine = create_engine(database_url1)
result = {}
for input in input_list:
sql_query = f"""SELECT * FROM table 1where name = input limit 1;"""
db_results = engine.execute(sql_query).fetchall()
if len(db_results) <= 0:
sql_query = f"""SELECT * FROM table 2 where name = input limit 1;"""
db_results = engine.execute(sql_query).fetchall()
if len(db_results) <= 0:
sql_query = f"""SELECT * FROM table 3 where name = input limit 1;"""
db_results = engine.execute(sql_query).fetchall()
if len(db_results) <= 0:
engine_database2 = create_engine(database_url2)
sql_query = f"""SELECT * FROM table 4 where name = input limit 1;"""
db_results = engine_database2.execute(sql_query).fetchall()
I end up getting timeout issue, for large set of inputs.
What is the best way/practice to approach this, so that I am not opening and closing multiple database connections and the performance is better with large set of inputs.
Should i handle the inputs in batches ? will it increase the performance.

Related

Why is Django ORM so slow?

1. This code - raw SQL - takes 2.6 sec:
*
all_feeds = Feed.objects.all()
for feed in all_feeds:
q_sku = MainData.objects.raw(f'SELECT id as id, COUNT(DISTINCT sku) as "count" FROM imports_maindata WHERE feed_id={feed.id}')
q_loc = MainData.objects.raw(
f'SELECT id as id, COUNT(DISTINCT locale) AS "count" FROM imports_maindata WHERE feed_id={feed.id}')
q_spec = MapSpecs.objects.raw(
f'SELECT id as id, COUNT(DISTINCT f_feat_id) AS "count" FROM imports_mapspecs WHERE feed_id={feed.id}')
q_mapped = MapSpecs.objects.raw(
f'SELECT id as id, COUNT(DISTINCT ic_feat_id) AS "count" FROM imports_mapspecs WHERE feed_id={feed.id} AND ic_feat_id IS NOT NULL')
q_date = MainData.objects.raw(
f'SELECT id as id, MAX(last_update) as "last_date" FROM imports_maindata WHERE feed_id={feed.id}')
print(q_sku[0].count, q_loc[0].count, q_spec[0].count, q_mapped[0].count, q_date[0].last_date)*
3. While this one - ORM only - takes 3.1 sec:
*f = Feed.objects.all()
for feed in f:
prods_count = f.filter(maindata__feed_id=feed.id).values('maindata__sku').distinct().count()
locales_count = f.filter(maindata__feed_id=feed.id).values_list('maindata__locale', flat=True).distinct()
total_specs = f.filter(mapspecs__feed_id=feed.id).count()
mapped_specs = f.filter(mapspecs__feed_id=feed.id, mapspecs__ic_feat_id__isnull=False).all().count()
try:
last_update = f.filter(maindata__feed_id=feed.id).values('maindata__last_update').distinct().order_by('-maindata__last_update').first()['maindata__last_update']
except TypeError:
pass*
3. And this one, using ORM but different approach, is returned in 3.1-3.2 sec:
*
f = Feed.objects.all()
prods = f.annotate(num_prods=Count('maindata__sku', distinct=True))
locs = f.annotate(num_locs=Count('maindata__locale', distinct=True))
total_sp_count = f.annotate(num_books=Count('mapspecs__f_feat_id', distinct=True))
total_sp_mapped = f.filter(mapspecs__ic_feat_id__isnull=False).annotate(
num_books=Count('mapspecs__ic_feat_id', distinct=True))
dates = f.annotate(num_books=Max('maindata__last_update'))
*
So how come that Django ORM is so inefficient and slow? The timings are for a low number of rows in DB (below 50K)... So it's not only slower than raw SQL but has a more confusing (and sometimes too vague) syntax. Guess some other Python frameworks should be considered...

Batch downloading of table using cx_oracle

I need to download a large table from an oracle database into a python server, using cx_oracle to do so. However, the ram is limited on the python server and so I need to do it in a batch way.
I know already how to do generally a whole table
usr = ''
pwd = ''
tns = '(Description = ...'
orcl = cx_Oracle.connect(user, pwd, tns)
curs = orcl.cursor()
printHeader=True
tabletoget = 'BIGTABLE'
sql = "SELECT * FROM " + "SCHEMA." + tabletoget
curs.execute(sql)
data = pd.read_sql(sql, orcl)
data.to_csv(tabletoget + '.csv'
I'm not sure what to do though to load say a batch of 10000 rows at a time and then save it off to a csv and then rejoin.
You can use cx_Oracle directly to perform this sort of batch:
curs.arraysize = 10000
curs.execute(sql)
while True:
rows = cursor.fetchmany()
if rows:
write_to_csv(rows)
if len(rows) < curs.arraysize:
break
If you are using Oracle Database 12c or higher you can also use the OFFSET and FETCH NEXT ROWS options, like this:
offset = 0
numRowsInBatch = 10000
while True:
curs.execute("select * from tabletoget offset :offset fetch next :nrows only",
offset=offset, nrows=numRowsInBatch)
rows = curs.fetchall()
if rows:
write_to_csv(rows)
if len(rows) < numRowsInBatch:
break
offset += len(rows)
This option isn't as efficient as the first one and involves giving the database more work to do but it may be better for you depending on your circumstances.
None of these examples use pandas directly. I am not particularly familiar with that package, but if you (or someone else) can adapt this appropriately, hopefully this will help!
You can achieve your result like this. Here I am loading data to df.
import cx_Oracle
import time
import pandas
user = "test"
pw = "test"
dsn="localhost:port/TEST"
con = cx_Oracle.connect(user,pw,dsn)
start = time.time()
cur = con.cursor()
cur.arraysize = 10000
try:
cur.execute( "select * from test_table" )
names = [ x[0] for x in cur.description]
rows = cur.fetchall()
df=pandas.DataFrame( rows, columns=names)
print(df.shape)
print(df.head())
finally:
if cur is not None:
cur.close()
elapsed = (time.time() - start)
print(elapsed, "seconds")

Retrieving Data from MySQL in batches via Python

I would like to make this process in batches, because of the volume.
Here's my code:
getconn = conexiones()
con = getconn.mysqlDWconnect()
with con:
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute("SELECT id, date, product_id, sales FROM sales")
rows = cur.fetchall()
How can I implement an index to fetch the data in batches?
First point: a python db-api.cursor is an iterator, so unless you really need to load a whole batch in memory at once, you can just start with using this feature, ie instead of:
cursor.execute("SELECT * FROM mytable")
rows = cursor.fetchall()
for row in rows:
do_something_with(row)
you could just:
cursor.execute("SELECT * FROM mytable")
for row in cursor:
do_something_with(row)
Then if your db connector's implementation still doesn't make proper use of this feature, it will be time to add LIMIT and OFFSET to the mix:
# py2 / py3 compat
try:
# xrange is defined in py2 only
xrange
except NameError:
# py3 range is actually p2 xrange
xrange = range
cursor.execute("SELECT count(*) FROM mytable")
count = cursor.fetchone()[0]
batch_size = 42 # whatever
for offset in xrange(0, count, batch_size):
cursor.execute(
"SELECT * FROM mytable LIMIT %s OFFSET %s",
(batch_size, offset))
for row in cursor:
do_something_with(row)
You can use
SELECT id, date, product_id, sales FROM sales LIMIT X OFFSET Y;
where X is the size of the batch you need and Y is current offset (X times number of current iterations for example)
To expand on akalikin's answer, you can use a stepped iteration to split the query into chunks, and then use LIMIT and OFFSET to execute the query.
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute("SELECT COUNT(*) FROM sales")
for i in range(0,cur.fetchall(),5):
cur2 = con.cursor(mdb.cursors.DictCursor)
cur2.execute("SELECT id, date, product_id, sales FROM sales LIMIT %s OFFSET %s" %(5,i))
rows = cur2.fetchall()
print rows
Thank you, here's how I implement it with your suggestions:
control = True
index = 0
while control==True:
getconn = conexiones()
con = getconn.mysqlDWconnect()
with con:
cur = con.cursor(mdb.cursors.DictCursor)
query = "SELECT id, date, product_id, sales FROM sales limit 10 OFFSET " + str(10 * (index))
cur.execute(query)
rows = cur.fetchall()
index = index+1
if len(rows)== 0:
control=False
for row in rows:
dataset.append(row)

python pyodbc is returning an empty list with this query

This code returns an empty list. When I run it in my application I get lots of records. I have used this basic script with other SQL queries and they work fine but not this one.
The database server is Sybase SQL Anywhere 12
import pyodbc
cnxn = pyodbc.connect('DSN=dmserver')
cursor = cnxn.cursor()
cursor.execute("""select
debtor_id
,name1
,assign_id
,(select
dateformat(dateadd(minute,user_gmt_offset,string(act_date,' ',act_time)),'MM/DD/YYYY HH::NN::SS')
from
dm.dbtract
where
item_no = (select max(item_no) from dm.dbtract as d2
where
left(comments,5) = 'AC11::'
and
act_date < today(*) - 1
and
d2.debtor_id = dbtract.debtor_id)
and
dbtr.debtor_id = dbtract.debtor_id) as act_code_date_time
,(select
list(phone,'~' ORDER by item_no asc)
from
dm.dbtrphon
where
status = 'A'
and
dbtrphon.debtor_id = dbtr.debtor_id) as Active_phone_list
from
dm.dbtr
where
(select
count(*)
from
dm.dbtract
where
left(comments,5) = 'AC11::'
and
act_date < today(*) - 1
and
dbtr.debtor_id = dbtract.debtor_id) > 0
""")
while 1:
row = cursor.fetchone()
if not row:
break
print row

Select records incrementally in MySQL and save to csv in Python

I need to query the database for some data analysis and I have more than 20 millions records. I have limited access to the DB and my query times out after 8 mins. So, I am trying to break up the query into smaller portions and save the results to excel for processing later.
This is what I have so far. How can I get python to loop the queries over every x-number (e.g 1,000,000) of records and store them in the same csv until all (20 mil++) records are searched?
import MySQLdb
import csv
db_main = MySQLdb.connect(host="localhost",
port = 1234,
user="user1",
passwd="test123",
db="mainDB")
cur = db_main .cursor()
cur.execute("SELECT a.user_id, b.last_name, b.first_name,
FLOOR(DATEDIFF(CURRENT_DATE(), c.birth_date) / 365) age,
DATEDIFF(b.left_date, b.join_date) workDays
FROM users a
INNER JOIN users_signup b ON a.user_id a = b.user_id
INNER JOIN users_personal c ON a.user_id a = c.user_id
INNER JOIN
(
SELECT distinct d.a.user_id FROM users_signup d
WHERE (user_id >=1 AND user_id <1000000)
AND d.join_date >= '2013-01-01' and d.join_date < '2014-01-01'
)
AS t ON a.user_id = t.user_id")
result=cur.fetchall()
c = csv.writer(open("temp.csv","wb"))
for row in result:
c.writerow(row)
Your code should look like below. You can tune its performance by per_query variable
c = csv.writer(open("temp.csv","wb"))
offset = 0
per_query = 10000
while true:
cur.execute("__the_query__ LIMIT %s OFFSET %s", (per_query, offset))
rows = cur.fetchall()
if len(rows) == 0:
break #escape the loop at the end of data
for row in cur.fetchall():
c.writerow(row)
offset += per_query
Untested code but this should get you started...
SQL = """
SELECT a.user_id, b.last_name, b.first_name,
FLOOR(DATEDIFF(CURRENT_DATE(), c.birth_date) / 365) age,
DATEDIFF(b.left_date, b.join_date) workDays
FROM users a
INNER JOIN users_signup b ON a.user_id a = b.user_id
INNER JOIN users_personal c ON a.user_id a = c.user_id
INNER JOIN
(
SELECT distinct d.a.user_id FROM users_signup d
WHERE (user_id >=1 AND user_id <1000000)
AND d.join_date >= '2013-01-01' and d.join_date < '2014-01-01'
)
AS t ON a.user_id = t.user_id
OFFSET %s LIMIT %s
"""
BATCH_SIZE = 100000
with open("temp.csv","wb") as f:
writer = csv.writer(f)
cursor = db_main.cursor()
offset = 0
limit = BATCH_SIZE
while True:
cursor.execute(SQL, (offset, limit))
for row in cursor:
writer.writerow(row)
else:
# no more rows, we're done
break
offset += BATCH_SIZE
cursor.close()
Here is an example of implementation that might help you:
from contextlib import contextmanager
import MySQLdb
import csv
connection_args = {"host": "localhost", "port": 1234, "user": "user1", "passwd": "test123", "db": "mainDB"}
#contextmanager
def get_cursor(**kwargs):
''' The contextmanager allow to automatically close
the cursor.
'''
db = MySQLdb.connect(**kwargs)
cursor = db.cursor()
try:
yield cursor
finally:
cursor.close()
# note the placeholders for the limits
query = """ SELECT a.user_id, b.last_name, b.first_name,
FLOOR(DATEDIFF(CURRENT_DATE(), c.birth_date) / 365) age,
DATEDIFF(b.left_date, b.join_date) workDays
FROM users a
INNER JOIN users_signup b ON a.user_id a = b.user_id
INNER JOIN users_personal c ON a.user_id a = c.user_id
INNER JOIN
(
SELECT distinct d.a.user_id FROM users_signup d
WHERE (user_id >= 1 AND user_id < 1000000)
AND d.join_date >= '2013-01-01' and d.join_date < '2014-01-01'
) AS t ON a.user_id = t.user_id OFFSET %s LIMIT %s """
csv_file = csv.writer(open("temp.csv","wb"))
# One million at the time
STEP = 1000000
for step_nb in xrange(0, 20):
with get_cursor(**connection_args) as cursor:
cursor.execute(query, (step_nb * STEP, (step_nb + 1) * STEP)) # query the DB
for row in cursor: # use the cursor instead of fetching everything in memory
csv_file.writerow(row)
Edited: misunderstanding of what was the Batch (though it was on user_id)

Categories