How can I use concurrency to migrate database in Python? - python

I have a script used to migrate data from SQLite to Postgres. I just use a for loop to transfer tables one by one. Now, I want to experiment with transfering multiple tables in concurrency using threads, multiprocessing or asyncio to speed up the program to compare the runtimes between those ways.
How do you do one of those ways?
Here is my script:
import psycopg2, sqlite3, sys
import time
import multiprocessing
sqdb="C://Users//duongnb//Desktop//Python//SqliteToPostgreFull//testmydb6.db"
sqlike="table"
pgdb="testmydb11"
pguser="postgres"
pgpswd="1234"
pghost="127.0.0.1"
pgport="5432"
consq=sqlite3.connect(sqdb)
cursq=consq.cursor()
tabnames=[]
print()
cursq.execute('SELECT name FROM sqlite_master WHERE type="table" AND name LIKE "%table%";')
tabgrab = cursq.fetchall()
for item in tabgrab:
tabnames.append(item[0])
print(tabgrab)
def copyTable(table):
print(table)
cursq.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name = ?;", (table,))
create = cursq.fetchone()[0]
cursq.execute("SELECT * FROM %s;" %table)
rows=cursq.fetchall()
colcount=len(rows[0])
pholder='%s,'*colcount
newholder=pholder[:-1]
try:
conpg = psycopg2.connect(database=pgdb, user=pguser, password=pgpswd,
host=pghost, port=pgport)
curpg = conpg.cursor()
curpg.execute("DROP TABLE IF EXISTS %s;" %table)
create = create.replace("AUTOINCREMENT", "")
curpg.execute(create)
curpg.executemany("INSERT INTO %s VALUES (%s);" % (table, newholder),rows)
conpg.commit()
if conpg:
conpg.close()
except psycopg2.DatabaseError as e:
print ('Error %s' % e)
sys.exit(1)
finally:
print("Complete")
consq.close()
if __name__ == "__main__":
start_time = time.time()
for table in tabnames:
p = multiprocessing.Process(target = copyTable, args = (table))
p.start()
for table in tabnames:
p.join()
print("All processes finished.")
duration = time.time() - start_time
print(f"Duration {duration} seconds")

You should put the inner of for table in tabnames into a function, say copyTable. Then you're able to use the multiprocessing package to parallelize your code. It should look something like this:
for table in tabnames:
p = multiprocessing.Process(target = copyTable, args = (table))
p.start()
for table in tabnames:
p.join()
print("All processes finished.")
But you can speed up your code even more if you use a COPY (https://www.postgresql.org/docs/current/sql-copy.html) instead of the many INSERT commands.
Instead of the multiprocessing module, you can also use the threading module, which works quite similarly. Then you have threads instead of processes. Because of the interpreter lock I would expect a worse performance with this.

Related

python multiproccess - send db connection

I would like to have 2 processes: one for collecting data and store it to sqlite database and one for using the data.
I'm trying to pass to the collecting process a handle to sqlite, but getting an error. Below is my simplified code
import sqlite3
import multiprocessing
import time
def do(db_handler):
print("start inserting data")
for i in range(20):
time.sleep(1)
db_handler.execute("insert into temp_table (f1, f2) Values (?,?)", (i, i*2))
if __name__ == "__main__":
connection = sqlite3.connect("temp_db.db", check_same_thread=False)
dBhandler = connection.cursor()
p = multiprocessing.Process(target=do, args=(dBhandler,))
p.start()
for i in range(20):
time.sleep(1)
result = dBhandler.execute("SELECT * FROM tasks")
rows = result.fetchall()
print("i =", i)
for row in rows:
print(row)
print("============")
I'm receiving an error: TypeError: Can't pickle sqlite3.cursor objects
Any idea how to solve it?

How to use Asyncio module in migrate database from Sqlite to Postgres?

I have a script to migrate database from Sqlite to Postgres. My original scipt works, but when I try to use Asyncio to speed up program, my new code even running slower than the original a few seconds. The transfer speed of tables is very slow. Can anyone suggest to me, where am I wrong ?
My original code :
import psycopg2, sqlite3, sys
import time
start_time = time.time()
sqdb="D://Python//SqliteToPostgreFull//testmydb6.db" #folder contain sqlite db
sqlike="table"
pgdb="testmydb7" #postgres db
pguser="postgres"
pgpswd="1234"
pghost="127.0.0.1"
pgport="5432"
consq=sqlite3.connect(sqdb)
cursq=consq.cursor()
tabnames=[]
print()
cursq.execute('SELECT name FROM sqlite_master WHERE type="table" AND name LIKE "%table%";')
tabgrab = cursq.fetchall()
for item in tabgrab:
tabnames.append(item[0])
print(tabgrab)
for table in tabnames:
print(table)
cursq.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name = ?;", (table,))
create = cursq.fetchone()[0]
cursq.execute("SELECT * FROM %s;" %table)
rows=cursq.fetchall()
colcount=len(rows[0])
pholder='%s,'*colcount
newholder=pholder[:-1]
try:
conpg = psycopg2.connect(database=pgdb, user=pguser, password=pgpswd,
host=pghost, port=pgport)
curpg = conpg.cursor()
curpg.execute("DROP TABLE IF EXISTS %s;" %table)
create = create.replace("AUTOINCREMENT", "")
curpg.execute(create)
curpg.executemany("INSERT INTO %s VALUES (%s);" % (table, newholder),rows)
conpg.commit()
if conpg:
conpg.close()
except psycopg2.DatabaseError as e:
print ('Error %s' % e)
sys.exit(1)
finally:
print("Complete")
consq.close()
duration = time.time() - start_time
print(f"Duration {duration} seconds")
My code with Asyncio module :
import psycopg2, sqlite3, sys
import time
import asyncio
sqdb="D://Python//SqliteToPostgreFull//testmydb6.db"
sqlike="table"
pgdb="testmydb9"
pguser="postgres"
pgpswd="1234"
pghost="127.0.0.1"
pgport="5432"
consq=sqlite3.connect(sqdb)
cursq=consq.cursor()
tabnames=[]
print()
cursq.execute('''SELECT name FROM sqlite_master WHERE type="table" AND name LIKE "'''+sqlike+'''%";''')
tabgrab = cursq.fetchall()
for item in tabgrab:
tabnames.append(item[0])
print(tabgrab)
async def copyTable(table):
cursq.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name = ?;", (table,))
create = cursq.fetchone()[0]
cursq.execute("SELECT * FROM %s;" %table)
rows=cursq.fetchall()
colcount=len(rows[0])
pholder='%s,'*colcount
newholder=pholder[:-1]
try:
conpg = psycopg2.connect(database=pgdb, user=pguser, password=pgpswd,
host=pghost, port=pgport)
curpg = conpg.cursor()
curpg.execute("DROP TABLE IF EXISTS %s;" %table)
create = create.replace("AUTOINCREMENT", "")
curpg.execute(create)
curpg.executemany("INSERT INTO %s VALUES (%s);" % (table, newholder),rows)
conpg.commit()
if conpg:
conpg.close()
except psycopg2.DatabaseError as e:
print ('Error %s' % e)
sys.exit(1)
finally:
print("Complete")
async def main():
for table in tabnames:
a = loop.create_task(copyTable(table,))
await asyncio.wait([a])
if __name__ == "__main__":
start_time = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
duration = time.time() - start_time
print(f"Duration {duration} seconds")
If both databases located on a same computer, asyncio won't speedup a process: there's no network overhead to parallelize. Quite opposite: overhead for using coroutines will make program a bit slower.
Please, read this answer for detailed explanation.

Python multiprocessing hive hang in Linux

The code below worked in Windows but in Linux is hanging:
from impala.dbapi import connect
from multiprocessing import Pool
conn = connect(host='172.16.12.12', port=10000, user='hive', password='hive', database='test',auth_mechanism='PLAIN')
cur = conn.cursor()
def test_hive(a):
cur.execute('select {}'.format(a))
tab_cc = cur.fetchall()
tab_cc =tab_cc[0][0]
print(a,tab_cc)
if __name__ == '__main__':
pool = Pool(processes=8)
alist=[1,2,3]
for i in range(len(alist)):
pool.apply_async(test_hive,str(i))
pool.close()
pool.join()
When I change alist=[1,2,3] to alist=[1] it works in Linux.
I see two possible causes for this behavior:
an exception raised in test_hive in the context of a forked subprocess
a deadlock caused by the fact that fork does not copy threads from the parent and/or the fact that mutexes are copied in the state they have when the fork call is executed
To check for exceptions, add return tab_cc to the end of your test_hive function and gather the results returned by the pool:
if __name__ == '__main__':
pool = Pool(processes=8)
alist = [1,2,3]
results = []
for i in range(len(alist)):
results.append(pool.apply_async(test_hive, str(i)))
pool.close()
pool.join()
for result in results:
try:
print(result.get())
except Exception as e:
print("{}: {}".format(type(e).__name__, e))
As for the threads, I did a quick search through the impala repo and it seems like they somehow play a role around the usage of thrift. I'm not sure if Python's threading module can actually see them, when originating from that library. You might try with print(multiprocessing.current_process(), threading.enumerate()), both at the module level (e.g. after cur = conn.cursor()) and at the beginning of the test_hive function and see if the _MainProcess(MainProcess, started) shows a longer list of active threads than all of the ForkProcess(ForkPoolWorker-<worker#>, started daemon).
As for a potential solution: I somewhat suspect the fact that you create conn and cur at the module level to be the culprit; all childs use a copy of those two.
Try and move these two lines to the beginning of test_hive, so that each process creates a connection and a cursor if its own:
conn = connect(host='172.16.12.12', port=10000, user='hive', password='hive', database='test',auth_mechanism='PLAIN')
cur = conn.cursor()
from impala.dbapi import connect
import time,datetime,sys,re
import psycopg2 as pg
today = datetime.date.today()
from multiprocessing import Pool
def test_hive(a):
conn = connect(host='172.16.12.12', port=10000, user='hive', password='hive', database='test',auth_mechanism='PLAIN')
cur = conn.cursor()
#print(a)
cur.execute('select {}'.format(a))
tab_cc = cur.fetchall()
tab_cc =tab_cc[0][0]
return tab_cc
if __name__ == '__main__':
pool = Pool(processes=8)
alist = [1,2,4,4,4,4,5,3]
results = []
for i in range(len(alist)):
results.append(pool.apply_async(test_hive, str(i)))
pool.close()
pool.join()
for result in results:
try:
print(result.get())
except Exception as e:
print("{}: {}".format(type(e).__name__, e))
I move these two lines to test_hive is worked.
conn = connect(host='172.16.12.12', port=10000, user='hive', password='hive', database='test',auth_mechanism='PLAIN')
cur = conn.cursor()

Python multiprocessing - Is there a way to make sure python process stops after execution of the program

Using pool with multi processing for hitting an api and storing the response into mysql for around 5k queries. The program never exits even after complete execution.Found that this happens as the processes are still open after execution.
Tried using join(), terminate(). Nothing works.
def fetchAndStore(query):
conn = pymysql.connect(host=host,
user=user,
passwd=passwd,
db=db)
x = conn.cursor()
full_url = getFullUrl(redirection_service, query)
response = urllib2.urlopen(full_url)
html = response.read()
data = json.loads(html)
if data is not None:
store = json.dumps(data["RESPONSE"]["redirectionStore"])
else:
store = 'search.flipkart.com'
try:
stmt = "INSERT INTO golden_response(`store`) VALUES ('%s')" %( store)
x.execute(stmt)
conn.commit()
conn.close()
except Exception, e:
global failure
failure += 1
print e
return "Done"
#main
queries = db_connection()
pool = Pool(processes=20, maxtasksperchild=5)
results = []
for query in queries:
results.append(pool.apply_async(fetchAndStore, (query,)))
pool.close()
pool.join()
print "completed"
The process should exit and print completed, in the ideal situation.

Python: Read SQL Server Data using Concurrent Future Process Pool

I am extremely new to have never used any sort of parallel processing methods. I want to read a huge amount of data (i.e. at least 2 million rows) from the SQL Server and want to use parallel processing to speed up the reading. Below is my attempt at the parallel processing using concurrent future process pool.
class DatabaseWorker(object):
def __init__(self, connection_string, n, result_queue = []):
self.connection_string = connection_string
stmt = "select distinct top %s * from dbo.KrishAnalyticsAllCalls" %(n)
self.query = stmt
self.result_queue = result_queue
def reading(self,x):
return(x)
def pooling(self):
t1 = time.time()
con = pyodbc.connect(self.connection_string)
curs = con.cursor()
curs.execute(self.query)
with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
print("Test1")
future_to_read = {executor.submit(self.reading, row): row for row in curs.fetchall()}
print("Test2")
for future in concurrent.futures.as_completed(future_to_read):
print("Test3")
read = future_to_read[future]
try:
print("Test4")
self.result_queue.append(future.result())
except:
print("Not working")
print("\nTime take to grab this data is %s" %(time.time() - t1))
df = DatabaseWorker(r'driver={SQL Server}; server=SPROD_RPT01; database=Reporting;', 2*10**7)
df.pooling()
I am not getting any output with my current implementation. "Test1" prints and that's it. Nothing else happens. I understood the various examples provided by concurrent future documents but I am unable to implement it here. I will highly appreciate your help. Thank you.

Categories