How to do multiprocessing for list of values? - python

I have list in python having 1000 values
Each value has to process separately
How to do the process independently
Here question is regarding multiprocessing or independent process
my current code
import mysql.connector
for each in value_list:
# Connecting to the Database
mydb = mysql.connector.connect(
host ='localhost',
database ='College',
user ='root',
)
cs = mydb.cursor()
# drop clause
statement ="UPDATE STUDENT SET AGE = 23 WHERE Name=each"
cs.execute(statement)
mydb.commit()
# Disconnecting from the database
mydb.close()
My above code will process one by one. since each update is independent of another how to achieve using multiprocessing
Is there any way to use like from joblib import Parallel, delayed

Here's an example of how this could be done using multiprocessing.
from multiprocessing import Process, Queue
import mysql.connector as MYSQL
NPROCS = 5
value_list = [] # List of names to be updated
CONFIG = {
'host': 'localhost',
'user': 'root',
#'passwd': 'secret',
'database': 'College'
}
def process(queue):
conn = None
try:
conn = MYSQL.connect(**CONFIG)
while param := queue.get():
cursor = conn.cursor()
sql = f'UPDATE STUDENT SET AGE=23 WHERE Name="{param}"'
cursor.execute(sql)
cursor.close()
except Exception as e:
print(e)
finally:
if conn:
conn.commit()
conn.close()
def main():
queue = Queue()
procs = [Process(target=process, args=(queue,)) for _ in range(NPROCS)]
for p in procs:
p.start()
for each in value_list:
queue.put(each)
for _ in range(NPROCS):
queue.put(None)
for p in procs:
p.join()
if __name__ == '__main__':
main()

Related

Best practices when using a database and multiprocessing?

I'm trying to determine the correct way to establish a DB connection within a process.map() function. Do I establish the connection and close it within the function thats being run? Do I do this outside of the process.map()? Below is my some psuedo code thats close to what I currently have.
from multiprocessing import Pool, Manager, Process
import pyodbc
server = DB_SERVER
database = DATABASE
username = USERNAME
password = PASSWORD
def ref_worker(file):
conn = pyodbc.connect('DRIVER='+driver+';SERVER='+server+';PORT=1433;DATABASE='+database+';UID='+username+';PWD='+ password)
temp_data = pd.read_csv(file, dtype=str)
# logic here
cursor = conn.cursor()
sql_string = "insert into db (column) values(field)
cursor.execute(sql_string, temp_data)
cursor.commit()
cursor.close()
conn.close()
if __name__ == '__main__':
driver = ''
driver_names = [x for x in pyodbc.drivers() if x.endswith(' for SQL Server')]
if driver_names:
driver = driver_names[0]
if driver:
conn_str = 'DRIVER={}; ...'.format(driver)
else:
print('(No suitable driver found. Cannot connect.)')
file_list = pd.read_csv('list_of_files.csv', header=None, dtype=str)
pool = Pool(8) # Create a multiprocessing Pool
pool.map(ref_worker, file_list) # process data_inputs iterable with pool
pool.close()
pool.join()

How can I use concurrency to migrate database in Python?

I have a script used to migrate data from SQLite to Postgres. I just use a for loop to transfer tables one by one. Now, I want to experiment with transfering multiple tables in concurrency using threads, multiprocessing or asyncio to speed up the program to compare the runtimes between those ways.
How do you do one of those ways?
Here is my script:
import psycopg2, sqlite3, sys
import time
import multiprocessing
sqdb="C://Users//duongnb//Desktop//Python//SqliteToPostgreFull//testmydb6.db"
sqlike="table"
pgdb="testmydb11"
pguser="postgres"
pgpswd="1234"
pghost="127.0.0.1"
pgport="5432"
consq=sqlite3.connect(sqdb)
cursq=consq.cursor()
tabnames=[]
print()
cursq.execute('SELECT name FROM sqlite_master WHERE type="table" AND name LIKE "%table%";')
tabgrab = cursq.fetchall()
for item in tabgrab:
tabnames.append(item[0])
print(tabgrab)
def copyTable(table):
print(table)
cursq.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name = ?;", (table,))
create = cursq.fetchone()[0]
cursq.execute("SELECT * FROM %s;" %table)
rows=cursq.fetchall()
colcount=len(rows[0])
pholder='%s,'*colcount
newholder=pholder[:-1]
try:
conpg = psycopg2.connect(database=pgdb, user=pguser, password=pgpswd,
host=pghost, port=pgport)
curpg = conpg.cursor()
curpg.execute("DROP TABLE IF EXISTS %s;" %table)
create = create.replace("AUTOINCREMENT", "")
curpg.execute(create)
curpg.executemany("INSERT INTO %s VALUES (%s);" % (table, newholder),rows)
conpg.commit()
if conpg:
conpg.close()
except psycopg2.DatabaseError as e:
print ('Error %s' % e)
sys.exit(1)
finally:
print("Complete")
consq.close()
if __name__ == "__main__":
start_time = time.time()
for table in tabnames:
p = multiprocessing.Process(target = copyTable, args = (table))
p.start()
for table in tabnames:
p.join()
print("All processes finished.")
duration = time.time() - start_time
print(f"Duration {duration} seconds")
You should put the inner of for table in tabnames into a function, say copyTable. Then you're able to use the multiprocessing package to parallelize your code. It should look something like this:
for table in tabnames:
p = multiprocessing.Process(target = copyTable, args = (table))
p.start()
for table in tabnames:
p.join()
print("All processes finished.")
But you can speed up your code even more if you use a COPY (https://www.postgresql.org/docs/current/sql-copy.html) instead of the many INSERT commands.
Instead of the multiprocessing module, you can also use the threading module, which works quite similarly. Then you have threads instead of processes. Because of the interpreter lock I would expect a worse performance with this.

Python multiprocessing hive hang in Linux

The code below worked in Windows but in Linux is hanging:
from impala.dbapi import connect
from multiprocessing import Pool
conn = connect(host='172.16.12.12', port=10000, user='hive', password='hive', database='test',auth_mechanism='PLAIN')
cur = conn.cursor()
def test_hive(a):
cur.execute('select {}'.format(a))
tab_cc = cur.fetchall()
tab_cc =tab_cc[0][0]
print(a,tab_cc)
if __name__ == '__main__':
pool = Pool(processes=8)
alist=[1,2,3]
for i in range(len(alist)):
pool.apply_async(test_hive,str(i))
pool.close()
pool.join()
When I change alist=[1,2,3] to alist=[1] it works in Linux.
I see two possible causes for this behavior:
an exception raised in test_hive in the context of a forked subprocess
a deadlock caused by the fact that fork does not copy threads from the parent and/or the fact that mutexes are copied in the state they have when the fork call is executed
To check for exceptions, add return tab_cc to the end of your test_hive function and gather the results returned by the pool:
if __name__ == '__main__':
pool = Pool(processes=8)
alist = [1,2,3]
results = []
for i in range(len(alist)):
results.append(pool.apply_async(test_hive, str(i)))
pool.close()
pool.join()
for result in results:
try:
print(result.get())
except Exception as e:
print("{}: {}".format(type(e).__name__, e))
As for the threads, I did a quick search through the impala repo and it seems like they somehow play a role around the usage of thrift. I'm not sure if Python's threading module can actually see them, when originating from that library. You might try with print(multiprocessing.current_process(), threading.enumerate()), both at the module level (e.g. after cur = conn.cursor()) and at the beginning of the test_hive function and see if the _MainProcess(MainProcess, started) shows a longer list of active threads than all of the ForkProcess(ForkPoolWorker-<worker#>, started daemon).
As for a potential solution: I somewhat suspect the fact that you create conn and cur at the module level to be the culprit; all childs use a copy of those two.
Try and move these two lines to the beginning of test_hive, so that each process creates a connection and a cursor if its own:
conn = connect(host='172.16.12.12', port=10000, user='hive', password='hive', database='test',auth_mechanism='PLAIN')
cur = conn.cursor()
from impala.dbapi import connect
import time,datetime,sys,re
import psycopg2 as pg
today = datetime.date.today()
from multiprocessing import Pool
def test_hive(a):
conn = connect(host='172.16.12.12', port=10000, user='hive', password='hive', database='test',auth_mechanism='PLAIN')
cur = conn.cursor()
#print(a)
cur.execute('select {}'.format(a))
tab_cc = cur.fetchall()
tab_cc =tab_cc[0][0]
return tab_cc
if __name__ == '__main__':
pool = Pool(processes=8)
alist = [1,2,4,4,4,4,5,3]
results = []
for i in range(len(alist)):
results.append(pool.apply_async(test_hive, str(i)))
pool.close()
pool.join()
for result in results:
try:
print(result.get())
except Exception as e:
print("{}: {}".format(type(e).__name__, e))
I move these two lines to test_hive is worked.
conn = connect(host='172.16.12.12', port=10000, user='hive', password='hive', database='test',auth_mechanism='PLAIN')
cur = conn.cursor()

cx_Oracle and context lib is this correct

I am curious to know if this is the correct way of using cx_Oracle with context lib and connection pooling using DBRCP.
import cx_Oracle
import threading
import time
def get_connection():
connection = cx_Oracle.connect(user='username', password='password', dsn='mydsn_name/service_name:pooled')
return connection
def myfunc():
with get_connection() as conn:
cursor = conn.cursor()
for _ in range(10):
cursor.execute("select * from mytable")
val = cursor.fetchone()
time.sleep(60)
print("Thread", threading.current_thread().name, "fetched sequence =", val)
results = []
for thread in range(0,10):
current_thread = threading.Thread(name = f'Thread {thread}', target = myfunc)
results.append(current_thread)
current_thread.start()
print('Started All Threads')
for thread in results:
thread.join()
print("All done!")
I am not sure If i am doing the right thing here .
And have no idea how to confirm that the connection is being returned to the connection pool.
And each thread is not opening a brand new connection to the database.
Although the doc's on cx_Oracle seem to indicate i am on the right path.
You'll get most benefit if you also use a cx_Oracle connect pool at the same time as DRCP. You need to set cclass with DRCP otherwise you will lose its benefits. You can then decide what level of session reuse (the 'purity') to use. Check the cx_Oracle tutorial. From solutions/connect_pool2.py:
pool = cx_Oracle.SessionPool("pythonhol", "welcome", "localhost/orclpdb:pooled",
min = 2, max = 5, increment = 1, threaded = True)
def Query():
con = pool.acquire(cclass="PYTHONHOL", purity=cx_Oracle.ATTR_PURITY_SELF)
#con = pool.acquire(cclass="PYTHONHOL", purity=cx_Oracle.ATTR_PURITY_NEW)
cur = con.cursor()
for i in range(4):
cur.execute("select myseq.nextval from dual")
seqval, = cur.fetchone()
There are V$ views like V$CPOOL_STATS you can query to check whether DRCP is being used. Links to some resources are in https://oracle.github.io/node-oracledb/doc/api.html#drcp

Python multiprocessing. Finish all processes at the same time

My goal is upload some data to database. I'm using psycopg2 and it has a rule: all processes must have own database connection. In my case it means that I must commit in worker. The problem is that I can commit only all processes finish sql insert command. What I need:
from multiprocessing import Process
def worker1(s):
conn = psycopg2.connect("dbname=mydb user=postgres")
cursor = conn.cursor()
pg_cursor.execute(
""" insert into "MyTable1"("Column1")
values(%s)""", [1])
#wait all processes
conn.commit()
def worker2(s):
conn = psycopg2.connect("dbname=mydb user=postgres")
cursor = conn.cursor()
pg_cursor.execute(
""" insert into "MyTable2"("Column1")
values(%s)""", [1])
#wait all processes
conn.commit()
if __name__ == '__main__':
p1 = Process(target=worker1)
p2 = Process(target=worker2)
p1.start()
p2.start()
How can I make all process wait untill finish sql commands? What is correct way to do this?
This sql insert only for example, in real task i need to insert millions of records.
You can use a pair of multiprocessing.Event objects to allow both workers to tell the other that they're done, as well as to force them to wait for the other to signal back:
from multiprocessing import Process, Event
def worker1(s, my_e, other_e):
conn = psycopg2.connect("dbname=mydb user=postgres")
cursor = conn.cursor()
pg_cursor.execute(
""" insert into "MyTable1"("Column1")
values(%s)""", [1])
#wait all processes
my_e.set()
other_e.wait()
conn.commit()
def worker2(s, my_e, other_e):
conn = psycopg2.connect("dbname=mydb user=postgres")
cursor = conn.cursor()
pg_cursor.execute(
""" insert into "MyTable2"("Column1")
values(%s)""", [1])
#wait all processes
my_e.set()
other_e.wait()
conn.commit()
if __name__ == '__main__':
e1 = Event()
e2 = Event()
p1 = Process(target=worker1, args=(e1, e2))
p2 = Process(target=worker2, args=(e2, e1))
p1.start()
p2.start()
p1.join()
p2.join()
Just be careful about the possibility of a deadlock if one of the workers fails for some reason. You may want to set the optional timeout keyword argument of Event.wait to some fairly high value, and rollback if it expires. That or use a try/except in each worker that guarantees my_e gets set(), but also share a variable between the two workers that can be used to tell the other worker if a failure occurred.

Categories