I am trying to create approach to run multiple queries from a list at the same time, for that I am using threading library. For that I have this code:
from threading import Thread, Lock
queries = ["SELECT * FROM db1.trans", "SELECT * FROM db1.order", "SELECT * FROM db2.Store", "SELECT * FROM db2.Document", "SELECT * FROM db3.Sales"]
class DatabaseWorker(Thread):
__lock = Lock()
def __init__(self, query, result_queue):
Thread.__init__(self)
self.query = query
self.result_queue = result_queue
def run(self):
result = None
print("Connecting to database...")
try:
conn = connect(host=host, port=port)
curs = conn.cursor()
curs.execute(self.query)
result = curs
curs.close()
conn.close()
except Exception as e:
print(str(e))
self.result_queue.append(result)
delay = 1
result_queue = []
for query in queries:
worker1 = DatabaseWorker(query,result_queue)
worker1.start()
while len(result_queue) < 2:
time.sleep(delay)
job_done = True
worker1.join()
Using the above approach I am running in sequential mode. I know I can do in this way:
worker1 = DatabaseWorker(queries[0],result_queue)
worker2 = DatabaseWorker(queries[1],result_queue)
...
But I think it is not the best way. Anyone knows how can I run all the queries from the list in a dynamic mode?
Thanks!
Python threading is not really parallel because of the Python GIL (Global Interpreter Lock).
For real multiprocessing parallel operations you can use the Python multiprocessing module.
example :
import multiprocessing
def runner(task):
return f'Hi, i do {task}'
if __name__ == '__main__':
list_tasks = ['1', '2', '3']
with multiprocessing.Pool() as pool:
result = pool.map(runner, list_tasks)
print(result)
Related
I have a list which contains table names and let say size of list be n. Now I have n servers so I have opened n cursors corresponding to each which is also in another list. Now for every table I want to call a certain function which takes parameter as this two list.
templst = [T1,T2,T3,T4,T5]
curlst = [cur1,cur2,cur3,cur4,cur5]
for x in range(len(templst)):
for y in range(len(curlst)):
if( x == y):
print "extracting of table ",templst[x]
extract_single(curlst[y], tempst[x]);
I think this above code doesn't run in parallel or will not start each cursor at same time.
I need to run extract_single parallel for curi for corresponding Ti where i goes to 1 to 5 in this example. How to do that?
How to processes to run this parallely
You can use Thread to do this Job
this is just a example:
from threading import Thread, Lock
class DatabaseWorker(Thread):
__lock = Lock()
def __init__(self, db, query, result_queue):
Thread.__init__(self)
self.db = db
self.query = query
self.result_queue = result_queue
def run(self):
result = None
logging.info("Connecting to database...")
try:
conn = connect(host=host, port=port, database=self.db)
curs = conn.cursor()
curs.execute(self.query)
result = curs
curs.close()
conn.close()
except Exception as e:
logging.error("Unable to access database %s" % str(e))
self.result_queue.append(result)
delay = 1
result_queue = []
worker1 = DatabaseWorker("db1", "select something from sometable",
result_queue)
worker2 = DatabaseWorker("db1", "select something from othertable",
result_queue)
worker1.start()
worker2.start()
# Wait for the job to be done
while len(result_queue) < 2:
sleep(delay)
job_done = True
worker1.join()
worker2.join()
you can read more from here.
https://www.oracle.com/technical-resources/articles/embedded/vasiliev-python-concurrency.html
You can use asyncio. (Documentation can be found here)
import asyncio
def background(f):
def wrapped(*args, **kwargs):
return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
return wrapped
#background
def your_function(argument):
#code
Now this function will be run in parallel whenever called without putting main program into wait state. You can use it to parallelize for loop as well. When called for a for loop, though loop is sequential but every iteration runs in parallel to the main program as soon as interpreter gets there. In your case wrap your function to run in background as follows by just adding #background signature:
#background
def extract_single(curlst[y], tempst[x]):
#function definition
Then you can use your code without any modification to run in parallel as intended by just adding #background signature to function definition.
For Example:
#background
def your_function(argument):
time.sleep(5)
print('function finished for '+str(argument))
for i in range(10):
your_function(i)
print('loop finished')
This produces following output:
loop finished
function finished for 4
function finished for 8
function finished for 0
function finished for 3
function finished for 6
function finished for 2
function finished for 5
function finished for 7
function finished for 9
function finished for 1
I have a Python program that acts as a consumer for RabbitMQ. Once it receives a job from its queue, I want the program to split the job up using multiprocessing, but I'm running into issues with the logistics of multiprocessing.
I've simplified the code for readability.
My RabbitMQ consumer functionality:
connection = pika.BlockingConnection(pika.ConnectionParameters('localhost'))
channel = connection.channel()
channel.queue_declare(queue="JobReader", durable=True)
logging.info('Waiting for messages..')
def callback(ch, method, properties, body):
job_info = json.loads(body)
logging.info('Start Time: ' + time.strftime("%H:%M:%S"))
split_jobs = split_job(job_info)
process_manager.runProcesses(split_jobs)
ch.basic_ack(delivery_tag=method.delivery_tag)
My multiprocessing functionality:
#!/usr/bin/python
import multiprocessing
import other_package
def worker_process(sub_job):
other_package.run_job(sub_job)
def runProcesses(jobs):
processes = []
for sub_job in jobs:
p = multiprocessing.Process(target=worker_process, args=(sub_job,))
processes.append(p)
p.start()
Naturally, I can't do if __name__ == '__main__': because it is within a function.
I'm not sure if there is a workaround for this with multiprocessing, or if I'm just approaching this the wrong way. Any help would be greatly appreciated.
You can refactor the multiprocessing piece so that you initialize its state from your main script:
import process_manager
...
def callback(ch, method, properties, body):
job_info = json.loads(body)
logging.info('Start Time: ' + time.strftime("%H:%M:%S"))
split_jobs = split_job(job_info)
manager.runProcesses(split_jobs)
ch.basic_ack(delivery_tag=method.delivery_tag)
if __name__ == "__main__":
manager = process_manager.get_manager()
connection = pika.BlockingConnection(pika.ConnectionParameters('localhost'))
channel = connection.channel()
channel.queue_declare(queue="JobReader", durable=True)
logging.info('Waiting for messages..')
Then process_manager looks like this:
import multiprocessing
import other_package
def worker_process(sub_job):
other_package.run_job(sub_job)
_manager = None
def get_manager(): # Note that you don't have to use a singleton here
global _manager
if not _manager:
_manager = Manager()
return _manager
class Manager(object):
def __init__(self):
self._pool = multiprocessing.Pool()
def runProcesses(self, jobs):
self._pool.map_async(worker_process, jobs)
Note that I use a Pool instead of spawning a Process for every single job, because that probably won't scale well.
i was planning to change my project to multiprocesses so i can use more resources,here's my database module code
import pymysql
import threading
class tdb:
def __init__(self):
self.totalEffected = 0
pass
def start(self):
self.conn = pymysql.connect(host='xxxx', port=3306, user='root', passwd='xxxx', db='xxxx', charset='utf8')
def select(self,sql,args=None):
cur = self.conn.cursor()
cur.execute(sql,args)
result = cur.fetchall()
cur.close()
return result
def execute(self,sql,args=None):
cur = self.conn.cursor()
result = cur.execute(sql,args)
cur.close()
self.totalEffected+=result
return result
# def __commit(self,callback):
def __commitCallback(self,result):
print('commit result:',result)
self.conn.close()
def errorc(self,*args):
print('error')
def end(self):
# init()
# p.apply_async(self.conn.commit, callback=self.__commitCallback,error_callback=self.errorc)
if self.totalEffected!=0:
thread = threading.Thread(target=self.t)
thread.start()
else:
self.conn.close()
# p.apply(self.conn.commit)
# self.conn.close()
# print('result:' ,result.get())
def t(self):
self.conn.commit()
self.conn.close()
the only operation that really need to handle is conn.commit(), i use thread to do it ,so i can immediately return. i once use Pool.apply_async(),but it didn't callback, so i want to know how to make the other process call me , so i don't have to spend my time waiting recieve.
I'm trying to start a data queue server under a managing process (so that it can later be turned into a service), and while the data queue server function works fine in the main process, it does not work in a process created using multiprocessing.Process.
The dataQueueServer and dataQueueClient code is based on the code from the multiprocessing module documentation here.
When run on its own, dataQueueServer works well. However, when run using a multiprocessing.Process's start() in mpquueue, it doesn't work (when tested with the client). I am using the dataQueueClient without changes to test both cases.
The code does reach the serve_forever in both cases, so I think the server is working, but something is blocking it from communicating back to the client in the mpqueue case.
I have placed the loop that runs the serve_forever() part under a thread, so that it can be stoppable.
Here is the code:
mpqueue # this is the "manager" process trying to spawn the server in a child process
import time
import multiprocessing
import threading
import dataQueueServer
class Printer():
def __init__(self):
self.lock = threading.Lock()
def tsprint(self, text):
with self.lock:
print text
class QueueServer(multiprocessing.Process):
def __init__(self, name = '', printer = None):
multiprocessing.Process.__init__(self)
self.name = name
self.printer = printer
self.ml = dataQueueServer.MainLoop(name = 'ml', printer = self.printer)
def run(self):
self.printer.tsprint(self.ml)
self.ml.start()
def stop(self):
self.ml.stop()
if __name__ == '__main__':
printer = Printer()
qs = QueueServer(name = 'QueueServer', printer = printer)
printer.tsprint(qs)
printer.tsprint('starting')
qs.start()
printer.tsprint('started.')
printer.tsprint('Press Ctrl-C to quit')
try:
while True:
time.sleep(60)
except KeyboardInterrupt:
printer.tsprint('\nTrying to exit cleanly...')
qs.stop()
printer.tsprint('stopped')
dataQueueServer
import time
import threading
from multiprocessing.managers import BaseManager
from multiprocessing import Queue
HOST = ''
PORT = 50010
AUTHKEY = 'authkey'
## Define some helper functions for use by the main process loop
class Printer():
def __init__(self):
self.lock = threading.Lock()
def tsprint(self, text):
with self.lock:
print text
class QueueManager(BaseManager):
pass
class MainLoop(threading.Thread):
"""A thread based loop manager, allowing termination signals to be sent
to the thread"""
def __init__(self, name = '', printer = None):
threading.Thread.__init__(self)
self._stopEvent = threading.Event()
self.daemon = True
self.name = name
if printer is None:
self.printer = Printer()
else:
self.printer = printer
## create the queue
self.queue = Queue()
## Add a function to the handler to return the queue to clients
self.QM = QueueManager
self.QM.register('get_queue', callable=lambda:self.queue)
self.queue_manager = self.QM(address=(HOST, PORT), authkey=AUTHKEY)
self.queue_server = self.queue_manager.get_server()
def __del__(self):
self.printer.tsprint( 'closing...')
def run(self):
self.printer.tsprint( '{}: started serving'.format(self.name))
self.queue_server.serve_forever()
def stop(self):
self.printer.tsprint ('{}: stopping'.format(self.name))
self._stopEvent.set()
def stopped(self):
return self._stopEvent.isSet()
def start():
printer = Printer()
ml = MainLoop(name = 'ml', printer = printer)
ml.start()
return ml
def stop(ml):
ml.stop()
if __name__ == '__main__':
ml = start()
raw_input("\nhit return to stop")
stop(ml)
And a client:
dataQueueClient
import datetime
from multiprocessing.managers import BaseManager
n = 0
N = 10**n
HOST = ''
PORT = 50010
AUTHKEY = 'authkey'
def now():
return datetime.datetime.now()
def gen(n, func, *args, **kwargs):
k = 0
while k < n:
yield func(*args, **kwargs)
k += 1
class QueueManager(BaseManager):
pass
QueueManager.register('get_queue')
m = QueueManager(address=(HOST, PORT), authkey=AUTHKEY)
m.connect()
queue = m.get_queue()
def load(msg, q):
return q.put(msg)
def get(q):
return q.get()
lgen = gen(N, load, msg = 'hello', q = queue)
t0 = now()
while True:
try:
lgen.next()
except StopIteration:
break
t1 = now()
print 'loaded %d items in ' % N, t1-t0
t0 = now()
while queue.qsize() > 0:
queue.get()
t1 = now()
print 'got %d items in ' % N, t1-t0
So it seems like the solution is simple enough: Don't use serve_forever(), and use manager.start() instead.
According to Eli Bendersky, the BaseManager (and it's extended version SyncManager) already spawns the server in a new process (and looking at the multiprocessing.managers code confirms this). The problem I have been experiencing stems from the form used in the example, in which the server is started under the main process.
I still don't understand why the current example doesn't work when run under a child process, but that's no longer an issue.
Here's the working (and much simplified from OP) code to manage multiple queue servers:
Server:
from multiprocessing import Queue
from multiprocessing.managers import SyncManager
HOST = ''
PORT0 = 5011
PORT1 = 5012
PORT2 = 5013
AUTHKEY = 'authkey'
name0 = 'qm0'
name1 = 'qm1'
name2 = 'qm2'
description = 'Queue Server'
def CreateQueueServer(HOST, PORT, AUTHKEY, name = None, description = None):
name = name
description = description
q = Queue()
class QueueManager(SyncManager):
pass
QueueManager.register('get_queue', callable = lambda: q)
QueueManager.register('get_name', callable = name)
QueueManager.register('get_description', callable = description)
manager = QueueManager(address = (HOST, PORT), authkey = AUTHKEY)
manager.start() # This actually starts the server
return manager
# Start three queue servers
qm0 = CreateQueueServer(HOST, PORT0, AUTHKEY, name0, description)
qm1 = CreateQueueServer(HOST, PORT1, AUTHKEY, name1, description)
qm2 = CreateQueueServer(HOST, PORT2, AUTHKEY, name2, description)
raw_input("return to end")
Client:
from multiprocessing.managers import SyncManager
HOST = ''
PORT0 = 5011
PORT1 = 5012
PORT2 = 5013
AUTHKEY = 'authkey'
def QueueServerClient(HOST, PORT, AUTHKEY):
class QueueManager(SyncManager):
pass
QueueManager.register('get_queue')
QueueManager.register('get_name')
QueueManager.register('get_description')
manager = QueueManager(address = (HOST, PORT), authkey = AUTHKEY)
manager.connect() # This starts the connected client
return manager
# create three connected managers
qc0 = QueueServerClient(HOST, PORT0, AUTHKEY)
qc1 = QueueServerClient(HOST, PORT1, AUTHKEY)
qc2 = QueueServerClient(HOST, PORT2, AUTHKEY)
# Get the queue objects from the clients
q0 = qc0.get_queue()
q1 = qc1.get_queue()
q2 = qc2.get_queue()
# put stuff in the queues
q0.put('some stuff')
q1.put('other stuff')
q2.put({1:123, 2:'abc'})
# check their sizes
print 'q0 size', q0.qsize()
print 'q1 size', q1.qsize()
print 'q2 size', q2.qsize()
# pull some stuff and print it
print q0.get()
print q1.get()
print q2.get()
Adding an additional server to share a dictionary with the information of the running queue servers so that consumers can easily tell what's available where is easy enough using that model. One thing to note, though, is that the shared dictionary requires slightly different syntax than a normal dictionary: dictionary[0] = something will not work. You need to use dictionary.update([(key, value), (otherkey, othervalue)]) and dictionary.get(key) syntax, which propagates across to all other clients connected to this dictionary..
here is a simple threading program which works fine:
import psycopg2
import threading
import time
class testit(threading.Thread):
def __init__(self, currency):
threading.Thread.__init__(self)
self.currency = currency
def run(self):
global SQLConnection
global cursor
SQLString = "Select dval from ddata where dname ='%s' and ddate = '2009-07-17'" \
%self.currency
z = time.time()
while (time.time() - z) < 2:
print SQLString
SQLConnection = psycopg2.connect(database = "db", user = "xxxx", password = "xxxx")
cursor = SQLConnection.cursor()
a = testit('EURCZK')
b = testit('EURPLN')
a.start()
b.start()
However as soon as I try to start accessing the postgresql database in the thread with the following code, I always get a stop-sign crash:
import psycopg2
import threading
import time
class testit(threading.Thread):
def __init__(self, currency):
threading.Thread.__init__(self)
self.currency = currency
def run(self):
global SQLConnection
global cursor
SQLString = "Select dval from ddata where dname ='%s'and ddate = '2009-07-17'" %self.currency
z = time.time()
while (time.time() - z) < 2:
cursor.execute(SQLString)
print cursor.fetchall()
SQLConnection = psycopg2.connect(database = "db", user = "xxxx", password = "xxxx")
cursor = SQLConnection.cursor()
a = testit('EURCZK')
b = testit('EURPLN')
a.start()
b.start()
The only difference between the two is in the while loop. I am fairly new to thread programming. Is the postgres library (psycopg2) not "thread safe"? All this is running on Windows XP. Anything I can do?
Thanks.
global SQLConnection
global cursor
Seems you're accessing globals from multiple threads ? You should never do that unless those globals are thread safe, or you provide the proper locking yourself.
You now have 2 threads accessing the same connection and the same cursor. They'll step on eachothers toes. psycopg2 connection might be thread safe but cursors are not.
Use one cursor(probably one connection as well) per thread.
bingo it's working. Someone left an answer but then seems to have removed it, to give each thread its own connection. And yep that solves it. So this code works:
import psycopg2
import threading
import time
class testit(threading.Thread):
def __init__(self, currency):
threading.Thread.__init__(self)
self.currency = currency
self.SQLConnection = psycopg2.connect(database = "db", user = "xxxx", password = "xxxx")
self.cursor = self.SQLConnection.cursor()
def run(self):
SQLString = "Select dval from ddata where dname ='%s' and ddate = '2009-07-17'" \
%self.currency
z = time.time()
while (time.time() - z) < 2:
self.cursor.execute(SQLString)
print self.cursor.fetchall()
a = testit('EURCZK')
b = testit('EURPLN')
a.start()
b.start()