Python threads - crashing when they access postgreSQL - python

here is a simple threading program which works fine:
import psycopg2
import threading
import time
class testit(threading.Thread):
def __init__(self, currency):
threading.Thread.__init__(self)
self.currency = currency
def run(self):
global SQLConnection
global cursor
SQLString = "Select dval from ddata where dname ='%s' and ddate = '2009-07-17'" \
%self.currency
z = time.time()
while (time.time() - z) < 2:
print SQLString
SQLConnection = psycopg2.connect(database = "db", user = "xxxx", password = "xxxx")
cursor = SQLConnection.cursor()
a = testit('EURCZK')
b = testit('EURPLN')
a.start()
b.start()
However as soon as I try to start accessing the postgresql database in the thread with the following code, I always get a stop-sign crash:
import psycopg2
import threading
import time
class testit(threading.Thread):
def __init__(self, currency):
threading.Thread.__init__(self)
self.currency = currency
def run(self):
global SQLConnection
global cursor
SQLString = "Select dval from ddata where dname ='%s'and ddate = '2009-07-17'" %self.currency
z = time.time()
while (time.time() - z) < 2:
cursor.execute(SQLString)
print cursor.fetchall()
SQLConnection = psycopg2.connect(database = "db", user = "xxxx", password = "xxxx")
cursor = SQLConnection.cursor()
a = testit('EURCZK')
b = testit('EURPLN')
a.start()
b.start()
The only difference between the two is in the while loop. I am fairly new to thread programming. Is the postgres library (psycopg2) not "thread safe"? All this is running on Windows XP. Anything I can do?
Thanks.

global SQLConnection
global cursor
Seems you're accessing globals from multiple threads ? You should never do that unless those globals are thread safe, or you provide the proper locking yourself.
You now have 2 threads accessing the same connection and the same cursor. They'll step on eachothers toes. psycopg2 connection might be thread safe but cursors are not.
Use one cursor(probably one connection as well) per thread.

bingo it's working. Someone left an answer but then seems to have removed it, to give each thread its own connection. And yep that solves it. So this code works:
import psycopg2
import threading
import time
class testit(threading.Thread):
def __init__(self, currency):
threading.Thread.__init__(self)
self.currency = currency
self.SQLConnection = psycopg2.connect(database = "db", user = "xxxx", password = "xxxx")
self.cursor = self.SQLConnection.cursor()
def run(self):
SQLString = "Select dval from ddata where dname ='%s' and ddate = '2009-07-17'" \
%self.currency
z = time.time()
while (time.time() - z) < 2:
self.cursor.execute(SQLString)
print self.cursor.fetchall()
a = testit('EURCZK')
b = testit('EURPLN')
a.start()
b.start()

Related

Python - Dynamic processing in parallel based on a list

I am trying to create approach to run multiple queries from a list at the same time, for that I am using threading library. For that I have this code:
from threading import Thread, Lock
queries = ["SELECT * FROM db1.trans", "SELECT * FROM db1.order", "SELECT * FROM db2.Store", "SELECT * FROM db2.Document", "SELECT * FROM db3.Sales"]
class DatabaseWorker(Thread):
__lock = Lock()
def __init__(self, query, result_queue):
Thread.__init__(self)
self.query = query
self.result_queue = result_queue
def run(self):
result = None
print("Connecting to database...")
try:
conn = connect(host=host, port=port)
curs = conn.cursor()
curs.execute(self.query)
result = curs
curs.close()
conn.close()
except Exception as e:
print(str(e))
self.result_queue.append(result)
delay = 1
result_queue = []
for query in queries:
worker1 = DatabaseWorker(query,result_queue)
worker1.start()
while len(result_queue) < 2:
time.sleep(delay)
job_done = True
worker1.join()
Using the above approach I am running in sequential mode. I know I can do in this way:
worker1 = DatabaseWorker(queries[0],result_queue)
worker2 = DatabaseWorker(queries[1],result_queue)
...
But I think it is not the best way. Anyone knows how can I run all the queries from the list in a dynamic mode?
Thanks!
Python threading is not really parallel because of the Python GIL (Global Interpreter Lock).
For real multiprocessing parallel operations you can use the Python multiprocessing module.
example :
import multiprocessing
def runner(task):
return f'Hi, i do {task}'
if __name__ == '__main__':
list_tasks = ['1', '2', '3']
with multiprocessing.Pool() as pool:
result = pool.map(runner, list_tasks)
print(result)

how to do Hollywood principle between processes in python?

i was planning to change my project to multiprocesses so i can use more resources,here's my database module code
import pymysql
import threading
class tdb:
def __init__(self):
self.totalEffected = 0
pass
def start(self):
self.conn = pymysql.connect(host='xxxx', port=3306, user='root', passwd='xxxx', db='xxxx', charset='utf8')
def select(self,sql,args=None):
cur = self.conn.cursor()
cur.execute(sql,args)
result = cur.fetchall()
cur.close()
return result
def execute(self,sql,args=None):
cur = self.conn.cursor()
result = cur.execute(sql,args)
cur.close()
self.totalEffected+=result
return result
# def __commit(self,callback):
def __commitCallback(self,result):
print('commit result:',result)
self.conn.close()
def errorc(self,*args):
print('error')
def end(self):
# init()
# p.apply_async(self.conn.commit, callback=self.__commitCallback,error_callback=self.errorc)
if self.totalEffected!=0:
thread = threading.Thread(target=self.t)
thread.start()
else:
self.conn.close()
# p.apply(self.conn.commit)
# self.conn.close()
# print('result:' ,result.get())
def t(self):
self.conn.commit()
self.conn.close()
the only operation that really need to handle is conn.commit(), i use thread to do it ,so i can immediately return. i once use Pool.apply_async(),but it didn't callback, so i want to know how to make the other process call me , so i don't have to spend my time waiting recieve.

Python multiprocessing RemoteManager under a multiprocessing.Process

I'm trying to start a data queue server under a managing process (so that it can later be turned into a service), and while the data queue server function works fine in the main process, it does not work in a process created using multiprocessing.Process.
The dataQueueServer and dataQueueClient code is based on the code from the multiprocessing module documentation here.
When run on its own, dataQueueServer works well. However, when run using a multiprocessing.Process's start() in mpquueue, it doesn't work (when tested with the client). I am using the dataQueueClient without changes to test both cases.
The code does reach the serve_forever in both cases, so I think the server is working, but something is blocking it from communicating back to the client in the mpqueue case.
I have placed the loop that runs the serve_forever() part under a thread, so that it can be stoppable.
Here is the code:
mpqueue # this is the "manager" process trying to spawn the server in a child process
import time
import multiprocessing
import threading
import dataQueueServer
class Printer():
def __init__(self):
self.lock = threading.Lock()
def tsprint(self, text):
with self.lock:
print text
class QueueServer(multiprocessing.Process):
def __init__(self, name = '', printer = None):
multiprocessing.Process.__init__(self)
self.name = name
self.printer = printer
self.ml = dataQueueServer.MainLoop(name = 'ml', printer = self.printer)
def run(self):
self.printer.tsprint(self.ml)
self.ml.start()
def stop(self):
self.ml.stop()
if __name__ == '__main__':
printer = Printer()
qs = QueueServer(name = 'QueueServer', printer = printer)
printer.tsprint(qs)
printer.tsprint('starting')
qs.start()
printer.tsprint('started.')
printer.tsprint('Press Ctrl-C to quit')
try:
while True:
time.sleep(60)
except KeyboardInterrupt:
printer.tsprint('\nTrying to exit cleanly...')
qs.stop()
printer.tsprint('stopped')
dataQueueServer
import time
import threading
from multiprocessing.managers import BaseManager
from multiprocessing import Queue
HOST = ''
PORT = 50010
AUTHKEY = 'authkey'
## Define some helper functions for use by the main process loop
class Printer():
def __init__(self):
self.lock = threading.Lock()
def tsprint(self, text):
with self.lock:
print text
class QueueManager(BaseManager):
pass
class MainLoop(threading.Thread):
"""A thread based loop manager, allowing termination signals to be sent
to the thread"""
def __init__(self, name = '', printer = None):
threading.Thread.__init__(self)
self._stopEvent = threading.Event()
self.daemon = True
self.name = name
if printer is None:
self.printer = Printer()
else:
self.printer = printer
## create the queue
self.queue = Queue()
## Add a function to the handler to return the queue to clients
self.QM = QueueManager
self.QM.register('get_queue', callable=lambda:self.queue)
self.queue_manager = self.QM(address=(HOST, PORT), authkey=AUTHKEY)
self.queue_server = self.queue_manager.get_server()
def __del__(self):
self.printer.tsprint( 'closing...')
def run(self):
self.printer.tsprint( '{}: started serving'.format(self.name))
self.queue_server.serve_forever()
def stop(self):
self.printer.tsprint ('{}: stopping'.format(self.name))
self._stopEvent.set()
def stopped(self):
return self._stopEvent.isSet()
def start():
printer = Printer()
ml = MainLoop(name = 'ml', printer = printer)
ml.start()
return ml
def stop(ml):
ml.stop()
if __name__ == '__main__':
ml = start()
raw_input("\nhit return to stop")
stop(ml)
And a client:
dataQueueClient
import datetime
from multiprocessing.managers import BaseManager
n = 0
N = 10**n
HOST = ''
PORT = 50010
AUTHKEY = 'authkey'
def now():
return datetime.datetime.now()
def gen(n, func, *args, **kwargs):
k = 0
while k < n:
yield func(*args, **kwargs)
k += 1
class QueueManager(BaseManager):
pass
QueueManager.register('get_queue')
m = QueueManager(address=(HOST, PORT), authkey=AUTHKEY)
m.connect()
queue = m.get_queue()
def load(msg, q):
return q.put(msg)
def get(q):
return q.get()
lgen = gen(N, load, msg = 'hello', q = queue)
t0 = now()
while True:
try:
lgen.next()
except StopIteration:
break
t1 = now()
print 'loaded %d items in ' % N, t1-t0
t0 = now()
while queue.qsize() > 0:
queue.get()
t1 = now()
print 'got %d items in ' % N, t1-t0
So it seems like the solution is simple enough: Don't use serve_forever(), and use manager.start() instead.
According to Eli Bendersky, the BaseManager (and it's extended version SyncManager) already spawns the server in a new process (and looking at the multiprocessing.managers code confirms this). The problem I have been experiencing stems from the form used in the example, in which the server is started under the main process.
I still don't understand why the current example doesn't work when run under a child process, but that's no longer an issue.
Here's the working (and much simplified from OP) code to manage multiple queue servers:
Server:
from multiprocessing import Queue
from multiprocessing.managers import SyncManager
HOST = ''
PORT0 = 5011
PORT1 = 5012
PORT2 = 5013
AUTHKEY = 'authkey'
name0 = 'qm0'
name1 = 'qm1'
name2 = 'qm2'
description = 'Queue Server'
def CreateQueueServer(HOST, PORT, AUTHKEY, name = None, description = None):
name = name
description = description
q = Queue()
class QueueManager(SyncManager):
pass
QueueManager.register('get_queue', callable = lambda: q)
QueueManager.register('get_name', callable = name)
QueueManager.register('get_description', callable = description)
manager = QueueManager(address = (HOST, PORT), authkey = AUTHKEY)
manager.start() # This actually starts the server
return manager
# Start three queue servers
qm0 = CreateQueueServer(HOST, PORT0, AUTHKEY, name0, description)
qm1 = CreateQueueServer(HOST, PORT1, AUTHKEY, name1, description)
qm2 = CreateQueueServer(HOST, PORT2, AUTHKEY, name2, description)
raw_input("return to end")
Client:
from multiprocessing.managers import SyncManager
HOST = ''
PORT0 = 5011
PORT1 = 5012
PORT2 = 5013
AUTHKEY = 'authkey'
def QueueServerClient(HOST, PORT, AUTHKEY):
class QueueManager(SyncManager):
pass
QueueManager.register('get_queue')
QueueManager.register('get_name')
QueueManager.register('get_description')
manager = QueueManager(address = (HOST, PORT), authkey = AUTHKEY)
manager.connect() # This starts the connected client
return manager
# create three connected managers
qc0 = QueueServerClient(HOST, PORT0, AUTHKEY)
qc1 = QueueServerClient(HOST, PORT1, AUTHKEY)
qc2 = QueueServerClient(HOST, PORT2, AUTHKEY)
# Get the queue objects from the clients
q0 = qc0.get_queue()
q1 = qc1.get_queue()
q2 = qc2.get_queue()
# put stuff in the queues
q0.put('some stuff')
q1.put('other stuff')
q2.put({1:123, 2:'abc'})
# check their sizes
print 'q0 size', q0.qsize()
print 'q1 size', q1.qsize()
print 'q2 size', q2.qsize()
# pull some stuff and print it
print q0.get()
print q1.get()
print q2.get()
Adding an additional server to share a dictionary with the information of the running queue servers so that consumers can easily tell what's available where is easy enough using that model. One thing to note, though, is that the shared dictionary requires slightly different syntax than a normal dictionary: dictionary[0] = something will not work. You need to use dictionary.update([(key, value), (otherkey, othervalue)]) and dictionary.get(key) syntax, which propagates across to all other clients connected to this dictionary..

Create DB connection and maintain on multiple processes (multiprocessing)

Similar to another post I made, this answers that post and creates a new question.
Recap: I need to update every record in a spatial database in which I have a data set of points that overlay data set of polygons. For each point feature I want to assign a key to relate it to the polygon feature that it lies within. So if my point 'New York City' lies within polygon USA and for the USA polygon 'GID = 1' I will assign 'gid_fkey = 1' for my point New York City.
Okay so this has been achieved using multiprocessing. I have noticed a 150% increase in speed using this so it does work. But I think there is a bunch of unecessary overhead as one DB connection is required for each record.
So here is the code:
import multiprocessing, time, psycopg2
class Consumer(multiprocessing.Process):
def __init__(self, task_queue, result_queue):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
print 'Tasks Complete'
self.task_queue.task_done()
break
answer = next_task()
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, a):
self.a = a
def __call__(self):
pyConn = psycopg2.connect("dbname='geobase_1' host = 'localhost'")
pyConn.set_isolation_level(0)
pyCursor1 = pyConn.cursor()
procQuery = 'UPDATE city SET gid_fkey = gid FROM country WHERE ST_within((SELECT the_geom FROM city WHERE city_id = %s), country.the_geom) AND city_id = %s' % (self.a, self.a)
pyCursor1.execute(procQuery)
print 'What is self?'
print self.a
return self.a
def __str__(self):
return 'ARC'
def run(self):
print 'IN'
if __name__ == '__main__':
tasks = multiprocessing.JoinableQueue()
results = multiprocessing.Queue()
num_consumers = multiprocessing.cpu_count() * 2
consumers = [Consumer(tasks, results) for i in xrange(num_consumers)]
for w in consumers:
w.start()
pyConnX = psycopg2.connect("dbname='geobase_1' host = 'localhost'")
pyConnX.set_isolation_level(0)
pyCursorX = pyConnX.cursor()
pyCursorX.execute('SELECT count(*) FROM cities WHERE gid_fkey IS NULL')
temp = pyCursorX.fetchall()
num_job = temp[0]
num_jobs = num_job[0]
pyCursorX.execute('SELECT city_id FROM city WHERE gid_fkey IS NULL')
cityIdListTuple = pyCursorX.fetchall()
cityIdListList = []
for x in cityIdListTuple:
cityIdList.append(x[0])
for i in xrange(num_jobs):
tasks.put(Task(cityIdList[i - 1]))
for i in xrange(num_consumers):
tasks.put(None)
while num_jobs:
result = results.get()
print result
num_jobs -= 1
It looks to be between 0.3 and 1.5 seconds per connection as I have measure it with 'time' module.
Is there a way to make a DB connection per process and then just use the city_id info as a variable that I can feed into a query for the cursor in this open? This way I make say four processes each with a DB connection and then drop me city_id in somehow to process.
Try to isolate the creation of your connection in the Consumer constructor, then give it to the executed Task :
import multiprocessing, time, psycopg2
class Consumer(multiprocessing.Process):
def __init__(self, task_queue, result_queue):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.pyConn = psycopg2.connect("dbname='geobase_1' host = 'localhost'")
self.pyConn.set_isolation_level(0)
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
print 'Tasks Complete'
self.task_queue.task_done()
break
answer = next_task(connection=self.pyConn)
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, a):
self.a = a
def __call__(self, connection=None):
pyConn = connection
pyCursor1 = pyConn.cursor()
procQuery = 'UPDATE city SET gid_fkey = gid FROM country WHERE ST_within((SELECT the_geom FROM city WHERE city_id = %s), country.the_geom) AND city_id = %s' % (self.a, self.a)
pyCursor1.execute(procQuery)
print 'What is self?'
print self.a
return self.a
def __str__(self):
return 'ARC'
def run(self):
print 'IN'

sqlalchemy + mysql deadlocks

I insert rows with random data into mysql database. After inserting some rows, greenlet hangs on connecting. Statistics printing greenlet keeps running
This happens with any number of workers (including one), both with mysql-connector and mysqldb drivers.
sqlite works fine.
This has no effect (as i understand it is already fixed in new gevent)
def patch():
from gevent import monkey
monkey.patch_all()
# fix https://bugs.launchpad.net/myconnpy/+bug/712037
from mysql.connector.connection import MySQLConnection
MySQLConnection.get_characterset_info = MySQLConnection.get_charset
patch()
from sqlalchemy import MetaData, Table, Column, Integer, String, create_engine
from gevent import spawn, sleep
from random import randrange
from time import time
class Stats(object):
def __init__(self):
self.inserts, self.faults = 0, 0
def run(self):
while True:
sleep(1)
print "%d %d %d" % (time(), self.inserts, self.faults)
self.inserts, self.faults = 0, 0
class Victim(object):
metadata = MetaData()
Entry = Table(
'entry', metadata,
Column('id', Integer, primary_key=True),
Column('junk', String(128), unique=True)
)
def __init__(self, cs, stats):
self.e = create_engine(cs)
self.metadata.drop_all(self.e)
self.metadata.create_all(self.e)
self.stats = stats
def add(self, junk, i):
print i, 'connecting'
c = self.e.connect()
print i, 'connected'
t = c.begin()
try:
q = self.Entry.insert().values(junk=junk)
c.execute(q)
t.commit()
self.stats.inserts += 1
except Exception as e:
print i, 'EXCEPTION: ', e
t.rollback()
self.stats.faults += 1
print i, 'done'
def flood(victim, i):
a, z, l = ord('a'), ord('z')+1, 100
while True:
victim.add(''.join(chr(randrange(a, z)) for _ in xrange(l)), i)
sleep(0)
def main(n_threads, cs):
stats = Stats()
victim = Victim(cs, stats)
threads = [ spawn(flood, victim, i) for i in xrange(n_threads) ]
threads.append(spawn(stats.run))
[t.join() for t in threads]
#main(2, 'mysql://root:root#localhost/junk')
main(1, 'mysql+mysqlconnector://root:root#localhost/junk')
What is happening?
Retested, error persists without gevent, probably something with server configuration
I just forgot to release used connections, so connections where never checked-in back into pool.
...
def add(self, junk, i):
print i, 'connecting'
c = self.e.connect()
...
try:
...
except Exception as e:
...
finally:
c.close() # <-- this returns conenction into pool
print i, 'done'

Categories