Create DB connection and maintain on multiple processes (multiprocessing) - python

Similar to another post I made, this answers that post and creates a new question.
Recap: I need to update every record in a spatial database in which I have a data set of points that overlay data set of polygons. For each point feature I want to assign a key to relate it to the polygon feature that it lies within. So if my point 'New York City' lies within polygon USA and for the USA polygon 'GID = 1' I will assign 'gid_fkey = 1' for my point New York City.
Okay so this has been achieved using multiprocessing. I have noticed a 150% increase in speed using this so it does work. But I think there is a bunch of unecessary overhead as one DB connection is required for each record.
So here is the code:
import multiprocessing, time, psycopg2
class Consumer(multiprocessing.Process):
def __init__(self, task_queue, result_queue):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
print 'Tasks Complete'
self.task_queue.task_done()
break
answer = next_task()
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, a):
self.a = a
def __call__(self):
pyConn = psycopg2.connect("dbname='geobase_1' host = 'localhost'")
pyConn.set_isolation_level(0)
pyCursor1 = pyConn.cursor()
procQuery = 'UPDATE city SET gid_fkey = gid FROM country WHERE ST_within((SELECT the_geom FROM city WHERE city_id = %s), country.the_geom) AND city_id = %s' % (self.a, self.a)
pyCursor1.execute(procQuery)
print 'What is self?'
print self.a
return self.a
def __str__(self):
return 'ARC'
def run(self):
print 'IN'
if __name__ == '__main__':
tasks = multiprocessing.JoinableQueue()
results = multiprocessing.Queue()
num_consumers = multiprocessing.cpu_count() * 2
consumers = [Consumer(tasks, results) for i in xrange(num_consumers)]
for w in consumers:
w.start()
pyConnX = psycopg2.connect("dbname='geobase_1' host = 'localhost'")
pyConnX.set_isolation_level(0)
pyCursorX = pyConnX.cursor()
pyCursorX.execute('SELECT count(*) FROM cities WHERE gid_fkey IS NULL')
temp = pyCursorX.fetchall()
num_job = temp[0]
num_jobs = num_job[0]
pyCursorX.execute('SELECT city_id FROM city WHERE gid_fkey IS NULL')
cityIdListTuple = pyCursorX.fetchall()
cityIdListList = []
for x in cityIdListTuple:
cityIdList.append(x[0])
for i in xrange(num_jobs):
tasks.put(Task(cityIdList[i - 1]))
for i in xrange(num_consumers):
tasks.put(None)
while num_jobs:
result = results.get()
print result
num_jobs -= 1
It looks to be between 0.3 and 1.5 seconds per connection as I have measure it with 'time' module.
Is there a way to make a DB connection per process and then just use the city_id info as a variable that I can feed into a query for the cursor in this open? This way I make say four processes each with a DB connection and then drop me city_id in somehow to process.

Try to isolate the creation of your connection in the Consumer constructor, then give it to the executed Task :
import multiprocessing, time, psycopg2
class Consumer(multiprocessing.Process):
def __init__(self, task_queue, result_queue):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.pyConn = psycopg2.connect("dbname='geobase_1' host = 'localhost'")
self.pyConn.set_isolation_level(0)
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
print 'Tasks Complete'
self.task_queue.task_done()
break
answer = next_task(connection=self.pyConn)
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, a):
self.a = a
def __call__(self, connection=None):
pyConn = connection
pyCursor1 = pyConn.cursor()
procQuery = 'UPDATE city SET gid_fkey = gid FROM country WHERE ST_within((SELECT the_geom FROM city WHERE city_id = %s), country.the_geom) AND city_id = %s' % (self.a, self.a)
pyCursor1.execute(procQuery)
print 'What is self?'
print self.a
return self.a
def __str__(self):
return 'ARC'
def run(self):
print 'IN'

Related

how to use multiprocessing inside class have to attribute are object and dict?

I have a class:
# myclass.py
class MyClass(object):
def __init__(self,*args):
self.env = args[0]
self.mydict = args[1]
def run(self):
list_data = [1,2,3,4,5,6,7,8,9,10]
pool = mp.Pool(3)
for _ in tqdm(pool.imap_unordered(self.exefunction, list_data), total = len(a), desc = 'Main process'):
pass
pool.close()
pool.join
def exefunction(self,number):
print(number)
print(self.env)
print(self.mydict)
# myenv.py
class MyEnv(object):
def __init__(self, *args, **kwargs):
self.database = args[0]
self.host = args[1]
self.port = args[2]
self.user = args[3]
self.pwd = args[4]
self.sp = None
self.param = None
self.output_exception_msg = None
self.output_sperror_msg = None
self.pool = self.create_pool()
def create_pool(self):
self.pool = MySQLConnectionPool(pool_name=config.pool_name, pool_size=config.pool_size,
user=self.user, password=self.pwd, host=self.host, port=self.port, database=self.database)
return self.pool
# main.py
if __name__ == '__main__':
list_dict = [{ 1 : 3 , 5 : 10 }]
env = MyEnv(**dbconfig)
build_class = MyClass(env, list_dict)
build_class.run()
self.env is an object that was created from another class.
When I call to function run I have a TypeError: cannot serialize socket object.
I know the problem is MyClass has an object and dict (I try to pass self.env and self.mydict are integer and it's working). I don't have any solution to fix it.
UPDATE: I opened a connection pool into my DB ( self.env contains that)
Each process runs in its own address space and therefore needs its own DB connection pool. So:
Class MyEnv should not explicitly call method create_pool from its __init__ method; this pool-creation needs to be postponed until later.
Each process in the multiprocessing pool will have its own instance of class MyEnv as a global variable that gets initialized when the pool is created. create_pool will be called on each of these instances as part of that initialization.
I obviously am not in a position to test this, but you should get the general idea if minimal tweaking is necessary:
def init_pool(the_env):
global env
env = the_env
env.create_pool() # now create the pool
class MyClass(object):
def __init__(self,*args):
self.env = args[0]
self.mydict = args[1]
def run(self):
list_data = [1,2,3,4,5,6,7,8,9,10]
# initialize the env global variable for each process in the process pool :
pool = mp.Pool(3, initializer=init_pool, initargs=(self.env,))
for _ in tqdm(pool.imap_unordered(self.exefunction, list_data), total = len(a), desc = 'Main process'):
pass
pool.close()
pool.join
def exefunction(self,number):
print(number)
print(env) # access global
print(self.mydict)
# myenv.py
class MyEnv(object):
def __init__(self, *args, **kwargs):
self.database = args[0]
self.host = args[1]
self.port = args[2]
self.user = args[3]
self.pwd = args[4]
self.sp = None
self.param = None
self.output_exception_msg = None
self.output_sperror_msg = None
# don't create the pool now:
self.pool = None
def create_pool(self):
self.pool = MySQLConnectionPool(pool_name=config.pool_name, pool_size=config.pool_size,
user=self.user, password=self.pwd, host=self.host, port=self.port, database=self.database)
return self.pool
# main.py
if __name__ == '__main__':
list_dict = [{ 1 : 3 , 5 : 10 }]
env = MyEnv(**dbconfig)
build_class = MyClass(env, list_dict)
build_class.run()

Python 3.6 Object/Class Threading

I'm looking to create a "self contained threaded class" using Python 3.
At a high level, want I would liked to do is to spawn up 50 asynchronous device objects from my "main" class and then just use their methods as needed. This is not difficult when just dealing with objects in synchronous situation but gets cloudy quite quickly as we move to asynchronous processing. The primary idea to keep the threading self contained in the device class so my base (main.py) code stays streamlined/clean and without any of the thread management.
I don't plan on any resource sharing in this case so I think I'm clear of any thread lock issues.
Here is some sample code that I hope someone can provide some hints or samples into making it a self threaded class (meaning I don't want to manage threads at the main.py level):
Sample main.py
from deviceworker import Device
availableworkers = {'USA':'services.groupkt.com', 'IND':'services.groupkt.com'}
Activeworkers = []
for name, ip in availableworkers.items():
Activeworkers.append(Device(name, ip))
for worker in Activeworkers:
worker.checkcountry() # asynchronous call - (we don't want to wait for a response)
# The idea is to keep this code as clean as possible.
Sample Object: deviceworker.py
import urllib.request
import urllib.parse
import json
class Device:
def __init__(self, name, endpoint, preamble = 'state', port = 80 ):
self.name = name
self.connected =False
self.connection = HTTPConnection(endpoint, preamble, port)
self.getStatus()
def getStatus(self, check_for = None):
self.urlresponse = json.loads(self.connection.GET('get/USA/all')) #Use USA just to verify connection
if check_for:
pass
self.connected = True
def checkcountry(self):
print(self.connection.GET('get/%s/all' % self.name))
class HTTPConnection:
def __init__(self, endpoint, preamble = None, port = 80):
if preamble: # specificing a version after the port and before method
self.url = 'http://%s:%s/%s/' % (endpoint, port, preamble)
else:
self.url = 'http://%s:%s/' % (endpoint, port)
print('_init_ url=%s' % self.url)
def GET(self, operation):
#try:
#print('%s%s' % (self.url, operation))
with urllib.request.urlopen('%s%s' % (self.url, operation)) as f:
return f.read().decode('utf-8')
#except Exception as e:
#raise Exception("GET Request Failed")
I've stripped most of the exception handling for simplicity. The sample above should work.
--- UPDATE ---
So I think I've sort of figured it out. Still not getting the parrellism I would expect from the documentation.
import threading
import urllib.request
import urllib.parse
import json
import time
class Device(threading.Thread):
def __init__(self, name, endpoint, preamble = 'state', port = 80 ):
threading.Thread.__init__(self)
self.name = name
self.connected = False
self.connection = HTTPConnection(endpoint, preamble, port)
print('%s: __init__' % self.name)
def run(self):
self.getStatus()
print('%s: hit run()' % self.name)
def getStatus(self):
self.urlresponse = json.loads(self.connection.GET('get/USA/all')) #Use USA just to verify connection
self.connected = True
def checkcountry(self):
if (self.name == 'USA'): self.waittime = 10
else: self.waittime = 0
print('%s: Getting Codes - wait time: %s' % (self.name, self.waittime))
start_time=time.time()
time.sleep(self.waittime)
result =self.connection.GET('get/%s/all' % self.name)
elapsed_time=time.time() - start_time
print('%s: Got Codes - second: %s' % (self.name, elapsed_time))
class HTTPConnection:
def __init__(self, endpoint, preamble = None, port = 80):
if preamble: # specificing a version after the port and before method
self.url = 'http://%s:%s/%s/' % (endpoint, port, preamble)
else:
self.url = 'http://%s:%s/' % (endpoint, port)
def GET(self, operation):
with urllib.request.urlopen('%s%s' % (self.url, operation)) as f:
return f.read().decode('utf-8')
DeviceList = {'USA':'services.groupkt.com', 'IND':'services.groupkt.com'}
ActiveDevices = []
DeviceList = {'USA':'services.groupkt.com', 'IND':'services.groupkt.com'}
ActiveDevices = []
for name, ip in DeviceList.items():
print('main: creating object for: %s' % name)
newDevice = Device(name, ip)
ActiveDevices.append(newDevice)
newDevice.start()
for device in ActiveDevices:
print('main: calling checkcountry() for: %s' % device.name)
device.checkcountry()
Here are the results:
main: creating object for: USA
USA: __init__
main: creating object for: IND
IND: __init__
main: calling checkcountry() for: USA
USA: Getting Codes - wait time: 10
USA: Got Codes - second: 10.167016744613647
main: calling checkcountry() for: IND
IND: Getting Codes - wait time: 0
IND: Got Codes - second: 0.11001110076904297
I by adding in the delay to the USA search I would have expected the IND to finish first but it appears that it serialized.
I'm running this on:
Python 3.6.0 (v3.6.0:41df79263a11, Dec 23 2016, 07:18:10) [MSC v.1900 32 bit (Intel)] on win32
Here is a custom thread sample with locking worked great for me, better than using the event.
Try it in Colab.
import threading,time
i=0
luk=threading.Lock()
global i
global y
global t_num
class special_thread(threading.Thread):
"""This function starts a Thread class"""
def __init__(self, execute,threadID , name, daemon,args=(), repetitive=False,kwargs=None, interval_sec=60 ):
threading.Thread.__init__(self)
self.daemon = daemon
self.stopped = threading.Event()
self.interval_sec = interval_sec
self.execute = execute
self.name = name
if kwargs is None:
kwargs = {}
self.args = args
self.kwargs=kwargs
self.repetitive=repetitive
self.threadID = threadID
print(args)
def stop(self):
self.stopped.set()
self.join()
def run(self):
if self.repetitive:
while not self.stopped.wait(self.interval_sec):
self.execute(*self.args,**self.kwargs)
else:
self.execute(*self.args,**self.kwargs)
def center(t_num):
y=0
luk.acquire()
caller = inspect.getouterframes(inspect.currentframe())[1][3]
print(' {} is aquiring by {} '.format( caller, str(time.ctime())))
y+=t_num
print( "Inside %s()" % caller)
print('thread number is ',t_num,y)
time.sleep(2*t_num)
luk.release()
print(' {} is releasing by {} '.format( caller, str(time.ctime())))
def target_uno():
t_num=1
center(t_num)
def target_dos():
t_num=2
center(t_num)
target_uno=special_thread(execute=target_uno, args=(),repetitive=True, interval_sec=1,threadID=10004,
name='target_uno',
daemon=False )
target_dos=special_thread(execute=target_dos, args=(),repetitive=True, interval_sec=1,threadID=10004,
name='target_dos',
daemon=False )
if __name__ == "__main__":
target_uno.start()
target_dos.start()
time.sleep(20)
target_uno.stop()
target_dos.stop()

Python : multiprocessing and Array of c_char_p

I'm launching 3 processes and I want them to put a string into a shared array, at the index corresponding to the process (i).
Look at the code below, the output generated is:
['test 0', None, None]
['test 1', 'test 1', None]
['test 2', 'test 2', 'test 2']
Why 'test 0' get overwritten by test 1, and test 1 by test 2?
What I want is (order is not important) :
['test 0', None, None]
['test 0', 'test 1', None]
['test 0', 'test 1', 'test 2']
The code :
#!/usr/bin/env python
import multiprocessing
from multiprocessing import Value, Lock, Process, Array
import ctypes
from ctypes import c_int, c_char_p
class Consumer(multiprocessing.Process):
def __init__(self, task_queue, result_queue, arr, lock):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.arr = arr
self.lock = lock
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
self.task_queue.task_done()
break
answer = next_task(arr=self.arr, lock=self.lock)
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, i):
self.i = i
def __call__(self, arr=None, lock=None):
with lock:
arr[self.i] = "test %d" % self.i
print arr[:]
def __str__(self):
return 'ARC'
def run(self):
print 'IN'
if __name__ == '__main__':
tasks = multiprocessing.JoinableQueue()
results = multiprocessing.Queue()
arr = Array(ctypes.c_char_p, 3)
lock = multiprocessing.Lock()
num_consumers = multiprocessing.cpu_count() * 2
consumers = [Consumer(tasks, results, arr, lock) for i in xrange(num_consumers)]
for w in consumers:
w.start()
for i in xrange(3):
tasks.put(Task(i))
for i in xrange(num_consumers):
tasks.put(None)
I'm running Python 2.7.3 (Ubuntu)
This problem seems similar to this one. There, J.F. Sebastian speculated that the assignment to arr[i] points arr[i] to a memory address that was only meaningful to the subprocess making the assignment. The other subprocesses retrieve garbage when looking at that address.
There are at least two ways to avoid this problem. One is to use a multiprocessing.manager list:
import multiprocessing as mp
class Consumer(mp.Process):
def __init__(self, task_queue, result_queue, lock, lst):
mp.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.lock = lock
self.lst = lst
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
self.task_queue.task_done()
break
answer = next_task(lock = self.lock, lst = self.lst)
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, i):
self.i = i
def __call__(self, lock, lst):
with lock:
lst[self.i] = "test {}".format(self.i)
print([lst[i] for i in range(3)])
if __name__ == '__main__':
tasks = mp.JoinableQueue()
results = mp.Queue()
manager = mp.Manager()
lst = manager.list(['']*3)
lock = mp.Lock()
num_consumers = mp.cpu_count() * 2
consumers = [Consumer(tasks, results, lock, lst) for i in xrange(num_consumers)]
for w in consumers:
w.start()
for i in xrange(3):
tasks.put(Task(i))
for i in xrange(num_consumers):
tasks.put(None)
tasks.join()
Another way is to use a shared array with a fixed size such as mp.Array('c', 10).
import multiprocessing as mp
class Consumer(mp.Process):
def __init__(self, task_queue, result_queue, arr, lock):
mp.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.arr = arr
self.lock = lock
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
self.task_queue.task_done()
break
answer = next_task(arr = self.arr, lock = self.lock)
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, i):
self.i = i
def __call__(self, arr, lock):
with lock:
arr[self.i].value = "test {}".format(self.i)
print([a.value for a in arr])
if __name__ == '__main__':
tasks = mp.JoinableQueue()
results = mp.Queue()
arr = [mp.Array('c', 10) for i in range(3)]
lock = mp.Lock()
num_consumers = mp.cpu_count() * 2
consumers = [Consumer(tasks, results, arr, lock) for i in xrange(num_consumers)]
for w in consumers:
w.start()
for i in xrange(3):
tasks.put(Task(i))
for i in xrange(num_consumers):
tasks.put(None)
tasks.join()
I speculate that the reason why this works when mp.Array(ctypes.c_char_p, 3) does not, is because mp.Array('c', 10) has a fixed size so the memory address never changes, while mp.Array(ctypes.c_char_p, 3) has a variable size, so the memory address might change when arr[i] is assigned to a bigger string.
Perhaps this is what the docs are warning about when it states,
Although it is possible to store a pointer in shared memory remember
that this will refer to a location in the address space of a specific
process. However, the pointer is quite likely to be invalid in the
context of a second process and trying to dereference the pointer from
the second process may cause a crash.

Python multiprocessing RemoteManager under a multiprocessing.Process

I'm trying to start a data queue server under a managing process (so that it can later be turned into a service), and while the data queue server function works fine in the main process, it does not work in a process created using multiprocessing.Process.
The dataQueueServer and dataQueueClient code is based on the code from the multiprocessing module documentation here.
When run on its own, dataQueueServer works well. However, when run using a multiprocessing.Process's start() in mpquueue, it doesn't work (when tested with the client). I am using the dataQueueClient without changes to test both cases.
The code does reach the serve_forever in both cases, so I think the server is working, but something is blocking it from communicating back to the client in the mpqueue case.
I have placed the loop that runs the serve_forever() part under a thread, so that it can be stoppable.
Here is the code:
mpqueue # this is the "manager" process trying to spawn the server in a child process
import time
import multiprocessing
import threading
import dataQueueServer
class Printer():
def __init__(self):
self.lock = threading.Lock()
def tsprint(self, text):
with self.lock:
print text
class QueueServer(multiprocessing.Process):
def __init__(self, name = '', printer = None):
multiprocessing.Process.__init__(self)
self.name = name
self.printer = printer
self.ml = dataQueueServer.MainLoop(name = 'ml', printer = self.printer)
def run(self):
self.printer.tsprint(self.ml)
self.ml.start()
def stop(self):
self.ml.stop()
if __name__ == '__main__':
printer = Printer()
qs = QueueServer(name = 'QueueServer', printer = printer)
printer.tsprint(qs)
printer.tsprint('starting')
qs.start()
printer.tsprint('started.')
printer.tsprint('Press Ctrl-C to quit')
try:
while True:
time.sleep(60)
except KeyboardInterrupt:
printer.tsprint('\nTrying to exit cleanly...')
qs.stop()
printer.tsprint('stopped')
dataQueueServer
import time
import threading
from multiprocessing.managers import BaseManager
from multiprocessing import Queue
HOST = ''
PORT = 50010
AUTHKEY = 'authkey'
## Define some helper functions for use by the main process loop
class Printer():
def __init__(self):
self.lock = threading.Lock()
def tsprint(self, text):
with self.lock:
print text
class QueueManager(BaseManager):
pass
class MainLoop(threading.Thread):
"""A thread based loop manager, allowing termination signals to be sent
to the thread"""
def __init__(self, name = '', printer = None):
threading.Thread.__init__(self)
self._stopEvent = threading.Event()
self.daemon = True
self.name = name
if printer is None:
self.printer = Printer()
else:
self.printer = printer
## create the queue
self.queue = Queue()
## Add a function to the handler to return the queue to clients
self.QM = QueueManager
self.QM.register('get_queue', callable=lambda:self.queue)
self.queue_manager = self.QM(address=(HOST, PORT), authkey=AUTHKEY)
self.queue_server = self.queue_manager.get_server()
def __del__(self):
self.printer.tsprint( 'closing...')
def run(self):
self.printer.tsprint( '{}: started serving'.format(self.name))
self.queue_server.serve_forever()
def stop(self):
self.printer.tsprint ('{}: stopping'.format(self.name))
self._stopEvent.set()
def stopped(self):
return self._stopEvent.isSet()
def start():
printer = Printer()
ml = MainLoop(name = 'ml', printer = printer)
ml.start()
return ml
def stop(ml):
ml.stop()
if __name__ == '__main__':
ml = start()
raw_input("\nhit return to stop")
stop(ml)
And a client:
dataQueueClient
import datetime
from multiprocessing.managers import BaseManager
n = 0
N = 10**n
HOST = ''
PORT = 50010
AUTHKEY = 'authkey'
def now():
return datetime.datetime.now()
def gen(n, func, *args, **kwargs):
k = 0
while k < n:
yield func(*args, **kwargs)
k += 1
class QueueManager(BaseManager):
pass
QueueManager.register('get_queue')
m = QueueManager(address=(HOST, PORT), authkey=AUTHKEY)
m.connect()
queue = m.get_queue()
def load(msg, q):
return q.put(msg)
def get(q):
return q.get()
lgen = gen(N, load, msg = 'hello', q = queue)
t0 = now()
while True:
try:
lgen.next()
except StopIteration:
break
t1 = now()
print 'loaded %d items in ' % N, t1-t0
t0 = now()
while queue.qsize() > 0:
queue.get()
t1 = now()
print 'got %d items in ' % N, t1-t0
So it seems like the solution is simple enough: Don't use serve_forever(), and use manager.start() instead.
According to Eli Bendersky, the BaseManager (and it's extended version SyncManager) already spawns the server in a new process (and looking at the multiprocessing.managers code confirms this). The problem I have been experiencing stems from the form used in the example, in which the server is started under the main process.
I still don't understand why the current example doesn't work when run under a child process, but that's no longer an issue.
Here's the working (and much simplified from OP) code to manage multiple queue servers:
Server:
from multiprocessing import Queue
from multiprocessing.managers import SyncManager
HOST = ''
PORT0 = 5011
PORT1 = 5012
PORT2 = 5013
AUTHKEY = 'authkey'
name0 = 'qm0'
name1 = 'qm1'
name2 = 'qm2'
description = 'Queue Server'
def CreateQueueServer(HOST, PORT, AUTHKEY, name = None, description = None):
name = name
description = description
q = Queue()
class QueueManager(SyncManager):
pass
QueueManager.register('get_queue', callable = lambda: q)
QueueManager.register('get_name', callable = name)
QueueManager.register('get_description', callable = description)
manager = QueueManager(address = (HOST, PORT), authkey = AUTHKEY)
manager.start() # This actually starts the server
return manager
# Start three queue servers
qm0 = CreateQueueServer(HOST, PORT0, AUTHKEY, name0, description)
qm1 = CreateQueueServer(HOST, PORT1, AUTHKEY, name1, description)
qm2 = CreateQueueServer(HOST, PORT2, AUTHKEY, name2, description)
raw_input("return to end")
Client:
from multiprocessing.managers import SyncManager
HOST = ''
PORT0 = 5011
PORT1 = 5012
PORT2 = 5013
AUTHKEY = 'authkey'
def QueueServerClient(HOST, PORT, AUTHKEY):
class QueueManager(SyncManager):
pass
QueueManager.register('get_queue')
QueueManager.register('get_name')
QueueManager.register('get_description')
manager = QueueManager(address = (HOST, PORT), authkey = AUTHKEY)
manager.connect() # This starts the connected client
return manager
# create three connected managers
qc0 = QueueServerClient(HOST, PORT0, AUTHKEY)
qc1 = QueueServerClient(HOST, PORT1, AUTHKEY)
qc2 = QueueServerClient(HOST, PORT2, AUTHKEY)
# Get the queue objects from the clients
q0 = qc0.get_queue()
q1 = qc1.get_queue()
q2 = qc2.get_queue()
# put stuff in the queues
q0.put('some stuff')
q1.put('other stuff')
q2.put({1:123, 2:'abc'})
# check their sizes
print 'q0 size', q0.qsize()
print 'q1 size', q1.qsize()
print 'q2 size', q2.qsize()
# pull some stuff and print it
print q0.get()
print q1.get()
print q2.get()
Adding an additional server to share a dictionary with the information of the running queue servers so that consumers can easily tell what's available where is easy enough using that model. One thing to note, though, is that the shared dictionary requires slightly different syntax than a normal dictionary: dictionary[0] = something will not work. You need to use dictionary.update([(key, value), (otherkey, othervalue)]) and dictionary.get(key) syntax, which propagates across to all other clients connected to this dictionary..

Embarassingly Parallel DB Update Using Python (PostGIS/PostgreSQL)

I need to update every record in a spatial database in which I have a data set of points that overlay data set of polygons. For each point feature I want to assign a key to relate it to the polygon feature that it lies within. So if my point 'New York City' lies within polygon USA and for the USA polygon 'GID = 1' I will assign 'gid_fkey = 1' for my point New York City.
To do this I have created the following query.
procQuery = 'UPDATE city SET gid_fkey = gid FROM country WHERE ST_within((SELECT the_geom FROM city WHERE wp_id = %s), country.the_geom) AND city_id = %s' % (cityID, cityID)
At present I am getting the cityID info from another query that just selects all cityID where gid_fkey is NULL. Essentially I just need to loop through these and run the query shown earlier. As the query only relies on static information in the other table in theory all of these processes can be run at once. I have implemented the threading procedure below but I can't seem to make the migration to multiprocessing
import psycopg2, pprint, threading, time, Queue
queue = Queue.Queue()
pyConn = psycopg2.connect("dbname='geobase_1' host='localhost'")
pyConn.set_isolation_level(0)
pyCursor1 = pyConn.cursor()
getGID = 'SELECT cityID FROM city'
pyCursor1.execute(getGID)
gidList = pyCursor1.fetchall()
class threadClass(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
gid = self.queue.get()
procQuery = 'UPDATE city SET gid_fkey = gid FROM country WHERE ST_within((SELECT the_geom FROM city WHERE wp_id = %s), country.the_geom) AND city_id = %s' % (cityID, cityID)
pyCursor2 = pyConn.cursor()
pyCursor2.execute(procQuery)
print gid[0]
print 'Done'
def main():
for i in range(4):
t = threadClass(queue)
t.setDaemon(True)
t.start()
for gid in gidList:
queue.put(gid)
queue.join()
main()
I'm not even sure if the multithreading is optimal but it is definitely faster than going through one by one.
The machine I will be using has four cores (Quad Core) and a minimal Linux OS with no GUI, PostgreSQL, PostGIS and Python if that makes a difference.
What do I need to change to get this painfully easy multiprocessing task enabled?
Okay this is an answer to my own post. Well done me =D
Produces about a 150% increase in speed on my system going from a single core thread to quad core multiprocessing.
import multiprocessing, time, psycopg2
class Consumer(multiprocessing.Process):
def __init__(self, task_queue, result_queue):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
print 'Tasks Complete'
self.task_queue.task_done()
break
answer = next_task()
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, a):
self.a = a
def __call__(self):
pyConn = psycopg2.connect("dbname='geobase_1' host = 'localhost'")
pyConn.set_isolation_level(0)
pyCursor1 = pyConn.cursor()
procQuery = 'UPDATE city SET gid_fkey = gid FROM country WHERE ST_within((SELECT the_geom FROM city WHERE city_id = %s), country.the_geom) AND city_id = %s' % (self.a, self.a)
pyCursor1.execute(procQuery)
print 'What is self?'
print self.a
return self.a
def __str__(self):
return 'ARC'
def run(self):
print 'IN'
if __name__ == '__main__':
tasks = multiprocessing.JoinableQueue()
results = multiprocessing.Queue()
num_consumers = multiprocessing.cpu_count() * 2
consumers = [Consumer(tasks, results) for i in xrange(num_consumers)]
for w in consumers:
w.start()
pyConnX = psycopg2.connect("dbname='geobase_1' host = 'localhost'")
pyConnX.set_isolation_level(0)
pyCursorX = pyConnX.cursor()
pyCursorX.execute('SELECT count(*) FROM cities WHERE gid_fkey IS NULL')
temp = pyCursorX.fetchall()
num_job = temp[0]
num_jobs = num_job[0]
pyCursorX.execute('SELECT city_id FROM city WHERE gid_fkey IS NULL')
cityIdListTuple = pyCursorX.fetchall()
cityIdList = []
for x in cityIdListTuple:
cityIdList.append(x[0])
for i in xrange(num_jobs):
tasks.put(Task(cityIdList[i - 1]))
for i in xrange(num_consumers):
tasks.put(None)
while num_jobs:
result = results.get()
print result
num_jobs -= 1
Now I have another question which I have posted here:
Create DB connection and maintain on multiple processes (multiprocessing)
Hopefully we can get rid of some overhead and speed this baby up even more.
In plain SQL one could do something like:
UPDATE city ci
SET gid_fkey = co.gid
FROM country co
WHERE ST_within(ci.the_geom , co.the_geom)
AND ci.city_id = _some_parameter_
;
There could be a problem if a city would fit into more than one country (causing multiple updates to the same target row), but that is probably not the case in your data.

Categories