Is there a Pool class for worker threads, similar to the multiprocessing module's Pool class?
I like for example the easy way to parallelize a map function
def long_running_func(p):
c_func_no_gil(p)
p = multiprocessing.Pool(4)
xs = p.map(long_running_func, range(100))
however I would like to do it without the overhead of creating new processes.
I know about the GIL. However, in my usecase, the function will be an IO-bound C function for which the python wrapper will release the GIL before the actual function call.
Do I have to write my own threading pool?
I just found out that there actually is a thread-based Pool interface in the multiprocessing module, however it is hidden somewhat and not properly documented.
It can be imported via
from multiprocessing.pool import ThreadPool
It is implemented using a dummy Process class wrapping a python thread. This thread-based Process class can be found in multiprocessing.dummy which is mentioned briefly in the docs. This dummy module supposedly provides the whole multiprocessing interface based on threads.
In Python 3 you can use concurrent.futures.ThreadPoolExecutor, i.e.:
executor = ThreadPoolExecutor(max_workers=10)
a = executor.submit(my_function)
See the docs for more info and examples.
Yes, and it seems to have (more or less) the same API.
import multiprocessing
def worker(lnk):
....
def start_process():
.....
....
if(PROCESS):
pool = multiprocessing.Pool(processes=POOL_SIZE, initializer=start_process)
else:
pool = multiprocessing.pool.ThreadPool(processes=POOL_SIZE,
initializer=start_process)
pool.map(worker, inputs)
....
For something very simple and lightweight (slightly modified from here):
from Queue import Queue
from threading import Thread
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except Exception, e:
print e
finally:
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads):
Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
if __name__ == '__main__':
from random import randrange
from time import sleep
delays = [randrange(1, 10) for i in range(100)]
def wait_delay(d):
print 'sleeping for (%d)sec' % d
sleep(d)
pool = ThreadPool(20)
for i, d in enumerate(delays):
pool.add_task(wait_delay, d)
pool.wait_completion()
To support callbacks on task completion you can just add the callback to the task tuple.
Hi to use the thread pool in Python you can use this library :
from multiprocessing.dummy import Pool as ThreadPool
and then for use, this library do like that :
pool = ThreadPool(threads)
results = pool.map(service, tasks)
pool.close()
pool.join()
return results
The threads are the number of threads that you want and tasks are a list of task that most map to the service.
Yes, there is a threading pool similar to the multiprocessing Pool, however, it is hidden somewhat and not properly documented. You can import it by following way:-
from multiprocessing.pool import ThreadPool
Just I show you simple example
def test_multithread_stringio_read_csv(self):
# see gh-11786
max_row_range = 10000
num_files = 100
bytes_to_df = [
'\n'.join(
['%d,%d,%d' % (i, i, i) for i in range(max_row_range)]
).encode() for j in range(num_files)]
files = [BytesIO(b) for b in bytes_to_df]
# read all files in many threads
pool = ThreadPool(8)
results = pool.map(self.read_csv, files)
first_result = results[0]
for result in results:
tm.assert_frame_equal(first_result, result)
Here's the result I finally ended up using. It's a modified version of the classes by dgorissen above.
File: threadpool.py
from queue import Queue, Empty
import threading
from threading import Thread
class Worker(Thread):
_TIMEOUT = 2
""" Thread executing tasks from a given tasks queue. Thread is signalable,
to exit
"""
def __init__(self, tasks, th_num):
Thread.__init__(self)
self.tasks = tasks
self.daemon, self.th_num = True, th_num
self.done = threading.Event()
self.start()
def run(self):
while not self.done.is_set():
try:
func, args, kwargs = self.tasks.get(block=True,
timeout=self._TIMEOUT)
try:
func(*args, **kwargs)
except Exception as e:
print(e)
finally:
self.tasks.task_done()
except Empty as e:
pass
return
def signal_exit(self):
""" Signal to thread to exit """
self.done.set()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads, tasks=[]):
self.tasks = Queue(num_threads)
self.workers = []
self.done = False
self._init_workers(num_threads)
for task in tasks:
self.tasks.put(task)
def _init_workers(self, num_threads):
for i in range(num_threads):
self.workers.append(Worker(self.tasks, i))
def add_task(self, func, *args, **kwargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kwargs))
def _close_all_threads(self):
""" Signal all threads to exit and lose the references to them """
for workr in self.workers:
workr.signal_exit()
self.workers = []
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
def __del__(self):
self._close_all_threads()
def create_task(func, *args, **kwargs):
return (func, args, kwargs)
To use the pool
from random import randrange
from time import sleep
delays = [randrange(1, 10) for i in range(30)]
def wait_delay(d):
print('sleeping for (%d)sec' % d)
sleep(d)
pool = ThreadPool(20)
for i, d in enumerate(delays):
pool.add_task(wait_delay, d)
pool.wait_completion()
another way can be adding the process to thethread queue pool
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
for i in range(10):
a = executor.submit(arg1, arg2,....)
The overhead of creating the new processes is minimal, especially when it's just 4 of them. I doubt this is a performance hot spot of your application. Keep it simple, optimize where you have to and where profiling results point to.
There is no built in thread based pool. However, it can be very quick to implement a producer/consumer queue with the Queue class.
From:
https://docs.python.org/2/library/queue.html
from threading import Thread
from Queue import Queue
def worker():
while True:
item = q.get()
do_work(item)
q.task_done()
q = Queue()
for i in range(num_worker_threads):
t = Thread(target=worker)
t.daemon = True
t.start()
for item in source():
q.put(item)
q.join() # block until all tasks are done
If you don't mind executing other's code, here's mine:
Note: There is lot of extra code you may want to remove [added for better clarificaiton and demonstration how it works]
Note: Python naming conventions were used for method names and variable names instead of camelCase.
Working procedure:
MultiThread class will initiate with no of instances of threads by sharing lock, work queue, exit flag and results.
SingleThread will be started by MultiThread once it creates all instances.
We can add works using MultiThread (It will take care of locking).
SingleThreads will process work queue using a lock in middle.
Once your work is done, you can destroy all threads with shared boolean value.
Here, work can be anything. It can automatically import (uncomment import line) and process module using given arguments.
Results will be added to results and we can get using get_results
Code:
import threading
import queue
class SingleThread(threading.Thread):
def __init__(self, name, work_queue, lock, exit_flag, results):
threading.Thread.__init__(self)
self.name = name
self.work_queue = work_queue
self.lock = lock
self.exit_flag = exit_flag
self.results = results
def run(self):
# print("Coming %s with parameters %s", self.name, self.exit_flag)
while not self.exit_flag:
# print(self.exit_flag)
self.lock.acquire()
if not self.work_queue.empty():
work = self.work_queue.get()
module, operation, args, kwargs = work.module, work.operation, work.args, work.kwargs
self.lock.release()
print("Processing : " + operation + " with parameters " + str(args) + " and " + str(kwargs) + " by " + self.name + "\n")
# module = __import__(module_name)
result = str(getattr(module, operation)(*args, **kwargs))
print("Result : " + result + " for operation " + operation + " and input " + str(args) + " " + str(kwargs))
self.results.append(result)
else:
self.lock.release()
# process_work_queue(self.work_queue)
class MultiThread:
def __init__(self, no_of_threads):
self.exit_flag = bool_instance()
self.queue_lock = threading.Lock()
self.threads = []
self.work_queue = queue.Queue()
self.results = []
for index in range(0, no_of_threads):
thread = SingleThread("Thread" + str(index+1), self.work_queue, self.queue_lock, self.exit_flag, self.results)
thread.start()
self.threads.append(thread)
def add_work(self, work):
self.queue_lock.acquire()
self.work_queue._put(work)
self.queue_lock.release()
def destroy(self):
self.exit_flag.value = True
for thread in self.threads:
thread.join()
def get_results(self):
return self.results
class Work:
def __init__(self, module, operation, args, kwargs={}):
self.module = module
self.operation = operation
self.args = args
self.kwargs = kwargs
class SimpleOperations:
def sum(self, *args):
return sum([int(arg) for arg in args])
#staticmethod
def mul(a, b, c=0):
return int(a) * int(b) + int(c)
class bool_instance:
def __init__(self, value=False):
self.value = value
def __setattr__(self, key, value):
if key != "value":
raise AttributeError("Only value can be set!")
if not isinstance(value, bool):
raise AttributeError("Only True/False can be set!")
self.__dict__[key] = value
# super.__setattr__(key, bool(value))
def __bool__(self):
return self.value
if __name__ == "__main__":
multi_thread = MultiThread(5)
multi_thread.add_work(Work(SimpleOperations(), "mul", [2, 3], {"c":4}))
while True:
data_input = input()
if data_input == "":
pass
elif data_input == "break":
break
else:
work = data_input.split()
multi_thread.add_work(Work(SimpleOperations(), work[0], work[1:], {}))
multi_thread.destroy()
print(multi_thread.get_results())
I am exploring Klein and Deferred. In the following example I am trying to increment a number using a child process and return it via Future. I am able to receive the Future call back.
The problem is that deferred object never calls the cb() function and the request made to endpoint never returns. Please help me identify the problem.
Following is my server.py code
from klein import Klein
from twisted.internet.defer import inlineCallbacks, returnValue
import Process4
if __name__ == '__main__':
app = Klein()
#app.route('/visit')
#inlineCallbacks
def get_num_visit(request):
try:
resp = yield Process4.get_visitor_num()
req.setResponseCode(200)
returnValue('Visited = {}'.format(resp))
except Exception as e:
req.setResponseCode(500)
returnValue('error {}'.format(e))
print('starting server')
app.run('0.0.0.0', 5005)
Following is Process4.py code
from multiprocessing import Process
from concurrent.futures import Future
from time import sleep
from twisted.internet.defer import Deferred
def foo(x):
result = x+1
sleep(3)
return result
class MyProcess(Process):
def __init__(self, target, args):
super().__init__()
self.target = target
self.args = args
self.f = Future()
self.visit = 0
def run(self):
r = foo(self.visit)
self.f.set_result(result=r)
def cb(result):
print('visitor number {}'.format(result))
return result
def eb(err):
print('error occurred {}'.format(err))
return err
def future_to_deferred(future):
d = Deferred()
def callback(f):
e = f.exception()
if e:
d.errback(e)
else:
d.callback(f.result())
future.add_done_callback(callback)
return d
def get_visitor_num():
p1 = MyProcess(target=foo, args=None)
d = future_to_deferred(p1.f)
p1.start()
d.addCallback(cb)
d.addErrback(eb)
sleep(1)
return d
Edit 1
Adding callbacks before starting the process p1 solves the problem of calling cb() function. But still the http request made to the endpoint does not return.
It turns out that setting future result self.f.set_result(result=r) in the run() method triggers the callback() method in the child process, where no thread is waiting for the result to be returned!
So to get the callback() function triggered in the MainProcess I had to get the result from the child-process using a multiprocess Queue using a worker thread in the MainProcess and then set the future result.
#notorious.no Thanks for reply. One thing which I noticed is that reactor.callFromThread does switches result from worker thread to MainThread in my modified code however d.callback(f.result()) works just fine but returns result from worker thread.
Following is the modified working code
server.py
from klein import Klein
from twisted.internet.defer import inlineCallbacks, returnValue
import Process4
if __name__ == '__main__':
app = Klein()
visit_count = 0
#app.route('/visit')
#inlineCallbacks
def get_num_visit(req):
global visit_count
try:
resp = yield Process4.get_visitor_num(visit_count)
req.setResponseCode(200)
visit_count = resp
returnValue('Visited = {}'.format(resp))
except Exception as e:
req.setResponseCode(500)
returnValue('error {}'.format(e))
print('starting server')
app.run('0.0.0.0', 5005)
Process4.py
from multiprocessing import Process, Queue
from concurrent.futures import Future
from time import sleep
from twisted.internet.defer import Deferred
import threading
from twisted.internet import reactor
def foo(x, q):
result = x+1
sleep(3)
print('setting result, {}'.format(result))
q.put(result)
class MyProcess(Process):
def __init__(self, target, args):
super().__init__()
self.target = target
self.args = args
self.visit = 0
def run(self):
self.target(*self.args)
def future_to_deferred(future):
d = Deferred()
def callback(f):
e = f.exception()
print('inside callback {}'.format(threading.current_thread().name))
if e:
print('calling errback')
d.errback(e)
# reactor.callFromThread(d.errback, e)
else:
print('calling callback with result {}'.format(f.result()))
# d.callback(f.result())
reactor.callFromThread(d.callback, f.result())
future.add_done_callback(callback)
return d
def wait(q,f):
r = q.get(block=True)
f.set_result(r)
def get_visitor_num(x):
def cb(result):
print('inside cb visitor number {} {}'.format(result, threading.current_thread().name))
return result
def eb(err):
print('inside eb error occurred {}'.format(err))
return err
f = Future()
q = Queue()
p1 = MyProcess(target=foo, args=(x,q,))
wait_thread = threading.Thread(target=wait, args=(q,f,))
wait_thread.start()
defr = future_to_deferred(f)
defr.addCallback(cb)
defr.addErrback(eb)
p1.start()
print('returning deferred')
return defr
Now we are a couple of guys that are coding a day-trading bots, but it's beside the point. The usual way of calling Process on a functions, gets nested and really obscure in already big classes that are well structured.
The problem
Let's say a Process may never terminate and writes output to a queue which we access in def main().. Would this implementation be pythonic or following the pep20.. A good boilerplate?:
from multiprocessing import Process
from multiprocessing import Queue
from time import sleep
class Worker(Process):
def __init__(self, q, *args, **kwargs):
super().__init__(*args, **kwargs)
self.q = q
def run(self):
# doing work put work in queue
self.q.put("hello foo")
class RestartingProcesses(Process):
def __init__(self, ps=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.ps_classes = {} if ps is None else ps
self.processes = []
def setup(self):
"""
Initiate all processes that is in the dict all inserts into the list
"""
for p, args in self.ps_classes.items():
p = p(args)
p.start()
self.processes += [p]
def run(self):
"""
Uses the process.__class__ to instantiate and get appropriate args
of process in the dict
"""
self.setup()
# should be while True:
for i in range(8):
for p in self.processes:
if not p.is_alive():
print("worker dead")
p.join()
self.processes.remove(p)
p = p.__class__(self.ps_classes[p.__class__])
p.start()
self.processes += [p]
sleep(1)
def main():
q = Queue()
processes_to_start_with_args = {Worker: q}
r = RestartingProcesses(processes_to_start_with_args)
r.start()
r.join()
r.terminate()
while not q.empty():
print(q.get())
if __name__ == "__main__":
main()
What are your experiences and have you solved it in another way?
I want to use python's multiprocessing module in a class, which itself uses subprocesses to not block the main call.
The minimal example looks like this:
import multiprocessing as mp
class mpo():
def __init__(self):
cpu = mp.cpu_count()
self.Pool = mp.Pool(processes = 2)
self.alive = True
self.p = mp.Process(target = self.sub,args=())
def worker():
print 'Alive'
def sub(self):
print self.alive
for i in range(2):
print i
self.Pool.apply_async(self.worker, args=())
print 'done'
self.Pool.close()
# self.Pool.join()
I commented the last line out, as it raises an assertion Error (can only join a child process).
When I do:
m =mpo()
m.p.start()
The output is
True
0
1
done
My main question is, why the print statement in the worker thread never is reached?
Update:
The updated code looks like this.
import multiprocessing as mp
class mpo():
def __init__(self):
cpu = mp.cpu_count()
self.alive = True
self.p = mp.Process(target = self.sub,args=())
self.result=[]
def worker(self):
self.result.append(1)
print 'Alive'
def sub(self):
print self.alive
Pool = mp.Pool(processes = 2)
for i in range(2):
print i
Pool.apply_async(self.worker, args=())
print 'done'
Pool.close()
Pool.join()
The pool now doesn't have to be inherited as it is created in the subprocess. Instead of the print statement the result is appended to the calling object and the pool is properly joined. Nevertheless, there is no result showing up.
so I think this may correspond to a simple example of what you are looking for:
import multiprocessing as mp
def worker(arg):
#print 'Alive'+str(arg)
return "Alive and finished {0}".format(arg)
class mpo():
def __init__(self):
cpu = mp.cpu_count()
self.alive = True
self.pool = mp.Pool(processes = 2)
def sub(self,arguments):
self.results=self.pool.map_async(worker, arguments)
return self.results
if __name__=="__main__":
s=mpo()
s.sub(range(10))
print s.results.get()
Additionally you can call
self.results.ready()
to find out whether the processes have finished their work. You do not have to put this inside of another process because the map_async call does not block the rest of your program.
EDIT:
Concerning your comment, I do not really see the value of putting the calculation in a separate process, because the function is already running in separate processes (in the pool). You only add complexity by nesting it in another subprocess, but it is possible:
import multiprocessing as mp
def worker(arg):
#print 'Alive'+str(arg)
return "Alive and finished {0}".format(arg)
class mpo():
def __init__(self):
cpu = mp.cpu_count()
self.alive = True
self.pool = mp.Pool(processes = 2)
def sub(self,arguments):
self.results=self.pool.map_async(worker, arguments)
return self.results
def run_calculation(q):
s=mpo()
results=s.sub(range(10))
q.put(results.get())
queue=mp.Queue()
proc=mp.Process(target=run_calculation,args=(queue,))
proc.start()
proc.join()
queue.get()
I want to repeat a function at timed intervals. The issue I have is that the function runs another function in a separate thread and therefore doesn't seem to be working with my code.
From the example below, I want to repeat function1 every 60 seconds:
from multiprocessing import Process
from threading import Event
def function2(type):
print("Function2")
def function1():
print("Function1")
if __name__ == '__main__':
p = Process(target=function2, args=('type',))
p.daemon = True
p.start()
p.join()
function1()
To repeat the function I attempted to use the following code:
class TimedThread(Thread):
def __init__(self, event, wait_time, tasks):
Thread.__init__(self)
self.stopped = event
self.wait_time = wait_time
self.tasks = tasks
def run(self):
while not self.stopped.wait(0.5):
self.tasks()
stopFlag = Event()
thread = TimedThread(stopFlag, 60, function1)
thread.start()
Both snippets combined print "Function1" in a timed loop but also produce the following error:
AttributeError: Can't get attribute 'function2' on <module '__main__' (built-in)
Any help would be greatly appreciated.
You can wrap your function1, like:
def main():
while True:
time.sleep(60)
function1()
or you can have it run in a separate thread:
def main():
while True:
time.sleep(60)
t = threading.Thread(target=function1)
t.start()
It actually works for me, printing Function1 and Function2 over and over. Are these two snippets in the same file?
If you import function1 from a different module, then the if __name__ == '__main__' check will fail.
I managed to find an alternative, working solution. Instead of using processes, I achieved the desired results using threads.The differences between the two are well explained here.
from threading import Event, Thread
class TimedThread(Thread):
def __init__(self, event, wait_time):
Thread.__init__(self)
self.stopped = event
self.wait_time = wait_time
def run(self):
while not self.stopped.wait(self.wait_time):
self.function1()
def function2(self):
print("Function2 started from thread")
# Do something
def function1(self):
print("Function1 started from thread")
# Do something
temp_thread = Thread(target=self.function2)
temp_thread.start()
stopFlag = Event()
thread = TimedThread(stopFlag, 60)
thread.start()