Is there a Pool class for worker threads, similar to the multiprocessing module's Pool class?
I like for example the easy way to parallelize a map function
def long_running_func(p):
c_func_no_gil(p)
p = multiprocessing.Pool(4)
xs = p.map(long_running_func, range(100))
however I would like to do it without the overhead of creating new processes.
I know about the GIL. However, in my usecase, the function will be an IO-bound C function for which the python wrapper will release the GIL before the actual function call.
Do I have to write my own threading pool?
I just found out that there actually is a thread-based Pool interface in the multiprocessing module, however it is hidden somewhat and not properly documented.
It can be imported via
from multiprocessing.pool import ThreadPool
It is implemented using a dummy Process class wrapping a python thread. This thread-based Process class can be found in multiprocessing.dummy which is mentioned briefly in the docs. This dummy module supposedly provides the whole multiprocessing interface based on threads.
In Python 3 you can use concurrent.futures.ThreadPoolExecutor, i.e.:
executor = ThreadPoolExecutor(max_workers=10)
a = executor.submit(my_function)
See the docs for more info and examples.
Yes, and it seems to have (more or less) the same API.
import multiprocessing
def worker(lnk):
....
def start_process():
.....
....
if(PROCESS):
pool = multiprocessing.Pool(processes=POOL_SIZE, initializer=start_process)
else:
pool = multiprocessing.pool.ThreadPool(processes=POOL_SIZE,
initializer=start_process)
pool.map(worker, inputs)
....
For something very simple and lightweight (slightly modified from here):
from Queue import Queue
from threading import Thread
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except Exception, e:
print e
finally:
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads):
Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
if __name__ == '__main__':
from random import randrange
from time import sleep
delays = [randrange(1, 10) for i in range(100)]
def wait_delay(d):
print 'sleeping for (%d)sec' % d
sleep(d)
pool = ThreadPool(20)
for i, d in enumerate(delays):
pool.add_task(wait_delay, d)
pool.wait_completion()
To support callbacks on task completion you can just add the callback to the task tuple.
Hi to use the thread pool in Python you can use this library :
from multiprocessing.dummy import Pool as ThreadPool
and then for use, this library do like that :
pool = ThreadPool(threads)
results = pool.map(service, tasks)
pool.close()
pool.join()
return results
The threads are the number of threads that you want and tasks are a list of task that most map to the service.
Yes, there is a threading pool similar to the multiprocessing Pool, however, it is hidden somewhat and not properly documented. You can import it by following way:-
from multiprocessing.pool import ThreadPool
Just I show you simple example
def test_multithread_stringio_read_csv(self):
# see gh-11786
max_row_range = 10000
num_files = 100
bytes_to_df = [
'\n'.join(
['%d,%d,%d' % (i, i, i) for i in range(max_row_range)]
).encode() for j in range(num_files)]
files = [BytesIO(b) for b in bytes_to_df]
# read all files in many threads
pool = ThreadPool(8)
results = pool.map(self.read_csv, files)
first_result = results[0]
for result in results:
tm.assert_frame_equal(first_result, result)
Here's the result I finally ended up using. It's a modified version of the classes by dgorissen above.
File: threadpool.py
from queue import Queue, Empty
import threading
from threading import Thread
class Worker(Thread):
_TIMEOUT = 2
""" Thread executing tasks from a given tasks queue. Thread is signalable,
to exit
"""
def __init__(self, tasks, th_num):
Thread.__init__(self)
self.tasks = tasks
self.daemon, self.th_num = True, th_num
self.done = threading.Event()
self.start()
def run(self):
while not self.done.is_set():
try:
func, args, kwargs = self.tasks.get(block=True,
timeout=self._TIMEOUT)
try:
func(*args, **kwargs)
except Exception as e:
print(e)
finally:
self.tasks.task_done()
except Empty as e:
pass
return
def signal_exit(self):
""" Signal to thread to exit """
self.done.set()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads, tasks=[]):
self.tasks = Queue(num_threads)
self.workers = []
self.done = False
self._init_workers(num_threads)
for task in tasks:
self.tasks.put(task)
def _init_workers(self, num_threads):
for i in range(num_threads):
self.workers.append(Worker(self.tasks, i))
def add_task(self, func, *args, **kwargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kwargs))
def _close_all_threads(self):
""" Signal all threads to exit and lose the references to them """
for workr in self.workers:
workr.signal_exit()
self.workers = []
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
def __del__(self):
self._close_all_threads()
def create_task(func, *args, **kwargs):
return (func, args, kwargs)
To use the pool
from random import randrange
from time import sleep
delays = [randrange(1, 10) for i in range(30)]
def wait_delay(d):
print('sleeping for (%d)sec' % d)
sleep(d)
pool = ThreadPool(20)
for i, d in enumerate(delays):
pool.add_task(wait_delay, d)
pool.wait_completion()
another way can be adding the process to thethread queue pool
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
for i in range(10):
a = executor.submit(arg1, arg2,....)
The overhead of creating the new processes is minimal, especially when it's just 4 of them. I doubt this is a performance hot spot of your application. Keep it simple, optimize where you have to and where profiling results point to.
There is no built in thread based pool. However, it can be very quick to implement a producer/consumer queue with the Queue class.
From:
https://docs.python.org/2/library/queue.html
from threading import Thread
from Queue import Queue
def worker():
while True:
item = q.get()
do_work(item)
q.task_done()
q = Queue()
for i in range(num_worker_threads):
t = Thread(target=worker)
t.daemon = True
t.start()
for item in source():
q.put(item)
q.join() # block until all tasks are done
If you don't mind executing other's code, here's mine:
Note: There is lot of extra code you may want to remove [added for better clarificaiton and demonstration how it works]
Note: Python naming conventions were used for method names and variable names instead of camelCase.
Working procedure:
MultiThread class will initiate with no of instances of threads by sharing lock, work queue, exit flag and results.
SingleThread will be started by MultiThread once it creates all instances.
We can add works using MultiThread (It will take care of locking).
SingleThreads will process work queue using a lock in middle.
Once your work is done, you can destroy all threads with shared boolean value.
Here, work can be anything. It can automatically import (uncomment import line) and process module using given arguments.
Results will be added to results and we can get using get_results
Code:
import threading
import queue
class SingleThread(threading.Thread):
def __init__(self, name, work_queue, lock, exit_flag, results):
threading.Thread.__init__(self)
self.name = name
self.work_queue = work_queue
self.lock = lock
self.exit_flag = exit_flag
self.results = results
def run(self):
# print("Coming %s with parameters %s", self.name, self.exit_flag)
while not self.exit_flag:
# print(self.exit_flag)
self.lock.acquire()
if not self.work_queue.empty():
work = self.work_queue.get()
module, operation, args, kwargs = work.module, work.operation, work.args, work.kwargs
self.lock.release()
print("Processing : " + operation + " with parameters " + str(args) + " and " + str(kwargs) + " by " + self.name + "\n")
# module = __import__(module_name)
result = str(getattr(module, operation)(*args, **kwargs))
print("Result : " + result + " for operation " + operation + " and input " + str(args) + " " + str(kwargs))
self.results.append(result)
else:
self.lock.release()
# process_work_queue(self.work_queue)
class MultiThread:
def __init__(self, no_of_threads):
self.exit_flag = bool_instance()
self.queue_lock = threading.Lock()
self.threads = []
self.work_queue = queue.Queue()
self.results = []
for index in range(0, no_of_threads):
thread = SingleThread("Thread" + str(index+1), self.work_queue, self.queue_lock, self.exit_flag, self.results)
thread.start()
self.threads.append(thread)
def add_work(self, work):
self.queue_lock.acquire()
self.work_queue._put(work)
self.queue_lock.release()
def destroy(self):
self.exit_flag.value = True
for thread in self.threads:
thread.join()
def get_results(self):
return self.results
class Work:
def __init__(self, module, operation, args, kwargs={}):
self.module = module
self.operation = operation
self.args = args
self.kwargs = kwargs
class SimpleOperations:
def sum(self, *args):
return sum([int(arg) for arg in args])
#staticmethod
def mul(a, b, c=0):
return int(a) * int(b) + int(c)
class bool_instance:
def __init__(self, value=False):
self.value = value
def __setattr__(self, key, value):
if key != "value":
raise AttributeError("Only value can be set!")
if not isinstance(value, bool):
raise AttributeError("Only True/False can be set!")
self.__dict__[key] = value
# super.__setattr__(key, bool(value))
def __bool__(self):
return self.value
if __name__ == "__main__":
multi_thread = MultiThread(5)
multi_thread.add_work(Work(SimpleOperations(), "mul", [2, 3], {"c":4}))
while True:
data_input = input()
if data_input == "":
pass
elif data_input == "break":
break
else:
work = data_input.split()
multi_thread.add_work(Work(SimpleOperations(), work[0], work[1:], {}))
multi_thread.destroy()
print(multi_thread.get_results())
I have a requirement of creating child processes, receive results using Future and then kill some of them when required.
For this I have subclassed multiprocessing.Process class and return a Future object from the start() method.
The problem is that I am not able to receive the result in the cb() function as it never gets called.
Please help/suggest if this can be done in some other way or something I am missing in my current implementation?
Following is my current approach
from multiprocessing import Process, Queue
from concurrent.futures import _base
import threading
from time import sleep
def foo(x,q):
print('result {}'.format(x*x))
result = x*x
sleep(5)
q.put(result)
class MyProcess(Process):
def __init__(self, target, args):
super().__init__()
self.target = target
self.args = args
self.f = _base.Future()
def run(self):
q = Queue()
worker_thread = threading.Thread(target=self.target, args=(self.args+ (q,)))
worker_thread.start()
r = q.get(block=True)
print('setting result {}'.format(r))
self.f.set_result(result=r)
print('done setting result')
def start(self):
f = _base.Future()
run_thread = threading.Thread(target=self.run)
run_thread.start()
return f
def cb(future):
print('received result in callback {}'.format(future))
def main():
p1 = MyProcess(target=foo, args=(2,))
f = p1.start()
f.add_done_callback(fn=cb)
sleep(10)
if __name__ == '__main__':
main()
print('Main thread dying')
In your start method you create a new Future which you then return. This is a different future then the one you set the result on, this future is just not used at all. Try:
def start(self):
run_thread = threading.Thread(target=self.run)
run_thread.start()
return self.f
However there are more problems with your code. You override the start method of the process, replacing it with execution on a worker thread, therefore actually bypassing multiprocessing. Also you shouldn't import the _base module, that is an implementation detail as seen from the leading underscore. You should import concurrent.futures.Future (it's the same class, but through public API).
This really uses multiprocessing:
from multiprocessing import Process, Queue
from concurrent.futures import Future
import threading
from time import sleep
def foo(x,q):
print('result {}'.format(x*x))
result = x*x
sleep(5)
q.put(result)
class MyProcess(Process):
def __init__(self, target, args):
super().__init__()
self.target = target
self.args = args
self.f = Future()
def run(self):
q = Queue()
worker_thread = threading.Thread(target=self.target, args=(self.args+ (q,)))
worker_thread.start()
r = q.get(block=True)
print('setting result {}'.format(r))
self.f.set_result(result=r)
print('done setting result')
def cb(future):
print('received result in callback {}: {}'.format(future, future.result()))
def main():
p1 = MyProcess(target=foo, args=(2,))
p1.f.add_done_callback(fn=cb)
p1.start()
p1.join()
sleep(10)
if __name__ == '__main__':
main()
print('Main thread dying')
And you're already in a new process now, spawning a worker thread to execute your target function shouldn't really be necessary, you could just execute your target function directly instead. Should the target function raise an Exception you wouldn't know about it, your callback will only be called on success. So if you fix that, then you're left with:
from multiprocessing import Process
from concurrent.futures import Future
import threading
from time import sleep
def foo(x):
print('result {}'.format(x*x))
result = x*x
sleep(5)
return result
class MyProcess(Process):
def __init__(self, target, args):
super().__init__()
self.target = target
self.args = args
self.f = Future()
def run(self):
try:
r = self.target(*self.args)
print('setting result {}'.format(r))
self.f.set_result(result=r)
print('done setting result')
except Exception as ex:
self.f.set_exception(ex)
def cb(future):
print('received result in callback {}: {}'.format(future, future.result()))
def main():
p1 = MyProcess(target=foo, args=(2,))
p1.f.add_done_callback(fn=cb)
p1.start()
p1.join()
sleep(10)
if __name__ == '__main__':
main()
print('Main thread dying')
This is basically what a ProcessPoolExecutor does.
Now we are a couple of guys that are coding a day-trading bots, but it's beside the point. The usual way of calling Process on a functions, gets nested and really obscure in already big classes that are well structured.
The problem
Let's say a Process may never terminate and writes output to a queue which we access in def main().. Would this implementation be pythonic or following the pep20.. A good boilerplate?:
from multiprocessing import Process
from multiprocessing import Queue
from time import sleep
class Worker(Process):
def __init__(self, q, *args, **kwargs):
super().__init__(*args, **kwargs)
self.q = q
def run(self):
# doing work put work in queue
self.q.put("hello foo")
class RestartingProcesses(Process):
def __init__(self, ps=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.ps_classes = {} if ps is None else ps
self.processes = []
def setup(self):
"""
Initiate all processes that is in the dict all inserts into the list
"""
for p, args in self.ps_classes.items():
p = p(args)
p.start()
self.processes += [p]
def run(self):
"""
Uses the process.__class__ to instantiate and get appropriate args
of process in the dict
"""
self.setup()
# should be while True:
for i in range(8):
for p in self.processes:
if not p.is_alive():
print("worker dead")
p.join()
self.processes.remove(p)
p = p.__class__(self.ps_classes[p.__class__])
p.start()
self.processes += [p]
sleep(1)
def main():
q = Queue()
processes_to_start_with_args = {Worker: q}
r = RestartingProcesses(processes_to_start_with_args)
r.start()
r.join()
r.terminate()
while not q.empty():
print(q.get())
if __name__ == "__main__":
main()
What are your experiences and have you solved it in another way?
I want to have a function, in Python (3.x), which force to the script itself to terminate, like :
i_time_value = 10
mytimeout(i_time_value ) # Terminate the script if not in i_time_value seconds
for i in range(10):
print("go")
time.sleep(2)
Where "mytimeout" is the function I need : it terminate the script in "arg" seconds if the script is not terminated.
I have seen good solutions for put a timeout to a function here or here, but I don't want a timeout for a function but for the script.
Also :
I know that I can put my script in a function or using something like subprocess and use it with a
timeout, I tried it and it works, but I want something more simple.
It must be Unix & Windows compatible.
The function must be universal i.e. : it may be add to any script in
one line (except import)
I need a function not a 'how to put a timeout in a script'.
signal is not Windows compatible.
You can send some signals on Windows e.g.:
os.kill(os.getpid(), signal.CTRL_C_EVENT) # send Ctrl+C to itself
You could use threading.Timer to call a function at a later time:
from threading import Timer
def kill_yourself(delay):
t = Timer(delay, kill_yourself_now)
t.daemon = True # no need to kill yourself if we're already dead
t.start()
where kill_yourself_now():
import os
import signal
import sys
def kill_yourself_now():
sig = signal.CTRL_C_EVENT if sys.platform == 'win32' else signal.SIGINT
os.kill(os.getpid(), sig) # raise KeyboardInterrupt in the main thread
If your scripts starts other processes then see: how to kill child process(es) when parent dies? See also, How to terminate a python subprocess launched with shell=True -- it demonstrates how to kill a process tree.
I would use something like this.
import sys
import time
import threading
def set_timeout(event):
event.set()
event = threading.Event()
i_time_value = 2
t = threading.Timer(i_time_value, set_timeout, [event])
t.start()
for i in range(10):
print("go")
if event.is_set():
print('Timed Out!')
sys.exit()
time.sleep(2)
A little bit of googling turned this answer up:
import multiprocessing as MP
from sys import exc_info
from time import clock
DEFAULT_TIMEOUT = 60
################################################################################
def timeout(limit=None):
if limit is None:
limit = DEFAULT_TIMEOUT
if limit <= 0:
raise ValueError()
def wrapper(function):
return _Timeout(function, limit)
return wrapper
class TimeoutError(Exception): pass
################################################################################
def _target(queue, function, *args, **kwargs):
try:
queue.put((True, function(*args, **kwargs)))
except:
queue.put((False, exc_info()[1]))
class _Timeout:
def __init__(self, function, limit):
self.__limit = limit
self.__function = function
self.__timeout = clock()
self.__process = MP.Process()
self.__queue = MP.Queue()
def __call__(self, *args, **kwargs):
self.cancel()
self.__queue = MP.Queue(1)
args = (self.__queue, self.__function) + args
self.__process = MP.Process(target=_target, args=args, kwargs=kwargs)
self.__process.daemon = True
self.__process.start()
self.__timeout = self.__limit + clock()
def cancel(self):
if self.__process.is_alive():
self.__process.terminate()
#property
def ready(self):
if self.__queue.full():
return True
elif not self.__queue.empty():
return True
elif self.__timeout < clock():
self.cancel()
else:
return False
#property
def value(self):
if self.ready is True:
flag, load = self.__queue.get()
if flag:
return load
raise load
raise TimeoutError()
def __get_limit(self):
return self.__limit
def __set_limit(self, value):
if value <= 0:
raise ValueError()
self.__limit = value
limit = property(__get_limit, __set_limit)
It might be Python 2.x, but it shouldn't be terribly hard to convert.
I have little problem understanding python multiprocessing. I wrote an application, witch analyzes downloaded web pages. I would like to fetch raw html in separate process with specific timeout. I know i can set timeout in urllib2, but it seems not working correctly in some cases when using socks5 proxy.
So, wrote a little Class:
class SubprocessManager(Logger):
def __init__(self, function):
self.request_queue = Queue()
self.return_queue = Queue()
self.worker = function
self.args = ()
self.kwargs = {'request_queue': self.request_queue,
'return_queue': self.return_queue}
self._run()
def _run(self):
self.subprocess = Process(target=self.worker, args=self.args, kwargs=self.kwargs)
self.subprocess.start()
def put_in_queue(self, data):
self.request_queue.put(data)
def get_from_queue(self):
result = None
try:
result = self.request_queue.get(timeout=10)
except Empty:
self.reset_process()
return result
def reset_process(self):
if self.subprocess.is_alive():
self.subprocess.terminate()
self._run()
Worker function:
def subprocess_fetch_www(*args, **kwargs):
request_queue = kwargs['request_queue']
return_queue = kwargs['return_queue']
while True:
request_data = request_queue.get()
if request_data:
return_data = fetch_request(*request_data)
return_queue.put(return_data)
And function that is called for each url from input list:
def fetch_html(url, max_retry=cfg.URLLIB_MAX_RETRY, to_xml=False, com_headers=False):
subprocess = Logger.SUBPROCESS
args = (url, max_retry, com_headers)
subprocess.put_in_queue(args)
result = subprocess.get_from_queue()
if result and to_xml:
return html2lxml(result)
return result
I need help in fixing my code. I want my subprocess running all the time waiting for job in request_queue. I want to recreate subprocess only in case of timeout. Worker should suspend execution once request_data is processed and return_data put in return queue.
How can i achieve that?
EDIT:
Well, it seems that above code works as intended, if get_from_queue requests result data from return_queue instead request_queue... >_>'
Ok, I think I have a better understanding of what you want to do.
Have a look at this code. It's not OO but illustrates the idea.
from multiprocessing import Process, Queue, Pipe
from time import sleep
import random
proc = None
inq = None
outq = None
def createWorker():
global inq, outq, proc
inq = Queue()
outq = Queue()
proc = Process(target=worker, args=(inq,outq))
proc.start()
def worker(inq, outq):
print "Worker started"
while True:
url = inq.get()
secs = random.randint(1,5)
print "processing", url, " sleeping for", secs
sleep(secs)
outq.put(url + " done")
def callWithTimeout(arg):
global proc, inq, outq
inq.put(arg)
result = None
while result is None:
try:
result = outq.get(timeout=4)
except:
print "restarting worker process"
proc.terminate()
createWorker()
inq.put(arg)
return result
def main():
global proc, inq, outq
createWorker()
for arg in ["foo", "bar", "baz", "quux"]:
res = callWithTimeout(arg)
print "res =", res
proc.terminate()
main()
It uses two queues - one for sending messages to the worker process and one for receiving the results. You could also use pipes. Also, new queues are created when the worker process is restarted - this is to avoid a possible race condition.
Edit: Just saw your edit - looks like the same idea.