I am trying to write a module which needs to crawl some URLs concurrently/parallelly. since this would be a more expensive Network IO operation instead of CPU heavy. I am using ThreadPoolExecutor.
Now in my code, multiple functions add tasks to the shared thread pool.
my issue is Main thread gets suspended before all future objects are
done processing in the callback functions.
I am a beginner dealing with futures and ThreadPoolExecutor. Any help would be appreciated.
import settings
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
class Test(Base):
WORKER_THREADS = settings.WORKER_THREADS
def __init__(self, urls):
super(Test, self).__init__()
self.urls = urls
self.worker_pool = ThreadPoolExecutor(max_workers=Test.WORKER_THREADS)
def add_to_worker_queue(self, task, callback, **kwargs):
self.logger.info("Adding task %s to worker pool.", task.func_name)
self.worker_pool.submit(task, **kwargs).add_done_callback(callback)
return
def load_url(self, url):
response = self.make_requests(urls=url) # make_requests is in Base class (it just makes a HTTP req)
# response is a generator, so to get the data out of it need to iterate through it.
for res in response:
return res
def handle_response(self, response):
# do some stuff with response and add it again to the worker queue for further parallel processing
self.add_to_worker_queue(some_task, callback_func, data=response)
return
def start(self):
for url in self.urls:
self.add_to_worker_queue(self.load_url, self.handle_response, url=[url])
return
def stop(self):
self.worker_pool.shutdown(wait=True)
return
if __name__ == "__main__":
start_urls = [ 'http://stackoverflow.com/'
, 'https://docs.python.org/3.3/library/concurrent.futures.html'
]
test = Test(urls=start_urls)
test.start()
test.stop()
PS I tried using executer with "with" statement, according to this example. https://docs.python.org/3.3/library/concurrent.futures.html#threadpoolexecutor-example
but as I submit tasks to the pool one by one and above example wait for future objects to be completed which defeats my purpose.
Related
I have a script to traverse an AWS S3 bucket to do some aggregation at the file level.
from threading import Semaphore, Thread
class Spider:
def __init__(self):
self.sem = Semaphore(120)
self.threads = list()
def crawl(self, root_url):
self.recursive_harvest_subroutine(root_url)
for thread in self.threads:
thread.join()
def recursive_harvest_subroutine(self, url):
children = get_direct_subdirs(url)
self.sem.acquire()
if len(children) == 0:
queue_url_to_do_something_later(url) # Done
else:
for child_url in children:
try:
thread = Thread(target=self.recursive_harvest_subroutine, args=(url,))
self.threads.append(thread)
thread.start()
self.sem.release()
This used to run okay, until I encountered a bucket of several TB of data with hundreds of thousand sub-directories. The number of Thread objects in self.threads increases very fast and soon the server reported to me
RuntimeError: can't start new thread
There is some extra processing I have to do in the script so I can't just get all files from the bucket.
Currently I'm putting a depth of at least 2 before the script can go parallelized but it's just a workaround. Any suggestion is appreciated.
So the way the original piece of code worked was BFS, which created a lot of waiting threads in queue. I changed it to DFS and everything is working fine. Pseudo code in case someone needs this in the future:
def __init__(self):
self.sem = Semaphore(120)
self.urls = list()
self.mutex = Lock()
def crawl(self, root_url):
self.recursive_harvest_subroutine(root_url)
while not is_done():
self.sem.acquire()
url = self.urls.pop(0)
thread = Thread(target=self.recursive_harvest_subroutine, args=(url,))
thread.start()
self.sem.release()
def recursive_harvest_subroutine(self, url):
children = get_direct_subdirs(url)
if len(children) == 0:
queue_url_to_do_something_later(url) # Done
else:
self.mutex.acquire()
for child_url in children:
self.urls.insert(0, child_url)
self.mutex.release()
No join() so I implemented my own is_done() check.
User visit http://example.com/url/ and invoke page_parser from views.py. page_parser create instance of class Foo from script.py.
Each time http://example.com/url/ is visited I see that memory usage goes up and up. I guess Garbage Collector don't collect instantiated class Foo. Any ideas why is it so?
Here is the code:
views.py:
from django.http import HttpResponse
from script import Foo
from script import urls
# When user visits http://example.com/url/ I run `page_parser`
def page_parser(request):
Foo(urls)
return HttpResponse("alldone")
script.py:
import requests
from queue import Queue
from threading import Thread
class Newthread(Thread):
def __init__(self, queue, result):
Thread.__init__(self)
self.queue = queue
self.result = result
def run(self):
while True:
url = self.queue.get()
data = requests.get(url) # Download image at url
self.result.append(data)
self.queue.task_done()
class Foo:
def __init__(self, urls):
self.result = list()
self.queue = Queue()
self.startthreads()
for url in urls:
self.queue.put(url)
self.queue.join()
def startthreads(self):
for x in range(3):
worker = Newthread(queue=self.queue, result=self.result)
worker.daemon = True
worker.start()
urls = [
"https://static.pexels.com/photos/106399/pexels-photo-106399.jpeg",
"https://static.pexels.com/photos/164516/pexels-photo-164516.jpeg",
"https://static.pexels.com/photos/206172/pexels-photo-206172.jpeg",
"https://static.pexels.com/photos/32870/pexels-photo.jpg",
"https://static.pexels.com/photos/106399/pexels-photo-106399.jpeg",
"https://static.pexels.com/photos/164516/pexels-photo-164516.jpeg",
"https://static.pexels.com/photos/206172/pexels-photo-206172.jpeg",
"https://static.pexels.com/photos/32870/pexels-photo.jpg",
"https://static.pexels.com/photos/32870/pexels-photo.jpg",
"https://static.pexels.com/photos/106399/pexels-photo-106399.jpeg",
"https://static.pexels.com/photos/164516/pexels-photo-164516.jpeg",
"https://static.pexels.com/photos/206172/pexels-photo-206172.jpeg",
"https://static.pexels.com/photos/32870/pexels-photo.jpg"]
There's several moving parts involved, but what I think happens is the following:
WSGI processes are not killed after each request, so things may persist.
You create 3 new threads, but don't let them join the main thread again, for example when the queue is empty.
Since the reference count to Foo.queue never reaches zero (as the threads are still alive waiting for new queue items), it cannot be garbage collected
So you keep creating new threads, new Foo classes and none of them can be freed.
I'm not an expert on queue.Queue, but my theory can be verified if you can watch the number of threads in the WSGI process go up with 3 each request (for example using top(1)).
As a side note, this is a side-effect of your class design. You do everything in __init__, which should really only be assigning class attributes.
I am playing around with concurrent.futures.
Currently my future calls time.sleep(secs).
It seems that Future.cancel() does less than I thought.
If the future is already executing, then time.sleep() does not get cancel by it.
The same for the timeout parameter for wait(). It does not cancel my time.sleep().
How to cancel time.sleep() which gets executed in a concurrent.futures?
For testing I use the ThreadPoolExecutor.
If you submit a function to a ThreadPoolExecutor, the executor will run the function in a thread and store its return value in the Future object. Since the number of concurrent threads is limited, you have the option to cancel the pending execution of a future, but once control in the worker thread has been passed to the callable, there's no way to stop execution.
Consider this code:
import concurrent.futures as f
import time
T = f.ThreadPoolExecutor(1) # Run at most one function concurrently
def block5():
time.sleep(5)
return 1
q = T.submit(block5)
m = T.submit(block5)
print q.cancel() # Will fail, because q is already running
print m.cancel() # Will work, because q is blocking the only thread, so m is still queued
In general, whenever you want to have something cancellable you yourself are responsible for making sure that it is.
There are some off-the-shelf options available though. E.g., consider using asyncio, they also have an example using sleep. The concept circumvents the issue by, whenever any potentially blocking operation is to be called, instead returning control to a control loop running in the outer-most context, together with a note that execution should be continued whenever the result is available - or, in your case, after n seconds have passed.
I do not know much about concurrent.futures, but you can use this logic to break the time. Use a loop instead of sleep.time() or wait()
for i in range(sec):
sleep(1)
interrupt or break can be used to come out of loop.
I figured it out.
Here is a example:
from concurrent.futures import ThreadPoolExecutor
import queue
import time
class Runner:
def __init__(self):
self.q = queue.Queue()
self.exec = ThreadPoolExecutor(max_workers=2)
def task(self):
while True:
try:
self.q.get(block=True, timeout=1)
break
except queue.Empty:
pass
print('running')
def run(self):
self.exec.submit(self.task)
def stop(self):
self.q.put(None)
self.exec.shutdown(wait=False,cancel_futures=True)
r = Runner()
r.run()
time.sleep(5)
r.stop()
As it is written in its link, You can use a with statement to ensure threads are cleaned up promptly, like the below example:
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
I've faced this same problem recently. I had 2 tasks to run concurrently and one of them had to sleep from time to time. In the code below, suppose task2 is the one that sleeps.
from concurrent.futures import ThreadPoolExecutor
executor = ThreadPoolExecutor(max_workers=2)
executor.submit(task1)
executor.submit(task2)
executor.shutdown(wait=True)
In order to avoid the endless sleep I've extracted task2 to run synchronously. I don't whether it's a good practice, but it's simple and fit perfectly in my scenario.
from concurrent.futures import ThreadPoolExecutor
executor = ThreadPoolExecutor(max_workers=1)
executor.submit(task1)
task2()
executor.shutdown(wait=True)
Maybe it's useful to someone else.
In my Python Twisted application I need to receive data from the client, perform some database operations and - depends on data - run some blocking code in separate thread.
So far I have:
d = get_user(user_id)
d.addCallback(do_something_with_input_data, input_data)
d.addCallback(run_blocking_code)
d.addCallback(save_data_into_db)
d.addCallback(response_to_client)
#defer.inlineCallbacks
def get_user(self, user_id):
user = yield get_user_from_db(user_id)
defer.returnValue(user)
def do_something_with_input_data(user, input_data):
# do smth...
return results
#defer.inlineCallbacks
def run_blocking_code(results)
threads.deferToThread(run_in_separate_thread, results)
return results
#defer.inlineCallbacks
def save_data_into_db(results)
yield save_in_db(results)
def.returnValue('OK')
def response_to_client(response)
# send 'OK' to client
Is this a good approach to call deferToThread() in run_blocking_code()? If so, how can I make save_data_into_db() wait until thread ends?
I'd say the general concept is fine. I'd add in some errbacks, and you also need to tweak your run_blocking_code function:
from twisted.internet import defer, threads
#defer.inlineCallbacks
def run_blocking_code(results):
# the `deferToThread` returns a deferred
d = threads.deferToThread(run_in_separate_thread, results)
# let's wait for it here (need a yield for functions decorated with `inlineCallbacks`)
yield d
# now return its value to the next function in the callback chain
defer.returnValue(d.result)
I have a python script which has a line that makes a post request as shown below:
rsp = requests.post(img_url, data=img_json_data, headers=img_headers)
print rsp # just for debugging
But suppose I don't want my script to keep waiting for the response, but instead run the above lines asynchronously in parallel to the rest of the code. What would be the easiest way to do so?
This is a class that allow easy parallel execution on multiple workers.
Basically it creates worker threads, that wait for job in a Queue.
Once you put a task they execute it and put the results in another Queue.
join() will wait until everything is done, then we empty the results queue and return as an array.
from Queue import Queue
import logging
from threading import Thread
logger = logging.getLogger(__name__)
class Parallel(object):
def __init__(self, thread_num=10):
# create queues
self.tasks_queue = Queue()
self.results_queue = Queue()
# create a threading pool
self.pool = []
for i in range(thread_num):
worker = Worker(i, self.tasks_queue, self.results_queue)
self.pool.append(worker)
worker.start()
logger.debug('Created %s workers',thread_num)
def add_task(self, task_id, func, *args, **kwargs):
"""
Add task to queue, they will be started as soon as added
:param func: function to execute
:param args: args to transmit
:param kwargs: kwargs to transmit
"""
logger.debug('Adding one task to queue (%s)', func.__name__)
# add task to queue
self.tasks_queue.put_nowait((task_id, func, args, kwargs))
pass
def get_results(self):
logger.debug('Waiting for processes to ends')
self.tasks_queue.join()
logger.debug('Processes terminated, fetching results')
results = []
while not self.results_queue.empty():
results.append(self.results_queue.get())
logger.debug('Results fetched, returning data')
return dict(results)
class Worker(Thread):
def __init__(self, thread_id, tasks, results):
super(Worker, self).__init__()
self.id = thread_id
self.tasks = tasks
self.results = results
self.daemon = True
def run(self):
logger.debug('Worker %s launched', self.id)
while True:
task_id, func, args, kwargs = self.tasks.get()
logger.debug('Worker %s start to work on %s', self.id, func.__name__)
try:
self.results.put_nowait((task_id, func(*args, **kwargs)))
except Exception as err:
logger.debug('Thread(%s): error with task %s\n%s', self.id, repr(func.__name__), err)
finally:
logger.debug('Worker %s finished work on %s', self.id, func.__name__)
self.tasks.task_done()
import requests
# create parallel instance with 4 workers
parallel = Parallel(4)
# launch jobs
for i in range(20):
parallel.add_task(i, requests.post, img_url, data=img_json_data, headers=img_headers)
# wait for all jobs to return data
print parrallel.get_results()
You can use celery for the same. With celery the processing will be async and you can check for status as well as result. For further info click here
You need to queue this task for asynchronous processing.
There are multiple options here :
celery which has larger learning curve for a newbie. check here
python-rq which is relatively very light weight and a goto library. check here
You can use any of the message queues among redis,rabbitmq etc