asynchronous post request in python - python

I have a python script which has a line that makes a post request as shown below:
rsp = requests.post(img_url, data=img_json_data, headers=img_headers)
print rsp # just for debugging
But suppose I don't want my script to keep waiting for the response, but instead run the above lines asynchronously in parallel to the rest of the code. What would be the easiest way to do so?

This is a class that allow easy parallel execution on multiple workers.
Basically it creates worker threads, that wait for job in a Queue.
Once you put a task they execute it and put the results in another Queue.
join() will wait until everything is done, then we empty the results queue and return as an array.
from Queue import Queue
import logging
from threading import Thread
logger = logging.getLogger(__name__)
class Parallel(object):
def __init__(self, thread_num=10):
# create queues
self.tasks_queue = Queue()
self.results_queue = Queue()
# create a threading pool
self.pool = []
for i in range(thread_num):
worker = Worker(i, self.tasks_queue, self.results_queue)
self.pool.append(worker)
worker.start()
logger.debug('Created %s workers',thread_num)
def add_task(self, task_id, func, *args, **kwargs):
"""
Add task to queue, they will be started as soon as added
:param func: function to execute
:param args: args to transmit
:param kwargs: kwargs to transmit
"""
logger.debug('Adding one task to queue (%s)', func.__name__)
# add task to queue
self.tasks_queue.put_nowait((task_id, func, args, kwargs))
pass
def get_results(self):
logger.debug('Waiting for processes to ends')
self.tasks_queue.join()
logger.debug('Processes terminated, fetching results')
results = []
while not self.results_queue.empty():
results.append(self.results_queue.get())
logger.debug('Results fetched, returning data')
return dict(results)
class Worker(Thread):
def __init__(self, thread_id, tasks, results):
super(Worker, self).__init__()
self.id = thread_id
self.tasks = tasks
self.results = results
self.daemon = True
def run(self):
logger.debug('Worker %s launched', self.id)
while True:
task_id, func, args, kwargs = self.tasks.get()
logger.debug('Worker %s start to work on %s', self.id, func.__name__)
try:
self.results.put_nowait((task_id, func(*args, **kwargs)))
except Exception as err:
logger.debug('Thread(%s): error with task %s\n%s', self.id, repr(func.__name__), err)
finally:
logger.debug('Worker %s finished work on %s', self.id, func.__name__)
self.tasks.task_done()
import requests
# create parallel instance with 4 workers
parallel = Parallel(4)
# launch jobs
for i in range(20):
parallel.add_task(i, requests.post, img_url, data=img_json_data, headers=img_headers)
# wait for all jobs to return data
print parrallel.get_results()

You can use celery for the same. With celery the processing will be async and you can check for status as well as result. For further info click here

You need to queue this task for asynchronous processing.
There are multiple options here :
celery which has larger learning curve for a newbie. check here
python-rq which is relatively very light weight and a goto library. check here
You can use any of the message queues among redis,rabbitmq etc

Related

Python Multiprocessing: Adding to Queue Within Child Process

I want to implement a file crawler that stores data to a Mongo. I would like to use multiprocessing as a way to 'hand off' blocking tasks such as unzipping files, file crawling and uploading to Mongo. There are certain tasks that are reliant on other tasks (i.e., a file needs to be unzipped before files can be crawled), so I would like the ability to complete the necessary task and add new ones to the same task queue.
Below is what I currently have:
import multiprocessing
class Worker(multiprocessing.Process):
def __init__(self, task_queue: multiprocessing.Queue):
super(Worker, self).__init__()
self.task_queue = task_queue
def run(self):
for (function, *args) in iter(self.task_queue.get, None):
print(f'Running: {function.__name__}({*args,})')
# Run the provided function with its parameters in child process
function(*args)
self.task_queue.task_done()
def foo(task_queue: multiprocessing.Queue) -> None:
print('foo')
# Add new task to queue from this child process
task_queue.put((bar, 1))
def bar(x: int) -> None:
print(f'bar: {x}')
def main():
# Start workers on separate processes
workers = []
manager = multiprocessing.Manager()
task_queue = manager.Queue()
for i in range(multiprocessing.cpu_count()):
worker = Worker(task_queue)
workers.append(worker)
worker.start()
# Run foo on child process using the queue as parameter
task_queue.put((foo, task_queue))
for _ in workers:
task_queue.put(None)
# Block until workers complete and join main process
for worker in workers:
worker.join()
print('Program completed.')
if __name__ == '__main__':
main()
Expected Behaviour:
Running: foo((<AutoProxy[Queue] object, typeid 'Queue' at 0x1b963548908>,))
foo
Running: bar((1,))
bar: 1
Program completed.
Actual Behaviour:
Running: foo((<AutoProxy[Queue] object, typeid 'Queue' at 0x1b963548908>,))
foo
Program completed.
I am quite new to multiprocessing so any help would be greatly appreciated.
As #FrankYellin noted, this is due to the fact that None is being put into task_queue before bar can be added.
Assuming that the queue will either be non-empty or waiting for a task to complete
during the program (which is true in my case), the join method on the queue can be used. According to the docs:
Blocks until all items in the queue have been gotten and processed.
The count of unfinished tasks goes up whenever an item is added to the
queue. The count goes down whenever a consumer thread calls
task_done() to indicate that the item was retrieved and all work on it
is complete. When the count of unfinished tasks drops to zero, join()
unblocks.
Below is the updated code:
import multiprocessing
class Worker(multiprocessing.Process):
def __init__(self, task_queue: multiprocessing.Queue):
super(Worker, self).__init__()
self.task_queue = task_queue
def run(self):
for (function, *args) in iter(self.task_queue.get, None):
print(f'Running: {function.__name__}({*args,})')
# Run the provided function with its parameters in child process
function(*args)
self.task_queue.task_done() # <-- Notify queue that task is complete
def foo(task_queue: multiprocessing.Queue) -> None:
print('foo')
# Add new task to queue from this child process
task_queue.put((bar, 1))
def bar(x: int) -> None:
print(f'bar: {x}')
def main():
# Start workers on separate processes
workers = []
manager = multiprocessing.Manager()
task_queue = manager.Queue()
for i in range(multiprocessing.cpu_count()):
worker = Worker(task_queue)
workers.append(worker)
worker.start()
# Run foo on child process using the queue as parameter
task_queue.put((foo, task_queue))
# Block until all items in queue are popped and completed
task_queue.join() # <---
for _ in workers:
task_queue.put(None)
# Block until workers complete and join main process
for worker in workers:
worker.join()
print('Program completed.')
if __name__ == '__main__':
main()
This seems to work fine. I will update this if I discover anything new. Thank you all.

Python ThreadPoolExecutor wait for all futures to complete

I am trying to write a module which needs to crawl some URLs concurrently/parallelly. since this would be a more expensive Network IO operation instead of CPU heavy. I am using ThreadPoolExecutor.
Now in my code, multiple functions add tasks to the shared thread pool.
my issue is Main thread gets suspended before all future objects are
done processing in the callback functions.
I am a beginner dealing with futures and ThreadPoolExecutor. Any help would be appreciated.
import settings
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
class Test(Base):
WORKER_THREADS = settings.WORKER_THREADS
def __init__(self, urls):
super(Test, self).__init__()
self.urls = urls
self.worker_pool = ThreadPoolExecutor(max_workers=Test.WORKER_THREADS)
def add_to_worker_queue(self, task, callback, **kwargs):
self.logger.info("Adding task %s to worker pool.", task.func_name)
self.worker_pool.submit(task, **kwargs).add_done_callback(callback)
return
def load_url(self, url):
response = self.make_requests(urls=url) # make_requests is in Base class (it just makes a HTTP req)
# response is a generator, so to get the data out of it need to iterate through it.
for res in response:
return res
def handle_response(self, response):
# do some stuff with response and add it again to the worker queue for further parallel processing
self.add_to_worker_queue(some_task, callback_func, data=response)
return
def start(self):
for url in self.urls:
self.add_to_worker_queue(self.load_url, self.handle_response, url=[url])
return
def stop(self):
self.worker_pool.shutdown(wait=True)
return
if __name__ == "__main__":
start_urls = [ 'http://stackoverflow.com/'
, 'https://docs.python.org/3.3/library/concurrent.futures.html'
]
test = Test(urls=start_urls)
test.start()
test.stop()
PS I tried using executer with "with" statement, according to this example. https://docs.python.org/3.3/library/concurrent.futures.html#threadpoolexecutor-example
but as I submit tasks to the pool one by one and above example wait for future objects to be completed which defeats my purpose.

EC2 Spot Instance Termination & Python 2.7

I know that the termination notice is made available via the meta-data url and that I can do something similar to
if requests.get("http://169.254.169.254/latest/meta-data/spot/termination-time").status_code == 200
in order to determine if the notice has been posted. I run a Python service on my Spot Instances that:
Loops over long polling SQS Queues
If it gets a message, it pauses polling and works on the payload.
Working on the payload can take 5-50 minutes.
Working on the payload will involve spawning a threadpool of up to 50 threads to handle parallel uploading of files to S3, this is the majority of the time spent working on the payload.
Finally, remove the message from the queue, rinse, repeat.
The work is idempotent, so if the same payload runs multiple times, I'm out the processing time/costs, but will not negatively impact the application workflow.
I'm searching for an elegant way to now also poll for the termination notice every five seconds in the background. As soon as the termination notice appears, I'd like to immediately release the message back to the SQS queue in order for another instance to pick it up as quickly as possible.
As a bonus, I'd like to shutdown the work, kill off the threadpool, and have the service enter a stasis state. If I terminate the service, supervisord will simply start it back up again.
Even bigger bonus! Is there not a python module available that simplifies this and just works?
I wrote this code to demonstrate how a thread can be used to poll for the Spot instance termination. It first starts up a polling thread, which would be responsible for checking the http endpoint.
Then we create pool of fake workers (mimicking real work to be done) and starts running the pool. Eventually the polling thread will kick in (about 10 seconds into execution as implemented) and kill the whole thing.
To prevent the script from continuing to work after Supervisor restarts it, we would simply put a check at the beginning of the __main__ and if the termination notice is there we sleep for 2.5 minutes, which is longer than that notice lasts before the instance is shutdown.
#!/usr/bin/env python
import threading
import Queue
import random
import time
import sys
import os
class Instance_Termination_Poll(threading.Thread):
"""
Sleep for 5 seconds and eventually pretend that we then recieve the
termination event
if requests.get("http://169.254.169.254/latest/meta-data/spot/termination-time").status_code == 200
"""
def run(self):
print("Polling for termination")
while True:
for i in range(30):
time.sleep(5)
if i==2:
print("Recieve Termination Poll!")
print("Pretend we returned the message to the queue.")
print("Now Kill the entire program.")
os._exit(1)
print("Well now, this is embarassing!")
class ThreadPool:
"""
Pool of threads consuming tasks from a queue
"""
def __init__(self, num_threads):
self.num_threads = num_threads
self.errors = Queue.Queue()
self.tasks = Queue.Queue(self.num_threads)
for _ in range(num_threads):
Worker(self.tasks, self.errors)
def add_task(self, func, *args, **kargs):
"""
Add a task to the queue
"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""
Wait for completion of all the tasks in the queue
"""
try:
while True:
if self.tasks.empty() == False:
time.sleep(10)
else:
break
except KeyboardInterrupt:
print "Ctrl-c received! Kill it all with Prejudice..."
os._exit(1)
self.tasks.join()
class Worker(threading.Thread):
"""
Thread executing tasks from a given tasks queue
"""
def __init__(self, tasks, error_queue):
threading.Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.errors = error_queue
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except Exception, e:
print("Exception " + str(e))
error = {'exception': e}
self.errors.put(error)
self.tasks.task_done()
def do_work(n):
"""
Sleeps a random ammount of time, then creates a little CPU usage to
mimic some work taking place.
"""
for z in range(100):
time.sleep(random.randint(3,10))
print "Thread ID: {} working.".format(threading.current_thread())
for x in range(30000):
x*n
print "Thread ID: {} done, sleeping.".format(threading.current_thread())
if __name__ == '__main__':
num_threads = 30
# Start up the termination polling thread
term_poll = Instance_Termination_Poll()
term_poll.start()
# Create our threadpool
pool = ThreadPool(num_threads)
for y in range(num_threads*2):
pool.add_task(do_work, n=y)
# Wait for the threadpool to complete
pool.wait_completion()

Self-joining thread pool: where's my race condition?

Since I use a similar pattern in my work a lot, I decided to write a class that abstracts very simple worker concurrency via job queue / threading. I know there are already things out there that solve this, but I also wanted to use this as an opportunity to hone my multithreading skills.
The main challenge I've given myself is that I want this to be able to let processes finish, even if they are not explicitly blocked by Queue.join(). "A process finishing" is defined by the input function returning a value (or None). The way I have attempted to accomplish this is by having each job create it's own results queue rq, which is then checked by _wait_for_results in a non-daemon thread, which blocks the automatic exit of all other daemonized threads until rq is filled by the worker in add_to_queue.
Here is the full class:
class EasyPool(object):
def __init__(self, concurrency, always_finish=True):
def add_to_queue(q):
while True:
func_data, rq = q.get()
func, args, kwargs = func_data
if not args:
args = []
if not kwargs:
kwargs = {}
result = func(*args, **kwargs)
rq.put(result)
q.task_done()
self.rqs = []
self.always_finish = always_finish
self.q = Queue(maxsize=0)
self.workers = []
for i in range(concurrency):
worker = Thread(target=add_to_queue, args=(self.q,))
self.workers.append(worker)
worker.setDaemon(True)
worker.start()
def _wait_for_results(self, rq):
rq.not_empty.acquire()
rq.not_empty.wait()
rq.not_empty.notify()
rq.not_empty.release()
def add_job(self, func, *args, **kwargs):
rq = Queue()
if self.always_finish:
blocker = Thread(target=self._wait_for_results, args=(rq,))
blocker.setDaemon(False)
blocker.start()
to_add = []
[ to_add.append(i) if i else to_add.append(None) for i in [func, args, kwargs] ]
self.q.put((to_add, rq))
return rq.get
When a job is created via the .add_job instance method, it immediately returns a promise-like object, which is a reference to the .get method of the results queue. The problem I'm facing is that there seems to be a race condition between this .get and the _wait_for_results method. I think the answer probably involves a Lock or a Condition, but I'm not really sure. Any help is much appreciated :)

end daemon processes with multiprocessing module

I include an example usage of multiprocessing below. This is a process pool model. It is not as simple as it might be, but is relatively close in structure to the code I'm actually using. It also uses sqlalchemy, sorry.
My question is - I currently have a situation where I have a relatively long running Python script which is executing a number of functions which each look like the code below, so the parent process is the same in all cases. In other words, multiple pools are created by one python script. (I don't have to do it this way, I suppose, but the alternative is to use something like os.system and subprocess.) The problem is that these processes hang around and hold on to memory. The docs say these daemon processes are supposed to stick around till the parent process exits, but what about if the parent process then goes on to generate another pool or processes and doesn't exit immediately.
Calling terminate() works, but this doesn't seem terribly polite. Is there a good way to ask the processes to terminate nicely? I.e. clean up after yourself and go away now, I need to start up the next pool?
I also tried calling join() on the processes. According to the documentation this means wait for the processes to terminate. What if they don't plan to terminate? What actually happens is that the process hangs.
Thanks in advance.
Regards, Faheem.
import multiprocessing, time
class Worker(multiprocessing.Process):
"""Process executing tasks from a given tasks queue"""
def __init__(self, queue, num):
multiprocessing.Process.__init__(self)
self.num = num
self.queue = queue
self.daemon = True
def run(self):
import traceback
while True:
func, args, kargs = self.queue.get()
try:
print "trying %s with args %s"%(func.__name__, args)
func(*args, **kargs)
except:
traceback.print_exc()
self.queue.task_done()
class ProcessPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.queue = multiprocessing.JoinableQueue()
self.workerlist = []
self.num = num_threads
for i in range(num_threads):
self.workerlist.append(Worker(self.queue, i))
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.queue.put((func, args, kargs))
def start(self):
for w in self.workerlist:
w.start()
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.queue.join()
for worker in self.workerlist:
print worker.__dict__
#worker.terminate() <--- terminate used here
worker.join() <--- join used here
start = time.time()
from sqlalchemy import *
from sqlalchemy.orm import *
dbuser = ''
password = ''
dbname = ''
dbstring = "postgres://%s:%s#localhost:5432/%s"%(dbuser, password, dbname)
db = create_engine(dbstring, echo=True)
m = MetaData(db)
def make_foo(i):
t1 = Table('foo%s'%i, m, Column('a', Integer, primary_key=True))
conn = db.connect()
for i in range(10):
conn.execute("DROP TABLE IF EXISTS foo%s"%i)
conn.close()
for i in range(10):
make_foo(i)
m.create_all()
def do(i, dbstring):
dbstring = "postgres://%s:%s#localhost:5432/%s"%(dbuser, password, dbname)
db = create_engine(dbstring, echo=True)
Session = scoped_session(sessionmaker())
Session.configure(bind=db)
Session.execute("ALTER TABLE foo%s SET ( autovacuum_enabled = false );"%i)
Session.execute("ALTER TABLE foo%s SET ( autovacuum_enabled = true );"%i)
Session.commit()
pool = ProcessPool(5)
for i in range(10):
pool.add_task(do, i, dbstring)
pool.start()
pool.wait_completion()
My way of dealing with this was:
import multiprocessing
for prc in multiprocessing.active_children():
prc.terminate()
I like this more so I don't have to pollute the worker function with some if clause.
You know multiprocessing already has classes for worker pools, right?
The standard way is to send your threads a quit signal:
queue.put(("QUIT", None, None))
Then check for it:
if func == "QUIT":
return

Categories