Terminating try block in python after n seconds - python

I am trying to impose a TimeoutException on a try statement after n seconds. I have found a library which handles this called signal which would be perfect but I'm running into an error I have a hard time getting around. (This similar SO question is answered with the signal library.)
This is the boiled down code representing the problem:
import multiprocessing
from multiprocessing.dummy import Pool
def main():
listOfLinks = []
threadpool = Pool(2)
info = threadpool.starmap(processRunSeveralTimesInParallel,zip(enumerate(listOfLinks)))
threadpool.close()
def processRunSeveralTimesInParallel(listOfLinks):
#The following is pseudo code representing what I would like to do:
loongSequenceOfInstructions()
for i in range(0,10):
try for n seconds:
doSomething(i)
except (after n seconds):
handleException()
return something
When implementing the above question's solution with the signal library, I get the following error:
File "file.py", line 388, in main
info = threadpool.starmap(processRunSeveralTimesInParallel,zip(enumerate(listOfLinks)))
File "/Users/user/anaconda3/envs/proj/lib/python3.8/multiprocessing/pool.py", line 372, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "/Users/user/anaconda3/envs/proj/lib/python3.8/multiprocessing/pool.py", line 771, in get
raise self._value
File "/Users/user/anaconda3/envs/proj/lib/python3.8/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/Users/user/anaconda3/envs/proj/lib/python3.8/multiprocessing/pool.py", line 51, in starmapstar
return list(itertools.starmap(args[0], args[1]))
File "file.py", line 193, in processRunSeveralTimesInParallel
signal.signal(signal.SIGALRM, signal_handler)
File "/Users/user/anaconda3/envs/proj/lib/python3.8/signal.py", line 47, in signal
handler = _signal.signal(_enum_to_int(signalnum), _enum_to_int(handler))
ValueError: signal only works in main thread
Any idea how to cap the time just on a try block within a method run as a thread? Thank you!
Important information:
I am using the multiprocessing library to run several processes at the same time in parallel. From the error statement above, I suspect that the signal and multiprocessing libraries conflict.
The methods in the try statement are selenium (find_element_by_xpath) methods. However there are no timeout arguments available.

Newly Updated Answer
If you are a way of looking for timing out without using signals, here is one way. First, since you are using threading, let's make it explicit and let's use the concurrent.futures module, which has a lot of flexibility.
When a "job" is submitted to the pool executor, a Future instance is returned immediately without blocking until a result call is made on that instance. You can specify a timeout value such that if the result is not available within the timeout period, an exception will be thrown. The idea is to pass to the worker thread the ThreadPoolExecutor instance and for it to run the critical piece of code that must be completed within a certain time period within its own worker thread. A Future instance will be created for that timed code but this time the result call will specify a timeout value:
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import time
def main():
listOfLinks = ['a', 'b', 'c', 'd', 'e']
futures = []
"""
To prevent timeout errors due to lack of threads, you need at least one extra thread
in addition to the ones being created here so that at least one time_critical thread
can start. Of course, ideally you would like all the time_critical threads to be able to
start without waiting. So, whereas the minimum number of max_workers would be 6 in this
case, the ideal number would be 5 * 2 = 10.
"""
with ThreadPoolExecutor(max_workers=10) as executor:
# pass executor to our worker
futures = [executor.submit(processRunSeveralTimesInParallel, tuple, executor) for tuple in enumerate(listOfLinks)]
for future in futures:
result = future.result()
print('result is', result)
def processRunSeveralTimesInParallel(tuple, executor):
link_number = tuple[0]
link = tuple[1]
# long running sequence of instructions up until this point and then
# allow 2 seconds for this part:
for i in range(10):
future = executor.submit(time_critical, link, i)
try:
future.result(timeout=2) # time_critical does not return a result other than None
except TimeoutError:
handle_exception(link, i)
return link * link_number
def time_critical(link, trial_number):
if link == 'd' and trial_number == 7:
time.sleep(3) # generate a TimeoutError
def handle_exception(link, trial_number):
print(f'There was a timeout for link {link}, trial number {trial_number}.')
if __name__ == '__main__':
main()
Prints:
result is
result is b
result is cc
There was a timeout for link d, trial number 7.
result is ddd
result is eeee
Using Threading and Multiprocessing
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, TimeoutError
import os
import time
def main():
listOfLinks = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
futures = []
cpu_count = os.cpu_count()
with ThreadPoolExecutor(max_workers=cpu_count) as thread_executor, ProcessPoolExecutor(max_workers=cpu_count) as process_executor:
# pass executor to our worker
futures = [thread_executor.submit(processRunSeveralTimesInParallel, tuple, process_executor) for tuple in enumerate(listOfLinks)]
for future in futures:
result = future.result()
print('result is', result)
def processRunSeveralTimesInParallel(tuple, executor):
link_number = tuple[0]
link = tuple[1]
# long running sequence of instructions up until this point and then
# allow 2 seconds for this part:
for i in range(10):
future = executor.submit(time_critical, link, i)
try:
future.result(timeout=2) # time_critical does not return a result other than None
except TimeoutError:
handle_exception(link, i)
return link * link_number
def time_critical(link, trial_number):
if link == 'd' and trial_number == 7:
time.sleep(3) # generate a TimeoutError
def handle_exception(link, trial_number):
print(f'There was a timeout for link {link}, trial number {trial_number}.')
if __name__ == '__main__':
main()
Prints:
result is
result is b
result is cc
There was a timeout for link d, trial number 7.
result is ddd
result is eeee
result is fffff
result is gggggg
result is hhhhhhh
result is iiiiiiii
result is jjjjjjjjj
Multiprocessing Exclusively
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Process
import os
import time
def main():
listOfLinks = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
futures = []
workers = os.cpu_count() // 2
with ProcessPoolExecutor(max_workers=workers) as process_executor:
# pass executor to our worker
futures = [process_executor.submit(processRunSeveralTimesInParallel, tuple) for tuple in enumerate(listOfLinks)]
for future in futures:
result = future.result()
print('result is', result)
def processRunSeveralTimesInParallel(tuple):
link_number = tuple[0]
link = tuple[1]
# long running sequence of instructions up until this point and then
# allow 2 seconds for this part:
for i in range(10):
p = Process(target=time_critical, args=(link, i))
p.start()
p.join(timeout=2) # don't block for more than 2 seconds
if p.exitcode is None: # subprocess did not terminate
p.terminate() # we will terminate it
handle_exception(link, i)
return link * link_number
def time_critical(link, trial_number):
if link == 'd' and trial_number == 7:
time.sleep(3) # generate a TimeoutError
def handle_exception(link, trial_number):
print(f'There was a timeout for link {link}, trial number {trial_number}.')
if __name__ == '__main__':
main()
Prints:
result is
result is b
result is cc
There was a timeout for link d, trial number 7.
result is ddd
result is eeee
result is fffff
result is gggggg
result is hhhhhhh
result is iiiiiiii
result is jjjjjjjjj

Related

Python multiprocessing finish the work correctly, but the processes still alive (Linux)

I use python multiprocessing to compute some sort of scores on DNA sequences from a large file.
For that I write and use the script below.
I use a Linux machine with 48 cpu in python 3.8 environment.
Th code work fine, and terminate the work correctly and print the processing time at the end.
Problem: when I use the htop command, I find that all 48 processes are still alive.
I don't know why, and I don't know what to add to my script to avoid this.
import csv
import sys
import concurrent.futures
from itertools import combinations
import psutil
import time
nb_cpu = psutil.cpu_count(logical=False)
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
score_dist = compute_score_dist(seq_1[1], seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
def help_fun_job(nested_pair):
return fun_job(nested_pair[0], nested_pair[1])
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor(max_workers=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')
def main():
print("nb_cpu in this machine : ", nb_cpu)
file_path = sys.argv[1]
dict_ids_seqs = get_dict_ids_seqs(file_path)
list_ids = list(dict_ids_seqs) # This will convert the dict_keys to a list
list_combined_ids = list(combinations(list_ids, 2))
compute_using_multi_processing(list_combined_ids, dict_ids_seqs)
if __name__ == '__main__':
main()
Thank you for your help.
Edit : add the complete code for fun_job (after #Booboo answer)
from Bio import Align
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
aligner = Align.PairwiseAligner()
aligner.mode = 'global'
score_dist = aligner.score(seq_1[1],seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
When the with ... as executor: block exits, there is an implicit call to executor.shutdown(wait=True). This will wait for all pending futures to to be done executing "and the resources associated with the executor have been freed", which presumably includes terminating the processes in the pool (if possible?). Why your program terminates (or does it?) or at least you say all the futures have completed executing, while the processes have not terminated is a bit of a mystery. But you haven't provided the code for fun_job, so who can say why this is so?
One thing you might try is to switch to using the multiprocessing.pool.Pool class from the multiprocessing module. It supports a terminate method, which is implicitly called when its context manager with block exits, that explicitly attempts to terminate all processes in the pool:
#import concurrent.futures
import multiprocessing
... # etc.
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with multiprocessing.Pool(processes=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')

How do I get child process PIDs when using ProcessPoolExecuter?

I'm using ProcessPoolExecutor context manager to run several Kafka consumers in parallel. I need to store the process IDs of the child processes so that later, I can cleanly terminate those processes. I have such code:
Class MultiProcessConsumer:
...
def run_in_parallel(self):
parallelism_factor = 5
with ProcessPoolExecutor() as executor:
processes = [executor.submit(self.consume) for _ in range(parallelism_factor)]
# It would be nice If I could write [process.pid for process in processes] to a file here.
def consume(self):
while True:
for message in self.kafka_consumer:
do_stuff(message)
I know I can use os.get_pid() in the consume method to get PIDs. But, handling them properly (in case of constant shutting down or starting up of consumers) requires some extra work.
How would you propose that I get and store PIDs of the child processes in such a context?
os.get_pid() seems to be the way to go. Just pass them through a Queue or Pipe in combination with maybe some random UUID that you pass to the process before to identify the PID.
from concurrent.futures import ProcessPoolExecutor
import os
import time
import uuid
#from multiprocessing import Process, Queue
import multiprocessing
import queue
#The Empty exception in in Queue, multiprocessing borrows
#it from there
# https://stackoverflow.com/questions/9908781/sharing-a-result-queue-among-several-processes
m = multiprocessing.Manager()
q = m.Queue()
def task(n, queue, uuid):
my_pid = os.getpid()
print("Executing our Task on Process {}".format(my_pid))
queue.put((uuid, my_pid))
time.sleep(n)
return n * n
def main():
with ProcessPoolExecutor(max_workers = 3) as executor:
some_dict = {}
for i in range(10):
print(i)
u = uuid.uuid4()
f = executor.submit(task, i, q, u)
some_dict[u] = [f, None] # PID not known here
try:
rcv_uuid, rcv_pid = q.get(block=True, timeout=1)
some_dict[rcv_uuid][1] = rcv_pid # store PID
except queue.Empty as e:
print('handle me', e)
print('I am', rcv_uuid, 'and my PID is', rcv_pid)
if __name__ == '__main__':
main()
Although this field is private, you could use the field in PoolProcessExecutor self._processes. The code snippet below shows how to use this variable.
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(backends)) == nb_processes
In the case above, an assertion error is raised. This is because a new task can reuse the forked processes in the pool. You cannot know all forked process IDs through the method you memtioned. Hence, you can do as:
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(executor._processes.keys())) == nb_processes
print('all of PID are: %s.' % list(executor._processes.keys()))
If you don't want to destroy the encapsulation, you could inhert the ProcessPoolExecutor and create a new property for that.

Using concurrent.futures within a for statement

I store QuertyText within a pandas dataframe. Once I've loaded all the queries into I want to conduct an analysis again each query. Currently, I have ~50k to evaluate. So, doing it one by one, will take a long time.
So, I wanted to implement concurrent.futures. How do I take the individual QueryText stored within fullAnalysis as pass it to concurrent.futures and return the output as a variable?
Here is my entire code:
import pandas as pd
import time
import gensim
import sys
import warnings
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed
fullAnalysis = pd.DataFrame()
def fetch_data(jFile = 'ProcessingDetails.json'):
print("Fetching data...please wait")
#read JSON file for latest dictionary file name
baselineDictionaryFileName = 'Dictionary/Dictionary_05-03-2020.json'
#copy data to pandas dataframe
labelled_data = pd.read_json(baselineDictionaryFileName)
#Add two more columns to get the most similar text and score
labelled_data['SimilarText'] = ''
labelled_data['SimilarityScore'] = float()
print("Data fetched from " + baselineDictionaryFileName + " and there are " + str(labelled_data.shape[0]) + " rows to be evalauted")
return labelled_data
def calculateScore(inputFunc):
warnings.filterwarnings("ignore", category=DeprecationWarning)
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
inp = inputFunc
print(inp)
out = dict()
strEvaluation = inp.split("most_similar ",1)[1]
#while inp != 'quit':
split_inp = inp.split()
try:
if split_inp[0] == 'help':
pass
elif split_inp[0] == 'similarity' and len(split_inp) >= 3:
pass
elif split_inp[0] == 'most_similar' and len(split_inp) >= 2:
for pair in model.most_similar(positive=[split_inp[1]]):
out.update({pair[0]: pair[1]})
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
return out
def main():
with ThreadPoolExecutor(max_workers=5) as executor:
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
#for item in executor.map(calculateScore, arg):
output = executor.map(calculateScore, arg)
return output
if __name__ == "__main__":
fullAnalysis = fetch_data()
results = main()
print(f'results: {results}')
The Python Global Interpreter Lock or GIL allows only one thread to hold control of the Python interpreter. Since your function calculateScore might be cpu-bound and requires the interpreter to execute its byte code, you may be gaining little by using threading. If, on the other hand, it were doing mostly I/O operations, it would be giving up the GIL for most of its running time allowing other threads to run. But that does not seem to be the case here. You probably should be using the ProcessPoolExecutor from concurrent.futures (try it both ways and see):
def main():
with ProcessPoolExecutor(max_workers=None) as executor:
the_futures = {}
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures[future] = i # map future to request
for future in as_completed(the_futures): # results as they become available not necessarily the order of submission
i = the_futures[future] # the original index
result = future.result() # the result
If you omit the max_workers parameter (or specify a value of None) from the ProcessPoolExecutor constructor, the default will be the number of processors you have on your machine (not a bad default). There is no point in specifying a value larger than the number of processors you have.
If you do not need to tie the future back to the original request, then the_futures can just be a list to which But simplest yest in not even to bother to use the as_completed method:
def main():
with ProcessPoolExecutor(max_workers=5) as executor:
the_futures = []
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures.append(future)
# wait for the completion of all the results and return them all:
results = [f.result() for f in the_futures()] # results in creation order
return results
It should be mentioned that code that launches the ProcessPoolExecutor functions should be in a block governed by a if __name__ = '__main__':. If it isn't you will get into a recursive loop with each subprocess launching the ProcessPoolExecutor. But that seems to be the case here. Perhaps you meant to use the ProcessPoolExecutor all along?
Also:
I don't know what the line ...
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
... in function calculateStore does. It may be the one i/o-bound statement. But this appears to be something that does not vary from call to call. If that is the case and model is not being modified in the function, shouldn't this statement be moved out of the function and computed just once? Then this function would clearly run faster (and be clearly cpu-bound).
Also:
The exception block ...
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
... is puzzling. You are inputting a value that will never be used right before returning. If this is to pause execution, there is no error message being output.
With Booboo assistance, I was able to update code to include ProcessPoolExecutor. Here is my updated code. Overall, processing has been speed up by more than 60%.
I did run into a processing issue and found this topic BrokenPoolProcess that addresses the issue.
output = {}
thePool = {}
def main(labelled_data, dictionaryRevised):
args = sys.argv[1:]
with ProcessPoolExecutor(max_workers=None) as executor:
for i in range(len(labelled_data)):
text = labelled_data['QueryText'][i]
arg = 'most_similar'+ ' '+ text
output = winprocess.submit(
executor, calculateScore, arg
)
thePool[output] = i #original index for future to request
for output in as_completed(thePool): # results as they become available not necessarily the order of submission
i = thePool[output] # the original index
text = labelled_data['QueryText'][i]
result = output.result() # the result
maximumKey = max(result.items(), key=operator.itemgetter(1))[0]
maximumValue = result.get(maximumKey)
labelled_data['SimilarText'][i] = maximumKey
labelled_data['SimilarityScore'][i] = maximumValue
return labelled_data, dictionaryRevised
if __name__ == "__main__":
start = time.perf_counter()
print("Starting to evaluate Query Text for labelling...")
output_Labelled_Data, output_dictionary_revised = preProcessor()
output,dictionary = main(output_Labelled_Data, output_dictionary_revised)
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} second(s)')

Python freezes during for loop

def get_price_history_data(ticker):
pricelist = []
try:
pricedata = False
tradingdays = 252
Historical_Prices = pdr.get_data_yahoo(symbols=ticker, start=(datetime.today()-timedelta(tradingdays)), end=(datetime.today()))#-timedelta(years4-1)))
price_df = pd.DataFrame(Historical_Prices)
pricelist = price_df['Adj Close']
pricedata = True
except:
print(ticker,' failed to get price data')
return(pricelist, pricedata)
tickers = ['FB','V']
for ticker in tickers:
[pricelist, pricedata] = get_price_data(ticker)
I have a list of a few thousand tickers that i run through this for loop. It outputs a single column df and a boolean. Overall it works just fine and does what I need it to. However, it inconsistently freezes indefinitely with no error message and stops running forcing me to close the program and re-run from the beginning.
I am looking for a way for me to skip the iteration of the for loop if a certain amount of time has passed. I have looked into the time.sleep() and the continue function but cant figure out how to apply it to this specific application. If it freezes, it freezes on the "pdr.get_data_yahoo() section". Help would be apprec
I'm guessing that get_data_yahoo() probably freezes because it's making some kind of request to a server that never gets answered. It doesn't have a timeout option so the most obvious option is to start it in another thread/process and terminate it if it takes too long. You can use concurrent.futures for that. Once you're happy about how the code below works, you can replace sleeps_for_a_while with get_price_history_data and (3, 1, 4, 0) with tickers.
from concurrent.futures import ThreadPoolExecutor, TimeoutError
from time import sleep
TIMEOUT = 2 # seconds
def sleeps_for_a_while(sleep_for):
print('starting {}s sleep'.format(sleep_for))
sleep(sleep_for)
print('finished {}s sleep'.format(sleep_for))
# return a value to break out of the while loop
return sleep_for * 100
if __name__ == '__main__':
# this only works with functions that return values
results = []
for sleep_secs in (3, 1, 4, 0):
with ThreadPoolExecutor(max_workers=1) as executor:
# a future represents something that will be done
future = executor.submit(sleeps_for_a_while, sleep_secs)
try:
# result() raises an error if it times out
results.append(future.result(TIMEOUT))
except TimeoutError as e:
print('Function timed out')
results.append(None)
print('Got results:', results)

How would I go about using concurrent.futures and queues for a real-time scenario?

It is fairly easy to do parallel work with Python 3's concurrent.futures module as shown below.
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to = {executor.submit(do_work, input, 60): input for input in dictionary}
for future in concurrent.futures.as_completed(future_to):
data = future.result()
It is also very handy to insert and retrieve items into a Queue.
q = queue.Queue()
for task in tasks:
q.put(task)
while not q.empty():
q.get()
I have a script running in background listening for updates. Now, in theory assume that, as those updates arrive, I would queue them and do work on them concurrently using the ThreadPoolExecutor.
Now, individually, all of these components work in isolation, and make sense, but how do I go about using them together? I am not aware if it is possible to feed the ThreadPoolExecutor work from the queue in real time unless the data to work from is predetermined?
In a nutshell, all I want to do is, receive updates of say 4 messages a second, shove them in a queue, and get my concurrent.futures to work on them. If I don't, then I am stuck with a sequential approach which is slow.
Let's take the canonical example in the Python documentation below:
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
The list of URLS is fixed. Is it possible to feed this list in real-time and get the worker to process it as they come by, perhaps from a queue for management purposes? I am a bit confused on whether my approach is actually possible?
The example from the Python docs, expanded to take its work from a queue. A change to note, is that this code uses concurrent.futures.wait instead of concurrent.futures.as_completed to allow new work to be started while waiting for other work to complete.
import concurrent.futures
import urllib.request
import time
import queue
q = queue.Queue()
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
def feed_the_workers(spacing):
""" Simulate outside actors sending in work to do, request each url twice """
for url in URLS + URLS:
time.sleep(spacing)
q.put(url)
return "DONE FEEDING"
def load_url(url, timeout):
""" Retrieve a single page and report the URL and contents """
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# start a future for a thread which sends work in through the queue
future_to_url = {
executor.submit(feed_the_workers, 0.25): 'FEEDER DONE'}
while future_to_url:
# check for status of the futures which are currently working
done, not_done = concurrent.futures.wait(
future_to_url, timeout=0.25,
return_when=concurrent.futures.FIRST_COMPLETED)
# if there is incoming work, start a new future
while not q.empty():
# fetch a url from the queue
url = q.get()
# Start the load operation and mark the future with its URL
future_to_url[executor.submit(load_url, url, 60)] = url
# process any completed futures
for future in done:
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
if url == 'FEEDER DONE':
print(data)
else:
print('%r page is %d bytes' % (url, len(data)))
# remove the now completed future
del future_to_url[future]
Output from fetching each url twice:
'http://www.foxnews.com/' page is 67574 bytes
'http://www.cnn.com/' page is 136975 bytes
'http://www.bbc.co.uk/' page is 193780 bytes
'http://some-made-up-domain.com/' page is 896 bytes
'http://www.foxnews.com/' page is 67574 bytes
'http://www.cnn.com/' page is 136975 bytes
DONE FEEDING
'http://www.bbc.co.uk/' page is 193605 bytes
'http://some-made-up-domain.com/' page is 896 bytes
'http://europe.wsj.com/' page is 874649 bytes
'http://europe.wsj.com/' page is 874649 bytes
At work I found a situation where I wanted to do parallel work on an unbounded stream of data. I created a small library inspired by the excellent answer already provided by Stephen Rauch.
I originally approached this problem by thinking about two separate threads, one that submits work to a queue and one that monitors the queue for any completed tasks and makes more room for new work to come in. This is similar to what Stephen Rauch proposed, where he consumes the stream using a feed_the_workers function that runs in a separate thread.
Talking to one of my colleagues, he helped me realize that you can get away with doing everything in a single thread if you define a buffered iterator that allows you to control how many elements are let out of the input stream every time you are ready to submit more work to the thread pool.
So we introduce the BufferedIter class
class BufferedIter(object):
def __init__(self, iterator):
self.iter = iterator
def nextN(self, n):
vals = []
for _ in range(n):
vals.append(next(self.iter))
return vals
which allows us to define the stream processor in the following way
import logging
import queue
import signal
import sys
import time
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
level = logging.DEBUG
log = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
handler.setLevel(level)
log.addHandler(handler)
log.setLevel(level)
WAIT_SLEEP = 1 # second, adjust this based on the timescale of your tasks
def stream_processor(input_stream, task, num_workers):
# Use a queue to signal shutdown.
shutting_down = queue.Queue()
def shutdown(signum, frame):
log.warning('Caught signal %d, shutting down gracefully ...' % signum)
# Put an item in the shutting down queue to signal shutdown.
shutting_down.put(None)
# Register the signal handler
signal.signal(signal.SIGTERM, shutdown)
signal.signal(signal.SIGINT, shutdown)
def is_shutting_down():
return not shutting_down.empty()
futures = dict()
buffer = BufferedIter(input_stream)
with ThreadPoolExecutor(num_workers) as executor:
num_success = 0
num_failure = 0
while True:
idle_workers = num_workers - len(futures)
if not is_shutting_down():
items = buffer.nextN(idle_workers)
for data in items:
futures[executor.submit(task, data)] = data
done, _ = wait(futures, timeout=WAIT_SLEEP, return_when=ALL_COMPLETED)
for f in done:
data = futures[f]
try:
f.result(timeout=0)
except Exception as exc:
log.error('future encountered an exception: %r, %s' % (data, exc))
num_failure += 1
else:
log.info('future finished successfully: %r' % data)
num_success += 1
del futures[f]
if is_shutting_down() and len(futures) == 0:
break
log.info("num_success=%d, num_failure=%d" % (num_success, num_failure))
Below we show an example for how to use the stream processor
import itertools
def integers():
"""Simulate an infinite stream of work."""
for i in itertools.count():
yield i
def task(x):
"""The task we would like to perform in parallel.
With some delay to simulate a time consuming job.
With a baked in exception to simulate errors.
"""
time.sleep(3)
if x == 4:
raise ValueError('bad luck')
return x * x
stream_processor(integers(), task, num_workers=3)
The output for this example is shown below
2019-01-15 22:34:40,193 future finished successfully: 1
2019-01-15 22:34:40,193 future finished successfully: 0
2019-01-15 22:34:40,193 future finished successfully: 2
2019-01-15 22:34:43,201 future finished successfully: 5
2019-01-15 22:34:43,201 future encountered an exception: 4, bad luck
2019-01-15 22:34:43,202 future finished successfully: 3
2019-01-15 22:34:46,208 future finished successfully: 6
2019-01-15 22:34:46,209 future finished successfully: 7
2019-01-15 22:34:46,209 future finished successfully: 8
2019-01-15 22:34:49,215 future finished successfully: 11
2019-01-15 22:34:49,215 future finished successfully: 10
2019-01-15 22:34:49,215 future finished successfully: 9
^C <=== THIS IS WHEN I HIT Ctrl-C
2019-01-15 22:34:50,648 Caught signal 2, shutting down gracefully ...
2019-01-15 22:34:52,221 future finished successfully: 13
2019-01-15 22:34:52,222 future finished successfully: 14
2019-01-15 22:34:52,222 future finished successfully: 12
2019-01-15 22:34:52,222 num_success=14, num_failure=1
I really liked the interesting approach by #pedro above. However, when processing thousands of files, I noticed that at the end a StopIteration would be thrown and some files would always be skipped. I had to make a little modification to as follows. Very useful answer again.
class BufferedIter(object):
def __init__(self, iterator):
self.iter = iterator
def nextN(self, n):
vals = []
try:
for _ in range(n):
vals.append(next(self.iter))
return vals, False
except StopIteration as e:
return vals, True
-- Call as follows
...
if not is_shutting_down():
items, is_finished = buffer.nextN(idle_workers)
if is_finished:
stop()
...
-- Where stop is a function that simply tells to shutdown
def stop():
shutting_down.put(None)
It is possible to gain the benefits of the executor without strictly having to use a Queue. New tasks are submitted from the main thread. The undone futures are tracked and waited on until all futures are done.
import concurrent.futures
import sys
import time
sys.setrecursionlimit(64) # This is only for demonstration purposes to trigger a RecursionError. Do not set in practice.
def slow_factorial(n: int) -> int:
time.sleep(0.01)
if n == 0:
return 1
else:
return n * slow_factorial(n-1)
initial_inputs = [0, 1, 5, 20, 200, 100, 50, 51, 55, 40, 44, 21, 222, 333, 202, 1000, 10, 9000, 9009, 99, 9999]
for executor_class in (concurrent.futures.ThreadPoolExecutor, concurrent.futures.ProcessPoolExecutor):
for max_workers in (4, 8, 16, 32):
start_time = time.monotonic()
with executor_class(max_workers=max_workers) as executor:
futures_to_n = {executor.submit(slow_factorial, n): n for n in initial_inputs}
while futures_to_n:
futures_done, futures_not_done = concurrent.futures.wait(futures_to_n, return_when=concurrent.futures.FIRST_COMPLETED)
# Note: Length of futures_done is often > 1.
for future in futures_done:
n = futures_to_n.pop(future)
try:
factorial_n = future.result()
except RecursionError:
n_smaller = int(n ** 0.9)
future = executor.submit(slow_factorial, n_smaller)
futures_to_n[future] = n_smaller
# print(f'Failed to compute factorial of {n}. Trying to compute factorial of a smaller number {n_smaller} instead.')
else:
# print(f'Factorial of {n} is {factorial_n}.')
pass
used_time = time.monotonic() - start_time
executor_type = executor_class.__name__.removesuffix('PoolExecutor').lower()
print(f'Workflow took {used_time:.1f}s with {max_workers} {executor_type} workers.')
print()
Output:
Workflow took 9.4s with 4 thread workers.
Workflow took 6.3s with 8 thread workers.
Workflow took 5.4s with 16 thread workers.
Workflow took 5.2s with 32 thread workers.
Workflow took 9.0s with 4 process workers.
Workflow took 5.9s with 8 process workers.
Workflow took 5.1s with 16 process workers.
Workflow took 4.9s with 32 process workers.
For more clarity, uncomment the two print statements. As per the output above, there is an asymptotic speed benefit with more workers.

Categories