I am getting this error, when using the "submit" functionality of ProcessPoolExecutor.
Exception has occurred: TypeError
'Future' object is not iterable
File "C:......\test3.py", line 28, in
for f in as_completed(res):
import time
import json
import os
import requests
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from concurrent.futures import as_completed
BAN_API_URL = 'https://api-adresse.data.gouv.fr/search/'
def get_french_addresses(request):
print(f"Started task with pid: {os.getpid()} fetch addresses: {request['search_field']}")
query_params = {'q': request['search_field'], 'type': 'housenumber', 'autocomplete': 1}
response = requests.get(BAN_API_URL, params=query_params)
print(f"Finished task with pid: {os.getpid()} to address: {request['search_field']}")
return json.loads(response.text)
request_data = [
{'search_field': '17 rue saint maur'},
{'search_field': '35 boulevard voltaire'},
{'search_field': '32 rue rivoli'},
{'search_field': 'Route de la Croqueterie'},
]
if __name__ == '__main__':
start_time = time.time()
# Execute asynchronously with multi threads
with ProcessPoolExecutor() as executor:
res = executor.submit(get_french_addresses, request_data)
print(res)
for f in as_completed(res):
print(f.result())
end_time = time.time()
print(f'Total time to run multithreads: {end_time - start_time:2f}s')
you are using submit which passes all of the data to the function at once, what you want is to use map to pass it one item at a time, like so:
res = executor.map(get_french_addresses, request_data)
or if you need to keep using submit, you will have to split your data yourself:
res = []
with ProcessPoolExecutor() as executor:
for item in request_data:
res.append(executor.submit(get_french_addresses, item ))
print(res)
for f in as_completed(res):
the simplest edit to avoid the error, is to change
for f in as_completed(res):
to
for f in as_completed([res]):
However, this way it will almost be an equivalent of a synchronous call (I say 'almost' because some code still could execute between submit and as_completed, but because of GIL it should either be async itself or invoke some IO).
If you want the function get_french_addresses to return data asyncronously (as it processes it), it must be rewritten to support that.
Related
I'm trying to implement multithreading to a very time consuming program, and I've come across this SO answer:
https://stackoverflow.com/a/28463266/3451339, which basically offers this solution for multiple arrays:
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(4)
results = pool.map(my_function, my_array)
# Close the pool and wait for the work to finish
pool.close()
pool.join()
and, passing multiple arrays:
results = pool.starmap(function, zip(list_a, list_b))
The following is the code I have so far which must be refactored with threading. It iterates over 4 arrays, and needs to pass arguments to the function at each iteration and append all results to a final container:
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
all_teams = pd.DataFrame()
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
team = function(strategy=strategy,
budget=budget,
curr_formation=formation,
model=model)
all_teams = all_teams.append(team, ignore_index=True, sort=False)\
.reset_index(drop=True)\
.copy()
Note: Each function call makes api web requests.
What is the way to go with multithreading in this scenario?
Python has the multiprocessing module which can run multiple tasks in parallel and inside each process you can have multiple threads or async io code
Here is a working example which uses 3 Processes and Multithreading
import pandas as pd
import multiprocessing
from multiprocessing import Queue
from threading import Thread
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
#shared Queue if you want to reduce write locking use 3 Queues
Q = Queue()
# Retrive async if you want to speed up the process
def function(q,strategy,budget,curr_formation,model):
q.put("Team")
def runTask(model,q):
for strategy in strategies:
for budget in budgets:
for formation in formations:
Thread(target=function,args=(q,strategy,budget,formation,model)).start()
def main():
p1 = multiprocessing.Process(target=runTask, args=('model_1',Q))
p2 = multiprocessing.Process(target=runTask, args=('model_2',Q))
p3 = multiprocessing.Process(target=runTask, args=('model_3',Q))
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()
all = []
for i in range(0,Q.qsize()):
all.append(Q.get())
print(all)
print(len(all))
if __name__ == "__main__":
main()
A usefull article Multiprocessing in Python | Set 2
This can be one approach.
Note: Thread vs multiProcess. In this SO, I have provided execution through map, that will not work here as map has limitation on number.
Run your nested for loops and build a list of parameters ==> financial_options
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
financial_options.append([strategy,budget,formation,model])
financial_options_len=len(financial_options)
Create a new function that will handle API calls
def access_url(url,parameter_list):
#response=requests.get(url) # request goes here
print(parameter_list)
time.sleep(2)
print("sleep done!")
return "Hello"#,parameter_list # return type
now run the threading with these permutation parameters. so complete program will look like this:
import concurrent.futures
import requests # just in case needed
from bs4 import BeautifulSoup # just in case needed
import time
import pandas as pd
def access_url(url,parameter_list):
#response=requests.get(url) # request goes here
print(parameter_list)
time.sleep(2)
print("sleep done!")
return "Hello"#,parameter_list # return type
def multi_threading():
test_url="http://bla bla.com/"
base_url=test_url
THREAD_MULTI_PROCESSING= True
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
all_teams = pd.DataFrame()
start = time.perf_counter() # start time for performance
financial_options=[]
decision_results=[]
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
financial_options.append([strategy,budget,formation,model])
financial_options_len=len(financial_options)
print(f"Total options:{financial_options_len}")
future_list = []
THREAD_MULTI_PROCESSING_LOOP=True
if THREAD_MULTI_PROCESSING_LOOP:
with concurrent.futures.ThreadPoolExecutor() as executor: # Through executor
for each in range(financial_options_len):
future = executor.submit(access_url,test_url,financial_options[each]) # submit each option
future_list.append(future)
for f1 in concurrent.futures.as_completed(future_list):
r1=f1.result()
decision_results.append(r1)
end = time.perf_counter() # finish time for performance
print(f'Threads: Finished in {round(end - start,2)} second(s)')
df=pd.DataFrame(decision_results)
df.to_csv("multithread_for.csv")
return df,decision_results
df,results=multi_threading()
I'm using ProcessPoolExecutor context manager to run several Kafka consumers in parallel. I need to store the process IDs of the child processes so that later, I can cleanly terminate those processes. I have such code:
Class MultiProcessConsumer:
...
def run_in_parallel(self):
parallelism_factor = 5
with ProcessPoolExecutor() as executor:
processes = [executor.submit(self.consume) for _ in range(parallelism_factor)]
# It would be nice If I could write [process.pid for process in processes] to a file here.
def consume(self):
while True:
for message in self.kafka_consumer:
do_stuff(message)
I know I can use os.get_pid() in the consume method to get PIDs. But, handling them properly (in case of constant shutting down or starting up of consumers) requires some extra work.
How would you propose that I get and store PIDs of the child processes in such a context?
os.get_pid() seems to be the way to go. Just pass them through a Queue or Pipe in combination with maybe some random UUID that you pass to the process before to identify the PID.
from concurrent.futures import ProcessPoolExecutor
import os
import time
import uuid
#from multiprocessing import Process, Queue
import multiprocessing
import queue
#The Empty exception in in Queue, multiprocessing borrows
#it from there
# https://stackoverflow.com/questions/9908781/sharing-a-result-queue-among-several-processes
m = multiprocessing.Manager()
q = m.Queue()
def task(n, queue, uuid):
my_pid = os.getpid()
print("Executing our Task on Process {}".format(my_pid))
queue.put((uuid, my_pid))
time.sleep(n)
return n * n
def main():
with ProcessPoolExecutor(max_workers = 3) as executor:
some_dict = {}
for i in range(10):
print(i)
u = uuid.uuid4()
f = executor.submit(task, i, q, u)
some_dict[u] = [f, None] # PID not known here
try:
rcv_uuid, rcv_pid = q.get(block=True, timeout=1)
some_dict[rcv_uuid][1] = rcv_pid # store PID
except queue.Empty as e:
print('handle me', e)
print('I am', rcv_uuid, 'and my PID is', rcv_pid)
if __name__ == '__main__':
main()
Although this field is private, you could use the field in PoolProcessExecutor self._processes. The code snippet below shows how to use this variable.
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(backends)) == nb_processes
In the case above, an assertion error is raised. This is because a new task can reuse the forked processes in the pool. You cannot know all forked process IDs through the method you memtioned. Hence, you can do as:
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(executor._processes.keys())) == nb_processes
print('all of PID are: %s.' % list(executor._processes.keys()))
If you don't want to destroy the encapsulation, you could inhert the ProcessPoolExecutor and create a new property for that.
I have a list of around 28K numbers in a list named "y" and I am running a for loop on API to send Messages but this takes a lot of time (to be exact 1.2797 seconds per call)
Code:
import timeit
start = timeit.default_timer()
for i in y:
data = {'From': 'XXXX', 'To': str(i),
'Body': "ABC ABC" }
requests.post('https://xxxx:xx#api.xxx.com/v1/Accounts/xxx/Sms/send',data=data)
stop = timeit.default_timer()
print('Time: ', stop - start)
How can I reduce the time for this ?
Asyncio or Multithreading are the two possible solutions to optimize your code, and both basically do the same under the hood:
Threaded
import timeit
import threading
import time
y = list(range(50))
def post_data(server, data, sleep_time=1.5):
time.sleep(sleep_time)
# request.post(server, data=data)
start = timeit.default_timer()
server = 'https://xxxx:xx#api.xxx.com/v1/Accounts/xxx/Sms/send'
threads = []
for i in y:
# if you don't need to wait for your threads don't hold them in memory after they are done and instead do
# threading.Thread(target, args).start()
# instead. Especially important if you want to send a large number of messages
threads.append(threading.Thread(target=post_data,
args=(server, {'From': 'XXXX', 'To': str(i), 'Body': "ABC ABC"}))
threads[-1].start()
for thread in threads:
# optional if you want to wait for completion of the concurrent posts
thread.join()
stop = timeit.default_timer()
print('Time: ', stop - start)
Asyncio
Referring to this answer.
import timeit
import asyncio
from concurrent.futures import ThreadPoolExecutor
y = list(range(50)
_executor = ThreadPoolExecutor(len(y))
loop = asyncio.get_event_loop()
def post_data(server, data, sleep_time=1.5):
time.sleep(sleep_time)
# request.post(server, data=data)
async def post_data_async(server, data):
return await loop.run_in_executor(_executor, lambda: post_data(server, data))
async def run(y, server):
return await asyncio.gather(*[post_data_async(server, {'From': 'XXXX', 'To': str(i), 'Body': "ABC ABC"})
for i in y])
start = timeit.default_timer()
server = 'https://xxxx:xx#api.xxx.com/v1/Accounts/xxx/Sms/send'
loop.run_until_complete(run(y, server))
stop = timeit.default_timer()
print('Time: ', stop - start)
When using an API that does not support asyncio but would profit from concurrency, like your use-case, I'd tend towards using threading as it's easier to read IMHO. If your API/Library does support asyncio, go for it! It's great!
On my machine with a list of 50 elements the asyncio solutions clocks in at 1.515 seconds of runtime while the threaded solution needs about 1.509 seconds, when executing 50 instances of time.sleep(1.5).
I have a situation to call multiple requests in a scheduler job to check live user status for 1000 users at a time. But server limits maximum up to 50 users in each hit of an API request. So using following approach with for loop its taking around 66 seconds for 1000 users (i.e for 20 API calls).
from apscheduler.schedulers.blocking import BlockingScheduler
sched = BlockingScheduler()
def shcdulerjob():
"""
"""
uidlist = todays_userslist() #Get around 1000 users from table
#-- DIVIDE LIST BY GIVEN SIZE (here 50)
split_list = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
idlists = split_list(uidlist, 50) # SERVER MAX LIMIT - 50 ids/request
for idlist in idlists:
apiurl = some_server_url + "&ids="+str(idlist)
resp = requests.get(apiurl)
save_status(resp.json()) #-- Save status to db
if __name__ == "__main__":
sched.add_job(shcdulerjob, 'interval', minutes=10)
sched.start()
So,
Is there any workaround so that it should optimize the time required to fetch API?
Does Python- APScheduler provide any multiprocessing option to process such api requests in a single job?
You could try to apply python's Thread pool from the concurrent.futures module, if the server allows concurrent requests. That way you would parallelise the processing, instead of the scheduling itself
There are some good examples provided in the documentation here (If you're using python 2, there is a sort of an equivalent module
e.g.
import concurrent.futures
import multiprocessing
import requests
import time
import json
cpu_start_time = time.process_time()
clock_start_time = time.time()
queue = multiprocessing.Queue()
uri = "http://localhost:5000/data.json"
users = [str(user) for user in range(1, 50)]
with concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count()) as executor:
for user_id, result in zip(
[str(user) for user in range(1, 50)]
, executor.map(lambda x: requests.get(uri, params={id: x}).content, users)
):
queue.put((user_id, result))
while not queue.empty():
user_id, rs = queue.get()
print("User ", user_id, json.loads(rs.decode()))
cpu_end_time = time.process_time()
clock_end_time = time.time()
print("Took {0:.03}s [{1:.03}s]".format(cpu_end_time-cpu_start_time, clock_end_time-clock_start_time))
If you want to use a Process pool, just make sure you don't use shared resources, e.g. queue, and write your data our independently
I am trying to execute different methods in a pool object from the python multiprocessing library. I've tried too many ways but all of them get stuck when I call any of the methods .get() or .join(). I've googled a lot and none of the topics nor tutorials worked for me. My code is next:
def get_profile(artist_id):
buckets = ['years_active', 'genre', 'images']
artist = Artist(artist_id)
return artist.get_profile(buckets=buckets)
def get_songs(artist_id):
from echonest.playlist import Playlist
return Playlist().static(artist_ids=[artist_id])
def get_similar(artist_id):
artist = Artist(artist_id)
return artist.get_similar(min_familiarity=0.5, buckets=['images'])
def get_news(artist_id):
artist = Artist(artist_id)
print "Executing get_news"
return artist.get_news(high_relevance='true')
def search_artist(request, artist_name, artist_id):
from multiprocessing import Pool, Process
requests = [
dict(func=get_profile, args=(artist_id,)),
dict(func=get_songs, args=(artist_id,)),
dict(func=get_similar, args=(artist_id,)),
dict(func=get_news, args=(artist_id,))
]
pool = Pool(processes=2)
for req in requests:
result = pool.apply_async(req['func'], req['args'])
pool.close()
pool.join()
print "HERE IT STOPS AND NOTHING HAPPENS."
output = [p.get() for p in results]
Hope someone could help because I've been stuck with this for too long. Thank you in advance.