Python Multiple requests - python

I have a situation to call multiple requests in a scheduler job to check live user status for 1000 users at a time. But server limits maximum up to 50 users in each hit of an API request. So using following approach with for loop its taking around 66 seconds for 1000 users (i.e for 20 API calls).
from apscheduler.schedulers.blocking import BlockingScheduler
sched = BlockingScheduler()
def shcdulerjob():
"""
"""
uidlist = todays_userslist() #Get around 1000 users from table
#-- DIVIDE LIST BY GIVEN SIZE (here 50)
split_list = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
idlists = split_list(uidlist, 50) # SERVER MAX LIMIT - 50 ids/request
for idlist in idlists:
apiurl = some_server_url + "&ids="+str(idlist)
resp = requests.get(apiurl)
save_status(resp.json()) #-- Save status to db
if __name__ == "__main__":
sched.add_job(shcdulerjob, 'interval', minutes=10)
sched.start()
So,
Is there any workaround so that it should optimize the time required to fetch API?
Does Python- APScheduler provide any multiprocessing option to process such api requests in a single job?

You could try to apply python's Thread pool from the concurrent.futures module, if the server allows concurrent requests. That way you would parallelise the processing, instead of the scheduling itself
There are some good examples provided in the documentation here (If you're using python 2, there is a sort of an equivalent module
e.g.
import concurrent.futures
import multiprocessing
import requests
import time
import json
cpu_start_time = time.process_time()
clock_start_time = time.time()
queue = multiprocessing.Queue()
uri = "http://localhost:5000/data.json"
users = [str(user) for user in range(1, 50)]
with concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count()) as executor:
for user_id, result in zip(
[str(user) for user in range(1, 50)]
, executor.map(lambda x: requests.get(uri, params={id: x}).content, users)
):
queue.put((user_id, result))
while not queue.empty():
user_id, rs = queue.get()
print("User ", user_id, json.loads(rs.decode()))
cpu_end_time = time.process_time()
clock_end_time = time.time()
print("Took {0:.03}s [{1:.03}s]".format(cpu_end_time-cpu_start_time, clock_end_time-clock_start_time))
If you want to use a Process pool, just make sure you don't use shared resources, e.g. queue, and write your data our independently

Related

Multithread or multiprocess

So, currently, I am using multiprocessing to run these 3 functions together.
As only tokens changes, is it recommended to switch to multi-threading? (if yes, will it really help in a performance like speed-up and I think memory will be for sure used less)
This is my code:
from database_function import *
from kiteconnect import KiteTicker
import pandas as pd
from datetime import datetime, timedelta
import schedule
import time
from multiprocessing import Process
def tick_A():
#credentials code here
tokens = [x[0] for x in db_fetchquery("SELECT zerodha FROM script ORDER BY id ASC LIMIT 50")] #FETCHING FIRST 50 SCRIPTS TOKEN
#print(tokens)
##### TO MAKE SURE THE TASK STARTS AFTER 8:59 ONLY ###########
t = datetime.today()
future = datetime(t.year,t.month,t.day,8,59)
if ((future-t).total_seconds()) < 0:
future = datetime(t.year,t.month,t.day,t.hour,t.minute,(t.second+2))
time.sleep((future-t).total_seconds())
##### TO MAKE SURE THE TASK STARTS AFTER 8:59 ONLY ###########
def on_ticks(ws, ticks):
global ltp
ltp = ticks[0]["last_price"]
for tick in ticks:
print(f"{tick['instrument_token']}A")
db_runquery(f'UPDATE SCRIPT SET ltp = {tick["last_price"]} WHERE zerodha = {tick["instrument_token"]}') #UPDATING LTP IN DATABASE
#print(f"{tick['last_price']}")
def on_connect(ws, response):
#print(f"response from connect :: {response}")
# Subscribe to a list of instrument_tokens (TOKENS FETCHED ABOVE WILL BE SUBSCRIBED HERE).
# logging.debug("on connect: {}".format(response))
ws.subscribe(tokens)
ws.set_mode(ws.MODE_LTP,tokens) # SETTING TOKEN TO TICK MODE (LTP / FULL / QUOTE)
kws.on_ticks = on_ticks
kws.on_connect = on_connect
kws.connect(threaded=True)
#####TO STOP THE TASK AFTER 15:32 #######
end_time = datetime(t.year,t.month,t.day,15,32)
while True:
schedule.run_pending()
#time.sleep(1)
if datetime.now() > end_time:
break
#####TO STOP THE TASK AFTER 15:32 #######
def tick_B():
everything remains the same only tokens value changes
tokens = [x[0] for x in db_fetchquery("SELECT zerodha FROM script ORDER BY id ASC OFFSET (50) ROWS FETCH NEXT (50) ROWS ONLY")]
def tick_C():
everything remains the same only tokens value changes
tokens = [x[0] for x in db_fetchquery("SELECT zerodha FROM script ORDER BY id ASC OFFSET (100) ROWS FETCH NEXT (50) ROWS ONLY")]
if __name__ == '__main__':
def runInParallel(*fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
runInParallel(tick_A , tick_B , tick_C)
So, currently, I am using multiprocessing to run these 3 functions together.
As only tokens changes, is it recommended to switch to multi-threading? (if yes, will it really help in a performance like speed-up and I think memory will be for sure used less)
most Python implementations do not have true multi-threading, because they use global lock (GIL). So only one thread runs at a time.
For I/O heavy applications it should not make difference. But if you need CPU heavy operations done in parallel (and I see that you use Panda - so the answer must be yes) - you will be better off staying with multi-process app.

Python multiprocessing finish the work correctly, but the processes still alive (Linux)

I use python multiprocessing to compute some sort of scores on DNA sequences from a large file.
For that I write and use the script below.
I use a Linux machine with 48 cpu in python 3.8 environment.
Th code work fine, and terminate the work correctly and print the processing time at the end.
Problem: when I use the htop command, I find that all 48 processes are still alive.
I don't know why, and I don't know what to add to my script to avoid this.
import csv
import sys
import concurrent.futures
from itertools import combinations
import psutil
import time
nb_cpu = psutil.cpu_count(logical=False)
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
score_dist = compute_score_dist(seq_1[1], seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
def help_fun_job(nested_pair):
return fun_job(nested_pair[0], nested_pair[1])
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor(max_workers=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')
def main():
print("nb_cpu in this machine : ", nb_cpu)
file_path = sys.argv[1]
dict_ids_seqs = get_dict_ids_seqs(file_path)
list_ids = list(dict_ids_seqs) # This will convert the dict_keys to a list
list_combined_ids = list(combinations(list_ids, 2))
compute_using_multi_processing(list_combined_ids, dict_ids_seqs)
if __name__ == '__main__':
main()
Thank you for your help.
Edit : add the complete code for fun_job (after #Booboo answer)
from Bio import Align
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
aligner = Align.PairwiseAligner()
aligner.mode = 'global'
score_dist = aligner.score(seq_1[1],seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
When the with ... as executor: block exits, there is an implicit call to executor.shutdown(wait=True). This will wait for all pending futures to to be done executing "and the resources associated with the executor have been freed", which presumably includes terminating the processes in the pool (if possible?). Why your program terminates (or does it?) or at least you say all the futures have completed executing, while the processes have not terminated is a bit of a mystery. But you haven't provided the code for fun_job, so who can say why this is so?
One thing you might try is to switch to using the multiprocessing.pool.Pool class from the multiprocessing module. It supports a terminate method, which is implicitly called when its context manager with block exits, that explicitly attempts to terminate all processes in the pool:
#import concurrent.futures
import multiprocessing
... # etc.
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with multiprocessing.Pool(processes=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')

Python - multithread with multiple arrays passing args to function

I'm trying to implement multithreading to a very time consuming program, and I've come across this SO answer:
https://stackoverflow.com/a/28463266/3451339, which basically offers this solution for multiple arrays:
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(4)
results = pool.map(my_function, my_array)
# Close the pool and wait for the work to finish
pool.close()
pool.join()
and, passing multiple arrays:
results = pool.starmap(function, zip(list_a, list_b))
The following is the code I have so far which must be refactored with threading. It iterates over 4 arrays, and needs to pass arguments to the function at each iteration and append all results to a final container:
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
all_teams = pd.DataFrame()
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
team = function(strategy=strategy,
budget=budget,
curr_formation=formation,
model=model)
all_teams = all_teams.append(team, ignore_index=True, sort=False)\
.reset_index(drop=True)\
.copy()
Note: Each function call makes api web requests.
What is the way to go with multithreading in this scenario?
Python has the multiprocessing module which can run multiple tasks in parallel and inside each process you can have multiple threads or async io code
Here is a working example which uses 3 Processes and Multithreading
import pandas as pd
import multiprocessing
from multiprocessing import Queue
from threading import Thread
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
#shared Queue if you want to reduce write locking use 3 Queues
Q = Queue()
# Retrive async if you want to speed up the process
def function(q,strategy,budget,curr_formation,model):
q.put("Team")
def runTask(model,q):
for strategy in strategies:
for budget in budgets:
for formation in formations:
Thread(target=function,args=(q,strategy,budget,formation,model)).start()
def main():
p1 = multiprocessing.Process(target=runTask, args=('model_1',Q))
p2 = multiprocessing.Process(target=runTask, args=('model_2',Q))
p3 = multiprocessing.Process(target=runTask, args=('model_3',Q))
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()
all = []
for i in range(0,Q.qsize()):
all.append(Q.get())
print(all)
print(len(all))
if __name__ == "__main__":
main()
A usefull article Multiprocessing in Python | Set 2
This can be one approach.
Note: Thread vs multiProcess. In this SO, I have provided execution through map, that will not work here as map has limitation on number.
Run your nested for loops and build a list of parameters ==> financial_options
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
financial_options.append([strategy,budget,formation,model])
financial_options_len=len(financial_options)
Create a new function that will handle API calls
def access_url(url,parameter_list):
#response=requests.get(url) # request goes here
print(parameter_list)
time.sleep(2)
print("sleep done!")
return "Hello"#,parameter_list # return type
now run the threading with these permutation parameters. so complete program will look like this:
import concurrent.futures
import requests # just in case needed
from bs4 import BeautifulSoup # just in case needed
import time
import pandas as pd
def access_url(url,parameter_list):
#response=requests.get(url) # request goes here
print(parameter_list)
time.sleep(2)
print("sleep done!")
return "Hello"#,parameter_list # return type
def multi_threading():
test_url="http://bla bla.com/"
base_url=test_url
THREAD_MULTI_PROCESSING= True
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
all_teams = pd.DataFrame()
start = time.perf_counter() # start time for performance
financial_options=[]
decision_results=[]
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
financial_options.append([strategy,budget,formation,model])
financial_options_len=len(financial_options)
print(f"Total options:{financial_options_len}")
future_list = []
THREAD_MULTI_PROCESSING_LOOP=True
if THREAD_MULTI_PROCESSING_LOOP:
with concurrent.futures.ThreadPoolExecutor() as executor: # Through executor
for each in range(financial_options_len):
future = executor.submit(access_url,test_url,financial_options[each]) # submit each option
future_list.append(future)
for f1 in concurrent.futures.as_completed(future_list):
r1=f1.result()
decision_results.append(r1)
end = time.perf_counter() # finish time for performance
print(f'Threads: Finished in {round(end - start,2)} second(s)')
df=pd.DataFrame(decision_results)
df.to_csv("multithread_for.csv")
return df,decision_results
df,results=multi_threading()

Stop multiprocess pool when a condition is met and continue with program

I've been trying to wrap my head around multiprocessing using an old python bitcoin mining program. Although relatively useless for mining, I figured this would be a great way to explore multiprocessing. However, I've hit a wall when it comes to stopping the processes when one of them achieves the goal they are all working towards.
I want to kill all multiprocessing pools when one of them finds the solution. Then allow the program to continue. I have tried terminate() and join(). I've attempted to include an Event(). I've tried using Process instead of Pool with the direction of a similar issue here: Killing a multiprocessing process when condition is met. However, same problem. How can I stop all processes after a condition is met without exiting the program with something like sys.exit() that would kill the entire program?
I tried also apply_sync with the direction from this post: Python Multiprocess Pool. How to exit the script when one of the worker process determines no more work needs to be done? However, it did not solve the problem of needing to continue executing the final functions of the program. In fact, it actually slowed the program significantly.
For clarity, I've included the code I tried based on the above mentioned link here:
from multiprocessing import Pool
from hashlib import sha256
import time
def SHA256(text):
return sha256(text.encode("ascii")).hexdigest()
def solution_helper(args):
solution, nonce = do_job(args)
if solution:
print(f"\nNonce Found: {nonce}\n")
return True
else:
return False
class Mining():
def __init__(self, workers, initargs):
self.pool = Pool(processes=workers, initargs=initargs)
def callback(self, result):
if result:
print('Solution Found...Terminating Processes...')
self.pool.terminate()
def do_job(self):
for args in values:
start_nonce = args[0]
end_nonce = args[1]
prefix_str = '0'*difficulty
self.pool.apply_async(solution_helper, args=args, callback=self.callback)
start = time.time()
for nonce in range(start_nonce, end_nonce):
text = str(block_number) + transactions + previous_hash + str(nonce)
new_hash = SHA256(text)
if new_hash.startswith(prefix_str):
print(f"Hashing: {text}")
print(f"\nSuccessfully mined bitcoin with nonce value: {nonce}\n")
print(f"New hash: {new_hash}")
total_time = str((time.time()-start))
print(f"\nEnd mning... Mining took {total_time} seconds\n")
return new_hash, nonce
self.pool.close()
self.pool.join()
print('.Goodbye.')
block_number = 5
transactions = """
bill->steve->20,
jan->phillis->45
"""
previous_hash = '0000000b7c7723e4d3a8654c975fe4dd23d4d37f22d0ea7e5abde2225d1567dc6'
values = [(20000, 100000), (100000, 1000000), (1000000, 10000000), (10000000, 100000000)]
difficulty = 4
m = Mining(5, values)
m.do_job()
Here's the basic concept. It works great to start the processes, but I cannot figure out how to stop them:
from multiprocessing import Pool
from hashlib import sha256
import functools
MAX_NONCE = 1000000000
def SHA256(text):
return sha256(text.encode("ascii")).hexdigest()
def nonce(block_number, transactions, previous_hash, prefix_str):
import time
start = time.time()
for nonce in range(MAX_NONCE):
text = str(block_number) + transactions + previous_hash + str(nonce)
new_hash = SHA256(text)
if new_hash.startswith(prefix_str):
print(f"\nYay! Successfully mined bitcoins with nonce value:{nonce}")
total_time = str((time.time()-start))
print(f"\nend mining. Mining took: {total_time} seconds\n")
print(new_hash + "\n")
def mine(block_number, transactions, previous_hash, prefix_zeros):
from multiprocessing import Pool
with Pool(4) as p:
prefix_str = '0'*prefix_zeros
p.map(nonce(block_number, transactions, previous_hash, prefix_str), [20000, 40000, 60000, 80000, 100000])
if __name__=='__main__':
transactions="""
bill->steve->20,
jan->phillis->45
"""
difficulty=7
print("\nstart mining\n")
new_hash = mine(5, transactions, '0000000b7c7723e4d3a8654c975fe4dd23d4d37f22d0ea7e5abde2225d1567dc6', difficulty)
# Do some other things... Here is where I'd like to get to after the multiproccesses are killed
print(f"\nMission Complete...{new_hash}\n") <---This never gets a chance to happen

Python multi-threading method

I've heard that Python multi-threading is a bit tricky, and I am not sure what is the best way to go about implementing what I need. Let's say I have a function called IO_intensive_function that does some API call which may take a while to get a response.
Say the process of queuing jobs can look something like this:
import thread
for job_args in jobs:
thread.start_new_thread(IO_intense_function, (job_args))
Would the IO_intense_function now just execute its task in the background and allow me to queue in more jobs?
I also looked at this question, which seems like the approach is to just do the following:
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(2)
results = pool.map(IO_intensive_function, jobs)
As I don't need those tasks to communicate with each other, the only goal is to send my API requests as fast as possible. Is this the most efficient way? Thanks.
Edit:
The way I am making the API request is through a Thrift service.
I had to create code to do something similar recently. I've tried to make it generic below. Note I'm a novice coder, so please forgive the inelegance. What you may find valuable, however, is some of the error processing I found it necessary to embed to capture disconnects, etc.
I also found it valuable to perform the json processing in a threaded manner. You have the threads working for you, so why go "serial" again for a processing step when you can extract the info in parallel.
It is possible I will have mis-coded in making it generic. Please don't hesitate to ask follow-ups and I will clarify.
import requests
from multiprocessing.dummy import Pool as ThreadPool
from src_code.config import Config
with open(Config.API_PATH + '/api_security_key.pem') as f:
my_key = f.read().rstrip("\n")
f.close()
base_url = "https://api.my_api_destination.com/v1"
headers = {"Authorization": "Bearer %s" % my_key}
itm = list()
itm.append(base_url)
itm.append(headers)
def call_API(call_var):
base_url = call_var[0]
headers = call_var[1]
call_specific_tag = call_var[2]
endpoint = f'/api_path/{call_specific_tag}'
connection_tries = 0
for i in range(3):
try:
dat = requests.get((base_url + endpoint), headers=headers).json()
except:
connection_tries += 1
print(f'Call for {api_specific_tag} failed after {i} attempt(s). Pausing for 240 seconds.')
time.sleep(240)
else:
break
tag = list()
vars_to_capture_01 = list()
vars_to_capture_02 = list()
connection_tries = 0
try:
if 'record_id' in dat:
vars_to_capture_01.append(dat['record_id'])
vars_to_capture_02.append(dat['second_item_of_interest'])
else:
vars_to_capture_01.append(call_specific_tag)
print(f'Call specific tag {call_specific_tag} is unavailable. Successful pull.')
vars_to_capture_02.append(-1)
except:
print(f'{call_specific_tag} is unavailable. Unsuccessful pull.')
vars_to_capture_01.append(call_specific_tag)
vars_to_capture_02.append(-1)
time.sleep(240)
pack = list()
pack.append(vars_to_capture_01)
pack.append(vars_to_capture_02)
return pack
vars_to_capture_01 = list()
vars_to_capture_02 = list()
i = 0
max_i = len(all_tags)
while i < max_i:
ind_rng = range(i, min((i + 10), (max_i)), 1)
itm_lst = (itm.copy())
call_var = [itm_lst + [all_tags[q]] for q in ind_rng]
#packed = call_API(call_var[0]) # for testing of function without pooling
pool = ThreadPool(len(call_var))
packed = pool.map(call_API, call_var)
pool.close()
pool.join()
for pack in packed:
try:
vars_to_capture_01.append(pack[0][0])
except:
print(f'Unpacking error for {all_tags[i]}.')
vars_to_capture_02.append(pack[1][0])
For network API request you can use asyncio. Have a look at this article https://realpython.com/python-concurrency/#asyncio-version for an example how to implement it.

Categories