Overwrite appended list during background scheduling process using apscheduler python - python

Every time I run the program below, the output keeps adding to previous outputs because the object_list is appending in the background since apscheduler is set to run on intervals. What my output needs to be is obtaining the real time up to date list of the objects in the bucket. Right now I get an appended list that includes objects that are no longer in the bucket because the appending list. When I run the program manually, I get the expected results because the list appends once and completes the process. Is there a way to run this program in the background and have a new appended list created each time the program produces the output? The program is using an exclude list to filter out unwanted results.
import boto3
from plyer import notification
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.schedulers.background import BlockingScheduler
from time import sleep
import datetime
import schedule
import time
Exclude_List= []
Object_List = []
FTP_File_List = []
file = open('ftp_exclude.txt', 'r')
excplist = file.readlines()
file.close
for x in excplist:
Exclude_List.append(x.strip())
def AWS_PROD_Check():
print(f"AWS_PROD START: {datetime.datetime.now()}")
session = boto3.Session(profile_name='My_Profile')
s3 = session.resource('s3')
my_bucket = s3.Bucket('my_bucket')
objects = my_bucket.objects.filter(Prefix = 'My_folder/')
for object in objects:
Object_List.append(object.key)
FTP_File_List = set(Object_List) - {x for y in Exclude_List for x in Object_List if y in x}
FTP_File_List_Sorted = sorted(FTP_File_List)
for x in FTP_File_List_Sorted:
if '/My_directory/' in x and '.' in x:
print(x)
print(f"AWS_PROD END: {datetime.datetime.now()}")
notification.notify(
title='AWS_PROD Check',
message='Report Generated',
app_icon=None,
timeout=20, )
AWS_PROD_Check()
sched = BackgroundScheduler()
sched.add_job(AWS_PROD_Check, 'interval', minutes = 5)
sched.start()
while True:
sleep(1)

Related

Discord py - multiprocessing blocks tasks.loop

I am creating a discord bot with Python on Replit.
One function of the bot is that it checks whether the current time is equal to a given time, so I have a tasks.loop event that loops every second. Another function of the bot is a command that generates a graph with data taken from an api.
Both blocks of codes run fine on their own. But sometimes after calling the graph command, it stops the tasks.loop: now is no longer printed every second after bot.pt_list is printed. The following is my code:
import datetime
from discord.ext import tasks
from multiprocessing import Pool
import requests
#tasks.loop(seconds = 1)
async def notif():
now = datetime.datetime.now() + datetime.timedelta(hours = 8)
now = now.strftime("%H:%M:%S")
print(now)
bot.pt_list = []
#bot.command(name = 'graph')
async def graph(ctx):
bot.rank = rank
timestamp_url = "https://api.sekai.best/event/29/rankings/time?region=tw"
timestamp_response = requests.get(timestamp_url)
timestamp_data = timestamp_response.json()["data"]
i = 1
timestamp_filtered = []
while i <= len(timestamp_data):
timestamp_filtered.append(timestamp_data[i])
i += 12
timestamp_url = []
if __name__ == '__main__':
for timestamp in timestamp_filtered:
timestamp_url.append("https://api.sekai.best/event/29/rankings?region=tw&timestamp=" + timestamp)
with Pool(20) as p:
bot.pt_list = p.map(pt, timestamp_url)
print(bot.pt_list)
def pt(timestamp_url):
pt_response = requests.get(timestamp_url)
pt_data = pt_response.json()["data"]["eventRankings"]
for data in pt_data:
if data["rank"] == 1:
return data["score"]
And below is the output:
# prints time every second
15:03:01
15:03:02
15:03:03
15:03:04
[414505, 6782930, 13229090, 19650440, 27690605, 34044730, 34807680, 38346228, 43531083, 48973205, 52643633, 56877023, 62323476, 67464731, 69565641, 74482140, 78791756, 84277236, 87191476, 91832031, 97207348, 102692443, 104280559, 106288572, 111710142, 112763082, 112827552, 113359257, 116211652, 117475362, 117529967, 117560102, 118293877, 118293877, 118430000, 118430000]
15:03:15
15:03:15
# printing stops
However, the tasks.loop does not get stopped every time, sometimes it works and will continue to print now after printing bot.pt_list. I'm relatively new to Python and I don't know what the issue is, could someone help explain why this is happening and how to fix this? Thank you!

Multithread or multiprocess

So, currently, I am using multiprocessing to run these 3 functions together.
As only tokens changes, is it recommended to switch to multi-threading? (if yes, will it really help in a performance like speed-up and I think memory will be for sure used less)
This is my code:
from database_function import *
from kiteconnect import KiteTicker
import pandas as pd
from datetime import datetime, timedelta
import schedule
import time
from multiprocessing import Process
def tick_A():
#credentials code here
tokens = [x[0] for x in db_fetchquery("SELECT zerodha FROM script ORDER BY id ASC LIMIT 50")] #FETCHING FIRST 50 SCRIPTS TOKEN
#print(tokens)
##### TO MAKE SURE THE TASK STARTS AFTER 8:59 ONLY ###########
t = datetime.today()
future = datetime(t.year,t.month,t.day,8,59)
if ((future-t).total_seconds()) < 0:
future = datetime(t.year,t.month,t.day,t.hour,t.minute,(t.second+2))
time.sleep((future-t).total_seconds())
##### TO MAKE SURE THE TASK STARTS AFTER 8:59 ONLY ###########
def on_ticks(ws, ticks):
global ltp
ltp = ticks[0]["last_price"]
for tick in ticks:
print(f"{tick['instrument_token']}A")
db_runquery(f'UPDATE SCRIPT SET ltp = {tick["last_price"]} WHERE zerodha = {tick["instrument_token"]}') #UPDATING LTP IN DATABASE
#print(f"{tick['last_price']}")
def on_connect(ws, response):
#print(f"response from connect :: {response}")
# Subscribe to a list of instrument_tokens (TOKENS FETCHED ABOVE WILL BE SUBSCRIBED HERE).
# logging.debug("on connect: {}".format(response))
ws.subscribe(tokens)
ws.set_mode(ws.MODE_LTP,tokens) # SETTING TOKEN TO TICK MODE (LTP / FULL / QUOTE)
kws.on_ticks = on_ticks
kws.on_connect = on_connect
kws.connect(threaded=True)
#####TO STOP THE TASK AFTER 15:32 #######
end_time = datetime(t.year,t.month,t.day,15,32)
while True:
schedule.run_pending()
#time.sleep(1)
if datetime.now() > end_time:
break
#####TO STOP THE TASK AFTER 15:32 #######
def tick_B():
everything remains the same only tokens value changes
tokens = [x[0] for x in db_fetchquery("SELECT zerodha FROM script ORDER BY id ASC OFFSET (50) ROWS FETCH NEXT (50) ROWS ONLY")]
def tick_C():
everything remains the same only tokens value changes
tokens = [x[0] for x in db_fetchquery("SELECT zerodha FROM script ORDER BY id ASC OFFSET (100) ROWS FETCH NEXT (50) ROWS ONLY")]
if __name__ == '__main__':
def runInParallel(*fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
runInParallel(tick_A , tick_B , tick_C)
So, currently, I am using multiprocessing to run these 3 functions together.
As only tokens changes, is it recommended to switch to multi-threading? (if yes, will it really help in a performance like speed-up and I think memory will be for sure used less)
most Python implementations do not have true multi-threading, because they use global lock (GIL). So only one thread runs at a time.
For I/O heavy applications it should not make difference. But if you need CPU heavy operations done in parallel (and I see that you use Panda - so the answer must be yes) - you will be better off staying with multi-process app.

Python multiprocessing finish the work correctly, but the processes still alive (Linux)

I use python multiprocessing to compute some sort of scores on DNA sequences from a large file.
For that I write and use the script below.
I use a Linux machine with 48 cpu in python 3.8 environment.
Th code work fine, and terminate the work correctly and print the processing time at the end.
Problem: when I use the htop command, I find that all 48 processes are still alive.
I don't know why, and I don't know what to add to my script to avoid this.
import csv
import sys
import concurrent.futures
from itertools import combinations
import psutil
import time
nb_cpu = psutil.cpu_count(logical=False)
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
score_dist = compute_score_dist(seq_1[1], seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
def help_fun_job(nested_pair):
return fun_job(nested_pair[0], nested_pair[1])
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor(max_workers=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')
def main():
print("nb_cpu in this machine : ", nb_cpu)
file_path = sys.argv[1]
dict_ids_seqs = get_dict_ids_seqs(file_path)
list_ids = list(dict_ids_seqs) # This will convert the dict_keys to a list
list_combined_ids = list(combinations(list_ids, 2))
compute_using_multi_processing(list_combined_ids, dict_ids_seqs)
if __name__ == '__main__':
main()
Thank you for your help.
Edit : add the complete code for fun_job (after #Booboo answer)
from Bio import Align
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
aligner = Align.PairwiseAligner()
aligner.mode = 'global'
score_dist = aligner.score(seq_1[1],seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
When the with ... as executor: block exits, there is an implicit call to executor.shutdown(wait=True). This will wait for all pending futures to to be done executing "and the resources associated with the executor have been freed", which presumably includes terminating the processes in the pool (if possible?). Why your program terminates (or does it?) or at least you say all the futures have completed executing, while the processes have not terminated is a bit of a mystery. But you haven't provided the code for fun_job, so who can say why this is so?
One thing you might try is to switch to using the multiprocessing.pool.Pool class from the multiprocessing module. It supports a terminate method, which is implicitly called when its context manager with block exits, that explicitly attempts to terminate all processes in the pool:
#import concurrent.futures
import multiprocessing
... # etc.
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with multiprocessing.Pool(processes=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')

Get output as a list from time module

I have a code that running in every 2 seconds. This code prints the coordinate information every two seconds. I want to collect these coordinates in a list but I cannot. How can I do that ?
Code:
import time
import requests
import schedule
def executeSomething():
r = requests.get('https://get.geojs.io/')
ip_request = requests.get("https://get.geojs.io/v1/ip.json")
ippAdd = ip_request.json()["ip"]
url = 'https://get.geojs.io/v1/ip/geo/' + ippAdd + '.json'
geo_request = requests.get(url)
geo_data = geo_request.json()
co=[]
co.append([float(geo_data["latitude"]),float(geo_data["longitude"])])
print(co)
schedule.every(2).seconds.do(executeSomething)#This code run every 10 seconds
#schedule.every().hour.do(executeSomething())
while 1:
schedule.run_pending()
time.sleep(1)
Output:
[[39.9208, 32.8375]]
[[39.7856, 32.2174]]
But I want output like this:
[[39.9208, 32.8375], [39.7856, 32.2174]]
Edit:
I have an another problem. When change print(co) to return co and import this function to another code and try to get "co" list, I cannot get.
import dynamic
d = dynamic.executeSomething()
print(d)
What am I doing wrong?
You're resetting the list every time the loop runs by including co=[] in your function as it calls the function every time.
Move your co=[] above and outside of the function.
import time
import requests
import schedule
co=[]
def executeSomething():
r = requests.get('https://get.geojs.io/')
ip_request = requests.get("https://get.geojs.io/v1/ip.json")
ippAdd = ip_request.json()["ip"]
url = 'https://get.geojs.io/v1/ip/geo/' + ippAdd + '.json'
geo_request = requests.get(url)
geo_data = geo_request.json()
co.append([float(geo_data["latitude"]),float(geo_data["longitude"])])
print(co)
schedule.every(2).seconds.do(executeSomething)#This code run every 10 seconds
#schedule.every().hour.do(executeSomething())
while 1:
schedule.run_pending()
time.sleep(1)

Python Multiple requests

I have a situation to call multiple requests in a scheduler job to check live user status for 1000 users at a time. But server limits maximum up to 50 users in each hit of an API request. So using following approach with for loop its taking around 66 seconds for 1000 users (i.e for 20 API calls).
from apscheduler.schedulers.blocking import BlockingScheduler
sched = BlockingScheduler()
def shcdulerjob():
"""
"""
uidlist = todays_userslist() #Get around 1000 users from table
#-- DIVIDE LIST BY GIVEN SIZE (here 50)
split_list = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
idlists = split_list(uidlist, 50) # SERVER MAX LIMIT - 50 ids/request
for idlist in idlists:
apiurl = some_server_url + "&ids="+str(idlist)
resp = requests.get(apiurl)
save_status(resp.json()) #-- Save status to db
if __name__ == "__main__":
sched.add_job(shcdulerjob, 'interval', minutes=10)
sched.start()
So,
Is there any workaround so that it should optimize the time required to fetch API?
Does Python- APScheduler provide any multiprocessing option to process such api requests in a single job?
You could try to apply python's Thread pool from the concurrent.futures module, if the server allows concurrent requests. That way you would parallelise the processing, instead of the scheduling itself
There are some good examples provided in the documentation here (If you're using python 2, there is a sort of an equivalent module
e.g.
import concurrent.futures
import multiprocessing
import requests
import time
import json
cpu_start_time = time.process_time()
clock_start_time = time.time()
queue = multiprocessing.Queue()
uri = "http://localhost:5000/data.json"
users = [str(user) for user in range(1, 50)]
with concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count()) as executor:
for user_id, result in zip(
[str(user) for user in range(1, 50)]
, executor.map(lambda x: requests.get(uri, params={id: x}).content, users)
):
queue.put((user_id, result))
while not queue.empty():
user_id, rs = queue.get()
print("User ", user_id, json.loads(rs.decode()))
cpu_end_time = time.process_time()
clock_end_time = time.time()
print("Took {0:.03}s [{1:.03}s]".format(cpu_end_time-cpu_start_time, clock_end_time-clock_start_time))
If you want to use a Process pool, just make sure you don't use shared resources, e.g. queue, and write your data our independently

Categories