downloadStart = datetime.now()
while (True):
requestURL = transactionAPI.format(page = tempPage,limit = 5000)
response = requests.get(requestURL,headers=headers)
json_data = json.loads(response.content)
tempMomosTransactionHistory.extend(json_data["list"])
if(datetime.fromtimestamp(json_data["list"][-1]["crtime"]) < datetime(datetime.today().year,datetime.today().month,datetime.today().day - dateRange)):
break
tempPage += 1
downloadEnd = datetime.now()
Any suggestions please threading or something like that ?
Outputs here
downloadtime 0:00:02.056010
downloadtime 0:00:05.680806
downloadtime 0:00:05.447945
You need to improve it in two ways.
Optimise code within loop
Parallelize code execution
#1
By looking at your code I can see one improvement ie. create datetime.today object instead of doing 3 times. Check other methods like transactionAPI optimise further.
#2:
If you multi core CPU machine then you take advantage of machine by spanning thread per page. Refer to modified code of above.
import threading
def processRequest(tempPage):
requestURL = transactionAPI.format(page = tempPage,limit = 5000)
response = requests.get(requestURL,headers=headers)
json_data = json.loads(response.content)
tempMomosTransactionHistory.extend(json_data["list"])
downloadStart = datetime.now()
while (True):
#create thread per page
t1 = threading.Thread(target=processRequest, args=(tempPage, ))
t1.start()
#Fetch datetime today object once instaed 3 times
datetimetoday = datetime()
if(datetime.fromtimestamp(json_data["list"][-1]["crtime"]) < datetime(datetimetoday.year,datetimetoday.month,datetimetoday.day - dateRange)):
break
tempPage += 1
downloadEnd = datetime.now()
Related
I am creating a discord bot with Python on Replit.
One function of the bot is that it checks whether the current time is equal to a given time, so I have a tasks.loop event that loops every second. Another function of the bot is a command that generates a graph with data taken from an api.
Both blocks of codes run fine on their own. But sometimes after calling the graph command, it stops the tasks.loop: now is no longer printed every second after bot.pt_list is printed. The following is my code:
import datetime
from discord.ext import tasks
from multiprocessing import Pool
import requests
#tasks.loop(seconds = 1)
async def notif():
now = datetime.datetime.now() + datetime.timedelta(hours = 8)
now = now.strftime("%H:%M:%S")
print(now)
bot.pt_list = []
#bot.command(name = 'graph')
async def graph(ctx):
bot.rank = rank
timestamp_url = "https://api.sekai.best/event/29/rankings/time?region=tw"
timestamp_response = requests.get(timestamp_url)
timestamp_data = timestamp_response.json()["data"]
i = 1
timestamp_filtered = []
while i <= len(timestamp_data):
timestamp_filtered.append(timestamp_data[i])
i += 12
timestamp_url = []
if __name__ == '__main__':
for timestamp in timestamp_filtered:
timestamp_url.append("https://api.sekai.best/event/29/rankings?region=tw×tamp=" + timestamp)
with Pool(20) as p:
bot.pt_list = p.map(pt, timestamp_url)
print(bot.pt_list)
def pt(timestamp_url):
pt_response = requests.get(timestamp_url)
pt_data = pt_response.json()["data"]["eventRankings"]
for data in pt_data:
if data["rank"] == 1:
return data["score"]
And below is the output:
# prints time every second
15:03:01
15:03:02
15:03:03
15:03:04
[414505, 6782930, 13229090, 19650440, 27690605, 34044730, 34807680, 38346228, 43531083, 48973205, 52643633, 56877023, 62323476, 67464731, 69565641, 74482140, 78791756, 84277236, 87191476, 91832031, 97207348, 102692443, 104280559, 106288572, 111710142, 112763082, 112827552, 113359257, 116211652, 117475362, 117529967, 117560102, 118293877, 118293877, 118430000, 118430000]
15:03:15
15:03:15
# printing stops
However, the tasks.loop does not get stopped every time, sometimes it works and will continue to print now after printing bot.pt_list. I'm relatively new to Python and I don't know what the issue is, could someone help explain why this is happening and how to fix this? Thank you!
I store QuertyText within a pandas dataframe. Once I've loaded all the queries into I want to conduct an analysis again each query. Currently, I have ~50k to evaluate. So, doing it one by one, will take a long time.
So, I wanted to implement concurrent.futures. How do I take the individual QueryText stored within fullAnalysis as pass it to concurrent.futures and return the output as a variable?
Here is my entire code:
import pandas as pd
import time
import gensim
import sys
import warnings
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed
fullAnalysis = pd.DataFrame()
def fetch_data(jFile = 'ProcessingDetails.json'):
print("Fetching data...please wait")
#read JSON file for latest dictionary file name
baselineDictionaryFileName = 'Dictionary/Dictionary_05-03-2020.json'
#copy data to pandas dataframe
labelled_data = pd.read_json(baselineDictionaryFileName)
#Add two more columns to get the most similar text and score
labelled_data['SimilarText'] = ''
labelled_data['SimilarityScore'] = float()
print("Data fetched from " + baselineDictionaryFileName + " and there are " + str(labelled_data.shape[0]) + " rows to be evalauted")
return labelled_data
def calculateScore(inputFunc):
warnings.filterwarnings("ignore", category=DeprecationWarning)
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
inp = inputFunc
print(inp)
out = dict()
strEvaluation = inp.split("most_similar ",1)[1]
#while inp != 'quit':
split_inp = inp.split()
try:
if split_inp[0] == 'help':
pass
elif split_inp[0] == 'similarity' and len(split_inp) >= 3:
pass
elif split_inp[0] == 'most_similar' and len(split_inp) >= 2:
for pair in model.most_similar(positive=[split_inp[1]]):
out.update({pair[0]: pair[1]})
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
return out
def main():
with ThreadPoolExecutor(max_workers=5) as executor:
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
#for item in executor.map(calculateScore, arg):
output = executor.map(calculateScore, arg)
return output
if __name__ == "__main__":
fullAnalysis = fetch_data()
results = main()
print(f'results: {results}')
The Python Global Interpreter Lock or GIL allows only one thread to hold control of the Python interpreter. Since your function calculateScore might be cpu-bound and requires the interpreter to execute its byte code, you may be gaining little by using threading. If, on the other hand, it were doing mostly I/O operations, it would be giving up the GIL for most of its running time allowing other threads to run. But that does not seem to be the case here. You probably should be using the ProcessPoolExecutor from concurrent.futures (try it both ways and see):
def main():
with ProcessPoolExecutor(max_workers=None) as executor:
the_futures = {}
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures[future] = i # map future to request
for future in as_completed(the_futures): # results as they become available not necessarily the order of submission
i = the_futures[future] # the original index
result = future.result() # the result
If you omit the max_workers parameter (or specify a value of None) from the ProcessPoolExecutor constructor, the default will be the number of processors you have on your machine (not a bad default). There is no point in specifying a value larger than the number of processors you have.
If you do not need to tie the future back to the original request, then the_futures can just be a list to which But simplest yest in not even to bother to use the as_completed method:
def main():
with ProcessPoolExecutor(max_workers=5) as executor:
the_futures = []
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures.append(future)
# wait for the completion of all the results and return them all:
results = [f.result() for f in the_futures()] # results in creation order
return results
It should be mentioned that code that launches the ProcessPoolExecutor functions should be in a block governed by a if __name__ = '__main__':. If it isn't you will get into a recursive loop with each subprocess launching the ProcessPoolExecutor. But that seems to be the case here. Perhaps you meant to use the ProcessPoolExecutor all along?
Also:
I don't know what the line ...
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
... in function calculateStore does. It may be the one i/o-bound statement. But this appears to be something that does not vary from call to call. If that is the case and model is not being modified in the function, shouldn't this statement be moved out of the function and computed just once? Then this function would clearly run faster (and be clearly cpu-bound).
Also:
The exception block ...
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
... is puzzling. You are inputting a value that will never be used right before returning. If this is to pause execution, there is no error message being output.
With Booboo assistance, I was able to update code to include ProcessPoolExecutor. Here is my updated code. Overall, processing has been speed up by more than 60%.
I did run into a processing issue and found this topic BrokenPoolProcess that addresses the issue.
output = {}
thePool = {}
def main(labelled_data, dictionaryRevised):
args = sys.argv[1:]
with ProcessPoolExecutor(max_workers=None) as executor:
for i in range(len(labelled_data)):
text = labelled_data['QueryText'][i]
arg = 'most_similar'+ ' '+ text
output = winprocess.submit(
executor, calculateScore, arg
)
thePool[output] = i #original index for future to request
for output in as_completed(thePool): # results as they become available not necessarily the order of submission
i = thePool[output] # the original index
text = labelled_data['QueryText'][i]
result = output.result() # the result
maximumKey = max(result.items(), key=operator.itemgetter(1))[0]
maximumValue = result.get(maximumKey)
labelled_data['SimilarText'][i] = maximumKey
labelled_data['SimilarityScore'][i] = maximumValue
return labelled_data, dictionaryRevised
if __name__ == "__main__":
start = time.perf_counter()
print("Starting to evaluate Query Text for labelling...")
output_Labelled_Data, output_dictionary_revised = preProcessor()
output,dictionary = main(output_Labelled_Data, output_dictionary_revised)
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} second(s)')
Im trying to make Instagram scraper with one Python lib. It goes well but its very slow. Im trying to speed it up by using multithreading but the other problem occurred.
This is the code without multithreading, it works good:
import threading
import instaloader
import time
L = instaloader.Instaloader()
def func1(name):
first = []
posts = instaloader.Profile.from_username(L.context, name).get_posts()
posts = list(posts)
for p in posts[0:5]:
first.append(p)
return first
def func2(name):
second = []
posts = instaloader.Profile.from_username(L.context, name).get_posts()
posts = list(posts)
for p in posts[5:10]:
second.append(p)
return second
t = time.time()
print(func1('eminem'))
print(func2('eminem'))
print(time.time()-t) # this is 47.43 seconds
But when I try to use multithreading, i see that time of execution of my code is much shorter, but I do not get the result, It does not work with 'return' statement. I need to use the return statement because this is only just a part of the code, so I cant use print.
This is the code with threads:
L = instaloader.Instaloader()
def func1(name):
first = []
posts = instaloader.Profile.from_username(L.context, name).get_posts()
posts = list(posts)
for p in posts[0:5]:
first.append(p)
return first
def func2(name):
second = []
posts = instaloader.Profile.from_username(L.context, name).get_posts()
posts = list(posts)
for p in posts[5:10]:
second.append(p)
return second
t = time.time()
t1 = threading.Thread(target = func1, args=('eminem',))
t2 = threading.Thread(target = func2, args=('eminem',))
t1.start()
t2.start()
t1.join()
t2.join()
print(time.time()-t) # this is 25.36 seconds
What am I doing wrong?
The easiest way in your case is to pass a shared data structure with distinguished keys to accumulate results from different functions:
Instead of using local lists first = [] ; second = [] - append result to shared structure like:
def func1(name, results):
...
for p in posts[0:5]:
results['func1'].append(p)
The same for func2 function.
results = {'func1': [], 'func2': []}
t1 = threading.Thread(target = func1, args=('eminem', results))
t2 = threading.Thread(target = func2, args=('eminem', results))
t1.start()
t2.start()
t1.join()
t2.join()
print(results)
Another option is using concurrent.futures.Executor.submit approach.
I've just released a module that could help you with your project and its scalability. Take a look at Akuanduba README file and see if it works for you.
I need to get all the elements on a page and iterate through them to search each element.
currently I am using, driver.find_elements_by_xpath('//*[#*]')
However, there can be a delay in completing the line of code above on larger pages. Is there a way to retrieve the results in increments of 100 elements? Or at least add a timeout?
Terminating driver.find_elements_by_xpath('//*[#*]') inside a multithread is the only why I currently think I can solve this.
I need to find all elements on a page that contain certain strings. For example. elem.get_attribute('outerHTML').find('type="submit"') != -1 … and so on and so forth … I also need their proximity to each other to compare index positions
Thanks!
import Globalz ###### globals import is an empty .py file
import threading
import time
import ctypes
def find_xpath():
for i in range(5):
print(i)
time.sleep(1)
Globalz.curr_value = 'DONE!'
### this is where the xpath retrieval goes (ABOVE loop is for example purposes only)
def stopwatch(info):
curr_time = 0
failed = False
Globalz.curr_value = ''
thread1 = threading.Thread(target=info['function'])
thread1.start()
while thread1.is_alive() is True:
if curr_time >= info['timeout']: failed = True; ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_long(thread1.ident), ctypes.py_object(SystemExit))
curr_time += 1; time.sleep(1)
if failed is True: return info['failed_returns']
if failed is False: return Globalz.curr_value
betty = stopwatch({'function': find_xpath, 'timeout': 10, 'failed_returns': 'failed'})
print(betty)
If anyone is interested here is a solution. I've created a wrapper called stopwatch()
I have a Process pool in python that is starting processes as normal, however, I have just realized that these processes are not closed after the completion (I know that they completed as the last statement is a file write).
Below the code, with an example function ppp:
from multiprocessing import Pool
import itertools
def ppp(element):
window,day = element
print(window,day)
time.sleep(10)
if __name__ == '__main__': ##The line marked
print('START')
start_time = current_milli_time()
days = ['0808', '0810', '0812', '0813', '0814', '0817', '0818', '0827']
windows = [1000,2000,3000,4000,5000,10000,15000, 20000,30000,60000,120000,180000]
processes_args = list(itertools.product(windows, days))
pool = Pool(8)
results = pool.map(ppp, processes_args)
pool.close()
pool.join()
print('END', current_milli_time()-start_time)
I am working on Linux, Ubuntu 16.04. Everything was working fine before I added the line marked in the example. I am wondering if that behavior can be related to the missing of a return statement. Anyway, that is what looks like my 'htop':
As you can see, no process is closed, but all have completed their work.
I found that related question: Python Multiprocessing pool.close() and join() does not close processes, however, I have not understood if the solution to this problem is to use map_async instead of map.
EDIT: real function code:
def process_day(element):
window,day = element
noise = 0.2
print('Processing day:', day,', window:', window)
individual_files = glob.glob('datan/'+day+'/*[0-9].csv')
individual = readDataset(individual_files)
label_time = individual.loc[(individual['LABEL_O'] != -2) | (individual['LABEL_F'] != -2), 'TIME']
label_time = list(np.unique(list(label_time)))
individual = individual[individual['TIME'].isin(label_time)]
#Saving IDs for further processing
individual['ID'] = individual['COLLAR']
#Time variable in seconds for aggregation and merging
individual['TIME_S'] = individual['TIME'].copy()
noise_x = np.random.normal(0,noise,len(individual))
noise_y = np.random.normal(0,noise,len(individual))
noise_z = np.random.normal(0,noise,len(individual))
individual['X_AXIS'] = individual['X_AXIS'] + noise_x
individual['Y_AXIS'] = individual['Y_AXIS'] + noise_y
individual['Z_AXIS'] = individual['Z_AXIS'] + noise_z
#Time syncronization (applying milliseconds for time series processing)
print('Time syncronization:')
with progressbar.ProgressBar(max_value=len(individual.groupby('ID'))) as bar:
for baboon,df_baboon in individual.groupby('ID'):
times = list(df_baboon['TIME'].values)
d = Counter(times)
result = []
for timestamp in np.unique(times):
for i in range(0,d[timestamp]):
result.append(str(timestamp+i*1000/d[timestamp]))
individual.loc[individual['ID'] == baboon,'TIME'] = result
bar.update(1)
#Time series process
ts_process = time_series_processing(window, 'TIME_S', individual, 'COLLAR', ['COLLAR', 'TIME', 'X_AXIS','Y_AXIS','Z_AXIS'])
#Aggregation and tsfresh
ts_process.do_process()
individual = ts_process.get_processed_dataframe()
individual.to_csv('noise2/processed_data/'+str(window)+'/agg/'+str(day)+'.csv', index = False)
#NEtwork inference process
ni = network_inference_process(individual, 'TIME_S_mean')
#Inference
ni.do_process()
final = ni.get_processed_dataframe()
final.to_csv('noise2/processed_data/'+str(window)+'/net/'+str(day)+'.csv', index = False)
#Saving not aggregated ground truth
ground_truth = final[['ID_mean', 'TIME_S_mean', 'LABEL_O_values', 'LABEL_F_values']].copy()
#Neighbor features process
neighbors_features_f = ni.get_neighbor_features(final, 'TIME_S_mean', 'ID_mean')
neighbors_features_f = neighbors_features_f.drop(['LABEL_O_values_n', 'LABEL_F_values_n'], axis=1)
neighbors_features_f.to_csv('noise2/processed_data/'+str(window)+'/net/'+str(day)+'_neigh.csv', index = False)
# Final features dataframe
final_neigh = pd.merge(final, neighbors_features_f, how='left', left_on=['TIME_S_mean','ID_mean'], right_on = ['TIME_S_mean_n','BABOON_NODE_n'])
final_neigh.to_csv('noise2/processed_data/'+str(window)+'/complete/'+str(day)+'.csv', index = False)
return
So as you can see, the last statement is a write to file, and it is executed by all the processes, I do not actually think that the problem is inside this function.