Using concurrent.futures within a for statement - python

I store QuertyText within a pandas dataframe. Once I've loaded all the queries into I want to conduct an analysis again each query. Currently, I have ~50k to evaluate. So, doing it one by one, will take a long time.
So, I wanted to implement concurrent.futures. How do I take the individual QueryText stored within fullAnalysis as pass it to concurrent.futures and return the output as a variable?
Here is my entire code:
import pandas as pd
import time
import gensim
import sys
import warnings
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed
fullAnalysis = pd.DataFrame()
def fetch_data(jFile = 'ProcessingDetails.json'):
print("Fetching data...please wait")
#read JSON file for latest dictionary file name
baselineDictionaryFileName = 'Dictionary/Dictionary_05-03-2020.json'
#copy data to pandas dataframe
labelled_data = pd.read_json(baselineDictionaryFileName)
#Add two more columns to get the most similar text and score
labelled_data['SimilarText'] = ''
labelled_data['SimilarityScore'] = float()
print("Data fetched from " + baselineDictionaryFileName + " and there are " + str(labelled_data.shape[0]) + " rows to be evalauted")
return labelled_data
def calculateScore(inputFunc):
warnings.filterwarnings("ignore", category=DeprecationWarning)
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
inp = inputFunc
print(inp)
out = dict()
strEvaluation = inp.split("most_similar ",1)[1]
#while inp != 'quit':
split_inp = inp.split()
try:
if split_inp[0] == 'help':
pass
elif split_inp[0] == 'similarity' and len(split_inp) >= 3:
pass
elif split_inp[0] == 'most_similar' and len(split_inp) >= 2:
for pair in model.most_similar(positive=[split_inp[1]]):
out.update({pair[0]: pair[1]})
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
return out
def main():
with ThreadPoolExecutor(max_workers=5) as executor:
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
#for item in executor.map(calculateScore, arg):
output = executor.map(calculateScore, arg)
return output
if __name__ == "__main__":
fullAnalysis = fetch_data()
results = main()
print(f'results: {results}')

The Python Global Interpreter Lock or GIL allows only one thread to hold control of the Python interpreter. Since your function calculateScore might be cpu-bound and requires the interpreter to execute its byte code, you may be gaining little by using threading. If, on the other hand, it were doing mostly I/O operations, it would be giving up the GIL for most of its running time allowing other threads to run. But that does not seem to be the case here. You probably should be using the ProcessPoolExecutor from concurrent.futures (try it both ways and see):
def main():
with ProcessPoolExecutor(max_workers=None) as executor:
the_futures = {}
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures[future] = i # map future to request
for future in as_completed(the_futures): # results as they become available not necessarily the order of submission
i = the_futures[future] # the original index
result = future.result() # the result
If you omit the max_workers parameter (or specify a value of None) from the ProcessPoolExecutor constructor, the default will be the number of processors you have on your machine (not a bad default). There is no point in specifying a value larger than the number of processors you have.
If you do not need to tie the future back to the original request, then the_futures can just be a list to which But simplest yest in not even to bother to use the as_completed method:
def main():
with ProcessPoolExecutor(max_workers=5) as executor:
the_futures = []
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures.append(future)
# wait for the completion of all the results and return them all:
results = [f.result() for f in the_futures()] # results in creation order
return results
It should be mentioned that code that launches the ProcessPoolExecutor functions should be in a block governed by a if __name__ = '__main__':. If it isn't you will get into a recursive loop with each subprocess launching the ProcessPoolExecutor. But that seems to be the case here. Perhaps you meant to use the ProcessPoolExecutor all along?
Also:
I don't know what the line ...
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
... in function calculateStore does. It may be the one i/o-bound statement. But this appears to be something that does not vary from call to call. If that is the case and model is not being modified in the function, shouldn't this statement be moved out of the function and computed just once? Then this function would clearly run faster (and be clearly cpu-bound).
Also:
The exception block ...
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
... is puzzling. You are inputting a value that will never be used right before returning. If this is to pause execution, there is no error message being output.

With Booboo assistance, I was able to update code to include ProcessPoolExecutor. Here is my updated code. Overall, processing has been speed up by more than 60%.
I did run into a processing issue and found this topic BrokenPoolProcess that addresses the issue.
output = {}
thePool = {}
def main(labelled_data, dictionaryRevised):
args = sys.argv[1:]
with ProcessPoolExecutor(max_workers=None) as executor:
for i in range(len(labelled_data)):
text = labelled_data['QueryText'][i]
arg = 'most_similar'+ ' '+ text
output = winprocess.submit(
executor, calculateScore, arg
)
thePool[output] = i #original index for future to request
for output in as_completed(thePool): # results as they become available not necessarily the order of submission
i = thePool[output] # the original index
text = labelled_data['QueryText'][i]
result = output.result() # the result
maximumKey = max(result.items(), key=operator.itemgetter(1))[0]
maximumValue = result.get(maximumKey)
labelled_data['SimilarText'][i] = maximumKey
labelled_data['SimilarityScore'][i] = maximumValue
return labelled_data, dictionaryRevised
if __name__ == "__main__":
start = time.perf_counter()
print("Starting to evaluate Query Text for labelling...")
output_Labelled_Data, output_dictionary_revised = preProcessor()
output,dictionary = main(output_Labelled_Data, output_dictionary_revised)
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} second(s)')

Related

Python multiprocessing finish the work correctly, but the processes still alive (Linux)

I use python multiprocessing to compute some sort of scores on DNA sequences from a large file.
For that I write and use the script below.
I use a Linux machine with 48 cpu in python 3.8 environment.
Th code work fine, and terminate the work correctly and print the processing time at the end.
Problem: when I use the htop command, I find that all 48 processes are still alive.
I don't know why, and I don't know what to add to my script to avoid this.
import csv
import sys
import concurrent.futures
from itertools import combinations
import psutil
import time
nb_cpu = psutil.cpu_count(logical=False)
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
score_dist = compute_score_dist(seq_1[1], seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
def help_fun_job(nested_pair):
return fun_job(nested_pair[0], nested_pair[1])
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor(max_workers=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')
def main():
print("nb_cpu in this machine : ", nb_cpu)
file_path = sys.argv[1]
dict_ids_seqs = get_dict_ids_seqs(file_path)
list_ids = list(dict_ids_seqs) # This will convert the dict_keys to a list
list_combined_ids = list(combinations(list_ids, 2))
compute_using_multi_processing(list_combined_ids, dict_ids_seqs)
if __name__ == '__main__':
main()
Thank you for your help.
Edit : add the complete code for fun_job (after #Booboo answer)
from Bio import Align
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
aligner = Align.PairwiseAligner()
aligner.mode = 'global'
score_dist = aligner.score(seq_1[1],seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
When the with ... as executor: block exits, there is an implicit call to executor.shutdown(wait=True). This will wait for all pending futures to to be done executing "and the resources associated with the executor have been freed", which presumably includes terminating the processes in the pool (if possible?). Why your program terminates (or does it?) or at least you say all the futures have completed executing, while the processes have not terminated is a bit of a mystery. But you haven't provided the code for fun_job, so who can say why this is so?
One thing you might try is to switch to using the multiprocessing.pool.Pool class from the multiprocessing module. It supports a terminate method, which is implicitly called when its context manager with block exits, that explicitly attempts to terminate all processes in the pool:
#import concurrent.futures
import multiprocessing
... # etc.
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with multiprocessing.Pool(processes=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')

How do I get the same queue into differnet multiprocessing files?

I see a lot of tutorials on how to use queues, but they always show them implemented in the same file. I'm trying to organize my code files well from the beginning because I anticipate the project to become very large. How do I get the queue that I initialize in my main file to import into the other function files?
Here is my main file:
import multiprocessing
import queue
from data_handler import data_handler
from get_info import get_memory_info
from get_info import get_cpu_info
if __name__ == '__main__':
q = queue.Queue()
getDataHandlerProcess = multiprocessing.Process(target=data_handler(q))
getMemoryInfoProcess = multiprocessing.Process(target=get_memory_info(q))
getCPUInfoProcess = multiprocessing.Process(target=get_cpu_info(q))
getDataHandlerProcess.start()
getMemoryInfoProcess.start()
getCPUInfoProcess.start()
print("DEBUG: All tasks successfully started.")
Here is my producer:
import psutil
import struct
import time
from data_frame import build_frame
def get_cpu_info(q):
while True:
cpu_string_data = bytes('', 'utf-8')
cpu_times = psutil.cpu_percent(interval=0.0, percpu=True)
for item in cpu_times:
cpu_string_data = cpu_string_data + struct.pack('<d',item)
cpu_frame = build_frame(cpu_string_data, 0, 0, -1, -1)
q.put(cpu_frame)
print(cpu_frame)
time.sleep(1.000)
def get_memory_info(q):
while True:
memory_string_data = bytes('', 'utf-8')
virtual_memory = psutil.virtual_memory()
swap_memory = psutil.swap_memory()
memory_info = list(virtual_memory+swap_memory)
for item in memory_info:
memory_string_data = memory_string_data + struct.pack('<d',item)
memory_frame = build_frame(memory_string_data, 0, 1, -1, -1)
q.put(memory_frame)
print(memory_frame)
time.sleep(1.000)
def get_disk_info(q):
while True:
disk_usage = psutil.disk_usage("/")
disk_io_counters = psutil.disk_io_counters()
time.sleep(1.000)
print(disk_usage)
print(disk_io_counters)
def get_network_info(q):
while True:
net_io_counters = psutil.net_io_counters()
time.sleep(1.000)
print(net_io_counters)
And here is my consumer:
def data_handler(q):
while True:
next_element = q.get()
print(next_element)
print('Item received at data handler queue.')
It is not entirely clear to me what do you mean by " How do I get the queue that I initialize in my main file to import into the other function files?".
Normally you pass a queue as and argument to a function and use it within a function scope regardless of the file structure. Or perform any other variable sharing techniques used for any other data type.
Your code seems to have a few errors however. Firstly, you shouldn't be using queue.Queue with multiprocessing. It has it's own version of that class.
q = multiprocessing.Queue()
It is slower than the queue.Queue, but it works for sharing the data across processes.
Secondly, the proper way to create process objects is:
getDataHandlerProcess = multiprocessing.Process(target=data_handler, args = (q,))
Otherwise you are actually calling data_handler(q) the main thread and trying to assign its return value to the target argument of multiprocessing.Process. Your data_handler function never returns, so the program probably gets into an infinite a deadlock at this point before multiprocessing even begins. Edit: actually it probably goes into infinite wait trying to get an element from an empty queue which will never be filled.

how to provide multiprocessing.process unique variblables

I have a list containing ID Number's, I want to implement every unique ID Number in an API call for each Multiprocessor whilst running the same corresponding functions, implementing the same conditional statements to each processor etc. I have tried to make sense of it but there is not a lot online about this procedure.
I thought to use a for loop but I don't want every processor running this for loop picking up every item in a list. I just need each item to be associated to each processor.
I was thinking something like this:
from multiprocessing import process
import requests, json
ID_NUMBERS = ["ID 1", "ID 2", "ID 3".... ETC]
BASE_URL = "www.api.com"
KEY = {"KEY": "12345"}
a = 0
for x in ID_NUMBERS:
def[a]():
while Active_live_data == True:
# continuously loops over, requesting data from the website
unique_api_call = "{}/livedata[{}]".format(BASE_URL, x)
request_it = requests.get(unique_api_call, headers=KEY)
show_it = (json.loads(request_it.content))
#some extra conditional code...
a += 1
processes = []
b = 0
for _ in range(len(ID_NUMBERS))
p = multiprocessing.Process(target = b)
p.start()
processes.append(p)
b += 1
Any help would be greatly appreciated!
Kindest regards,
Andrew
You can use the map function:
import multiprocessing as mp
num_cores = mp.cpu_count()
pool = mp.Pool(processes=num_cores)
results = pool.map(your_function, list_of_IDs)
This will execute the function your_function, each time with a different item from the list list_of_IDs, and the values returned by your_function will be stored in a list of values (results).
Same approach as #AlessiaM but uses the high-level api in the concurrent.futures module.
import concurrent.futures as mp
import requests, json
BASE_URL = ''
KEY = {"KEY": "12345"}
ID_NUMBERS = ["ID 1", "ID 2", "ID 3"]
def job(id):
unique_api_call = "{}/livedata[{}]".format(BASE_URL, id)
request_it = requests.get(unique_api_call, headers=KEY)
show_it = (json.loads(request_it.content))
return show_it
# Default to as many workers as there are processors,
# But since your job is IO bound (vs CPU bound),
# you could increase this to an even bigger figure by giving the `max_workers` parameter
with mp.ProcessPoolExecutor() as pool:
results = pool.map(job,ID_NUMBERS)
# Process results here

Multiprocessing in python - processes not closing after completing

I have a Process pool in python that is starting processes as normal, however, I have just realized that these processes are not closed after the completion (I know that they completed as the last statement is a file write).
Below the code, with an example function ppp:
from multiprocessing import Pool
import itertools
def ppp(element):
window,day = element
print(window,day)
time.sleep(10)
if __name__ == '__main__': ##The line marked
print('START')
start_time = current_milli_time()
days = ['0808', '0810', '0812', '0813', '0814', '0817', '0818', '0827']
windows = [1000,2000,3000,4000,5000,10000,15000, 20000,30000,60000,120000,180000]
processes_args = list(itertools.product(windows, days))
pool = Pool(8)
results = pool.map(ppp, processes_args)
pool.close()
pool.join()
print('END', current_milli_time()-start_time)
I am working on Linux, Ubuntu 16.04. Everything was working fine before I added the line marked in the example. I am wondering if that behavior can be related to the missing of a return statement. Anyway, that is what looks like my 'htop':
As you can see, no process is closed, but all have completed their work.
I found that related question: Python Multiprocessing pool.close() and join() does not close processes, however, I have not understood if the solution to this problem is to use map_async instead of map.
EDIT: real function code:
def process_day(element):
window,day = element
noise = 0.2
print('Processing day:', day,', window:', window)
individual_files = glob.glob('datan/'+day+'/*[0-9].csv')
individual = readDataset(individual_files)
label_time = individual.loc[(individual['LABEL_O'] != -2) | (individual['LABEL_F'] != -2), 'TIME']
label_time = list(np.unique(list(label_time)))
individual = individual[individual['TIME'].isin(label_time)]
#Saving IDs for further processing
individual['ID'] = individual['COLLAR']
#Time variable in seconds for aggregation and merging
individual['TIME_S'] = individual['TIME'].copy()
noise_x = np.random.normal(0,noise,len(individual))
noise_y = np.random.normal(0,noise,len(individual))
noise_z = np.random.normal(0,noise,len(individual))
individual['X_AXIS'] = individual['X_AXIS'] + noise_x
individual['Y_AXIS'] = individual['Y_AXIS'] + noise_y
individual['Z_AXIS'] = individual['Z_AXIS'] + noise_z
#Time syncronization (applying milliseconds for time series processing)
print('Time syncronization:')
with progressbar.ProgressBar(max_value=len(individual.groupby('ID'))) as bar:
for baboon,df_baboon in individual.groupby('ID'):
times = list(df_baboon['TIME'].values)
d = Counter(times)
result = []
for timestamp in np.unique(times):
for i in range(0,d[timestamp]):
result.append(str(timestamp+i*1000/d[timestamp]))
individual.loc[individual['ID'] == baboon,'TIME'] = result
bar.update(1)
#Time series process
ts_process = time_series_processing(window, 'TIME_S', individual, 'COLLAR', ['COLLAR', 'TIME', 'X_AXIS','Y_AXIS','Z_AXIS'])
#Aggregation and tsfresh
ts_process.do_process()
individual = ts_process.get_processed_dataframe()
individual.to_csv('noise2/processed_data/'+str(window)+'/agg/'+str(day)+'.csv', index = False)
#NEtwork inference process
ni = network_inference_process(individual, 'TIME_S_mean')
#Inference
ni.do_process()
final = ni.get_processed_dataframe()
final.to_csv('noise2/processed_data/'+str(window)+'/net/'+str(day)+'.csv', index = False)
#Saving not aggregated ground truth
ground_truth = final[['ID_mean', 'TIME_S_mean', 'LABEL_O_values', 'LABEL_F_values']].copy()
#Neighbor features process
neighbors_features_f = ni.get_neighbor_features(final, 'TIME_S_mean', 'ID_mean')
neighbors_features_f = neighbors_features_f.drop(['LABEL_O_values_n', 'LABEL_F_values_n'], axis=1)
neighbors_features_f.to_csv('noise2/processed_data/'+str(window)+'/net/'+str(day)+'_neigh.csv', index = False)
# Final features dataframe
final_neigh = pd.merge(final, neighbors_features_f, how='left', left_on=['TIME_S_mean','ID_mean'], right_on = ['TIME_S_mean_n','BABOON_NODE_n'])
final_neigh.to_csv('noise2/processed_data/'+str(window)+'/complete/'+str(day)+'.csv', index = False)
return
So as you can see, the last statement is a write to file, and it is executed by all the processes, I do not actually think that the problem is inside this function.

How do I gather performance metrics for GDI and user Objects using python

Think this is my first question I have asked on here normally find all the answers I need (so thanks in advance)
ok my problem I have written a python program that will in threads monitor a process and output the results to a csv file for later. This code is working great I am using win32pdhutil for the counters and WMI, Win32_PerfRawData_PerfProc_Process for the CPU %time. I have now been asked to monitor a WPF application and specifically monitor User objects and GDI objects.
This is where I have a problem, it is that i can't seem to find any python support for gathering metrics on these two counters. these two counters are easily available in the task manager I find it odd that there is very little information on these two counters. I am specifically looking at gathering these to see if we have a memory leak, I don't want to install anything else on the system other than python that is already installed. Please can you peeps help with finding a solution.
I am using python 3.3.1, this will be running on a windows platform (mainly win7 and win8)
This is the code i am using to gather the data
def gatherIt(self,whoIt,whatIt,type,wiggle,process_info2):
#this is the data gathering function thing
data=0.0
data1="wobble"
if type=="counter":
#gather data according to the attibutes
try:
data = win32pdhutil.FindPerformanceAttributesByName(whoIt, counter=whatIt)
except:
#a problem occoured with process not being there not being there....
data1="N/A"
elif type=="cpu":
try:
process_info={}#used in the gather CPU bassed on service
for x in range(2):
for procP in wiggle.Win32_PerfRawData_PerfProc_Process(name=whoIt):
n1 = int(procP.PercentProcessorTime)
d1 = int(procP.Timestamp_Sys100NS)
#need to get the process id to change per cpu look...
n0, d0 = process_info.get (whoIt, (0, 0))
try:
percent_processor_time = (float (n1 - n0) / float (d1 - d0)) *100.0
#print whoIt, percent_processor_time
except ZeroDivisionError:
percent_processor_time = 0.0
# pass back the n0 and d0
process_info[whoIt] = (n1, d1)
#end for loop (this should take into account multiple cpu's)
# end for range to allow for a current cpu time rather that cpu percent over sampleint
if percent_processor_time==0.0:
data=0.0
else:
data=percent_processor_time
except:
data1="N/A"
else:
#we have done something wrong so data =0
data1="N/A"
#endif
if data == "[]":
data=0.0
data1="N/A"
if data == "" :
data=0.0
data1="N/A"
if data == " ":
data=0.0
data1="N/A"
if data1!="wobble" and data==0.0:
#we have not got the result we were expecting so add a n/a
data=data1
return data
cheers
edited for correct cpu timings issue if anyone tried to run it :D
so after a long search i was able to mash something together that gets me the info needed.
import time
from ctypes import *
from ctypes.wintypes import *
import win32pdh
# with help from here http://coding.derkeiler.com/Archive/Python/comp.lang.python/2007-10/msg00717.html
# the following has been mashed together to get the info needed
def GetProcessID(name):
object = "Process"
items, instances = win32pdh.EnumObjectItems(None, None, object, win32pdh.PERF_DETAIL_WIZARD)
val = None
if name in instances :
tenQuery = win32pdh.OpenQuery()
tenarray = [ ]
item = "ID Process"
path = win32pdh.MakeCounterPath( ( None, object, name, None, 0, item ) )
tenarray.append( win32pdh.AddCounter( tenQuery, path ) )
win32pdh.CollectQueryData( tenQuery )
time.sleep( 0.01 )
win32pdh.CollectQueryData( tenQuery )
for tencounter in tenarray:
type, val = win32pdh.GetFormattedCounterValue( tencounter, win32pdh.PDH_FMT_LONG )
win32pdh.RemoveCounter( tencounter )
win32pdh.CloseQuery( tenQuery )
return val
processIDs = GetProcessID('OUTLOOK') # Remember this is case sensitive
PQI = 0x400
#open a handle on to the process so that we can query it
OpenProcessHandle = windll.kernel32.OpenProcess(PQI, 0, processIDs)
# OK so now we have opened the process now we want to query it
GR_GDIOBJECTS, GR_USEROBJECTS = 0, 1
print(windll.user32.GetGuiResources(OpenProcessHandle, GR_GDIOBJECTS))
print(windll.user32.GetGuiResources(OpenProcessHandle, GR_USEROBJECTS))
#so we have what we want we now close the process handle
windll.kernel32.CloseHandle(OpenProcessHandle)
hope that helps
For GDI count, I think a simpler, cleaner monitoring script is as follows:
import time, psutil
from ctypes import *
def getPID(processName):
for proc in psutil.process_iter():
try:
if processName.lower() in proc.name().lower():
return proc.pid
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
pass
return None;
def getGDIcount(PID):
PH = windll.kernel32.OpenProcess(0x400, 0, PID)
GDIcount = windll.user32.GetGuiResources(PH, 0)
windll.kernel32.CloseHandle(PH)
return GDIcount
PID = getPID('Outlook')
while True:
GDIcount = getGDIcount(PID)
print(f"{time.ctime()}, {GDIcount}")
time.sleep(1)

Categories