Wrong threading.active_count() results when using ThreadPool - python

I am using the multiprocessing.pool.ThreadPool with N threads (e.g 5 threads) and I wanted to check the total number of active threads in my process. To do that I am using the method threading.active_count(). I know it's a different module, but I found no other method to count the number of active threads in the multiprocessing package,
The expected result is N+1 (the number of threads I started plus the main thread), but I always get a higher number.
For ThreadPool(2) I am getting 6 active threads
For ThreadPool(5) I am getting 9 active threads
For ThreadPool(10) I am getting 14 active threads
It's important to say that threading.active_count() works fine when creating threads using the threading module. And I found out that multiprocessing.pool.ThreadPool is not well documented.
Can someone help me?
A reproduceable code is described bellow
import threading
from multiprocessing.pool import ThreadPool
import time
import requests
import os
urls_to_download = [
'https://picsum.photos/seed/1/1920/1080',
'https://picsum.photos/seed/2/1920/1080',
'https://picsum.photos/seed/3/1920/1080',
'https://picsum.photos/seed/4/1920/1080',
'https://picsum.photos/seed/5/1920/1080',
'https://picsum.photos/seed/6/1920/1080',
'https://picsum.photos/seed/7/1920/1080',
'https://picsum.photos/seed/8/1920/1080',
'https://picsum.photos/seed/9/1920/1080',
'https://picsum.photos/seed/10/1920/1080',
'https://picsum.photos/seed/11/1920/1080',
'https://picsum.photos/seed/12/1920/1080',
'https://picsum.photos/seed/13/1920/1080',
'https://picsum.photos/seed/14/1920/1080',
'https://picsum.photos/seed/15/1920/1080',
'https://picsum.photos/seed/16/1920/1080',
'https://picsum.photos/seed/17/1920/1080'
]
output_dir = 'downloaded_images'
##
def download(url):
print(f'downloading {url}')
img_data = requests.get(url).content
img_name = url.split('/')[-3]
img_name = f'{img_name}.jpg'
print(f'Received data for {img_name}')
print(f'Active Threads: {threading.active_count()}')
with open(os.path.join(output_dir,img_name), 'wb') as img_file:
img_file.write(img_data)
number_of_threads = 2
t1 = time.perf_counter()
with ThreadPool(number_of_threads) as pool:
pool.map(download,urls_to_download)
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')

Related

Python multiprocessing finish the work correctly, but the processes still alive (Linux)

I use python multiprocessing to compute some sort of scores on DNA sequences from a large file.
For that I write and use the script below.
I use a Linux machine with 48 cpu in python 3.8 environment.
Th code work fine, and terminate the work correctly and print the processing time at the end.
Problem: when I use the htop command, I find that all 48 processes are still alive.
I don't know why, and I don't know what to add to my script to avoid this.
import csv
import sys
import concurrent.futures
from itertools import combinations
import psutil
import time
nb_cpu = psutil.cpu_count(logical=False)
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
score_dist = compute_score_dist(seq_1[1], seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
def help_fun_job(nested_pair):
return fun_job(nested_pair[0], nested_pair[1])
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor(max_workers=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')
def main():
print("nb_cpu in this machine : ", nb_cpu)
file_path = sys.argv[1]
dict_ids_seqs = get_dict_ids_seqs(file_path)
list_ids = list(dict_ids_seqs) # This will convert the dict_keys to a list
list_combined_ids = list(combinations(list_ids, 2))
compute_using_multi_processing(list_combined_ids, dict_ids_seqs)
if __name__ == '__main__':
main()
Thank you for your help.
Edit : add the complete code for fun_job (after #Booboo answer)
from Bio import Align
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
aligner = Align.PairwiseAligner()
aligner.mode = 'global'
score_dist = aligner.score(seq_1[1],seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
When the with ... as executor: block exits, there is an implicit call to executor.shutdown(wait=True). This will wait for all pending futures to to be done executing "and the resources associated with the executor have been freed", which presumably includes terminating the processes in the pool (if possible?). Why your program terminates (or does it?) or at least you say all the futures have completed executing, while the processes have not terminated is a bit of a mystery. But you haven't provided the code for fun_job, so who can say why this is so?
One thing you might try is to switch to using the multiprocessing.pool.Pool class from the multiprocessing module. It supports a terminate method, which is implicitly called when its context manager with block exits, that explicitly attempts to terminate all processes in the pool:
#import concurrent.futures
import multiprocessing
... # etc.
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with multiprocessing.Pool(processes=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')

Python - multithread with multiple arrays passing args to function

I'm trying to implement multithreading to a very time consuming program, and I've come across this SO answer:
https://stackoverflow.com/a/28463266/3451339, which basically offers this solution for multiple arrays:
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(4)
results = pool.map(my_function, my_array)
# Close the pool and wait for the work to finish
pool.close()
pool.join()
and, passing multiple arrays:
results = pool.starmap(function, zip(list_a, list_b))
The following is the code I have so far which must be refactored with threading. It iterates over 4 arrays, and needs to pass arguments to the function at each iteration and append all results to a final container:
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
all_teams = pd.DataFrame()
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
team = function(strategy=strategy,
budget=budget,
curr_formation=formation,
model=model)
all_teams = all_teams.append(team, ignore_index=True, sort=False)\
.reset_index(drop=True)\
.copy()
Note: Each function call makes api web requests.
What is the way to go with multithreading in this scenario?
Python has the multiprocessing module which can run multiple tasks in parallel and inside each process you can have multiple threads or async io code
Here is a working example which uses 3 Processes and Multithreading
import pandas as pd
import multiprocessing
from multiprocessing import Queue
from threading import Thread
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
#shared Queue if you want to reduce write locking use 3 Queues
Q = Queue()
# Retrive async if you want to speed up the process
def function(q,strategy,budget,curr_formation,model):
q.put("Team")
def runTask(model,q):
for strategy in strategies:
for budget in budgets:
for formation in formations:
Thread(target=function,args=(q,strategy,budget,formation,model)).start()
def main():
p1 = multiprocessing.Process(target=runTask, args=('model_1',Q))
p2 = multiprocessing.Process(target=runTask, args=('model_2',Q))
p3 = multiprocessing.Process(target=runTask, args=('model_3',Q))
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()
all = []
for i in range(0,Q.qsize()):
all.append(Q.get())
print(all)
print(len(all))
if __name__ == "__main__":
main()
A usefull article Multiprocessing in Python | Set 2
This can be one approach.
Note: Thread vs multiProcess. In this SO, I have provided execution through map, that will not work here as map has limitation on number.
Run your nested for loops and build a list of parameters ==> financial_options
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
financial_options.append([strategy,budget,formation,model])
financial_options_len=len(financial_options)
Create a new function that will handle API calls
def access_url(url,parameter_list):
#response=requests.get(url) # request goes here
print(parameter_list)
time.sleep(2)
print("sleep done!")
return "Hello"#,parameter_list # return type
now run the threading with these permutation parameters. so complete program will look like this:
import concurrent.futures
import requests # just in case needed
from bs4 import BeautifulSoup # just in case needed
import time
import pandas as pd
def access_url(url,parameter_list):
#response=requests.get(url) # request goes here
print(parameter_list)
time.sleep(2)
print("sleep done!")
return "Hello"#,parameter_list # return type
def multi_threading():
test_url="http://bla bla.com/"
base_url=test_url
THREAD_MULTI_PROCESSING= True
strategies = ['strategy_1', 'strategy_2']
budgets = [90,100,110,120,130,140,150,160]
formations=['343','352','433','442','451','532','541']
models = ['model_1', 'model_2', 'model_3']
all_teams = pd.DataFrame()
start = time.perf_counter() # start time for performance
financial_options=[]
decision_results=[]
for strategy in strategies:
for budget in budgets:
for formation in formations:
for model in models:
financial_options.append([strategy,budget,formation,model])
financial_options_len=len(financial_options)
print(f"Total options:{financial_options_len}")
future_list = []
THREAD_MULTI_PROCESSING_LOOP=True
if THREAD_MULTI_PROCESSING_LOOP:
with concurrent.futures.ThreadPoolExecutor() as executor: # Through executor
for each in range(financial_options_len):
future = executor.submit(access_url,test_url,financial_options[each]) # submit each option
future_list.append(future)
for f1 in concurrent.futures.as_completed(future_list):
r1=f1.result()
decision_results.append(r1)
end = time.perf_counter() # finish time for performance
print(f'Threads: Finished in {round(end - start,2)} second(s)')
df=pd.DataFrame(decision_results)
df.to_csv("multithread_for.csv")
return df,decision_results
df,results=multi_threading()

How do I get child process PIDs when using ProcessPoolExecuter?

I'm using ProcessPoolExecutor context manager to run several Kafka consumers in parallel. I need to store the process IDs of the child processes so that later, I can cleanly terminate those processes. I have such code:
Class MultiProcessConsumer:
...
def run_in_parallel(self):
parallelism_factor = 5
with ProcessPoolExecutor() as executor:
processes = [executor.submit(self.consume) for _ in range(parallelism_factor)]
# It would be nice If I could write [process.pid for process in processes] to a file here.
def consume(self):
while True:
for message in self.kafka_consumer:
do_stuff(message)
I know I can use os.get_pid() in the consume method to get PIDs. But, handling them properly (in case of constant shutting down or starting up of consumers) requires some extra work.
How would you propose that I get and store PIDs of the child processes in such a context?
os.get_pid() seems to be the way to go. Just pass them through a Queue or Pipe in combination with maybe some random UUID that you pass to the process before to identify the PID.
from concurrent.futures import ProcessPoolExecutor
import os
import time
import uuid
#from multiprocessing import Process, Queue
import multiprocessing
import queue
#The Empty exception in in Queue, multiprocessing borrows
#it from there
# https://stackoverflow.com/questions/9908781/sharing-a-result-queue-among-several-processes
m = multiprocessing.Manager()
q = m.Queue()
def task(n, queue, uuid):
my_pid = os.getpid()
print("Executing our Task on Process {}".format(my_pid))
queue.put((uuid, my_pid))
time.sleep(n)
return n * n
def main():
with ProcessPoolExecutor(max_workers = 3) as executor:
some_dict = {}
for i in range(10):
print(i)
u = uuid.uuid4()
f = executor.submit(task, i, q, u)
some_dict[u] = [f, None] # PID not known here
try:
rcv_uuid, rcv_pid = q.get(block=True, timeout=1)
some_dict[rcv_uuid][1] = rcv_pid # store PID
except queue.Empty as e:
print('handle me', e)
print('I am', rcv_uuid, 'and my PID is', rcv_pid)
if __name__ == '__main__':
main()
Although this field is private, you could use the field in PoolProcessExecutor self._processes. The code snippet below shows how to use this variable.
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(backends)) == nb_processes
In the case above, an assertion error is raised. This is because a new task can reuse the forked processes in the pool. You cannot know all forked process IDs through the method you memtioned. Hence, you can do as:
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(executor._processes.keys())) == nb_processes
print('all of PID are: %s.' % list(executor._processes.keys()))
If you don't want to destroy the encapsulation, you could inhert the ProcessPoolExecutor and create a new property for that.

Python: How to run multiple files at the same time?

I'm trying to create a For-loop which automatically starts different python files at the exact same time, but they always seem to run one after one.
import os
import multiprocessing
import p1, p2, p3
#first idea
path = "C:" + "\\Users\\Max\\\\Desktop\\\python\\tasks\\"
tasks = ['p1.py', 'p2.py', 'p3.py']
len = tasks.__len__()
ind = 0
for i in range(len):
os.system('python' + ' ' + tasks[ind])
ind += 1
#second idea
for x in ('p1', 'p2', 'p3'):
p = multiprocessing.Process(target=lambda: __import__(x))
p.start()
p1, p2, p3 are the files I'm trying to run at the same time, but they get executed one after one, so if the code is:
time.sleep(10)
print("hello)
I will have to wait 30 seconds for the program to be done, instead of the 10 seconds I want.
If you want to start the files in three separate interpreters, start them as subprocesses:
import subprocess
path = r"C:\Users\Max\Desktop\python\tasks"
tasks = ['1.py', '2.py', '3.py']
task_processes = [
subprocess.Popen(r'python %s\%s' % (path, task), shell=True)
for task
in tasks
]
for task in task_processes:
task.wait()
If you want to keep using multiprocessing, you can just encapsulate your system calls in a function:
import os
from multiprocessing import Process
path = "C:\\Users\\Max\\\\Desktop\\\python\\tasks\\"
tasks = ['1.py', '2.py', '3.py']
def foo(task):
os.system('python ' + path + task)
for task in tasks:
p = Process(target=foo, args=(task,))
p.start()
Based on OP's actual goal from a comment:
I'm trying to open different links at the same time in my browser with the webbrowser module. Essentially time.sleep(10) webbrowser.open("google.com") But the link is different in each file
we can instead use threads. I added the option for a different delay per URL, because otherwise there'd be no point in having each thread sleep on its own.
import webbrowser
import threading
import time
def delayed_open_url(delay, url):
time.sleep(delay)
webbrowser.open(url)
threads = []
for delay, url in [
(3, "http://google.com"),
(5, "http://example.com"),
(11, "http://stackoverflow.com"),
]:
threads.append(
threading.Thread(target=delayed_open_url, args=(url,)).start()
)
for thread in threads:
thread.join() # Wait for each thread
# This code will be executed after each thread is done

Process start duration between Python3 and Python2

I observe a significant time delta for starting a serie of processes between Python 3.5 and Python 2.7.
In this below code, if CRITICAL = 8 : perf are almost identical in Py2 and Py3 (<1s). But for 9+, perf in Py2 remains unchanged whereas in Py3 it goes deeply worst (~1min!).
It seems linked to the size of args i give to process...
UPDATE : it's also linked to the location of module. Indeed, if it's run from "C:\" (or short path), then Py3 is similar to Py2. But if run from very long path, perf in Py3 are very downgraded, whereas it remains unchanged in Py2.
from __future__ import print_function
from multiprocessing import Process
import time
import itertools
def workerTask(inputs):
for _ in itertools.product(*inputs):
pass
if __name__ == '__main__':
CRITICAL = 9 # OK for 8-, KO for 9+
start = time.time()
ARGS = [["123.4567{}".format(i) for i in range(CRITICAL)] for _ in range(10)]
workerPool = [Process(target=workerTask, args=(ARGS,)) for _ in range(15)]
for idx, w in enumerate(workerPool):
print("...Starting process #{} after {}".format(idx + 1, time.time() - start))
w.start()
print("ALL PROCESSES STARTED in {}!".format(time.time() - start))
I've found an alternative, which seems very modular to "multi-process" works.
By this way, in Py3, time to launch N process remains similar to Py2.
Instead of providing huge args to each process, i create a shared object, linked to BaseManager, in which one i store huge data needed by process.
Furthemore, i can also store shared progress or any data computed by each process to continue after and use it. I really like this solution.
Here the code:
from __future__ import print_function
import time
import itertools
from multiprocessing import Process
from multiprocessing.managers import BaseManager
def workerTask(sharedSandbox):
inputs = sharedSandbox.getARGS()
for _ in itertools.product(*inputs):
pass
class _SharedData(object):
def __init__(self, data):
self.__myARGS = data
def getARGS(self):
return self.__myARGS
class _GlobalManager(BaseManager):
BaseManager.register('SharedData', _SharedData)
if __name__ == '__main__':
CRITICAL = 9 # OK for 8-, KO for 9+
start = time.time()
manager = _GlobalManager()
manager.start()
ARGS = manager.SharedData([["123.4567{}".format(i) for i in range(CRITICAL)] for _ in range(10)])
workerPool = [Process(target=workerTask, args=(ARGS,)) for _ in range(15)]
for idx, w in enumerate(workerPool):
print("...Starting process #{} after {}".format(idx + 1, time.time() - start))
w.start()
print("ALL PROCESSES STARTED in {}!".format(time.time() - start))
while any([w.is_alive() for w in workerPool]):
pass

Categories