I'm trying to create a For-loop which automatically starts different python files at the exact same time, but they always seem to run one after one.
import os
import multiprocessing
import p1, p2, p3
#first idea
path = "C:" + "\\Users\\Max\\\\Desktop\\\python\\tasks\\"
tasks = ['p1.py', 'p2.py', 'p3.py']
len = tasks.__len__()
ind = 0
for i in range(len):
os.system('python' + ' ' + tasks[ind])
ind += 1
#second idea
for x in ('p1', 'p2', 'p3'):
p = multiprocessing.Process(target=lambda: __import__(x))
p.start()
p1, p2, p3 are the files I'm trying to run at the same time, but they get executed one after one, so if the code is:
time.sleep(10)
print("hello)
I will have to wait 30 seconds for the program to be done, instead of the 10 seconds I want.
If you want to start the files in three separate interpreters, start them as subprocesses:
import subprocess
path = r"C:\Users\Max\Desktop\python\tasks"
tasks = ['1.py', '2.py', '3.py']
task_processes = [
subprocess.Popen(r'python %s\%s' % (path, task), shell=True)
for task
in tasks
]
for task in task_processes:
task.wait()
If you want to keep using multiprocessing, you can just encapsulate your system calls in a function:
import os
from multiprocessing import Process
path = "C:\\Users\\Max\\\\Desktop\\\python\\tasks\\"
tasks = ['1.py', '2.py', '3.py']
def foo(task):
os.system('python ' + path + task)
for task in tasks:
p = Process(target=foo, args=(task,))
p.start()
Based on OP's actual goal from a comment:
I'm trying to open different links at the same time in my browser with the webbrowser module. Essentially time.sleep(10) webbrowser.open("google.com") But the link is different in each file
we can instead use threads. I added the option for a different delay per URL, because otherwise there'd be no point in having each thread sleep on its own.
import webbrowser
import threading
import time
def delayed_open_url(delay, url):
time.sleep(delay)
webbrowser.open(url)
threads = []
for delay, url in [
(3, "http://google.com"),
(5, "http://example.com"),
(11, "http://stackoverflow.com"),
]:
threads.append(
threading.Thread(target=delayed_open_url, args=(url,)).start()
)
for thread in threads:
thread.join() # Wait for each thread
# This code will be executed after each thread is done
Related
I am using the multiprocessing.pool.ThreadPool with N threads (e.g 5 threads) and I wanted to check the total number of active threads in my process. To do that I am using the method threading.active_count(). I know it's a different module, but I found no other method to count the number of active threads in the multiprocessing package,
The expected result is N+1 (the number of threads I started plus the main thread), but I always get a higher number.
For ThreadPool(2) I am getting 6 active threads
For ThreadPool(5) I am getting 9 active threads
For ThreadPool(10) I am getting 14 active threads
It's important to say that threading.active_count() works fine when creating threads using the threading module. And I found out that multiprocessing.pool.ThreadPool is not well documented.
Can someone help me?
A reproduceable code is described bellow
import threading
from multiprocessing.pool import ThreadPool
import time
import requests
import os
urls_to_download = [
'https://picsum.photos/seed/1/1920/1080',
'https://picsum.photos/seed/2/1920/1080',
'https://picsum.photos/seed/3/1920/1080',
'https://picsum.photos/seed/4/1920/1080',
'https://picsum.photos/seed/5/1920/1080',
'https://picsum.photos/seed/6/1920/1080',
'https://picsum.photos/seed/7/1920/1080',
'https://picsum.photos/seed/8/1920/1080',
'https://picsum.photos/seed/9/1920/1080',
'https://picsum.photos/seed/10/1920/1080',
'https://picsum.photos/seed/11/1920/1080',
'https://picsum.photos/seed/12/1920/1080',
'https://picsum.photos/seed/13/1920/1080',
'https://picsum.photos/seed/14/1920/1080',
'https://picsum.photos/seed/15/1920/1080',
'https://picsum.photos/seed/16/1920/1080',
'https://picsum.photos/seed/17/1920/1080'
]
output_dir = 'downloaded_images'
##
def download(url):
print(f'downloading {url}')
img_data = requests.get(url).content
img_name = url.split('/')[-3]
img_name = f'{img_name}.jpg'
print(f'Received data for {img_name}')
print(f'Active Threads: {threading.active_count()}')
with open(os.path.join(output_dir,img_name), 'wb') as img_file:
img_file.write(img_data)
number_of_threads = 2
t1 = time.perf_counter()
with ThreadPool(number_of_threads) as pool:
pool.map(download,urls_to_download)
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
I use python multiprocessing to compute some sort of scores on DNA sequences from a large file.
For that I write and use the script below.
I use a Linux machine with 48 cpu in python 3.8 environment.
Th code work fine, and terminate the work correctly and print the processing time at the end.
Problem: when I use the htop command, I find that all 48 processes are still alive.
I don't know why, and I don't know what to add to my script to avoid this.
import csv
import sys
import concurrent.futures
from itertools import combinations
import psutil
import time
nb_cpu = psutil.cpu_count(logical=False)
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
score_dist = compute_score_dist(seq_1[1], seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
def help_fun_job(nested_pair):
return fun_job(nested_pair[0], nested_pair[1])
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor(max_workers=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')
def main():
print("nb_cpu in this machine : ", nb_cpu)
file_path = sys.argv[1]
dict_ids_seqs = get_dict_ids_seqs(file_path)
list_ids = list(dict_ids_seqs) # This will convert the dict_keys to a list
list_combined_ids = list(combinations(list_ids, 2))
compute_using_multi_processing(list_combined_ids, dict_ids_seqs)
if __name__ == '__main__':
main()
Thank you for your help.
Edit : add the complete code for fun_job (after #Booboo answer)
from Bio import Align
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
aligner = Align.PairwiseAligner()
aligner.mode = 'global'
score_dist = aligner.score(seq_1[1],seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
When the with ... as executor: block exits, there is an implicit call to executor.shutdown(wait=True). This will wait for all pending futures to to be done executing "and the resources associated with the executor have been freed", which presumably includes terminating the processes in the pool (if possible?). Why your program terminates (or does it?) or at least you say all the futures have completed executing, while the processes have not terminated is a bit of a mystery. But you haven't provided the code for fun_job, so who can say why this is so?
One thing you might try is to switch to using the multiprocessing.pool.Pool class from the multiprocessing module. It supports a terminate method, which is implicitly called when its context manager with block exits, that explicitly attempts to terminate all processes in the pool:
#import concurrent.futures
import multiprocessing
... # etc.
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with multiprocessing.Pool(processes=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')
I'm using ProcessPoolExecutor context manager to run several Kafka consumers in parallel. I need to store the process IDs of the child processes so that later, I can cleanly terminate those processes. I have such code:
Class MultiProcessConsumer:
...
def run_in_parallel(self):
parallelism_factor = 5
with ProcessPoolExecutor() as executor:
processes = [executor.submit(self.consume) for _ in range(parallelism_factor)]
# It would be nice If I could write [process.pid for process in processes] to a file here.
def consume(self):
while True:
for message in self.kafka_consumer:
do_stuff(message)
I know I can use os.get_pid() in the consume method to get PIDs. But, handling them properly (in case of constant shutting down or starting up of consumers) requires some extra work.
How would you propose that I get and store PIDs of the child processes in such a context?
os.get_pid() seems to be the way to go. Just pass them through a Queue or Pipe in combination with maybe some random UUID that you pass to the process before to identify the PID.
from concurrent.futures import ProcessPoolExecutor
import os
import time
import uuid
#from multiprocessing import Process, Queue
import multiprocessing
import queue
#The Empty exception in in Queue, multiprocessing borrows
#it from there
# https://stackoverflow.com/questions/9908781/sharing-a-result-queue-among-several-processes
m = multiprocessing.Manager()
q = m.Queue()
def task(n, queue, uuid):
my_pid = os.getpid()
print("Executing our Task on Process {}".format(my_pid))
queue.put((uuid, my_pid))
time.sleep(n)
return n * n
def main():
with ProcessPoolExecutor(max_workers = 3) as executor:
some_dict = {}
for i in range(10):
print(i)
u = uuid.uuid4()
f = executor.submit(task, i, q, u)
some_dict[u] = [f, None] # PID not known here
try:
rcv_uuid, rcv_pid = q.get(block=True, timeout=1)
some_dict[rcv_uuid][1] = rcv_pid # store PID
except queue.Empty as e:
print('handle me', e)
print('I am', rcv_uuid, 'and my PID is', rcv_pid)
if __name__ == '__main__':
main()
Although this field is private, you could use the field in PoolProcessExecutor self._processes. The code snippet below shows how to use this variable.
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(backends)) == nb_processes
In the case above, an assertion error is raised. This is because a new task can reuse the forked processes in the pool. You cannot know all forked process IDs through the method you memtioned. Hence, you can do as:
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait
nb_processes = 100
executor = ProcessPoolExecutor(nb_processes )
futures = [executor.submit(os.getpid) for _ in range(nb_processes )]
wait(futures)
backends = list(map(lambda x: x.result(), futures))
assert len(set(executor._processes.keys())) == nb_processes
print('all of PID are: %s.' % list(executor._processes.keys()))
If you don't want to destroy the encapsulation, you could inhert the ProcessPoolExecutor and create a new property for that.
After running some compuations nicely in linear fashion with a moderator script (cf. below) calling an inner one performing the computation, I struggle
to bring it to execution when trying it with multiprocessing. It seems that each CPU core is running through this list set (testRegister) and launches a computation even if an other core already performed this task earlier (in the same session). How can I prevent this chaotic behaviour? It is my first time attempting calling multiple processors by Python.
Correction: The initial post did not show that the test is a string consisting calling "the inner script" with varying parameters m1 and m2 beside fixed arguments arg1 and arg2 belonging solely to this "inner script".
#!/usr/bin/env python3
import os
import subprocess as sub
import sys
import multiprocessing
fileRegister = []
testRegister = []
def fileCollector():
for file in os.listdir("."):
if file.endswith(".xyz"):
fileRegister.append(file)
fileRegister.sort()
return fileRegister
def testSetup():
data = fileRegister
while len(data) > 1:
for entry in fileRegister[1:]:
m0 = str(fileRegister[0])
m1 = str(entry)
test = str("python foo.py ") + str(m1) + str(" ") + str(m2) +\
str(" --arg1 --arg2") # formulate test condition
testRegister.append(test)
testRegister.sort()
del data[0]
return testRegister
def shortAnalysator():
for entry in testRegister:
print(str(entry))
sub.call(entry, shell=True)
del testRegister[0]
def polyAnalysator():
# apparently each CPU core works as if the register were not shared
# reference: https://docs.python.org/3.7/library/multiprocessing.html
if __name__ == '__main__':
jobs = []
for i in range(3): # safety marging to not consume all CPU
p = multiprocessing.Process(target=shortAnalysator)
jobs.append(p)
p.start()
fileCollector()
testSetup()
shortAnalysator() # proceeding expectably on one CPU (slow)
# polyAnalysator() # causing irritation
sys.exit()```
Your polyAnalysator is running the shortAnalysator three times. Try changing your polyAnalysator as follows, and add the f method. This uses the multiprocessing Pool:
from multiprocessing import Pool
def f(test):
sub.call(test, shell=True)
def polyAnalysator():
# apparently each CPU core works as if the register were not shared
# reference: https://docs.python.org/3.7/library/multiprocessing.html
with Pool(3) as p:
p.map(f, testRegister)
I observe a significant time delta for starting a serie of processes between Python 3.5 and Python 2.7.
In this below code, if CRITICAL = 8 : perf are almost identical in Py2 and Py3 (<1s). But for 9+, perf in Py2 remains unchanged whereas in Py3 it goes deeply worst (~1min!).
It seems linked to the size of args i give to process...
UPDATE : it's also linked to the location of module. Indeed, if it's run from "C:\" (or short path), then Py3 is similar to Py2. But if run from very long path, perf in Py3 are very downgraded, whereas it remains unchanged in Py2.
from __future__ import print_function
from multiprocessing import Process
import time
import itertools
def workerTask(inputs):
for _ in itertools.product(*inputs):
pass
if __name__ == '__main__':
CRITICAL = 9 # OK for 8-, KO for 9+
start = time.time()
ARGS = [["123.4567{}".format(i) for i in range(CRITICAL)] for _ in range(10)]
workerPool = [Process(target=workerTask, args=(ARGS,)) for _ in range(15)]
for idx, w in enumerate(workerPool):
print("...Starting process #{} after {}".format(idx + 1, time.time() - start))
w.start()
print("ALL PROCESSES STARTED in {}!".format(time.time() - start))
I've found an alternative, which seems very modular to "multi-process" works.
By this way, in Py3, time to launch N process remains similar to Py2.
Instead of providing huge args to each process, i create a shared object, linked to BaseManager, in which one i store huge data needed by process.
Furthemore, i can also store shared progress or any data computed by each process to continue after and use it. I really like this solution.
Here the code:
from __future__ import print_function
import time
import itertools
from multiprocessing import Process
from multiprocessing.managers import BaseManager
def workerTask(sharedSandbox):
inputs = sharedSandbox.getARGS()
for _ in itertools.product(*inputs):
pass
class _SharedData(object):
def __init__(self, data):
self.__myARGS = data
def getARGS(self):
return self.__myARGS
class _GlobalManager(BaseManager):
BaseManager.register('SharedData', _SharedData)
if __name__ == '__main__':
CRITICAL = 9 # OK for 8-, KO for 9+
start = time.time()
manager = _GlobalManager()
manager.start()
ARGS = manager.SharedData([["123.4567{}".format(i) for i in range(CRITICAL)] for _ in range(10)])
workerPool = [Process(target=workerTask, args=(ARGS,)) for _ in range(15)]
for idx, w in enumerate(workerPool):
print("...Starting process #{} after {}".format(idx + 1, time.time() - start))
w.start()
print("ALL PROCESSES STARTED in {}!".format(time.time() - start))
while any([w.is_alive() for w in workerPool]):
pass