Multiprocessing in Python 3 parallel processing and waiting on jobs - python

I have a piece of code that queries a DB and returns a set of IDs. For each ID, I need to run a related query to get a dataset. I would like to run the queries in parallel to speed up the processing. Once all the processes are run, then I build a block of text and write that to a file, then move to the next id.
How do I ensure that all the processes start at the same time, then wait for all of them to complete before moving to the page =... and writefile operations?
If run as it, I get the following error: Process object is not iterable (on line 9).
Here is what I have so far:
from helpers import *
import multiprocessing
idSet = getIDset(10)
for id in idSet:
ds1 = multiprocessing.Process(target = getDS1(id))
ds1list1, ds1Item1, ds1Item2 = (ds1)
ds2 = multiprocessing.Process(target = getDS2(id))
ds3 = multiprocessing.Process(target = getDS3(id))
ds4 = multiprocessing.Process(target = getDS4(id))
ds5 = multiprocessing.Process(target = getDS5(id))
movefiles = multiprocessing.Process(moveFiles(srcPath = r'Z://', src = ds1Item2 , dstPath=r'E:/new_data_dump//'))
## is there a better way to get them to start in unison than this?
ds1.start()
ds2.start()
ds3.start()
ds4.start()
ds5.start()
## how do I know all processes are finished before moving on?
page = +ds1+'\n' \
+ds2+'\n' \
+ds3+'\n' \
+ds4+'\n' \
+ds5+'\n'
writeFile(r'E:/new_data_dump/',filename+'.txt',page)

I usually keep my "processes" in a list.
plist = []
for i in range(0, 5) :
p = multiprocessing.Process(target = getDS2(id))
plist.append(p)
for p in plist :
p.start()
... do stuff ...
for p in plist :
p.join() # <---- this will wait for each process to finish before continuing
Also I think you have an issue with creating your Process. "target" is supposed to be a function. Not the result of a function as it seems you have it (unless your function returns functions).
It should look like this:
p = Process(target=f, args=('bob',))
Where target is the function, and args is a tuple of arguemnts passed like so:
def f(name) :
print name

Related

Return multiprocessing results, Queue or manager.list?

Note: this question is different from that question, notably in when the jobs are dispatched to the workers and when the results are gathered.
So I have this code:
mp_jobqueue = MP.Queue()
mp_mgr = MP.Manager()
mp_state = mp_mgr.dict()
mp_faileds = mp_mgr.list()
# the processing in process_data_worker is very CPU-intensive,
# thus totally not suitable for async.
workers: List[MP.Process] = []
for ident in range(0, WORKER_COUNT):
print(ident, end=" ", flush=True)
mp_state[ident] = None
w = MP.Process(
target=process_data_worker,
args=(mp_jobqueue, mp_state, mp_faileds),
)
w.start()
workers.append(w)
# fetch_data asynchronously fetches chunks of data,
# each chunk will be directly fed into the job queue to be processed
# by the workers
asyncio.run(fetch_data(mp_jobqueue))
# when we reach here, all data-fetching should have been finished
# and submitted to the workers' job queue
# wait until mp_jobqueue is empty AND all workers are IDLE
safed_workers = 0
while not mp_jobqueue.is_empty() or safed_workers < WORKER_COUNT:
time.sleep(1.0)
safed_workers = sum(1 for state in mp_state.values() if state == "IDLE")
# gather failed results
faileds = list(mp_faileds)
# close manager first to prevent GetOverlappedResult error
mp_mgr.shutdown()
mp_mgr.join()
# disband the workers
[mp_jobqueue.put("DIE") for _ in workers]
time.sleep(1.0)
mp_jobqueue.close()
[w.join() for w in workers]
So as you can see, I cannot use pool.map() to gather the "faileds".
This got me thinking, though:
Will it be better (performance-wise) to use another Queue for mp_faileds instead of a list like it is now? Because I only need an object that can handle "add into bag" and "take out from bag until bag is empty".
Edit: Just found out about multiprocessing.queues.SimpleQueue. The answers to this question, notably this particular answer, seems to hint that SimpleQueue might be even faster. Can someone confirm?

Multiprocessing gremlinpython queries

Goal:
Accelerate the random walk generation by using multiple processes.
Get the list of vertices ids from which I want random walks to be generated in an input queue
Start as much processes as possible with the correct parameters
Make them put the random walks into an output queue
Wait for completion
Read the output queue
What I am doing:
# Libraries imports
from multiprocessing import cpu_count, Process, Queue
import queue
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
# Function the processes are supposed to execute
def job(proc_id:int, siq:Queue, rwq:Queue, g:AnonymousTraversalSource, length:int):
while True:
try:
# Get next element in ids queue
start_id = siq.get_nowait()
except queue.Empty:
# If the ids queue is empty, then terminate
break
else:
# Do a random walk of length <length> from the vertex with id <start_id>
random_walk = g.V(start_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
print(f"{proc_id}: rw obtained")
# Transform the list of vertices into a comma-separated string of ids
rwq.put(",".join(
[str(v.id) for v in random_walk]
))
print(f"{proc_id}: rw handled")
if __name__ == "__main__":
# Get the parameters from the <config.ini> configuration file
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_nb_per_node = int(config["WALKS"]["NB_PER_NODE"])
walks_length = int(config["WALKS"]["LENGTH"])
# Connect to Janus Graph
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
# Instantiate the queues and populate the ids one
start_ids_queue = Queue()
random_walks_queue = Queue()
for vertex in g_main.V().has("vertex_label", "<label>").fold().next():
start_ids_queue.put(vertex.id)
# Create and start the processes
nb_processes = cpu_count()
processes = []
for i in range(nb_processes):
p = Process(target=job, args=(
i,
start_ids_queue,
random_walks_queue,
g_main,
walks_length
))
processes.append(p)
p.start()
for p in processes:
p.join()
# Once the processes are terminated, read the random walks queue
random_walks = []
while not random_walks_queue.empty():
random_walks.append(random_walks_queue.get())
# Do something with the random walks
...
Issue:
Once the processes are started, nothing seems to happen. I never get the X: rw obtained/X: rw handled messages. With a bit more logging, I can see that the queries have been sent yet isn't finishing.
In the logs, when performing the first g_main.V().has("vertex_label", "<label>").fold().next() in the main process (when I populate the ids queue), I have the following message:
DEBUG:gremlinpython:submit with bytecode '[['V'], ['has', 'vertex_label', 'movie'], ['fold']]'
DEBUG:gremlinpython:message '[['V'], ['has', 'vertex_label', '<label>'], ['fold']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V'], ['has', 'vertex_label', '<label>'], ['fold']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
When the other processes send their queries, I have similar logs:
DEBUG:gremlinpython:submit with bytecode '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:message '[['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']]'
DEBUG:gremlinpython:processor='traversal', op='bytecode', args='{'gremlin': [['V', 16456], ['repeat', [['local', [['both'], ['sample', 1]]]]], ['times', 10], ['path']], 'aliases': {'g': 'g'}}'
DEBUG:asyncio:Using selector: EpollSelector
The issue seems not to reside in the query sent, but instead in the indefinite wait that ensues.
If you know of an issue with gremlinpython and multiprocessing, if there is a problem in my multi-processing code, or if you have any explanation that I may have overlooked, please explain to me! Thanks a lot to everyone reading this!
Solutions:
The first partial solution that I found is to use multi-threading instead of multiprocessing:
import configparser
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import AnonymousTraversalSource, traversal
from gremlin_python.process.graph_traversal import __
import threading
class myThread(threading.Thread):
def __init__(self, thread_id, g, length):
threading.Thread.__init__(self)
self.thread_id = thread_id
self.thread_count = 0
self.gtraversal = g
self.walk_length = length
self.nb_walks = nb_walks
def run(self):
while True:
start_ids_list_lock.acquire()
try:
start_id = start_ids_list.pop(0)
start_ids_list_lock.release()
except IndexError:
start_ids_list_lock.release()
break
else:
self.thread_count += 1
random_walk = job(
vertex_id=start_id,
g=self.gtraversal,
length=self.walk_length,
nb_walks=self.nb_walks
)
random_walks_list_lock.acquire()
random_walks_list.append(random_walk)
random_walks_list_lock.release()
logging.info(f"Thread {self.thread_id}: {self.thread_count} done")
def job(vertex_id:int, g:AnonymousTraversalSource, length:int) -> str:
random_walk = g.V(vertex_id).repeat(
__.local(__.both().sample(1))
).times(length).path().next()
return ",".join(random_walk)
config = configparser.RawConfigParser()
config.read("config.ini")
jg_uri = config["JANUSGRAPH"]["URI"]
file_random_walks = config["FILES"]["RANDOM_WALKS"]
walks_length = int(config["WALKS"]["LENGTH"])
connection = DriverRemoteConnection(jg_uri, "g")
g_main = traversal().withRemote(connection)
threads = []
start_ids_list = []
random_walks_list = []
random_walks_list_lock = threading.Lock()
start_ids_list_lock = threading.Lock()
start_ids_list = [vertex.id for vertex in g_main.V().has("vertex_label", "<label>").fold().next()]
nb_vertices = len(start_ids_list)
nb_threads = 6
for i in range(nb_threads):
thread = myThread(
thread_id=i,
g=g_main,
length=walks_length
)
thread.start()
threads.append(thread)
for t in threads:
t.join()
# Do something with the random walks
...
This solution is effectively working and improves the execution time of the program. This isn't a full answer though, as it doesn't explain why the multiprocessing is not performing as I expected.

Python multiprocessing finish the work correctly, but the processes still alive (Linux)

I use python multiprocessing to compute some sort of scores on DNA sequences from a large file.
For that I write and use the script below.
I use a Linux machine with 48 cpu in python 3.8 environment.
Th code work fine, and terminate the work correctly and print the processing time at the end.
Problem: when I use the htop command, I find that all 48 processes are still alive.
I don't know why, and I don't know what to add to my script to avoid this.
import csv
import sys
import concurrent.futures
from itertools import combinations
import psutil
import time
nb_cpu = psutil.cpu_count(logical=False)
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
score_dist = compute_score_dist(seq_1[1], seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
def help_fun_job(nested_pair):
return fun_job(nested_pair[0], nested_pair[1])
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor(max_workers=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')
def main():
print("nb_cpu in this machine : ", nb_cpu)
file_path = sys.argv[1]
dict_ids_seqs = get_dict_ids_seqs(file_path)
list_ids = list(dict_ids_seqs) # This will convert the dict_keys to a list
list_combined_ids = list(combinations(list_ids, 2))
compute_using_multi_processing(list_combined_ids, dict_ids_seqs)
if __name__ == '__main__':
main()
Thank you for your help.
Edit : add the complete code for fun_job (after #Booboo answer)
from Bio import Align
def fun_job(seq_1, seq_2): # seq_i : (id, string)
start = time.time()
aligner = Align.PairwiseAligner()
aligner.mode = 'global'
score_dist = aligner.score(seq_1[1],seq_2[1])
end = time.time()
return seq_1[0], seq_2[0], score_dist, end - start # id seq1, id seq2, score, time
When the with ... as executor: block exits, there is an implicit call to executor.shutdown(wait=True). This will wait for all pending futures to to be done executing "and the resources associated with the executor have been freed", which presumably includes terminating the processes in the pool (if possible?). Why your program terminates (or does it?) or at least you say all the futures have completed executing, while the processes have not terminated is a bit of a mystery. But you haven't provided the code for fun_job, so who can say why this is so?
One thing you might try is to switch to using the multiprocessing.pool.Pool class from the multiprocessing module. It supports a terminate method, which is implicitly called when its context manager with block exits, that explicitly attempts to terminate all processes in the pool:
#import concurrent.futures
import multiprocessing
... # etc.
def compute_using_multi_processing(list_comb_ids, dict_ids_seqs):
start = time.perf_counter()
with multiprocessing.Pool(processes=nb_cpu) as executor:
results = executor.map(help_fun_job,
[((pair_ids[0], dict_ids_seqs[pair_ids[0]]), (pair_ids[1], dict_ids_seqs[pair_ids[1]]))
for pair_ids in list_comb_ids])
save_results_to_csv(results)
finish = time.perf_counter()
proccessing_time = str(datetime.timedelta(seconds=round(finish - start, 2)))
print(f' Processing time Finished in {proccessing_time} hh:mm:ss')

Python: How to run multiple files at the same time?

I'm trying to create a For-loop which automatically starts different python files at the exact same time, but they always seem to run one after one.
import os
import multiprocessing
import p1, p2, p3
#first idea
path = "C:" + "\\Users\\Max\\\\Desktop\\\python\\tasks\\"
tasks = ['p1.py', 'p2.py', 'p3.py']
len = tasks.__len__()
ind = 0
for i in range(len):
os.system('python' + ' ' + tasks[ind])
ind += 1
#second idea
for x in ('p1', 'p2', 'p3'):
p = multiprocessing.Process(target=lambda: __import__(x))
p.start()
p1, p2, p3 are the files I'm trying to run at the same time, but they get executed one after one, so if the code is:
time.sleep(10)
print("hello)
I will have to wait 30 seconds for the program to be done, instead of the 10 seconds I want.
If you want to start the files in three separate interpreters, start them as subprocesses:
import subprocess
path = r"C:\Users\Max\Desktop\python\tasks"
tasks = ['1.py', '2.py', '3.py']
task_processes = [
subprocess.Popen(r'python %s\%s' % (path, task), shell=True)
for task
in tasks
]
for task in task_processes:
task.wait()
If you want to keep using multiprocessing, you can just encapsulate your system calls in a function:
import os
from multiprocessing import Process
path = "C:\\Users\\Max\\\\Desktop\\\python\\tasks\\"
tasks = ['1.py', '2.py', '3.py']
def foo(task):
os.system('python ' + path + task)
for task in tasks:
p = Process(target=foo, args=(task,))
p.start()
Based on OP's actual goal from a comment:
I'm trying to open different links at the same time in my browser with the webbrowser module. Essentially time.sleep(10) webbrowser.open("google.com") But the link is different in each file
we can instead use threads. I added the option for a different delay per URL, because otherwise there'd be no point in having each thread sleep on its own.
import webbrowser
import threading
import time
def delayed_open_url(delay, url):
time.sleep(delay)
webbrowser.open(url)
threads = []
for delay, url in [
(3, "http://google.com"),
(5, "http://example.com"),
(11, "http://stackoverflow.com"),
]:
threads.append(
threading.Thread(target=delayed_open_url, args=(url,)).start()
)
for thread in threads:
thread.join() # Wait for each thread
# This code will be executed after each thread is done

disorder in multiprocessing subprocess

After running some compuations nicely in linear fashion with a moderator script (cf. below) calling an inner one performing the computation, I struggle
to bring it to execution when trying it with multiprocessing. It seems that each CPU core is running through this list set (testRegister) and launches a computation even if an other core already performed this task earlier (in the same session). How can I prevent this chaotic behaviour? It is my first time attempting calling multiple processors by Python.
Correction: The initial post did not show that the test is a string consisting calling "the inner script" with varying parameters m1 and m2 beside fixed arguments arg1 and arg2 belonging solely to this "inner script".
#!/usr/bin/env python3
import os
import subprocess as sub
import sys
import multiprocessing
fileRegister = []
testRegister = []
def fileCollector():
for file in os.listdir("."):
if file.endswith(".xyz"):
fileRegister.append(file)
fileRegister.sort()
return fileRegister
def testSetup():
data = fileRegister
while len(data) > 1:
for entry in fileRegister[1:]:
m0 = str(fileRegister[0])
m1 = str(entry)
test = str("python foo.py ") + str(m1) + str(" ") + str(m2) +\
str(" --arg1 --arg2") # formulate test condition
testRegister.append(test)
testRegister.sort()
del data[0]
return testRegister
def shortAnalysator():
for entry in testRegister:
print(str(entry))
sub.call(entry, shell=True)
del testRegister[0]
def polyAnalysator():
# apparently each CPU core works as if the register were not shared
# reference: https://docs.python.org/3.7/library/multiprocessing.html
if __name__ == '__main__':
jobs = []
for i in range(3): # safety marging to not consume all CPU
p = multiprocessing.Process(target=shortAnalysator)
jobs.append(p)
p.start()
fileCollector()
testSetup()
shortAnalysator() # proceeding expectably on one CPU (slow)
# polyAnalysator() # causing irritation
sys.exit()```
Your polyAnalysator is running the shortAnalysator three times. Try changing your polyAnalysator as follows, and add the f method. This uses the multiprocessing Pool:
from multiprocessing import Pool
def f(test):
sub.call(test, shell=True)
def polyAnalysator():
# apparently each CPU core works as if the register were not shared
# reference: https://docs.python.org/3.7/library/multiprocessing.html
with Pool(3) as p:
p.map(f, testRegister)

Categories