How can a multiprocessing pool be set up as a global variable?

How can a multiprocessing pool be set up as a global variable? - python

I need to have multiprocessing pools available to multiple methods in multiple files as a global variable.
I have tried this using a setup with 5 files:
main.py
pool_settings.py
population_calculation.py
individual calculation.py
inside_of_individual_calculation.py
Here is the code of all files:
pool_settings.py
numberOfIndividuals = 10
workersPerIndividuals = 5
import multiprocessing as mp
def init():
global pool
pool = mp.Pool(numberOfIndividuals)
global listOfPools
listOfPools = []
for i in range(0,numberOfIndividuals):
listOfPools.append(mp.Pool(workersPerIndividuals))
population_calculation.py
import multiprocessing as mp
import pool_settings
import individual_calculation as ic
def calculatePopulation():
results_population = pool_settings.pool.map_async(ic.calculateIndividual, range(0,pool_settings.numberOfIndividuals)).get()
print(results_population)
individual_calculation.py
import multiprocessing as mp
import pool_settings
import numpy as np
import inside_of_individual_calculation as ioic
def calculateIndividual(individual):
data = []
for i in range(0,pool_settings.workersPerIndividuals):
data.append(np.random.rand(100))
results_individual = pool_settings.listOfPools[mp.current_process()._identity[0]-1].map_async(ioic.calculateInsideIndividual,data).get()
return sum(results_individual)
inside_of_individual_calculation.py
import numpy as np
def calculateInsideIndividual(individualRow):
return individualRow.sum()
main.py
import pool_settings
import population_calculation
pool_settings.init()
population_calculation.calculatePopulation()
When I run main.py (>>>python main.py) I get the following error:
"AttributeError: module 'pool_settings' has no attribute 'listOfPools'"
I have tried multiple ways and I always get the same error. How can I set up a multiprocessing pool as a global variable so that it is accessible to multiple methods in multiple files?
Thanks a lot,
Joe
P.S.: I also tried a multiprocessing pool in which each process would spin up another multiprocessing pool, and didn't work either

You may change pool_settings.py like:
numberOfIndividuals = 10
workersPerIndividuals = 5
import multiprocessing as mp
def init():
pool = mp.Pool(numberOfIndividuals)
global listOfPools
listOfPools = []
for i in range(0,numberOfIndividuals):
listOfPools.append(mp.Pool(workersPerIndividuals))
return pool, listOfPools
pool, listOfPools = init()
That will allow to use from pool_settings import pool or import pool_settings; pool_settings.pool
pool and listOfPools will be automatically initialized on first pool_settings import (instead of manually calling pool_settings.init())

Related

Fault Tolerance detect if cluster fails and select a worker to be the leader

I'm implementing with SimpleXMLRPCServer & Redis.
This is some example code of how i create the workers, (the workers are created from the client).
Now i want to implement Fault Tolerance, so if the main cluster fails (e.g Ctrl+C or any exception), iterate over my workers dictionary and select one as the new leader to become
the master (cluster).
This is the cluster main:
if __name__ == '__main__':
#-------------------- Start Redis & XMLRPC Server --------------------#
print("Starting Redis & XMLRPC Server...")
REDIS_SERVER = redis.Redis()
server = SimpleXMLRPCServer(('localhost', 9000), allow_none=True)
server.register_function(add_worker, "add_worker")
... other functions
server.register_function(missatge_worker_eliminat, "missatge_worker_eliminat")
try:
print('Control-C to exit')
server.serve_forever()
except KeyboardInterrupt:
print('Exiting, but there is another cluster open now!')
from audioop import add
from decimal import MIN_EMIN
from glob import glob
from xmlrpc.server import SimpleXMLRPCServer
import multiprocessing as mp
import xmlrpc.client
from multiprocessing import Process
import json
import redis
import logging
import pandas as pd
import dask.dataframe as dd
import uuid
import os
import time, threading
import asyncio
def add_worker():
global WORKERS
global WORKER_ID
global processos
proc = Process(target=start_worker, args=(REDIS_SERVER,))
proc.start()
WORKERS[WORKER_ID] = proc
WORKER_ID += 1
processos.append(proc)
return "[!] Worker with ID: {id} succesfully added.".format(id=WORKER_ID)

How to use `Manager` when using "spawn"

Here is some pseudocode for what I'm doing
import multiprocessing as mp
from multiprocessing import Manager
from tqdm import tqdm
def loop(arg):
# do stuff
# ...
results.append(result_of_stuff)
if __name__ == '__main__':
manager = Manager()
results = manager.list()
with mp.get_context('spawn').Pool(4) as pool:
list(tqdm(pool.imap(loop, ls), total=len(ls)))
# do stuff with `results`
# ...
So the issue here is that loop doesn't know about results. I have one working way to do this and it's by using "fork" instead of "spawn". But I need to use "spawn" for reasons beyond the scope of my question..
So what is the minimal change I need to make for this to work? And I really want to keep tqdm hence the use of imap
PS: I'm on Linux

You can use functools.partial to add the extra parameters:
import multiprocessing as mp
import os
from functools import partial
from multiprocessing import Manager
from tqdm import tqdm
def loop(results, arg):
results.append(len(arg))
def main():
ctx = mp.get_context("spawn")
manager = Manager()
l = manager.list()
partial_loop = partial(loop, l)
ls = os.listdir("/tmp")
with ctx.Pool() as pool:
results = list(tqdm(pool.imap(partial_loop, ls), total=len(ls)))
print(f"Sum: {sum(l)}")
if __name__ == "__main__":
main()
There is some overhead with this approach as it will spawn a child process to host the Manager server.
Since you will process the results in the main process anyway I would do something like this instead (but this depends on your circumstances of course):
import multiprocessing as mp
import os
from tqdm import tqdm
def loop(arg):
return len(arg)
def main():
ctx = mp.get_context("spawn")
ls = os.listdir("/tmp")
with ctx.Pool() as pool:
results = list(tqdm(pool.imap(loop, ls), total=len(ls)))
print(f"Sum: {sum(results)}")
if __name__ == "__main__":
main()

I know you have already accepted an answer, but let me add my "two cents":
The other way of solving your issue is by initializing each process in your pool with the global variable results as originally intended. The problem was that when using spawn newly created processes do not inherit the address space of the main process (which included the definition of results). Instead execution starts from the top of the program. But the code that creates results never gets executed because of the if __name__ == '__main__' check. But that is a good thing because you do not want a separate instance of this list anyway.
So how do we share the same instance of global variable results across all processes? This is accomplished by using a pool initializer as follows. Also, if you want an accurate progress bar, you should really use imap_unordered instead of imap so that the progress bar is updated in task-completion order rather than in the order in which tasks were submitted. For example, if the first task submitted happens to be the last task to complete, then using imap would result in the progress bar not progressing until all the tasks completed and then it would shoot to 100% all at once.
But Note: The doumentation for imap_unordered only states that the results will be returned in arbitrary order, not completion order. It does however seem that when a chunksize argument of 1 is used (the default if not explicitly specified), the results are returned in completion order. If you do not want to rely on this, then use instead apply_async specifying a callback function that will update the progrss bar. See the last code example.
import multiprocessing as mp
from multiprocessing import Manager
from tqdm import tqdm
def init_pool(the_results):
global results
results = the_results
def loop(arg):
import time
# do stuff
# ...
time.sleep(1)
results.append(arg ** 2)
if __name__ == '__main__':
manager = Manager()
results = manager.list()
ls = list(range(1, 10))
with mp.get_context('spawn').Pool(4, initializer=init_pool, initargs=(results,)) as pool:
list(tqdm(pool.imap_unordered(loop, ls), total=len(ls)))
print(results)
Update: Another (Better) Way
import multiprocessing as mp
from tqdm import tqdm
def loop(arg):
import time
# do stuff
# ...
time.sleep(1)
return arg ** 2
if __name__ == '__main__':
results = []
ls = list(range(1, 10))
with mp.get_context('spawn').Pool(4) as pool:
with tqdm(total=len(ls)) as pbar:
for v in pool.imap_unordered(loop, ls):
results.append(v)
pbar.update(1)
print(results)
Update: The Safest Way
import multiprocessing as mp
from tqdm import tqdm
def loop(arg):
import time
# do stuff
# ...
time.sleep(1)
return arg ** 2
def my_callback(v):
results.append(v)
pbar.update(1)
if __name__ == '__main__':
results = []
ls = list(range(1, 10))
with mp.get_context('spawn').Pool(4) as pool:
with tqdm(total=len(ls)) as pbar:
for arg in ls:
pool.apply_async(loop, args=(arg,), callback=(my_callback))
pool.close()
pool.join()
print(results)

Running two python code in parallel from two different directory using Multiprocessing

Below is my code for running two python code in parallel using multiprocessing :
defs.py
import os
def pro(process):
#print(process)
os.system('python {}'.format(process))
Multiprocessing.py
import os
from multiprocessing import Pool
import multiprocessing as mp
import defs
import datetime
import pandas as pd
processes = ('python_code1.py','python_code2.py')
if __name__ == '__main__':
pool = Pool(processes=4)
start = datetime.datetime.now()
print('Start:',start)
pool.map(defs.pro, processes)
end = datetime.datetime.now()
print('End :',end)
total = end-start
print('Total :', end-start)
This code is running perfectly fine. But my requirement is I need to run the python code 'python_code1.py' and 'python_code2.py' from two different directory.
so I made the below changes in Multiprocessing.py:
path1 = r'C:\Users\code1\python_code1.py'
path2 = r'C:\Users\code2\python_code2.py'
processes = (path1,path2)
but this is not working for me.
My Multiprocessing.py and defs.py are kept on path `C:\Users\Multiprocessing\'

Well an elegant solution using asyncio. It is used as a foundation for multiple Python asynchronous frameworks that provide high-performance network and web-servers, database connection libraries, distributed task queues, etc. Plus it has both high-level and low-level APIs to accomodate any kind of problem. And you might find syntax easier as I do:
import os
import asyncio
def background(f):
def wrapped(*args, **kwargs):
return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
return wrapped
#background
def pro(process):
#print(process)
os.system('python {}'.format(process))
processes = (r'C:\Users\code1\python_code1.py',r'C:\Users\code2\python_code2.py')
for process in processes:
pro(process)
Detailed answer on parallelizing for loop. You might find useful.

Passing Variables to a process python

Need help with how to modify/fix code to allow me to control what is occurring in a process. I have looked around and read I need to either make a global variable which the process can read or use an event function to trigger the process. Problem though is I don't know how to implement them in a class function. I thought that if I followed pyimagesearch code that it would work but it appears that it only works with the threading module and not the multiprocessing module.
import RPi.GPIO as GPIO
from RPI.GPIO import LOW,OUT,HIGH,BCM
import multiprocessing as mp
import time
class TestClass():
def __init__(self,PinOne=22,PinTwo=27):
self.PinOne = PinOne
self.PinTwo = PinTwo
self.RunningSys = True
GPIO.setmode(BCM)
GPIO.setup(PinOne,OUT)
GPIO.output(PinOne,LOW)
GPIO.setup(PinTwo,OUT)
GPIO.output(PinTwo,LOW)
def Testloop(self):
while self.RunningSys:
GPIO.output(PinOne,HIGH)
GPIO.output(PinTwo,HIGH)
time.sleep(1)
GPIO.output(PinOne,LOW)
GPIO.output(PinTwo,LOW)
GPIO.output(PinOne,LOW)
GPIO.output(PinTwo,LOW)
def StopPr(self):
self.RunningSys = False
def MProc(self):
MPGP = mp.process(target=TestClass().Testloop())
MPGP.start()
MPGP.join()
In a separate script
From testfile import TestClass
import time
TestClass().MProc()
time.sleep(4)
TestClass().StopPr()

How to properly initialize a python queue?

I'm new to python and having trouble with the python queue, I'm initializing queue in my init constructor by when I run my python app it crashes, I've included a snippet of my code is there a better way to do it?
import os, sys
import time
if(sys.hexversion < 0x03000000):
import Queue
else:
import queue as Queue
class Appwindow():
def __init__(self):
self.myQueue = Queue.Queue()
def displayMeth(self, stuff):
if self.displayed:
self.myQueue.put(stuff)

try:
from queue import Queue
except ImportError:
from Queue import Queue
# shiny fancy Atomic Message Queue for concurrency
q = Queue()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How can a multiprocessing pool be set up as a global variable? - python

Related

Fault Tolerance detect if cluster fails and select a worker to be the leader

How to use `Manager` when using "spawn"

Running two python code in parallel from two different directory using Multiprocessing

Passing Variables to a process python

How to properly initialize a python queue?

Categories

Resources