How to use pool processing to update global dictionary - python

I'm trying to use pool processing of Python to update global dictionary named: globalDict. I'm expecting that the globalDict={'0':0,'1':1,'2':2} but after the code run, this dictionary is still blank. Please help me to fix this issue, the code as below:
from multiprocessing import Pool
import time
def f(x):
global globalDict # update this dictionary every time function called
globalDict.setdefault(str(x),x)
return globalDict
def init_pool(dictX):
# function to initial global dictionary
global globalDict
globalDict = dictX
if __name__ == '__main__':
start=time.time()
globalDict={}
pool=Pool(initializer=init_pool, initargs=(globalDict,)) # initial global dictionary
pool.map(f, range(3)) # using pool processing to call f()function
pool.close()
pool.join()
stop=time.time()
print('Done in {:4f}'.format(stop-start))

A solution is to use a managed dictionary. There is no need to be returning the dictionary back from the worker function, f:
from multiprocessing import Pool, Manager
import time
def f(x):
globalDict.setdefault(str(x),x)
def init_pool(dictX):
# function to initial global dictionary
global globalDict
globalDict = dictX
if __name__ == '__main__':
start = time.time()
with Manager() as manager:
globalDict = manager.dict()
pool = Pool(initializer=init_pool, initargs=(globalDict,)) # initial global dictionary
pool.map(f, range(3)) # using pool processing to call f()function
pool.close()
pool.join()
stop = time.time()
print('Done in {:4f}'.format(stop-start))
print(globalDict)
Prints:
Done in 0.606996
{'0': 0, '2': 2, '1': 1
If you want to end up with a "regular" dictionary that no longer requires the SycnManager class that is returned by the call to Manager(), then after the call to map completes, add the following statement:
regular_dict = {k: v for k, v in globalDict.items()}
Or, if you want to get clever, you can create your own managed dictionary type (we will call it Dict) that only supports the one method we need, setdefault, and dispatches that method call to an underlying dict that we will be able to retrieve when our call to map completes:
from multiprocessing import Pool
from multiprocessing.managers import BaseManager
import time
class DictManager(BaseManager):
pass
class Dict:
def __init__(self):
self._dict = {}
def setdefault(self, *args):
return self._dict.setdefault(*args)
def get_underlying_dict(self):
return self._dict
def f(x):
globalDict.setdefault(str(x),x)
def init_pool(dictX):
# function to initial global dictionary
global globalDict
globalDict = dictX
if __name__ == '__main__':
start = time.time()
DictManager.register('Dict', Dict)
with DictManager() as manager:
globalDict = manager.Dict()
pool = Pool(initializer=init_pool, initargs=(globalDict,)) # initial global dictionary
pool.map(f, range(3)) # using pool processing to call f()function
pool.close()
pool.join()
stop = time.time()
print('Done in {:4f}'.format(stop-start))
regular_dict = globalDict.get_underlying_dict()
print(regular_dict)
Prints:
Done in 0.460001
{'0': 0, '1': 1, '2': 2}

Try this will get expected output as : {0: '0', 1: '1', 2: '2'}
from multiprocessing import Pool
import time
def f(x):
global globalDict # update this dictionary every time function called
globalDict.setdefault(str(x),x)
return globalDict
def init_pool(dictX):
# function to initial global dictionary
global globalDict
globalDict = dictX
if __name__ == '__main__':
start=time.time()
globalDict={}
pool=Pool(initializer=init_pool, initargs=(globalDict,)) # initial global dictionary
result = pool.map(f, range(3)) # using pool processing to call f()function
pool.close()
pool.join()
stop=time.time()
print('Done in {:4f}'.format(stop-start))
for i in result:
res = {val : key for key, val in i.items()}
print(res)

Related

How do I use multprocessing.Value (or other shared resources) on Mac with Spawn?

Example:
import multiprocessing as mp
counter = mp.Value('i', 0)
def test_fun(i):
global counter
with counter.get_lock():
counter.value += 1
def main():
global counter
with mp.Pool(4) as p:
result = p.map(test_fun, range(4))
print(counter.value)
if __name__ == "__main__":
main()
The expected output is 4, since the value is shared but outputs 0 on Mac.
It works when using linux or using fork but I'd like it work with Spawn.
just move counter declaration to your main function, then set it as a global variable inside the children initializer, same way it's done on windows.
import multiprocessing as mp
def setup_fn(value):
global counter
counter = value
def test_fun(i):
global counter
with counter.get_lock():
counter.value += 1
def main():
counter = mp.Value('i', 0)
with mp.Pool(4, initializer=setup_fn, initargs=(counter,)) as p:
result = p.map(test_fun, range(4))
print(counter.value)
if __name__ == "__main__":
main()

How to get function return value from a multiprocessing.Process?

I have two functions and I want to run them simultaneously. So, I tried:
from multiprocessing import Process
def func1():
for i in range(0, 100000000):
pass
return 'abc'
def func2():
for i in range(0, 100000000):
pass
return 'xyz'
if __name__=='__main__':
p1 = Process(target=func1()).start()
p2 = Process(target=func2()).start()
Now, how can I get return value of for example func1()?
I can use for example print(func1()) to get returned value but it executes the function again and it takes a long time again to execute func1(). How do I not cause the function to run again to get the result?
Here's how it could be done using a multiprocessing.Pool as I suggested in a comment.
import multiprocessing
def func1():
for i in range(0, 100000000):
pass
return 'abc'
def func2():
for i in range(0, 100000000):
pass
return 'xyz'
if __name__=='__main__':
funcs = func1, func2
with multiprocessing.Pool() as pool:
results = [pool.apply_async(func) for func in funcs]
pool.close()
pool.join()
results = [result.get() for result in results]
print(f'{results}') # -> ['abc', 'xyz']
you can also use the concurrent.futures module
import concurrent.futures
_pool = concurrent.futures.ThreadPoolExecutor()
def func1():
for i in range(0, 100000000):
pass
return 'abc'
def func2():
for i in range(0, 100000000):
pass
return 'xyz'
if __name__=='__main__':
p1 = _pool.submit(func1)
p2 = _pool.submit(func2)
print(p1.result(), p2.result())

Updating member variable of object while using multprocessing pool

I have a class B which is composed of another class A.
In class B I am using multiprocessing pool to call a method from class A. This method updates a member variable of A (which is a dict).
When I print out this member variable it doesn't seem to have been updated. Here is the code describing the issue:
import multiprocessing as mp
class A():
def __init__(self):
self.aDict = {'key': 0}
def set_lock(self, lock):
self.lock = lock
def do_work(self, item):
print("Doing work for item: {}".format(item) )
self.aDict['key'] += 1
return [1,2,3] # return some list
class B():
def __init__(self):
self.objA = A()
def run_with_mp(self):
items=['item1', 'item2']
with mp.Pool(processes=mp.cpu_count()) as pool:
result = pool.map_async(self.objA.do_work, items)
result.wait()
pool.terminate()
print(self.objA.aDict)
def run(self):
items=['item1', 'item2']
for item in items:
self.objA.do_work(item)
print(self.objA.aDict)
if __name__ == "__main__":
b = B()
b.run_with_mp() # prints {'key': 0}
b.run() # prints {'key': 2}
b.run_with_mp() prints {'key': 0} whole b.run() prints {'key': 2}. I thought the multiprocessing pool version would also do the same since the object self.objA had scope for the full class of B where the multiprocessing pool runs.
I think each worker of the pool sees a different version of self.objA, which are different from the one in the main program flow. Is there a way to make all the workers update a common variable?
You are close to the explanation, indeed, each spawned process holds its own area of memory, it means that they are independent. When you run the do_work each process updates its version of aDict because that variable it's not shared. If you want to share a variable, the easiest way is to use a Manager, for example:
import multiprocessing as mp
class A():
def __init__(self):
self.aDict = mp.Manager().dict({'key': 0})
def set_lock(self, lock):
self.lock = lock
def do_work(self, item):
print("Doing work for item: {}".format(item) )
self.aDict['key'] += 1
return [1,2,3] # return some list
class B():
def __init__(self):
self.objA = A()
def run_with_mp(self):
items=['item1', 'item2']
with mp.Pool(processes=mp.cpu_count()) as pool:
result = pool.map_async(self.objA.do_work, items)
result.wait()
pool.terminate()
print(self.objA.aDict)
def run(self):
items=['item1', 'item2']
for item in items:
self.objA.do_work(item)
print(self.objA.aDict)
if __name__ == "__main__":
b = B()
b.run_with_mp() # prints {'key': 2}
b.run() # prints {'key': 4}
I modified your example to share the aDict variable, so each process will update that property (run_with_mp and run methods). Consider reading more in docs.

How to use multi-processing in class?

I want to use use multiprocessing to do the following:
class myClass:
def proc(self):
#processing random numbers
return a
def gen_data(self):
with Pool(cpu_count()) as q:
data = q.map(self.proc, [_ for i in range(cpu_count())])#What is the correct approach?
return data
Try this:
def proc(self, i):
#processing random numbers
return a
def gen_data(self):
with Pool(cpu_count()) as q:
data = q.map(self.proc, [i for i in range(cpu_count())])#What is the correct approach?
return data
Since you don't have to pass an argument to the processes, there's no reason to map, just call apply_async() as many times as needed.
Here's what I'm saying:
from multiprocessing import cpu_count
from multiprocessing.pool import Pool
from random import randint
class MyClass:
def proc(self):
#processing random numbers
return randint(1, 10)
def gen_data(self, num_procs):
with Pool() as pool: # The default pool size will be the number of cpus.
results = [pool.apply_async(self.proc) for _ in range(num_procs)]
pool.close()
pool.join() # Wait until all worker processes exit.
return [result.get() for result in results] # Gather results.
if __name__ == '__main__':
obj = MyClass()
print(obj.gen_data(8))

better way to get results from multiple threads

what i want to do is be able to call a function with multiple threads and get their results.
i have the following code:
(it is an example, the actual code doesn't simply convert str to int)
from threading import Thread
import time
import Queue
#an example - actual code connects to a server
def str_to_int(arg, queue):
result = 0
result = int(arg)
#sleep to check that they happen at once.
time.sleep(10)
queue.put(result)
def combine():
q1 = Queue.Queue()
q2 = Queue.Queue()
q3 = Queue.Queue()
t1 = Thread(target = str_to_int, args=("111", q1))
t2 = Thread(target = str_to_int, args=("222", q2))
t3 = Thread(target = str_to_int, args=("333", q3))
t1.start()
t2.start()
t3.start()
t1.join()
t2.join()
t3.join()
return (q1.get(),q2.get(),q3.get())
print combine()
this code works. and i get the expected results:
>>>
(111, 222, 333)
however, there must be a better way to do this.
i plan on having many more threads than 3, but even if i was only staying with 3 - it seems very ugly.
EDIT: i need to be able to know which result came from which thread (ie: from which parameters/arguments that i gave the function)
Here are some advices:
Queue's are thread-safe, so use 1 queue to pass results.
You can create all threads in a cycle and use your queue to pass results. You don't need to have explicit variable for each thread.
So here's what your code might look like:
def str_to_int(arg, queue):
result = int(arg)
queue.put({arg: result})
def combine():
arguments = ('111', '222', '333')
q = Queue.Queue()
threads = []
for argument in arguments:
t = Thread(target=str_to_int, args=(argument, q))
t.start()
threads.append(t)
for t in threads:
t.join()
return [q.get() for _ in xrange(len(arguments))]
i need to be able to know which result came from which thread (ie: from which parameters/arguments that i gave the function)
I use a function like this to keep track of which result came from which task:
from threading import Thread
import typing
def run_together(tasks: typing.Dict[typing.Hashable, tuple],
except_errors: tuple = None) -> dict:
"""
:param tasks: a dict of task keys matched to a tuple of callable and its arguments
<pre>tasks = {'task1': (add, 1, 2), 'task2': (add, 3, 4)}</pre>
:param except_errors: a tuple of errors that should be catched. Defaults to all errors
:return: a dictionary of results with the same keys as `tasks` parameter
<pre>results = {'task1': 3, 'task2': 7}</pre>
"""
# catch all exceptions by default
if not except_errors:
except_errors = (Exception,)
threads = []
results = dict()
def save_results(f, key):
def wrapped(*args, **kwargs):
try:
result = f(*args, **kwargs)
except except_errors as e:
result = e
results[key] = result
return wrapped
for key, (func, *args) in tasks.items():
thread = Thread(
target=save_results(func, key),
args=args
)
thread.start()
threads.append(thread)
for t in threads:
t.join()
return results
tasks parameter is a dictionary of keys and a tuple of callable with its arguments. You can modify save decorator to return a list if you want.
So you can do:
def add(first, second):
return first + second
tasks = {f'result:{n}': (add, n, 1) for n in range(4)}
results = run_together(tasks)
print(results)
which gives:
{'result:0': 1, 'result:1': 2, 'result:2': 3, 'result:3': 4}

Categories