pass by reference between processes - python

I have an object:
from multiprocessing import Pool
import time
class ASYNC(object):
def __init__(self, THREADS=[]):
print('do')
pool = Pool(processes=len(THREADS))
self.THREAD_POOL = {}
thread_index = 0
for thread_ in THREADS:
self.THREAD_POOL[thread_index] = {
'thread': thread_['thread'],
'args': thread_['args'],
'callback': thread_['callback']
}
self.THREAD_POOL[thread_index]['running'] = True
pool.apply_async(self.run, [thread_index], callback=thread_['callback'])
thread_index += 1
def run(self, thread_index):
print('enter')
while(self.THREAD_POOL[thread_index]['running']):
print("loop")
self.THREAD_POOL[thread_index]['thread'](self.THREAD_POOL[thread_index])#HERE
time.sleep(1)
self.THREAD_POOL[thread_index]['running'] = False
def wait_for_finish(self):
for pool in self.THREAD_POOL:
while(self.THREAD_POOL[pool]['running']):
print("sleep" + str(self.THREAD_POOL[pool]['running']))
time.sleep(1)
def x(pool):#HERE
print(str(pool))
if(pool['args'][0] >= 15):
pool['running'] = False
pool['args'][0] += 1
def y(str):
print("done")
A = ASYNC([{'thread': x, 'args':[10], 'callback':y}])
print("start")
A.wait_for_finish()
I am having issues passing self.THREAD_POOL[thread_index] as reference to def x(pool)
I need x(pool) to change the value of the variable in the object.
If i check the value in wait_for_finish then the object is not changed.
Passing object by reference: (tested and works properly)
x = {"1":"one", "2","two"}
def test(a):
a["1"] = "ONE"
print(x["1"])#outputs ONE as expected
this means that dictionaries in python are passed by reference; So, why in my code is it passing by value?
SOLUTION
#DevShark
from multiprocessing import Process, Value, Array
def f(n, a):
n.value = 3.1415927
for i in range(len(a)):
a[i] = -a[i]
if __name__ == '__main__':
num = Value('d', 0.0)
arr = Array('i', range(10))
p = Process(target=f, args=(num, arr))
p.start()
p.join()
print num.value
print arr[:]
according to the documentation, you should not do this unless absolutely needed. I decided not to use this. https://docs.python.org/2/library/multiprocessing.html#multiprocessing.JoinableQueue
instead i will be doing:
from multiprocessing import Pool
import time
class ASYNC(object):
def __init__(self, THREADS=[]):
print('do')
pool = Pool(processes=len(THREADS))
self.THREAD_POOL = {}
thread_index = 0
for thread_ in THREADS:
self.THREAD_POOL[thread_index] = {
'thread': thread_['thread'],
'args': thread_['args'],
'callback': thread_['callback']
}
self.THREAD_POOL[thread_index]['running'] = True
pool.apply_async(self.run, [thread_index], callback=thread_['callback'])
thread_index += 1
def run(self, thread_index):
print('enter')
while(self.THREAD_POOL[thread_index]['running']):
print("loop")
self.THREAD_POOL[thread_index]['thread'](thread_index)
time.sleep(1)
self.THREAD_POOL[thread_index]['running'] = False
def wait_for_finish(self):
for pool in self.THREAD_POOL:
while(self.THREAD_POOL[pool]['running']):
print("sleep" + str(self.THREAD_POOL[pool]['running']))
time.sleep(1)
def x(index):
global A
A.THREAD_POOL[index]
print(str(pool))
if(pool['args'][0] >= 15):
pool['running'] = False
pool['args'][0] += 1
def y(str):
print("done")
A = ASYNC([{'thread': x, 'args':[10], 'callback':y}])
print("start")
A.wait_for_finish()

You are running your function in a different process. That's the way multiprocessing works. Therefore it does not matter what you do with the object, modifications will not be seen in other processes.
To share data between process, see the doc as someone noted in a comment.
Data can be stored in a shared memory map using Value or Array.

Related

Shared dict of numpy arrays?

I want to store a dict with many numpy arrays and share it across processes.
import ctypes
import multiprocessing
from typing import Dict, Any
import numpy as np
dict_of_np: Dict[Any, np.ndarray] = multiprocessing.Manager().dict()
def get_numpy(key):
if key not in dict_of_np:
shared_array = multiprocessing.Array(ctypes.c_int32, 5)
shared_np = np.frombuffer(shared_array.get_obj(), dtype=np.int32)
dict_of_np[key] = shared_np
return dict_of_np[key]
if __name__ == "__main__":
a = get_numpy("5")
a[1] = 5
print(a) # prints [0 5 0 0 0]
b = get_numpy("5")
print(b) # prints [0 0 0 0 0]
I followed the instructions in Use numpy array in shared memory for multiprocessing to create the numpy arrays using a buffer, but when I try to save the resulting numpy array in a dict, it doesn't work. As you can see above, changes to a numpy array don't get saved when accessing the dict again using the key.
How can I share a dict of numpy arrays? I need both the dict and the arrays to be shared and use the same memory.
based on our discussion from this question I may have come up with a solution: By using a thread in the main process to handle the instantiation of multiprocessing.shared_memory.SharedMemory objects, you can ensure a reference to the shared memory object sticks around, and the underlying memory isn't deleted too early. This only solves the problem specifically with windows where the file is deleted when no more references to it exist. It does not solve the problem of requiring each open instance to be held onto as long as the underlying memoryview is needed.
This manager thread "listens" for messages on an input multiprocessing.Queue, and creates / returns data about shared memory objects. A lock is used to make sure the response is read by the correct process (otherwise responses may get mixed up).
All shared memory objects are first created by the main process, and held onto until explicitly deleted so that other processes may access them.
example:
import multiprocessing
from multiprocessing import shared_memory, Queue, Process, Lock
from threading import Thread
import numpy as np
class Exit_Flag: pass
class SHMController:
def __init__(self):
self._shm_objects = {}
self.mq = Queue() #message input queue
self.rq = Queue() #response output queue
self.lock = Lock() #only let one child talk to you at a time
self._processing_thread = Thread(target=self.process_messages)
def start(self): #to be called after all child processes are started
self._processing_thread.start()
def stop(self):
self.mq.put(Exit_Flag())
def __enter__(self):
self.start()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.stop()
def process_messages(self):
while True:
message_obj = self.mq.get()
if isinstance(message_obj, Exit_Flag):
break
elif isinstance(message_obj, str):
message = message_obj
response = self.handle_message(message)
self.rq.put(response)
self.mq.close()
self.rq.close()
def handle_message(self, message):
method, arg = message.split(':', 1)
if method == "exists":
if arg in self._shm_objects: #if shm.name exists or not
return "ok:true"
else:
return "ok:false"
if method == "size":
if arg in self._shm_objects:
return f"ok:{len(self._shm_objects[arg].buf)}"
else:
return "ko:-1"
if method == "create":
args = arg.split(",") #name, size or just size
if len(args) == 1:
name = None
size = int(args[0])
elif len(args) == 2:
name = args[0]
size = int(args[1])
if name in self._shm_objects:
return f"ko:'{name}' already created"
else:
try:
shm = shared_memory.SharedMemory(name=name, create=True, size=size)
except FileExistsError:
return f"ko:'{name}' already exists"
self._shm_objects[shm.name] = shm
return f"ok:{shm.name}"
if method == "destroy":
if arg in self._shm_objects:
self._shm_objects[arg].close()
self._shm_objects[arg].unlink()
del self._shm_objects[arg]
return f"ok:'{arg}' destroyed"
else:
return f"ko:'{arg}' does not exist"
def create(mq, rq, lock):
#helper functions here could make access less verbose
with lock:
mq.put("create:key123,8")
response = rq.get()
print(response)
if response[:2] == "ok":
name = response.split(':')[1]
with lock:
mq.put(f"size:{name}")
response = rq.get()
print(response)
if response[:2] == "ok":
size = int(response.split(":")[1])
shm = shared_memory.SharedMemory(name=name, create=False, size=size)
else:
print("Oh no....")
return
else:
print("Uh oh....")
return
arr = np.ndarray((2,), buffer=shm.buf, dtype=np.int32)
arr[:] = (1,2)
print(arr)
shm.close()
def modify(mq, rq, lock):
while True: #until the shm exists
with lock:
mq.put("exists:key123")
response = rq.get()
if response == "ok:true":
print("key:exists")
break
with lock:
mq.put("size:key123")
response = rq.get()
print(response)
if response[:2] == "ok":
size = int(response.split(":")[1])
shm = shared_memory.SharedMemory(name="key123", create=False, size=size)
else:
print("Oh no....")
return
arr = np.ndarray((2,), buffer=shm.buf, dtype=np.int32)
arr[0] += 5
print(arr)
shm.close()
def delete(mq, rq, lock):
pass #TODO make a test for this?
if __name__ == "__main__":
multiprocessing.set_start_method("spawn") #because I'm mixing threads and processes
with SHMController() as controller:
mq, rq, lock = controller.mq, controller.rq, controller.lock
create_task = Process(target=create, args=(mq, rq, lock))
create_task.start()
create_task.join()
modify_task = Process(target=modify, args=(mq, rq, lock))
modify_task.start()
modify_task.join()
print("finished")
In order to solve the problem of each shm staying alive as long as the array does, you must keep a reference to that specific shm object. Keeping a reference alongside the array is fairly straightforward by attaching it as an attribute to a custom array subclass (copied from the numpy guide to subclassing)
class SHMArray(np.ndarray): #copied from https://numpy.org/doc/stable/user/basics.subclassing.html#slightly-more-realistic-example-attribute-added-to-existing-array
def __new__(cls, input_array, shm=None):
obj = np.asarray(input_array).view(cls)
obj.shm = shm
return obj
def __array_finalize__(self, obj):
if obj is None: return
self.shm = getattr(obj, 'shm', None)
#example
shm = shared_memory.SharedMemory(name=name)
np_array = SHMArray(np.ndarray(shape, buffer=shm.buf, dtype=np.int32), shm)

Use generator to iterate through data from a multiprocess

I would like to perform the following below using multiprocess, instead of subprocess.Popen. This is because I cannot pass objects using popen. I know my simple example below does not use/pass objects, but that is what I want to do.
Sample code is:
main.py
import subprocess
class ProcReader():
def __init__(self, python_file):
self.proc = subprocess.Popen(['python', python_file], stdout=subprocess.PIPE)
def __iter__(self):
return self
def __next__(self):
while True:
line = self.proc.stdout.readline()
if not line:
raise StopIteration
return line
if __name__ == "__main__":
r1 = ProcReader("test1.py")
r2 = ProcReader("test2.py")
r3 = ProcReader("test3.py")
for l1, l2, l3 in zip(r1, r2, r3):
d1 = l1.decode('utf-8').strip().split(",")
d2 = l2.decode('utf-8').strip().split(",")
d3 = l3.decode('utf-8').strip().split(",")
print(f"{d1[0]}:{d1[1]},{d2[0]}:{d2[1]},{d3[1]}:{d3[1]}")
test#.py
for x in range(10):
print("test1,{}".format(x))
My sample code is in python3, but I would like an equivalent, using multiprocess, in python2.7. Should the equivalent also read from stdout? Or should it utilize the queue and just have a worker reading from the queue?
Update---------
My example using multiprocessing:
import time
from multiprocessing import Process, Queue
def writer1(queue):
for x in range(10):
time.sleep(1)
queue.put("test1,{}".format(x))
def writer2(queue):
for x in range(10):
time.sleep(2)
queue.put("test2,{}".format(x))
def writer3(queue):
for x in range(10):
queue.put("test3,{}".format(x))
if __name__=='__main__':
q1 = Queue()
q2 = Queue()
q3 = Queue()
writer_1 = Process(target=writer1, args=((q1),))
writer_1.daemon = True
writer_1.start()
writer_2 = Process(target=writer2, args=((q2),))
writer_2.daemon = True
writer_2.start()
writer_3 = Process(target=writer3, args=((q3),))
writer_3.daemon = True
writer_3.start()
while True:
msg1 = q1.get()
msg2 = q2.get()
msg3 = q3.get()
if msg1 and msg2 and msg3:
d1 = msg1.strip().split(",")
d2 = msg2.strip().split(",")
d3 = msg3.strip().split(",")
print("{}:{},{}:{},{}:{}".format(d1[0],d1[1],
d2[0],d2[1],
d3[0],d3[1]))
else:
break
Didnt realize q1.get() waits until something is there, I added sleep to verify this. Also, how do I check that the process is done writing? Seems to be just waiting at the end
To adapt your second example for my comment about sentinel objects, maybe you're looking for something like
import os
import time
from multiprocessing import Process, Queue
def writer(queue):
value = os.getpid()
for x in range(10):
time.sleep(0.1)
queue.put("{},{}".format(value, x))
queue.put(None)
def spawn_process():
q = Queue()
p = Process(target=writer, args=(q,))
p.daemon = True
p.start()
return (p, q)
if __name__ == "__main__":
processes_and_queues = [spawn_process() for x in range(3)]
processes, queues = zip(*processes_and_queues)
live_queues = list(queues)
while live_queues:
messages = []
for queue in live_queues:
message = queue.get()
if message is None:
live_queues.remove(queue)
messages.append(message)
if len(messages) == len(processes):
print(messages)
It outputs (e.g.)
['51748,0', '51749,0', '51750,0']
['51748,1', '51749,1', '51750,1']
['51748,2', '51749,2', '51750,2']
['51748,3', '51749,3', '51750,3']
['51748,4', '51749,4', '51750,4']
['51748,5', '51749,5', '51750,5']
['51748,6', '51749,6', '51750,6']
['51748,7', '51749,7', '51750,7']
['51748,8', '51749,8', '51750,8']
['51748,9', '51749,9', '51750,9']

Semaphore in Python

I'm trying to implement mutual exclusion using semaphore in Python. The two processes (proc1, proc2) are supposed to be two independent, concurrent processes. They do exactly the same thing: store n in array[n], then increment n.
The purpose of the program is to show that using semaphore we can ensure that the array is filled properly: [0,1,2,3,4,5,6,7,8,9], without skipping any index. However, my code seems to store [0,1,0,0,0,0,0,0,0,0]. I haven't used threads in python before, so I don't know what's going on.
import threading
import time
n = 0
array = [0]*10
sem = threading.Semaphore()
def proc1():
global n, array
while True:
sem.acquire()
array[n] = n
n += 1
sem.release()
time.sleep(0.25)
def proc2():
global n, array
while True:
sem.acquire()
array[n] = n
n += 1
sem.release()
time.sleep(0.25)
t = threading.Thread(target = proc1)
t.start()
t2 = threading.Thread(target = proc2)
t2.start()
print (array)
the problem was that the OP tried to print the result before the threads were done.
He should have waited for join.
import threading
import time
n = 0
array = [0]*10
sem = threading.Semaphore()
def proc(num):
global n
while True:
sem.acquire()
n = n+1
sem.release()
if n > 9:
break
array[n] = n
print ("Thread {}: {}".format(num,array))
time.sleep(0.25)
t1 = threading.Thread(target = proc, args=[1])
t2 = threading.Thread(target = proc, args=[2])
t1.start()
t2.start()
t1.join()
t2.join()
Different take on a Semaphore pattern, handing the "tasks" within the Sempahore itself
class Sempahore:
def __init__(self, max_threads):
self.active_threads = 0
self.max_threads = max_threads
self.tasks = []
def add_task(self, func, args):
self.tasks.append(
Task(
func=func,
args=args
)
)
def run_task(self, task: Task):
_func = task.func
_args = task.args
self.active_threads += 1
_func(*_args)
self.active_threads -= 1
def run(self, blocking=False):
if blocking:
self._run()
else:
t = Thread(target=self._run)
t.start()
def _run(self):
while True:
if self.active_threads < self.max_threads:
task = self.tasks.pop()
logger.info(f'starting task: {task.task_id}')
t = Thread(
target=self.run_task,
args=(task,))
t.start()
if len(self.tasks) == 0:
break

Multiprocessing Class in Subprocess

I want to use python's multiprocessing module in a class, which itself uses subprocesses to not block the main call.
The minimal example looks like this:
import multiprocessing as mp
class mpo():
def __init__(self):
cpu = mp.cpu_count()
self.Pool = mp.Pool(processes = 2)
self.alive = True
self.p = mp.Process(target = self.sub,args=())
def worker():
print 'Alive'
def sub(self):
print self.alive
for i in range(2):
print i
self.Pool.apply_async(self.worker, args=())
print 'done'
self.Pool.close()
# self.Pool.join()
I commented the last line out, as it raises an assertion Error (can only join a child process).
When I do:
m =mpo()
m.p.start()
The output is
True
0
1
done
My main question is, why the print statement in the worker thread never is reached?
Update:
The updated code looks like this.
import multiprocessing as mp
class mpo():
def __init__(self):
cpu = mp.cpu_count()
self.alive = True
self.p = mp.Process(target = self.sub,args=())
self.result=[]
def worker(self):
self.result.append(1)
print 'Alive'
def sub(self):
print self.alive
Pool = mp.Pool(processes = 2)
for i in range(2):
print i
Pool.apply_async(self.worker, args=())
print 'done'
Pool.close()
Pool.join()
The pool now doesn't have to be inherited as it is created in the subprocess. Instead of the print statement the result is appended to the calling object and the pool is properly joined. Nevertheless, there is no result showing up.
so I think this may correspond to a simple example of what you are looking for:
import multiprocessing as mp
def worker(arg):
#print 'Alive'+str(arg)
return "Alive and finished {0}".format(arg)
class mpo():
def __init__(self):
cpu = mp.cpu_count()
self.alive = True
self.pool = mp.Pool(processes = 2)
def sub(self,arguments):
self.results=self.pool.map_async(worker, arguments)
return self.results
if __name__=="__main__":
s=mpo()
s.sub(range(10))
print s.results.get()
Additionally you can call
self.results.ready()
to find out whether the processes have finished their work. You do not have to put this inside of another process because the map_async call does not block the rest of your program.
EDIT:
Concerning your comment, I do not really see the value of putting the calculation in a separate process, because the function is already running in separate processes (in the pool). You only add complexity by nesting it in another subprocess, but it is possible:
import multiprocessing as mp
def worker(arg):
#print 'Alive'+str(arg)
return "Alive and finished {0}".format(arg)
class mpo():
def __init__(self):
cpu = mp.cpu_count()
self.alive = True
self.pool = mp.Pool(processes = 2)
def sub(self,arguments):
self.results=self.pool.map_async(worker, arguments)
return self.results
def run_calculation(q):
s=mpo()
results=s.sub(range(10))
q.put(results.get())
queue=mp.Queue()
proc=mp.Process(target=run_calculation,args=(queue,))
proc.start()
proc.join()
queue.get()

Python : multiprocessing and Array of c_char_p

I'm launching 3 processes and I want them to put a string into a shared array, at the index corresponding to the process (i).
Look at the code below, the output generated is:
['test 0', None, None]
['test 1', 'test 1', None]
['test 2', 'test 2', 'test 2']
Why 'test 0' get overwritten by test 1, and test 1 by test 2?
What I want is (order is not important) :
['test 0', None, None]
['test 0', 'test 1', None]
['test 0', 'test 1', 'test 2']
The code :
#!/usr/bin/env python
import multiprocessing
from multiprocessing import Value, Lock, Process, Array
import ctypes
from ctypes import c_int, c_char_p
class Consumer(multiprocessing.Process):
def __init__(self, task_queue, result_queue, arr, lock):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.arr = arr
self.lock = lock
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
self.task_queue.task_done()
break
answer = next_task(arr=self.arr, lock=self.lock)
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, i):
self.i = i
def __call__(self, arr=None, lock=None):
with lock:
arr[self.i] = "test %d" % self.i
print arr[:]
def __str__(self):
return 'ARC'
def run(self):
print 'IN'
if __name__ == '__main__':
tasks = multiprocessing.JoinableQueue()
results = multiprocessing.Queue()
arr = Array(ctypes.c_char_p, 3)
lock = multiprocessing.Lock()
num_consumers = multiprocessing.cpu_count() * 2
consumers = [Consumer(tasks, results, arr, lock) for i in xrange(num_consumers)]
for w in consumers:
w.start()
for i in xrange(3):
tasks.put(Task(i))
for i in xrange(num_consumers):
tasks.put(None)
I'm running Python 2.7.3 (Ubuntu)
This problem seems similar to this one. There, J.F. Sebastian speculated that the assignment to arr[i] points arr[i] to a memory address that was only meaningful to the subprocess making the assignment. The other subprocesses retrieve garbage when looking at that address.
There are at least two ways to avoid this problem. One is to use a multiprocessing.manager list:
import multiprocessing as mp
class Consumer(mp.Process):
def __init__(self, task_queue, result_queue, lock, lst):
mp.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.lock = lock
self.lst = lst
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
self.task_queue.task_done()
break
answer = next_task(lock = self.lock, lst = self.lst)
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, i):
self.i = i
def __call__(self, lock, lst):
with lock:
lst[self.i] = "test {}".format(self.i)
print([lst[i] for i in range(3)])
if __name__ == '__main__':
tasks = mp.JoinableQueue()
results = mp.Queue()
manager = mp.Manager()
lst = manager.list(['']*3)
lock = mp.Lock()
num_consumers = mp.cpu_count() * 2
consumers = [Consumer(tasks, results, lock, lst) for i in xrange(num_consumers)]
for w in consumers:
w.start()
for i in xrange(3):
tasks.put(Task(i))
for i in xrange(num_consumers):
tasks.put(None)
tasks.join()
Another way is to use a shared array with a fixed size such as mp.Array('c', 10).
import multiprocessing as mp
class Consumer(mp.Process):
def __init__(self, task_queue, result_queue, arr, lock):
mp.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.arr = arr
self.lock = lock
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
self.task_queue.task_done()
break
answer = next_task(arr = self.arr, lock = self.lock)
self.task_queue.task_done()
self.result_queue.put(answer)
return
class Task(object):
def __init__(self, i):
self.i = i
def __call__(self, arr, lock):
with lock:
arr[self.i].value = "test {}".format(self.i)
print([a.value for a in arr])
if __name__ == '__main__':
tasks = mp.JoinableQueue()
results = mp.Queue()
arr = [mp.Array('c', 10) for i in range(3)]
lock = mp.Lock()
num_consumers = mp.cpu_count() * 2
consumers = [Consumer(tasks, results, arr, lock) for i in xrange(num_consumers)]
for w in consumers:
w.start()
for i in xrange(3):
tasks.put(Task(i))
for i in xrange(num_consumers):
tasks.put(None)
tasks.join()
I speculate that the reason why this works when mp.Array(ctypes.c_char_p, 3) does not, is because mp.Array('c', 10) has a fixed size so the memory address never changes, while mp.Array(ctypes.c_char_p, 3) has a variable size, so the memory address might change when arr[i] is assigned to a bigger string.
Perhaps this is what the docs are warning about when it states,
Although it is possible to store a pointer in shared memory remember
that this will refer to a location in the address space of a specific
process. However, the pointer is quite likely to be invalid in the
context of a second process and trying to dereference the pointer from
the second process may cause a crash.

Categories