In below code I expect print('q.count' , q.count) to be 2 as count is a varialble initialised once using q = QueueFun() and then incremented in the read_queue method, instead print('q.count' , q.count) prints 0. What is the correct method of sharing a counter between multiprocessesing Processes ?
Complete code:
from multiprocessing import Process, Queue, Pool, Lock
class QueueFun():
def __init__(self):
self.count = 0
self.lock = Lock()
def write_queue(self, work_tasks, max_size):
for i in range(0, max_size):
print("Writing to queue")
work_tasks.put(1)
def read_queue(self, work_tasks, max_size):
while self.count != max_size:
self.lock.acquire()
self.count += 1
self.lock.release()
print('self.count' , self.count)
print('')
print('Reading from queue')
work_tasks.get()
if __name__ == '__main__':
q = QueueFun()
max_size = 1
work_tasks = Queue()
write_processes = []
for i in range(0,2):
write_processes.append(Process(target=q.write_queue,
args=(work_tasks,max_size)))
for p in write_processes:
p.start()
read_processes = []
for i in range(0, 2):
read_processes.append(Process(target=q.read_queue,
args=(work_tasks,max_size)))
for p in read_processes:
p.start()
for p in read_processes:
p.join()
for p in write_processes:
p.join()
print('q.count' , q.count)
Unlike threads, different processes have different address
spaces: they do not share memory with each other. Writing
to a variable in one process will not change an (unshared)
variable in another process.
In the original example, the count was 0 at the end, because
the main process never changed it (no matter what the other
spawned processes did).
Probably better to communicate between processes with Queue.
If it's really necessary, Value or Array could be used:
17.2.1.5. Sharing state between processes
As mentioned above, when doing concurrent programming it is usually
best to avoid using shared state as far as possible. This is
particularly true when using multiple processes.
However, if you really do need to use some shared data then
multiprocessing provides a couple of ways of doing so.
Shared memory Data can be stored in a shared memory map using Value or
Array.
...
These shared objects will be process and thread-safe.
multiprocessing.Value
Operations like += which involve a read and write are not atomic.
A slightly modified version of the question's code:
from multiprocessing import Process, Queue, Value
class QueueFun():
def __init__(self):
self.readCount = Value('i', 0)
self.writeCount = Value('i', 0)
def write_queue(self, work_tasks, MAX_SIZE):
with self.writeCount.get_lock():
if self.writeCount != MAX_SIZE:
self.writeCount.value += 1
work_tasks.put(1)
def read_queue(self, work_tasks, MAX_SIZE):
with self.readCount.get_lock():
if self.readCount.value != MAX_SIZE:
self.readCount.value += 1
work_tasks.get()
if __name__ == '__main__':
q = QueueFun()
MAX_SIZE = 2
work_tasks = Queue()
write_processes = []
for i in range(MAX_SIZE):
write_processes.append(Process(target=q.write_queue,
args=(work_tasks,MAX_SIZE)))
for p in write_processes: p.start()
read_processes = []
for i in range(MAX_SIZE):
read_processes.append(Process(target=q.read_queue,
args=(work_tasks,MAX_SIZE)))
for p in read_processes: p.start()
for p in read_processes: p.join()
for p in write_processes: p.join()
print('q.writeCount.value' , q.writeCount.value)
print('q.readCount.value' , q.readCount.value)
Note: printing to standard output from multiple processes,
can result in output getting mixed up (not synchronized).
Related
I'm wondering if there can be a sort of deadlock in the following code. I have to read each element of a database (about 1 million items), process it, then collect the results in a unique file.
I've parallelized the execution with multiprocessing using two Queue's and three types of processes:
Reader: Main process which reads the database and adds the read items in a task_queue
Worker: Pool of processes. Each worker gets an item from task_queue, processes the item, saves the results in an intermediate file stored in item_name/item_name.txt and puts the item_name in a completed_queue
Writer: Process which gets an item_name from completed_queue, gets the intermediate result from item_name/item_name.txt and writes it in results.txt
from multiprocessing import Pool, Process, Queue
class Computation():
def __init__(self,K):
self.task_queue = Queue()
self.completed_queue = Queue()
self.n_cpus = K
def reader(self,):
with open(db, "r") as db:
... # Read an item
self.task_queue.put(item)
def worker(self,):
while True:
item = self.task_queue.get(True)
if item == "STOP":
break
self.process_item(item)
def writer_process(self,):
while True:
f = self.completed_queue.get(True)
if f == "DONE":
break
self.write_f(f)
def run(self,):
pool = Pool(n_cpus, self.worker, args=())
writer = Process(target=self.writer_process, args=())
writer.start()
self.reader()
pool.close()
pool.join()
self.completed_queue.put("DONE")
writer.join()
The code works, but it seems that sometimes the writer or the pool stops working (or they are very slow). Is a deadlock possible in this scenario?
There are a couple of issues with your code. First, by using the queues as you are, you are in effect creating your own process pool and have no need for using the multiprocessing.Pool class at all. You are using a pool initializer as an actual pool worker and it's a bit of a misuse of this class; you would be better off to just use regular Process instances (my opinion, anyway).
Second, although it is well and good that you are putting message DONE to the writer_process to signal it to terminate, you have not done similarly for the self.n_cpus worker processes, which are looking for 'STOP' messages, and therefore the reader function needs to put self.n_cpus STOP messages in the task queue:
from multiprocessing import Process, Queue
class Computation():
def __init__(self, K):
self.task_queue = Queue()
self.completed_queue = Queue()
self.n_cpus = K
def reader(self,):
with open(db, "r") as db:
... # Read an item
self.task_queue.put(item)
# signal to the worker processes to terminate:
for _ in range(self.n_cpus):
self.task_queue.put('STOP')
def worker(self,):
while True:
item = self.task_queue.get(True)
if item == "STOP":
break
self.process_item(item)
def writer_process(self,):
while True:
f = self.completed_queue.get(True)
if f == "DONE":
break
self.write_f(f)
def run(self):
processes = [Process(target=self.worker) for _ in range(self.n_cpus)]
for p in processes:
p.start()
writer = Process(target=self.writer_process, args=())
writer.start()
self.reader()
for p in processes:
p.join()
self.completed_queue.put("DONE")
writer.join()
Personally, instead of using 'STOP' and 'DONE' as the sentinel messages, I would use None instead, assuming that is not a valid actual message. I have tested the above code where reader just processed strings in a list and self.process_item(item) simply appended ' done' to the each of those strings and put the modified string on the completed_queue and replaced self.write_f in the writer_process with a print call. I did not see any problems with the code as is.
Update to use a Managed Queue
Disclaimer: I have had no experience using mpi4py and have no idea how the queue proxies would get distributed across different computers. The above code may not be sufficient as suggested by the following article, How to share mutliprocessing queue object between multiple computers. However, that code is creating instances of Queue.Queue (that code is Python 2 code) and not the proxies that are returned by the multiprocessing.SyncManager. The documentation on this is very poor. Try the above change to see if it works better (it will be slower).
Because the proxy returned by manager.Queue(), I have had to rearrange the code a bit; the queues are now being passed explicitly as arguments to the process functions:
from multiprocessing import Process, Manager
class Computation():
def __init__(self, K):
self.n_cpus = K
def reader(self, task_queue):
with open(db, "r") as db:
... # Read an item
# signal to the worker processes to terminate:
for _ in range(self.n_cpus):
task_queue.put('STOP')
def worker(self, task_queue, completed_queue):
while True:
item = task_queue.get(True)
if item == "STOP":
break
self.process_item(item)
def writer_process(self, completed_queue):
while True:
f = completed_queue.get(True)
if f == "DONE":
break
self.write_f(f)
def run(self):
with Manager() as manager:
task_queue = manager.Queue()
completed_queue = manager.Queue()
processes = [Process(target=self.worker, args=(task_queue, completed_queue)) for _ in range(self.n_cpus)]
for p in processes:
p.start()
writer = Process(target=self.writer_process, args=(completed_queue,))
writer.start()
self.reader(task_queue)
for p in processes:
p.join()
completed_queue.put("DONE")
writer.join()
I've read the documentation here, and seems that to make sure that the Value does not hang we need to use a lock. I did just that but it still gets stuck:
from multiprocessing import Process, Value, freeze_support, Lock
nb_threads = 3
nbloops = 10
v = Value('i', 0)
def run_process(lock):
global nbloops
i = 0
while i < nbloops:
# do stuff
i += 1
with lock:
v.value += 1
# wait for all the processes to finish doing something
while v.value % nb_threads != 0:
pass
if __name__ == '__main__':
freeze_support()
processes = []
lock = Lock()
for i in range(0, 3):
processes.append( Process( target=run_process, args=(lock,) ) )
for process in processes:
process.start()
for process in processes:
process.join()
I've tried accessing the value using lock but it still blocks:
val = -1
while val % nb_threads != 0:
with lock:
val = v.value
How can I fix this? Thanks
Your code has a race condition; you do not guarantee that all three processes break free from the while v.value % nb_threads != 0 loop before allowing them to move on. This allows one or two of the processes to move on to the next iteration of the while i < nbloops loop, increment v.value, and then prevent the remaining process/processes from ever breaking out of their own while v.value % nb_threads != 0 loop. The kind of synchronization you're trying to do there is best handled by a Barrier, rather than looping and repeatedly checking the value.
Also, multiprocessing.Value also has a built-in synchronization by default, and you can explicitly access the Lock it uses for that by calling Value.get_lock, so there is no need to explicitly a Lock of your own to each process. Putting together, you have:
from multiprocessing import Process, Value, freeze_support, Lock, Barrier
nb_threads = 3
nbloops = 10
v = Value('i', 0)
def run_process(barrier):
global nbloops
i = 0
while i < nbloops:
# do stuff
i += 1
with v.get_lock():
v.value += 1
# wait for all the processes to finish doing something
out = barrier.wait()
if __name__ == '__main__':
freeze_support()
processes = []
b = Barrier(nb_threads)
for i in range(0, nb_threads):
processes.append( Process( target=run_process, args=(b,) ) )
for process in processes:
process.start()
for process in processes:
process.join()
The Barrier guarantees that no process can move on to the next iteration of the loop until all of them have called Barrier.wait(), at which point all three are simultaneously able to progress. The Barrier object supports re-use, so it can safely be called on each iteration.
I'm looking for shorter ways to prepare my dataset for a machine-learning task. I found that the multiprocessing library might helpful. However, because I'm a newbie in multiprocessing, I couldn't find a proper way.
I first wrote some codes like below:
class DatasetReader:
def __init__(self):
self.data_list = Read_Data_from_file
self.data = []
def _ready_data(self, ex, idx):
# Some complex functions that takes several minutes
def _dataset_creator(self, queue):
for idx, ex in enumerate(self.data_list):
queue.put(self._ready_data(ex, idx))
def _dataset_consumer(self, queue):
total_mem = 0.0
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ', bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) [{elapsed}<{remaining},{rate_fmt}{postfix}]')
for idx in t:
ins = queue.get()
self.data.append(ins)
gc.collect()
def _build_dataset(self):
queue = Queue()
creator = Process(target=self._dataset_creator, args=(queue,))
consumer = Process(target=self._dataset_consumer, args=(queue,))
creator.start()
consumer.start()
queue.close()
queue.join_thread()
creator.join()
consumer.join()
However, in my opinion, because the _dataset_creator processes data (here _ready_data) in serial manner, this would not be helpful for reducing time consumption.
So, I modified the code to generate multiple processes that process one datum:
class DatasetReader:
def __init__(self):
self.data_list = Read_Data_from_file
self.data = []
def _ready_data(self, ex, idx):
# Some complex functions that takes several minutes
def _dataset_creator(self, ex, idx, queue):
queue.put(self._ready_data(ex, idx))
def _dataset_consumer(self, queue):
total_mem = 0.0
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ', bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) [{elapsed}<{remaining},{rate_fmt}{postfix}]')
for idx in t:
ins = queue.get()
self.data.append(ins)
gc.collect()
def _build_dataset(self):
queue = Queue()
for idx, ex in enumerate(self.data_list):
p = Process(target=self._dataset_creator, args=(ex, idx, queue,))
p.start()
consumer = Process(target=self._dataset_consumer, args=(queue,))
consumer.start()
queue.close()
queue.join_thread()
consumer.join()
However, this returns me errors:
Process Process-18:
Traceback ~~~
RuntimeError: can't start new thread
Traceback ~~~
OSError: [Errno 12] Cannot allocate memory
Could you help me to process complex data in a parallel way?
EDIT 1:
Thanks to #tdelaney, I can reduce the time consumption by generating self.num_worker processes (16 in my experiment):
def _dataset_creator(self, pid, queue):
for idx, ex in list(enumerate(self.data_list))[pid::self.num_worker]:
queue.put(self._ready_data(ex, idx))
def _dataset_consumer(self, queue):
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ', bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) [{elapsed}<{remaining},{rate_fmt}{postfix}]')
for _ in t:
ins = queue.get()
self.data[ins['idx']] = ins
def _build_dataset(self):
queue = Queue()
procs = []
for pid in range(self.num_worker):
p = Process(target=self._dataset_creator, args=(pid, queue,))
procs.append(p)
p.start()
consumer = Process(target=self._dataset_consumer, args=(queue,))
consumer.start()
queue.close()
queue.join_thread()
for p in procs:
p.join()
consumer.join()
I'm trying to sketch out what a solution with a multiprocessing pool would look like. I got rid of the consumer process completely because it looks like the parent process is just waiting anyway (and needs the data eventually) so it can be the consumer. So, I set up a pool and use imap_unordered to handle passing the data to the worker.
I guessed that the data processing doesn't really need the DatasetReader at all and moved it out to its own function. On Windows, either the entire DataReader object is serialized to the subprocess (including data you don't want) or the child version of the object is incomplete and may crash when you try to use it.
Either way, changes made to a DatasetReader object in the child processes aren't seen in the parent. This can be unexpected if the parent is dependent on updated state in that object. Its best to severely bracket what's happening in subprocesses, in my opinion.
from multiprocessing import Pool, get_start_method, cpu_count
# moved out of class (assuming it is not class dependent) so that
# the entire DatasetReader object isn't pickled and sent to
# the child on spawning systems like Microsoft Windows
def _ready_data(idx_ex):
idx, ex = idx_ex
# Some complex functions that take several minutes
result = complex_functions(ex)
return (idx, result)
class DatasetReader:
def __init__(self):
self.data_list = Read_Data_from_file
self.data = [None] * len(data_list)
def _ready_data_fork(self, idx):
# on forking system, call worker with object data
return _ready_data((idx, self.data_list[idx]))
def run(self):
t = tqdm(range(self.num_data), total=self.num_data, desc='Building Dataset ',
bar_format='{desc}:{percentage:3.0f}% ({n_fmt}/{total_fmt}) '
'[{elapsed}<{remaining},{rate_fmt}{postfix}]')
pool = Pool(min(cpu_count, len(self.data_list)))
if get_start_method() == 'fork':
# on forking system, self.data_list is in child process and
# we only pass the index
result_iter = pool.imap_unordered(self._ready_data_fork,
(idx for idx in range(len(data_list))),
chunksize=1)
else:
# on spawning system, we need to pass the data
result_iter = pool.imap_unordered(_ready_data,
enumerate(self.data_list,
chunksize=1)
for idx, result in result_iter:
next(t)
self.data[idx] = result
pool.join()
Suppose I have the following multiprocessing structure:
import multiprocessing as mp
def worker(working_queue, output_queue):
while True:
if working_queue.empty() == True:
break
else:
picked = working_queue.get()
res_item = "Number " + str(picked)
output_queue.put(res_item)
return
if __name__ == '__main__':
static_input = xrange(100)
working_q = mp.Queue()
output_q = mp.Queue()
results_bank = []
for i in static_input:
working_q.put(i)
processes = [mp.Process(target=worker,args=(working_q, output_q)) for i in range(2)]
for proc in processes:
proc.start()
for proc in processes:
proc.join()
results_bank = []
while True:
if output_q.empty() == True:
break
results_bank.append(output_q.get_nowait())
if len(results_bank) == len(static_input):
print "Good run"
else:
print "Bad run"
My question: How would I 'batch' write my results to a single file while the working_queue is still 'working' (or at least, not finished)?
Note: My actual data structure is not sensitive to unordered results relative to inputs (despite my example using integers).
Also, I think that batch/set writing from the output queue is best practice rather than from the growing results bank object. However, I am open to solutions relying on either approach. I am new to multiprocessing so unsure of best practice or most efficient solution(s) to this question.
If you wish to use mp.Processes and mp.Queues, here is a way to process the results in batches. The main idea is in the writer function, below:
import itertools as IT
import multiprocessing as mp
SENTINEL = None
static_len = 100
def worker(working_queue, output_queue):
for picked in iter(working_queue.get, SENTINEL):
res_item = "Number {:2d}".format(picked)
output_queue.put(res_item)
def writer(output_queue, threshold=10):
result_length = 0
items = iter(output_queue.get, SENTINEL)
for batch in iter(lambda: list(IT.islice(items, threshold)), []):
print('\n'.join(batch))
result_length += len(batch)
state = 'Good run' if result_length == static_len else 'Bad run'
print(state)
if __name__ == '__main__':
num_workers = 2
static_input = range(static_len)
working_q = mp.Queue()
output_q = mp.Queue()
writer_proc = mp.Process(target=writer, args=(output_q,))
writer_proc.start()
for i in static_input:
working_q.put(i)
processes = [mp.Process(target=worker, args=(working_q, output_q))
for i in range(num_workers)]
for proc in processes:
proc.start()
# Put SENTINELs in the Queue to tell the workers to exit their for-loop
working_q.put(SENTINEL)
for proc in processes:
proc.join()
output_q.put(SENTINEL)
writer_proc.join()
When passed two arguments, iter expects a callable and a sentinel:
iter(callable, sentinel). The callable (i.e. a function) gets called repeatedly until it returns a value equal to the sentinel. So
items = iter(output_queue.get, SENTINEL)
defines items to be an iterable which, when iterated over, will return items from output_queue
until output_queue.get() returns SENTINEL.
The for-loop:
for batch in iter(lambda: list(IT.islice(items, threshold)), []):
calls the lambda function repeatedly until an empty list is returned. When called, the lambda function returns a list of up to threshold number of items from the iterable items. Thus, this is an idiom for "grouping by n items without padding". See this post for more on this idiom.
Note that it is not a good practice to test working_q.empty(). It could lead to a race condition. For example, suppose we have the 2 worker processes on these lines when the working_q has only 1 item left in it:
def worker(working_queue, output_queue):
while True:
if working_queue.empty() == True: <-- Process-1
break
else:
picked = working_queue.get() <-- Process-2
res_item = "Number " + str(picked)
output_queue.put(res_item)
return
Suppose Process-1 calls working_queue.empty() while there is still one item in the queue. So it returns False. Then Process-2 calls working_queue.get() and obtains the last item. Then Process-1 gets to line picked = working_queue.get() and hangs because there are no more items in the queue.
Therefore, use sentinels (as shown above) to concretely signal when a for-loop
or while-loop should stop instead of checking queue.empty().
There is no operation like "batch q.get". But it is a good practice to put/pop a batch of items instead of items one by one.
Which is exactly what multiprocessing.Pool.map is doing with its parameter chunksize :)
For writing output as soon as possible there is Pool.imap_unordered which returns an iterable instead of list.
def work(item):
return "Number " + str(item)
import multiprocessing
static_input = range(100)
chunksize = 10
with multiprocessing.Pool() as pool:
for out in pool.imap_unordered(work, static_input, chunksize):
print(out)
So I've got this code for Producers and Consumers;
import threading
import time
import random
N = 8
buffer = N * [None]
free = threading.Semaphore(N)
items = threading.Semaphore(0)
def prod():
n = 0
i = 0
while True:
time.sleep(random.random())
free.acquire()
buffer[i] = n
i = (i + 1) % N
n += 1
items.release()
def cons():
i = 0
while True:
time.sleep(random.random())
items.acquire()
print(buffer[i])
i = (i + 1) % N
free.release()
def main():
p = threading.Thread(target=prod, args=[])
c = threading.Thread(target=cons, args=[])
p.start()
c.start()
p.join()
c.join()
main()
But I want to be able to have three threads each for the producer and consumer. Can someone suggest a way I could do this using a third semaphore? Thanks.
Assuming this is not a homework about semaphores and you want a real solution, you should use the Queue object, which can handle all of this by itself. If I understood it correctly, you want three producers and three consumers that share one buffer that can have at maximum 8 items. If that's the case, the code can be simplified to something like this:
import threading
import Queue
def prod(queue):
n = 0
while True:
time.sleep(random.random())
queue.put(n)
n += 1
def cons(queue):
while True:
time.sleep(random.random())
n = queue.get()
print n
def main():
N = 8
queue = Queue.Queue(N)
threads = []
for i in range(3):
threads.append(threading.Thread(target=cons, args=[queue])))
threads.append(threading.Thread(target=prod, args=[queue])))
for thread in threads:
thread.start()
for thread in threads:
thread.join() # this will never really finish, because the threads run forever
If you are interested how is the queue implemented internally, you can see the source code here.