I want to implement a file crawler that stores data to a Mongo. I would like to use multiprocessing as a way to 'hand off' blocking tasks such as unzipping files, file crawling and uploading to Mongo. There are certain tasks that are reliant on other tasks (i.e., a file needs to be unzipped before files can be crawled), so I would like the ability to complete the necessary task and add new ones to the same task queue.
Below is what I currently have:
import multiprocessing
class Worker(multiprocessing.Process):
def __init__(self, task_queue: multiprocessing.Queue):
super(Worker, self).__init__()
self.task_queue = task_queue
def run(self):
for (function, *args) in iter(self.task_queue.get, None):
print(f'Running: {function.__name__}({*args,})')
# Run the provided function with its parameters in child process
function(*args)
self.task_queue.task_done()
def foo(task_queue: multiprocessing.Queue) -> None:
print('foo')
# Add new task to queue from this child process
task_queue.put((bar, 1))
def bar(x: int) -> None:
print(f'bar: {x}')
def main():
# Start workers on separate processes
workers = []
manager = multiprocessing.Manager()
task_queue = manager.Queue()
for i in range(multiprocessing.cpu_count()):
worker = Worker(task_queue)
workers.append(worker)
worker.start()
# Run foo on child process using the queue as parameter
task_queue.put((foo, task_queue))
for _ in workers:
task_queue.put(None)
# Block until workers complete and join main process
for worker in workers:
worker.join()
print('Program completed.')
if __name__ == '__main__':
main()
Expected Behaviour:
Running: foo((<AutoProxy[Queue] object, typeid 'Queue' at 0x1b963548908>,))
foo
Running: bar((1,))
bar: 1
Program completed.
Actual Behaviour:
Running: foo((<AutoProxy[Queue] object, typeid 'Queue' at 0x1b963548908>,))
foo
Program completed.
I am quite new to multiprocessing so any help would be greatly appreciated.
As #FrankYellin noted, this is due to the fact that None is being put into task_queue before bar can be added.
Assuming that the queue will either be non-empty or waiting for a task to complete
during the program (which is true in my case), the join method on the queue can be used. According to the docs:
Blocks until all items in the queue have been gotten and processed.
The count of unfinished tasks goes up whenever an item is added to the
queue. The count goes down whenever a consumer thread calls
task_done() to indicate that the item was retrieved and all work on it
is complete. When the count of unfinished tasks drops to zero, join()
unblocks.
Below is the updated code:
import multiprocessing
class Worker(multiprocessing.Process):
def __init__(self, task_queue: multiprocessing.Queue):
super(Worker, self).__init__()
self.task_queue = task_queue
def run(self):
for (function, *args) in iter(self.task_queue.get, None):
print(f'Running: {function.__name__}({*args,})')
# Run the provided function with its parameters in child process
function(*args)
self.task_queue.task_done() # <-- Notify queue that task is complete
def foo(task_queue: multiprocessing.Queue) -> None:
print('foo')
# Add new task to queue from this child process
task_queue.put((bar, 1))
def bar(x: int) -> None:
print(f'bar: {x}')
def main():
# Start workers on separate processes
workers = []
manager = multiprocessing.Manager()
task_queue = manager.Queue()
for i in range(multiprocessing.cpu_count()):
worker = Worker(task_queue)
workers.append(worker)
worker.start()
# Run foo on child process using the queue as parameter
task_queue.put((foo, task_queue))
# Block until all items in queue are popped and completed
task_queue.join() # <---
for _ in workers:
task_queue.put(None)
# Block until workers complete and join main process
for worker in workers:
worker.join()
print('Program completed.')
if __name__ == '__main__':
main()
This seems to work fine. I will update this if I discover anything new. Thank you all.
Related
I want to run multiple threads in parallel. Each thread picks up a task from a task queue and executes that task.
from threading import Thread
from Queue import Queue
import time
class link(object):
def __init__(self, i):
self.name = str(i)
def run_jobs_in_parallel(consumer_func, jobs, results, thread_count,
async_run=False):
def consume_from_queue(jobs, results):
while not jobs.empty():
job = jobs.get()
try:
results.append(consumer_func(job))
except Exception as e:
print str(e)
results.append(False)
finally:
jobs.task_done()
#start worker threads
if jobs.qsize() < thread_count:
thread_count = jobs.qsize()
for tc in range(1,thread_count+1):
worker = Thread(
target=consume_from_queue,
name="worker_{0}".format(str(tc)),
args=(jobs,results,))
worker.start()
if not async_run:
jobs.join()
def create_link(link):
print str(link.name)
time.sleep(10)
return True
def consumer_func(link):
return create_link(link)
# create_link takes a while to execute
jobs = Queue()
results = list()
for i in range(0,10):
jobs.put(link(i))
run_jobs_in_parallel(consumer_func, jobs, results, 25, async_run=False)
Now what is happening is, let say we have 10 link objects in jobs queue, while the threads are running in parallel, multiple threads are executing same task. How can I prevent this from happening?
Note - the above sample code does not have the problem describe above, but i have exactly same code except create_link method does some complex stuff.
I think what you need is a lock object (docs,tutorial+examples). If you create an instance of such an object you can 'lock' some parts of your code, ensuring that only one thread executes this part at a time.
I guess in your case you want to lock the line job = jobs.get().
First you have to create the lock in a scope where all threads have access to it. (You don't want a lock for every thread but a single lock for all your threads. That means creating the lock within your thread just before acquiring it won't work)
import threading
lock = threading.Lock()
then you can use it on your line like:
lock.acquire()
job = jobs.get()
lock.release()
or
with lock:
job = jobs.get()
The first thread to reach acquire() will lock the lock. other threads that try to acquire() the lock will pause until the lock gets unlocked again by calling release().
I am trying to learn multiprocessing with queue.
What I want to do is figure out when/how to "add more items to the queue" when the script is in motion.
The below script is the baseline I am working from:
import multiprocessing
class MyFancyClass:
def __init__(self, name):
self.name = name
def do_something(self):
proc_name = multiprocessing.current_process().name
print('Doing something fancy in {} for {}!'.format(
proc_name, self.name))
def worker(q):
obj = q.get()
obj.do_something()
if __name__ == '__main__':
queue = multiprocessing.Queue()
p = multiprocessing.Process(target=worker, args=(queue,))
p.start()
queue.put(MyFancyClass('Fancy Dan'))
queue.put(MyFancyClass('Frankie'))
print(queue.qsize())
# Wait for the worker to finish
queue.close()
queue.join_thread()
p.join()
on line 26, the Fancy Dan inject works, but the Frankie piece doesn't. I am able to confirm that Frankie does make it into the queue. I need a spot where I can "Check for more items" and insert them into the queue as needed. If no more items exist, then close the queue when the existing items are clear.
How do I do this?
Thanks!
Let's make it clear:
the target function worker(q) will be called just once in the above scheme. At that first call the function will suspend waiting the result from blocking operation q.get(). It gets the instance MyFancyClass('Fancy Dan') from the queue, invokes its do_something method and get finished.
MyFancyClass('Frankie') will be put into the queue but won't go to the Process cause the process' target function is done.
one of the ways is to read from the queue and wait for a signal/marked item which signals that queue usage is stopped. Let's say None value.
import multiprocessing
class MyFancyClass:
def __init__(self, name):
self.name = name
def do_something(self):
proc_name = multiprocessing.current_process().name
print('Doing something fancy in {} for {}!'.format(proc_name, self.name))
def worker(q):
while True:
obj = q.get()
if obj is None:
break
obj.do_something()
if __name__ == '__main__':
queue = multiprocessing.Queue()
p = multiprocessing.Process(target=worker, args=(queue,))
p.start()
queue.put(MyFancyClass('Fancy Dan'))
queue.put(MyFancyClass('Frankie'))
# print(queue.qsize())
queue.put(None)
# Wait for the worker to finish
queue.close()
queue.join_thread()
p.join()
The output:
Doing something fancy in Process-1 for Fancy Dan!
Doing something fancy in Process-1 for Frankie!
One way you could do this is by changing worker to
def worker(q):
while not q.empty():
obj = q.get()
obj.do_something()
The problem with your original code is that worker returns after doing work on one item on the queue. You need some sort of looping logic.
This solution is imperfect because empty() is not reliable. Also will fail if the queue becomes empty before adding more items to it (the process will just return).
I would suggest using a Process Pool Executor.
Submit is pretty close to what you're looking for.
I want to call a multiprocessing.pool.map inside a process.
When initialized inside the run() function, it works. When initialized at instantiation, it does not.
I cannot figure the reason for this behavior ? What happens in the process ?
I am on python 3.6
from multiprocessing import Pool, Process, Queue
def DummyPrinter(key):
print(key)
class Consumer(Process):
def __init__(self, task_queue):
Process.__init__(self)
self.task_queue = task_queue
self.p = Pool(1)
def run(self):
p = Pool(8)
while True:
next_task = self.task_queue.get()
if next_task is None:
break
p.map(DummyPrinter, next_task) # Works
#self.p.map(DummyPrinter, next_task) # Does not Work
return
if __name__ == '__main__':
task_queue = Queue()
Consumer(task_queue).start()
task_queue.put(range(5))
task_queue.put(None)
multiprocessing.Pool cannot be shared by multiple processes because it relies on pipes and threads for its functioning.
The __init__ method gets executed in the parent process whereas the run logic belongs to the child process.
I usually recommend against sub-classing the Process object as it's quite counter intuitive.
A logic like the following would better show the actual division of responsibilities.
def function(task_queue):
"""This runs in the child process."""
p = Pool(8)
while True:
next_task = self.task_queue.get()
if next_task is None:
break
p.map(DummyPrinter, next_task) # Works
def main():
"""This runs in the parent process."""
task_queue = Queue()
process = Process(target=function, args=[task_queue])
process.start()
I'm making remote API calls using threads, using no join so that the program could make the next API call without waiting for the last to complete.
Like so:
def run_single_thread_no_join(function, args):
thread = Thread(target=function, args=(args,))
thread.start()
return
The problem was I needed to know when all API calls were completed. So I moved to code that's using a cue & join.
Threads seem to run in serial now.
I can't seem to figure out how to get the join to work so that threads execute in parallel.
What am I doing wrong?
def run_que_block(methods_list, num_worker_threads=10):
'''
Runs methods on threads. Stores method returns in a list. Then outputs that list
after all methods in the list have been completed.
:param methods_list: example ((method name, args), (method_2, args), (method_3, args)
:param num_worker_threads: The number of threads to use in the block.
:return: The full list of returns from each method.
'''
method_returns = []
# log = StandardLogger(logger_name='run_que_block')
# lock to serialize console output
lock = threading.Lock()
def _output(item):
# Make sure the whole print completes or threads can mix up output in one line.
with lock:
if item:
print(item)
msg = threading.current_thread().name, item
# log.log_debug(msg)
return
# The worker thread pulls an item from the queue and processes it
def _worker():
while True:
item = q.get()
if item is None:
break
method_returns.append(item)
_output(item)
q.task_done()
# Create the queue and thread pool.
q = Queue()
threads = []
# starts worker threads.
for i in range(num_worker_threads):
t = threading.Thread(target=_worker)
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
threads.append(t)
for method in methods_list:
q.put(method[0](*method[1]))
# block until all tasks are done
q.join()
# stop workers
for i in range(num_worker_threads):
q.put(None)
for t in threads:
t.join()
return method_returns
You're doing all the work in the main thread:
for method in methods_list:
q.put(method[0](*method[1]))
Assuming each entry in methods_list is a callable and a sequence of arguments for it, you did all the work in the main thread, then put the result from each function call in the queue, which doesn't allow any parallelization aside from printing (which is generally not a big enough cost to justify thread/queue overhead).
Presumably, you want the threads to do the work for each function, so change that loop to:
for method in methods_list:
q.put(method) # Don't call it, queue it to be called in worker
and change the _worker function so it calls the function that does the work in the thread:
def _worker():
while True:
method, args = q.get() # Extract and unpack callable and arguments
item = method(*args) # Call callable with provided args and store result
if item is None:
break
method_returns.append(item)
_output(item)
q.task_done()
The following code takes an initial string ('a', 'b', or 'c'), and the two thread types pass it back and forth, appending 'W' and 'H' to it repeatedly, marking that the Worker thread or the Http thread last handled the string.
The code is a simple test to try and eventually accomplish the following. The http thread pool will pull web pages, and the worker thread will add info to a db, and then give the http thread more urls to pull. They just go back and forth. I want both thread pools and queues to stay alive unless BOTH are empty simultaneously. (there are cases where one pool will temporarily run out of things to do, and I don't want it to join because it's companion thread pool will probably be adding more work to it's queue soon.)
In the following code, the http thread pool runs out of things to do almost immediately, and then joins. But you'll notice that the threads keep functioning.
Why does it do this
And how do I make it so neither queues can join until BOTH are simultaneously empty?
from queue import Queue
import threading
import time
class http(threading.Thread):
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
row = self.queue.get()
print(row)
self.out_queue.put(row+'H')
self.queue.task_done()
class worker(threading.Thread):
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
time.sleep(1)
row = self.out_queue.get()
self.queue.put(row+'W')
self.out_queue.task_done()
URL_THREAD_COUNT = 3
rows = [chr(x) for x in range(97, 100)]
def main():
queue = Queue()
out_queue = Queue()
#spawn a pool of threads, and pass them queue instance
for i in range(URL_THREAD_COUNT):
t = http(queue, out_queue)
t.daemon = True
t.start()
#populate queue with data
for row in rows:
queue.put(row)
#spawn worker thread
dt = worker(queue, out_queue)
dt.daemon = True
dt.start()
#time.sleep(5)
# wait for queues
queue.join()
print('EXIT http')
out_queue.join()
print('EXIT worker')
start = time.time()
main()
print("Elapsed Time: %s" % (time.time() - start))
"joining" a queue waits until the queue is empty. If worker finishes processing some out_queue messages before the other threads can add more messages, the outer out_queue.join thinks you are done. You may want to add a control message that tells the threads when their work is done so that they can exit, and call thread.join() for them all instead. That will mean keeping a list of threads created in the for loop instead of just abandoning them.