Python threading - with limited threads, iterate over n number of Items - python

Here is an example read from IBM python threading tutorial. I was going through this URL (http://www.ibm.com/developerworks/aix/library/au-threadingpython/)
#!/usr/bin/env python
import Queue
import threading
import urllib2
import time
hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
"http://ibm.com", "http://apple.com"]
queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and prints first 1024 bytes of page
url = urllib2.urlopen(host)
print url.read(1024)
#signals to queue job is done
self.queue.task_done()
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(5):
t = ThreadUrl(queue)
t.setDaemon(True)
t.start()
#populate queue with data
for host in hosts:
queue.put(host)
#wait on the queue until everything has been processed
queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)
The example here works perfectly. I have been looking for a slightly different modification. Here there are known number of URL's , like for example 5. used range(5) in for loop to iterate over the URL's and process it.
What if, i want to use only '5' threads to process 1000 URL's? so when a thread completes, the completed URL should be removed from queue and new URL needs to be added to queue. But all these should happen by using the same thread.
I can check ,
if self.queue.task_done():
return host
This is the only way i can check if the URL is processed successfully or not. Once returned , i should remove URL from the queue. and add a new URL to queue. How to implement this using queue ?
Thanks,

That code will already do what you describe. If you put 1000 items into the queue instead of 5, they will be processed by those same 5 threads - each one will take an item from the queue, process it, then take a new one as long as there are items left in the queue.

Related

ZeroMQ: load balance many workers and one master

Suppose I have one master process that divides up data to be processed in parallel. Lets say there are 1000 chunks of data and 100 nodes on which to run the computations.
Is there some way to do REQ/REP to keep all the workers busy? I've tried to use the load balancer pattern in the guide but with a single client, sock.recv() is going to block until it receives its response from the worker.
Here is the code, slightly modified from the zmq guide for a load balancer. Is starts up one client, 10 workers, and a load balancer/broker in the middle. How can I get all those workers working at the same time???
from __future__ import print_function
from multiprocessing import Process
import zmq
import time
import uuid
import random
def client_task():
"""Basic request-reply client using REQ socket."""
socket = zmq.Context().socket(zmq.REQ)
socket.identity = str(uuid.uuid4())
socket.connect("ipc://frontend.ipc")
# Send request, get reply
for i in range(100):
print("SENDING: ", i)
socket.send('WORK')
msg = socket.recv()
print(msg)
def worker_task():
"""Worker task, using a REQ socket to do load-balancing."""
socket = zmq.Context().socket(zmq.REQ)
socket.identity = str(uuid.uuid4())
socket.connect("ipc://backend.ipc")
# Tell broker we're ready for work
socket.send(b"READY")
while True:
address, empty, request = socket.recv_multipart()
time.sleep(random.randint(1, 4))
socket.send_multipart([address, b"", b"OK : " + str(socket.identity)])
def broker():
context = zmq.Context()
frontend = context.socket(zmq.ROUTER)
frontend.bind("ipc://frontend.ipc")
backend = context.socket(zmq.ROUTER)
backend.bind("ipc://backend.ipc")
# Initialize main loop state
workers = []
poller = zmq.Poller()
# Only poll for requests from backend until workers are available
poller.register(backend, zmq.POLLIN)
while True:
sockets = dict(poller.poll())
if backend in sockets:
# Handle worker activity on the backend
request = backend.recv_multipart()
worker, empty, client = request[:3]
if not workers:
# Poll for clients now that a worker is available
poller.register(frontend, zmq.POLLIN)
workers.append(worker)
if client != b"READY" and len(request) > 3:
# If client reply, send rest back to frontend
empty, reply = request[3:]
frontend.send_multipart([client, b"", reply])
if frontend in sockets:
# Get next client request, route to last-used worker
client, empty, request = frontend.recv_multipart()
worker = workers.pop(0)
backend.send_multipart([worker, b"", client, b"", request])
if not workers:
# Don't poll clients if no workers are available
poller.unregister(frontend)
# Clean up
backend.close()
frontend.close()
context.term()
def main():
NUM_CLIENTS = 1
NUM_WORKERS = 10
# Start background tasks
def start(task, *args):
process = Process(target=task, args=args)
process.start()
start(broker)
for i in range(NUM_CLIENTS):
start(client_task)
for i in range(NUM_WORKERS):
start(worker_task)
# Process(target=broker).start()
if __name__ == "__main__":
main()
I guess there is different ways to do this :
-you can, for example, use the threading module to launch all your requests from your single client, with something like:
result_list = [] # Add the result to a list for the example
rlock = threading.RLock()
def client_thread(client_url, request, i):
context = zmq.Context.instance()
socket = context.socket(zmq.REQ)
socket.setsockopt_string(zmq.IDENTITY, '{}'.format(i))
socket.connect(client_url)
socket.send(request.encode())
reply = socket.recv()
with rlock:
result_list.append((i, reply))
return
def client_task():
# tasks = list with all your tasks
url_client = "ipc://frontend.ipc"
threads = []
for i in range(len(tasks)):
thread = threading.Thread(target=client_thread,
args=(url_client, tasks[i], i,))
thread.start()
threads.append(thread)
-you can take benefit of an evented library like asyncio (there is a submodule zmq.asyncio and an other library aiozmq, the last one offers a higher level of abstraction). In this case you will send your requests to the workers, sequentially too, but without blocking for each response (and so not keeping the main loop busy) and get the results when they came back to the main loop. This could look like this:
import asyncio
import zmq.asyncio
async def client_async(request, context, i, client_url):
"""Basic client sending a request (REQ) to a ROUTER (the broker)"""
socket = context.socket(zmq.REQ)
socket.setsockopt_string(zmq.IDENTITY, '{}'.format(i))
socket.connect(client_url)
await socket.send(request.encode())
reply = await socket.recv()
socket.close()
return reply
async def run(loop):
# tasks = list full of tasks
url_client = "ipc://frontend.ipc"
asyncio_tasks = []
ctx = zmq.asyncio.Context()
for i in range(len(tasks)):
task = asyncio.ensure_future(client_async(tasks[i], ctx, i, url_client))
asyncio_tasks.append(task)
responses = await asyncio.gather(*asyncio_tasks)
return responses
zmq.asyncio.install()
loop = asyncio.get_event_loop()
results = loop.run_until_complete(run(loop))
I didn't tested theses two snippets but they are both coming (with modifications to fit the question) from code i have using zmq in a similar configuration than your question.

Multiple threads needing to access a single resource

I'm currently working on a program where multiple threads need to access a single array list. The array functions as a "buffer". One or more threads write into this list and one or more other threads read and remove from this list. My first question is, are array's in Python thread safe? If not, what is a standard approach of dealing with situation?
Try using Threading.lock if there is only one resource.
You should use the queue lib.
here is a good article explaining about threading and queues.
import Queue
import threading
import urllib2
import time
from BeautifulSoup import BeautifulSoup
hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
"http://ibm.com", "http://apple.com"]
queue = Queue.Queue()
out_queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and then grabs chunk of webpage
url = urllib2.urlopen(host)
chunk = url.read()
#place chunk into out queue
self.out_queue.put(chunk)
#signals to queue job is done
self.queue.task_done()
class DatamineThread(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, out_queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
def run(self):
while True:
#grabs host from queue
chunk = self.out_queue.get()
#parse the chunk
soup = BeautifulSoup(chunk)
print soup.findAll(['title'])
#signals to queue job is done
self.out_queue.task_done()
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(5):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
#populate queue with data
for host in hosts:
queue.put(host)
for i in range(5):
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
#wait on the queue until everything has been processed
queue.join()
out_queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)
You need Locks like ATOzTOA mentioned. You create them by
lock = threading.Lock()
and the threads acquire them if they enter a critical section. After finishing the section, they release the lock. The pythonic way to write this is
with lock:
do_something(buffer)

Threading in python using queue

I wanted to use threading in python to download lot of webpages and went through the following code which uses queues in one of the website.
it puts a infinite while loop. Does each of thread run continuously with out ending till all of them are complete? Am I missing something.
#!/usr/bin/env python
import Queue
import threading
import urllib2
import time
hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
"http://ibm.com", "http://apple.com"]
queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and prints first 1024 bytes of page
url = urllib2.urlopen(host)
print url.read(1024)
#signals to queue job is done
self.queue.task_done()
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(5):
t = ThreadUrl(queue)
t.setDaemon(True)
t.start()
#populate queue with data
for host in hosts:
queue.put(host)
#wait on the queue until everything has been processed
queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)
Setting the thread's to be daemon threads causes them to exit when the main is done. But, yes you are correct in that your threads will run continuously for as long as there is something in the queue else it will block.
The documentation explains this detail Queue docs
The python Threading documentation explains the daemon part as well.
The entire Python program exits when no alive non-daemon threads are left.
So, when the queue is emptied and the queue.join resumes when the interpreter exits the threads will then die.
EDIT: Correction on default behavior for Queue
Your script works fine for me, so I assume you are asking what is going on so you can understand it better. Yes, your subclass puts each thread in an infinite loop, waiting on something to be put in the queue. When something is found, it grabs it and does its thing. Then, the critical part, it notifies the queue that it's done with queue.task_done, and resumes waiting for another item in the queue.
While all this is going on with the worker threads, the main thread is waiting (join) until all the tasks in the queue are done, which will be when the threads have sent the queue.task_done flag the same number of times as messages in the queue . At that point the main thread finishes and exits. Since these are deamon threads, they close down too.
This is cool stuff, threads and queues. It's one of the really good parts of Python. You will hear all kinds of stuff about how threading in Python is screwed up with the GIL and such. But if you know where to use them (like in this case with network I/O), they will really speed things up for you. The general rule is if you are I/O bound, try and test threads; if you are cpu bound, threads are probably not a good idea, maybe try processes instead.
good luck,
Mike
I don't think Queue is necessary in this case. Using only Thread:
import threading, urllib2, time
hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
"http://ibm.com", "http://apple.com"]
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, host):
threading.Thread.__init__(self)
self.host = host
def run(self):
#grabs urls of hosts and prints first 1024 bytes of page
url = urllib2.urlopen(self.host)
print url.read(1024)
start = time.time()
def main():
#spawn a pool of threads
for i in range(len(hosts)):
t = ThreadUrl(hosts[i])
t.start()
main()
print "Elapsed Time: %s" % (time.time() - start)

Understanding Threading in python: How to tell `run()` to return processed data?

I read up about threading in the IBM developer sources and found the following example.
In general I understand what happens here, except for one important thing. The work seems to be done in the run() function. In this example run() only prints a line and signals to the queue, that the job is done.
What if I had to return some processed data? I thought about caching it in a global variable, and to access this one later, but this seems not the right way to go.
Any advice?
Perhaps I should clearify: My intuition tells me to add return processed_data to run() right after self.queue.task_done(), but I can't figure out where to catch that return, since it is not obvious to me where run() is called.
#!/usr/bin/env python
import Queue
import threading
import urllib2
import time
hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
"http://ibm.com", "http://apple.com"]
queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and prints first 1024 bytes of page
url = urllib2.urlopen(host)
print url.read(1024)
#signals to queue job is done
self.queue.task_done()
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(5):
t = ThreadUrl(queue)
t.setDaemon(True)
t.start()
#populate queue with data
for host in hosts:
queue.put(host)
#wait on the queue until everything has been processed
queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)
You can't return a value from run, and in any case there is normally more than one item to process in each thread, so you don't want to return at all after processing one value (see the while loop in each thread).
I would either use another queue to return the results:
queue = Queue.Queue()
out_queue = Queue.Queue()
class ThreadUrl(threading.Thread):
...
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and saves first 1024 bytes of page
url = urllib2.urlopen(host)
out_queue.put(url.read(1024))
#signals to queue job is done
self.queue.task_done()
...
def main():
...
#populate queue with data
for host in hosts:
queue.put(host)
#don't have to wait until everything has been processed if we don't want to
for _ in range(len(hosts)):
first_1k = out_queue.get()
print first_1k
or store the result in the same queue:
class WorkItem(object):
def __init__(self, host):
self.host = host
class ThreadUrl(threading.Thread):
...
def run(self):
while True:
#grabs host from queue
work_item = self.queue.get()
host = work_item.host
#grabs urls of hosts and saves first 1024 bytes of page
url = urllib2.urlopen(host)
work_item.first_1k = url.read(1024)
#signals to queue job is done
self.queue.task_done()
...
def main():
...
#populate queue with data
work_items = [WorkItem(host) for host in hosts]
for item in work_items:
queue.put(item)
#wait on the queue until everything has been processed
queue.join()
for item in work_items:
print item.first_1k
the problem with using the queue method is : the order in which the threads may complete is random . Hence the queue item may not necessarily reflect the result of that specific position .
In this example , if google.com gets done before yahoo.com , then the queue has google data before yahoo data, so when retrieving it , the results are incorrect.

How does this while loop exit?

So, how does this code exit the while statement when the thread is started? (Please do not consider indentation)
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and then grabs chunk of webpage
url = urllib2.urlopen(host)
chunk = url.read()
#place chunk into out queue
self.out_queue.put(chunk)
#signals to queue job is done
self.queue.task_done()
** EDIT *
The code that starts the thread:
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(5):
t = ThreadUrl(queue)
t.setDaemon(True)
t.start()
queue.join()
It doesn't have to exit the while statement for the code to terminate. All that is happening here is that the thread has consumed everything in the queue at which point queue.join() returns.
As soon as the call to queue.join() in the main code returns the main code will exit and because you marked the thread as a daemon the entire application will exit and your background thread will be killed.
The quick answer: it doesn't, unless an exception is raised anywhere, which depends on the functions/methods called in run.
Of course, there is the possibility, that your thread is suspended/stopped from another thread, which effectively terminates your while loop.
Your code will only breaks if an exception occurs during the execution of the content of the while True loop.... not the better way to exit from a thread, but it could work.
If you want to exit properly from your thread, try to replace the while True with something like while self.continue_loop:
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
self.continue_loop = True
def run(self):
while self.continue_loop:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and then grabs chunk of webpage
url = urllib2.urlopen(host)
chunk = url.read()
#place chunk into out queue
self.out_queue.put(chunk)
#signals to queue job is done
self.queue.task_done()
And to start/stop the threads :
def main():
#spawn a pool of threads, and pass them queue instance
threads = []
for i in range(5):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
threads.append(t)
for t in threads:
t.continue_loop = False
t.join()
queue.join()
You could pass in block=False or timeout=5 to your self.queue.get() method. This will raise an Queue.Empty exception if no items remains in the queue. Otherwise AFAIK, the self.queue.get() will block the whole loop so even additional break attempts further on would not be reached.
def run(self):
while True:
#grabs host from queue
try:
host = self.queue.get(block=False)
except Queue.Empty, ex:
break
#grabs urls of hosts and then grabs chunk of webpage
url = urllib2.urlopen(host)
chunk = url.read()
#place chunk into out queue
self.out_queue.put(chunk)
#signals to queue job is done
self.queue.task_done()
Another approach would be to put a "Stop" flag in the queue after all your other items have been added. Then in the thread put a check for this stop flag and break if found.
Eg.
host = self.queue.get()
if host == 'STOP':
#Still need to signal that the task is done, else your queue join() will wait forever
self.queue.task_done()
break

Categories