why is queue empty for the file to be downloaded - python

Below is the code that I have that downloads various URLS into each separate thread, I was in attempt to make some changes before I implement the thread pool but with this change the queue is coming to be empty and download is not beginning.
import Queue
import urllib2
import os
import utils as _fdUtils
import signal
import sys
import time
import threading
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, queue, out_queue):
super(ThreadedFetch, self).__init__()
self.queueItems = queue.get()
self.__url = self.queueItems[0]
self.__saveTo = self.queueItems[1]
self.outQueue = out_queue
def run(self):
fileName = self.__url.split('/')[-1]
path = os.path.join(DESKTOP_PATH, fileName)
file_size = int(_fdUtils.getUrlSizeInBytes(self.__url))
while not STOP_REQUEST.isSet():
urlFh = urllib2.urlopen(self.__url)
_log.info("Download: %s" , fileName)
with open(path, 'wb') as fh:
file_size_dl = 0
block_sz = 8192
while True:
buffer = urlFh.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
fh.write(buffer)
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
status = status + chr(8)*(len(status)+1)
sys.stdout.write('%s\r' % status)
time.sleep(.05)
sys.stdout.flush()
if file_size_dl == file_size:
_log.info("Download Completed %s%% for file %s, saved to %s",
file_size_dl * 100. / file_size, fileName, DESKTOP_PATH)
below is the main function that does the call and initiation.
def main(appName):
args = _fdUtils.getParser()
urls_saveTo = {}
# spawn a pool of threads, and pass them queue instance
# each url will be downloaded concurrently
for i in range(len(args.urls)):
t = ThreadedFetch(queue, out_queue)
t.daemon = True
t.start()
try:
for url in args.urls:
urls_saveTo[url] = args.saveTo
# urls_saveTo = {urls[0]: args.saveTo, urls[1]: args.saveTo, urls[2]: args.saveTo}
# populate queue with data
for item, value in urls_saveTo.iteritems():
queue.put([item, value])
# wait on the queue until everything has been processed
queue.join()
print '*** Done'
except (KeyboardInterrupt, SystemExit):
lgr.critical('! Received keyboard interrupt, quitting threads.')

You create the queue and then the first thread which immediately tries to fetch an item from the still empty queue. The ThreadedFetch.__init__() method isn't run asynchronously, just the run() method when you call start() on a thread object.
Store the queue in the __init__() and move the get() into the run() method. That way you can create all the threads and they are blocking in their own thread, giving you the chance to put items into the queue in the main thread.
class ThreadedFetch(threading.Thread):
def __init__(self, queue, out_queue):
super(ThreadedFetch, self).__init__()
self.queue = queue
self.outQueue = out_queue
def run(self):
url, save_to = self.queue.get()
# ...
For this example the queue is unnecessary by the way as every thread gets exactly one item from the queue. You could pass that item directly to the thread when creating the thread object:
class ThreadedFetch(threading.Thread):
def __init__(self, url, save_to, out_queue):
super(ThreadedFetch, self).__init__()
self.url = url
self.save_to = save_to
self.outQueue = out_queue
def run(self):
# ...
And when the ThreadedFetch class really just consists of the __init__() and run() method you may consider moving the run() method into a function and start that asynchronously.
def fetch(url, save_to, out_queue):
# ...
# ...
def main():
# ...
thread = Thread(target=fetch, args=(url, save_to, out_queue))
thread.daemon = True
thread.start()

Related

How To Mark An Item From A Queue List [Python]

I have a list of strings in Queue
queue = ["First","Second","Third","Fourth","",etc]
Snippet Of The Code :
import thread
from thread import Threading
import Queue
import time
class MainThread(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
Details = self.queue.get()
Trail = Details
#..........#
# Now Mark If Trail Can Be Use Again Or Not
self.queue.put(Trail) # Now Put Them Back No Matter What
self.queue.task_done()
queue = Queue.Queue(maxsize=0)
while not #...........#
for i in range(TotalThreads):
try:
t = ThreadingPower(queue)
t.setDaemon(False)
t.start()
except:
time.sleep(5)
My question : if the string "First" from queue can't be used again,how can i mark it for a specific time,so the thread can continue grabbing other and ignore it until the specific time is over
Edit :
Currently the solution i have is
queue = [["First",1],["Second",1],["Third",1],["Fourth",1],etc]
class MainThread(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
Details = self.queue.get()
Trail,State = Details
if State == 1:
#................#
if #.............#:
State = 0
self.queue.put((Trail,State)) # Now Put Them Back No Matter What
self.queue.task_done()
queue = Queue.Queue(maxsize=0)
while not #...........#
for i in range(TotalThreads):
try:
t = ThreadingPower(queue)
t.setDaemon(False)
t.start()
except:
time.sleep(5)
However this only works if i want to disable them permanently,i need a solution where i can disable them for a specific time

How To Abort Threads that Pull Items from a Queue Using Ctrl+C In Python

I've implemented some threaded application using python. During runtime i want to catch the CTRL+C sigcall and exit the program. To do that I've registered a function called exit_gracefully which also takes care of stopping the threads in a more controlled way. However, it does not seem to work. It seems the handler is never called
Here's the example I'm working with:
import Queue
import threading
import signal
import sys
import time
queue = Queue.Queue()
workers = list()
def callback(id, item):
print("{}: {}".format(id, item))
time.sleep(1)
def exit_gracefully(signum, frame):
print("Ctrl+C was pressed. Shutting threads down ...")
print("Stopping workers ...")
for worker in workers:
worker.stop()
sys.exit(1)
class ThreadedTask(threading.Thread):
def __init__(self, id, queue, callbacks):
threading.Thread.__init__(self)
self._stop_event = threading.Event()
self.id = str(id)
self.queue = queue
self.callbacks = callbacks
self._stopped = False
def run(self):
while not self.stopped():
item = self.queue.get()
for callback in self.callbacks:
callback(self.id, item)
self.queue.task_done()
def stop(self):
self._stop_event.set()
self._stopped = True
def stopped(self):
return self._stop_event.is_set() or self._stopped
def main(input_file, thread_count, callbacks):
print("Initializing queue ...")
queue = Queue.Queue()
print("Parsing '{}' ...".format(input_file))
with open(input_file) as f:
for line in f:
queue.put(line.replace("\n", ""))
print("Initializing {} threads ...".format(thread_count))
for id in range(thread_count):
worker = ThreadedTask(id, queue, callbacks)
worker.setDaemon(True)
workers.append(worker)
print("Starting {} threads ...".format(thread_count))
for worker in workers:
worker.start()
queue.join()
if __name__ == '__main__':
signal.signal(signal.SIGINT, exit_gracefully)
print("Starting main ...")
input_file = "list.txt"
thread_count = 10
callbacks = [
callback
]
main(input_file, thread_count, callbacks)
If you want to try the example above you may generate some test-data first:
seq 1 10000 > list.txt
Any help is appreciated!
Here's a solution that seems to work.
One issue is that Queue.get() will ignore SIGINT unless a timeout is set. That's documented here: https://bugs.python.org/issue1360.
Another issue is that Queue.join() also seems to ignore SIGINT. I worked around that by polling the queue in a loop to see if it's empty.
These issues appear to have been fixed in Python 3.
I also added a shared event that's used in the SIGINT handler to tell all the threads to shut down.
import Queue
import signal
import sys
import threading
import time
def callback(id, item):
print '{}: {}'.format(id, item)
time.sleep(1)
class ThreadedTask(threading.Thread):
def __init__(self, id, queue, run_event, callbacks):
super(ThreadedTask, self).__init__()
self.id = id
self.queue = queue
self.run_event = run_event
self.callbacks = callbacks
def run(self):
queue = self.queue
while not self.run_event.is_set():
try:
item = queue.get(timeout=0.1)
except Queue.Empty:
pass
else:
for callback in self.callbacks:
callback(self.id, item)
queue.task_done()
def main():
queue = Queue.Queue()
run_event = threading.Event()
workers = []
def stop():
run_event.set()
for worker in workers:
# Allow worker threads to shut down completely
worker.join()
def sigint_handler(signum, frame):
print '\nShutting down...'
stop()
sys.exit(0)
signal.signal(signal.SIGINT, sigint_handler)
callbacks = [callback]
for id in range(1, 11):
worker = ThreadedTask(id, queue, run_event, callbacks)
workers.append(worker)
for worker in workers:
worker.start()
with open('list.txt') as fp:
for line in fp:
line = line.strip()
queue.put(line)
while not queue.empty():
time.sleep(0.1)
# Update: Added this to gracefully shut down threads after all
# items are consumed from the queue.
stop()
if __name__ == '__main__':
main()

How to inherit a thread class with infinite loop

Okay, suppose I've got a working class that inherits Thread:
from threading import Thread
import time
class DoStuffClass(Thread):
def __init__(self, queue):
self. queue = queue
self.isstart = False
def startthread(self, isstart):
self.isstart = isstart
if isstart:
Thread.__init__(self)
else:
print 'Thread not started!'
def run(self):
while self.isstart:
time.sleep(1)
if self.queue.full():
y = self.queue.get() #y goes nowhere, it's just to free up the queue
self.queue.put('stream data')
I've tried calling it in another file and it's working successfully:
from Queue import Queue
import dostuff
q = Queue(maxsize=1)
letsdostuff= dostuff.DoStuffClass()
letsdostuff.startthread(True)
letsdostuff.start()
val = ''
i=0
while (True):
val = q.get()
print "Outputting: %s" % val
Right now, I can get the value of the class output thru the queue.
My question: Suppose I want to create another class (ProcessStuff) that inherits the DoStuffClass so that I can grab the output of DoStuffClass through a queue object (or any other method), process it, and pass it to ProcessStuff's queue so that codes calling the ProcessStuff can get its value through queuing. How do I do that?
It sound like you don't really want ProcessStuff to inherit from DoStuffClass, instead you want ProcessStuff to consume from the DoStuffClass queue internally. So rather than use inheritance, just have ProcessStuff keep a reference to a DoStuffClass instance internally, along with an internal Queue object to get the values that DoStuffClass produces:
class ProcessStuff(Thread):
def __init__(self, queue):
super(ProcessStuff, self).__init__()
self.queue = queue
self._do_queue = Queue() # internal Queue for DoStuffClass
self._do_stuff = dostuff.DoStuffClass(self._do_queue)
def run(self):
self._do_stuff.startthread(True)
self._do_stuff.start()
while True:
val = self._do_queue.get() # Grab value from DoStuffClass
# process it
processed_val = "processed {}".format(val)
self.queue.put(processed_val)
q = Queue(maxsize=1)
letsprocessstuff = ProcessStuff(q)
letsprocessstuff.start()
while (True):
val = q.get()
print "Outputting: %s" % val
Output:
Outputting: processed stream data
Outputting: processed stream data
Outputting: processed stream data
Outputting: processed stream data

Limiting Threads within Python Threading, Queue

Im using the following code to multithread urlib2. However what is the best way to limit the number of threads that it consumes ??
class ApiMultiThreadHelper:
def __init__(self,api_calls):
self.q = Queue.Queue()
self.api_datastore = {}
self.api_calls = api_calls
self.userpass = '#####'
def query_api(self,q,api_query):
self.q.put(self.issue_request(api_query))
def issue_request(self,api_query):
self.api_datastore.update({api_query:{}})
for lookup in ["call1","call2"]:
query = api_query+lookup
request = urllib2.Request(query)
request.add_header("Authorization", "Basic %s" % self.userpass)
f = urllib2.urlopen(request)
response = f.read()
f.close()
self.api_datastore[api_query].update({lookup:response})
return True
def go(self):
threads = []
for i in self.api_calls:
t = threading.Thread(target=self.query_api, args = (self.q,i))
t.start()
threads.append(t)
for t in threads:
t.join()
You should use a thread pool. Here's my implementation I've made years ago (Python 3.x friendly):
import traceback
from threading import Thread
try:
import queue as Queue # Python3.x
except ImportError:
import Queue
class ThreadPool(object):
def __init__(self, no=10):
self.alive = True
self.tasks = Queue.Queue()
self.threads = []
for _ in range(no):
t = Thread(target=self.worker)
t.start()
self.threads.append(t)
def worker(self):
while self.alive:
try:
fn, args, kwargs = self.tasks.get(timeout=0.5)
except Queue.Empty:
continue
except ValueError:
self.tasks.task_done()
continue
try:
fn(*args, **kwargs)
except Exception:
# might wanna add some better error handling
traceback.print_exc()
self.tasks.task_done()
def add_job(self, fn, args=[], kwargs={}):
self.tasks.put((fn, args, kwargs))
def join(self):
self.tasks.join()
def deactivate(self):
self.alive = False
for t in self.threads:
t.join()
You can also find a similar class in multiprocessing.pool module (don't ask me why it is there). You can then refactor your code like this:
def go(self):
tp = ThreadPool(20) # <-- 20 thread workers
for i in self.api_calls:
tp.add_job(self.query_api, args=(self.q, i))
tp.join()
tp.deactivate()
Number of threads is now defined a priori.

python threadpool problem (wait for something)

I wrote simple web site crowler with threadpool. The problem is: then crawler is get all over site it must finish, but in real it wait for something in the end,and script dont finished, why this happend?
from Queue import Queue
from threading import Thread
import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
from Queue import Queue, Empty
from threading import Thread
visited = set()
queue = Queue()
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
print "startcall in thread",self
print args
try: func(*args, **kargs)
except Exception, e: print e
print "stopcall in thread",self
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads): Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
def process(pool,host,url):
try:
print "get url",url
#content = urlopen(url).read().decode(charset)
content = urlopen(url).read()
except UnicodeDecodeError:
return
for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
#print "link",link
try:
href = link['href']
except KeyError:
continue
if not href.startswith('http://'):
href = 'http://%s%s' % (host, href)
if not href.startswith('http://%s%s' % (host, '/')):
continue
if href not in visited:
visited.add(href)
pool.add_task(process,pool,host,href)
print href
def start(host,charset):
pool = ThreadPool(7)
pool.add_task(process,pool,host,'http://%s/' % (host))
pool.wait_completion()
start('simplesite.com','utf8')
The problem I see is that you never quit the while in run. So, it will block forever. You need to break that loop when the jobs are done.
You could try to :
1) insert
if not func: break
after task.get(...) in run.
2) append
pool.add_task(None, None, None)
at the end of process.
This is a way for process to notify the pool that he has no more task to process.

Categories