How to kill Finished threads in python - python

My multi-threading script raising this error :
thread.error : can't start new thread
when it reached 460 threads :
threading.active_count() = 460
I assume the old threads keeps stack up, since the script didn't kill them. This is my code:
import threading
import Queue
import time
import os
import csv
def main(worker):
#Do Work
print worker
return
def threader():
while True:
worker = q.get()
main(worker)
q.task_done()
def main_threader(workers):
global q
global city
q = Queue.Queue()
for x in range(20):
t = threading.Thread(target=threader)
t.daemon = True
print "\n\nthreading.active_count() = " + str(threading.active_count()) + "\n\n"
t.start()
for worker in workers:
q.put(worker)
q.join()
How do I kill the old threads when their job is done? (Is return not enough?)

Your threader function never exits, so your threads never die. Since you're just processing one fixed set of work and never adding items after you start working, you could set the threads up to exit when the queue is empty.
See the following altered version of your code and the comments I added:
def threader(q):
# let the thread die when all work is done
while not q.empty():
worker = q.get()
main(worker)
q.task_done()
def main_threader(workers):
# you don't want global variables
#global q
#global city
q = Queue.Queue()
# make sure you fill the queue *before* starting the worker threads
for worker in workers:
q.put(worker)
for x in range(20):
t = threading.Thread(target=threader, args=[q])
t.daemon = True
print "\n\nthreading.active_count() = " + str(threading.active_count()) + "\n\n"
t.start()
q.join()
Notice that I removed global q, and instead I pass q to the thread function. You don't want threads created by a previous call to end up sharing a q with new threads (edit although q.join() prevents this anyway, it's still better to avoid globals).

Related

threads not running parallel in python script

I am new to python and threading. I am trying to run multiple threads at a time. Here is my basic code :
import threading
import time
threads = []
print "hello"
class myThread(threading.Thread):
def __init__(self,i):
threading.Thread.__init__(self)
print "i = ",i
for j in range(0,i):
print "j = ",j
time.sleep(5)
for i in range(1,4):
thread = myThread(i)
thread.start()
While 1 thread is waiting for time.sleep(5) i want another thread to start. In short, all the threads should run parallel.
You might have some misunderstandings on how to subclass threading.Thread, first of all __init__() method is roughly what represents a constructor in Python, basically it'll get executed every time you create an instance, so in your case when thread = myThread(i) executes, it'll block till the end of __init__().
Then you should move your activity into run(), so that when start() is called, the thread will start to run. For example:
import threading
import time
threads = []
print "hello"
class myThread(threading.Thread):
def __init__(self, i):
threading.Thread.__init__(self)
self.i = i
def run(self):
print "i = ", self.i
for j in range(0, self.i):
print "j = ",j
time.sleep(5)
for i in range(1,4):
thread = myThread(i)
thread.start()
P.S. Because of the existence of GIL in CPython, you might not be able to fully take advantages of all your processors if the task is CPU-bound.
Here is an example on how you could use threading based on your code:
import threading
import time
threads = []
print "hello"
def doWork(i):
print "i = ",i
for j in range(0,i):
print "j = ",j
time.sleep(5)
for i in range(1,4):
thread = threading.Thread(target=doWork, args=(i,))
threads.append(thread)
thread.start()
# you need to wait for the threads to finish
for thread in threads:
thread.join()
print "Finished"
import threading
import subprocess
def obj_func(simid):
simid = simid
workingdir = './' +str (simid) # the working directory for the simulation
cmd = './run_delwaq.sh' # cmd is a bash commend to launch the external execution
subprocess.Popen(cmd, cwd=workingdir).wait()
def example_subprocess_files():
num_threads = 4
jobs = []
# Launch the threads and give them access to the objective function
for i in range(num_threads):
workertask = threading.Thread(target=obj_func(i))
jobs.append(workertask)
for j in jobs:
j.start()
for j in jobs:
j.join()
print('All the work finished!')
if __name__ == '__main__':
example_subprocess_files()
This one not works for my case that the task is not print but CPU-Intensive task. The thread are excluded in serial.

Can I assume my threads are done when threading.active_count() returns 1?

Given the following class:
from abc import ABCMeta, abstractmethod
from time import sleep
import threading
from threading import active_count, Thread
class ScraperPool(metaclass=ABCMeta):
Queue = []
ResultList = []
def __init__(self, Queue, MaxNumWorkers=0, ItemsPerWorker=50):
# Initialize attributes
self.MaxNumWorkers = MaxNumWorkers
self.ItemsPerWorker = ItemsPerWorker
self.Queue = Queue # For testing purposes.
def initWorkerPool(self, PrintIDs=True):
for w in range(self.NumWorkers()):
Thread(target=self.worker, args=(w + 1, PrintIDs,)).start()
sleep(1) # Explicitly wait one second for this worker to start.
def run(self):
self.initWorkerPool()
# Wait until all workers (i.e. threads) are done.
while active_count() > 1:
print("Active threads: " + str(active_count()))
sleep(5)
self.HandleResults()
def worker(self, id, printID):
if printID:
print("Starting worker " + str(id) + ".")
while (len(self.Queue) > 0):
self.scraperMethod()
if printID:
print("Worker " + str(id) + " is quiting.")
# Todo Kill is this Thread.
return
def NumWorkers(self):
return 1 # Simplified for testing purposes.
#abstractmethod
def scraperMethod(self):
pass
class TestScraper(ScraperPool):
def scraperMethod(self):
# print("I am scraping.")
# print("Scraping. Threads#: " + str(active_count()))
temp_item = self.Queue[-1]
self.Queue.pop()
self.ResultList.append(temp_item)
def HandleResults(self):
print(self.ResultList)
ScraperPool.register(TestScraper)
scraper = TestScraper(Queue=["Jaap", "Piet"])
scraper.run()
print(threading.active_count())
# print(scraper.ResultList)
When all the threads are done, there's still one active thread - threading.active_count() on the last line gets me that number.
The active thread is <_MainThread(MainThread, started 12960)> - as printed with threading.enumerate().
Can I assume that all my threads are done when active_count() == 1?
Or can, for instance, imported modules start additional threads so that my threads are actually done when active_count() > 1 - also the condition for the loop I'm using in the run method.
You can assume that your threads are done when active_count() reaches 1. The problem is, if any other module creates a thread, you'll never get to 1. You should manage your threads explicitly.
Example: You can put the threads in a list and join them one at a time. The relevant changes to your code are:
def __init__(self, Queue, MaxNumWorkers=0, ItemsPerWorker=50):
# Initialize attributes
self.MaxNumWorkers = MaxNumWorkers
self.ItemsPerWorker = ItemsPerWorker
self.Queue = Queue # For testing purposes.
self.WorkerThreads = []
def initWorkerPool(self, PrintIDs=True):
for w in range(self.NumWorkers()):
thread = Thread(target=self.worker, args=(w + 1, PrintIDs,))
self.WorkerThreads.append(thread)
thread.start()
sleep(1) # Explicitly wait one second for this worker to start.
def run(self):
self.initWorkerPool()
# Wait until all workers (i.e. threads) are done. Waiting in order
# so some threads further in the list may finish first, but we
# will get to all of them eventually
while self.WorkerThreads:
self.WorkerThreads[0].join()
self.HandleResults()
according to docs active_count() includes the main thread, so if you're at 1 then you're most likely done, but if you have another source of new threads in your program then you may be done before active_count() hits 1.
I would recommend implementing explicit join method on your ScraperPool and keeping track of your workers and explicitly joining them to main thread when needed instead of checking that you're done with active_count() calls.
Also remember about GIL...

Python Freeing Up Threads

I am using Threads from the threading class for the first time and they don't seem to be freeing themselves up after the function runs. I am attempting to have a max of 5 threads running at once. Since one thread creates the next there will be some overlap but I'm seeing 2000+ threads running at once before I get the exception "can't start new thread".
from threading import Thread
import string
URLS = ['LONG LIST OF URLS HERE']
currentThread = 0
LASTTHREAD = len(URLS) - 1
MAXTHREADS = 5
threads = [None] * (LASTTHREAD + 1)
def getURL(threadName, currentThread):
print('Thread Name = ' + threadName)
print('URL = ' + str(URLS[currentThread]))
if currentThread < LASTTHREAD:
currentThread = currentThread + 1
thisThread = currentThread
try:
threads[thisThread] = Thread(target = getURL, args = ('thread' + str(thisThread), currentThread, ))
threads[thisThread].start()
threads[thisThread].join()
except Exception,e:
print "Error: unable to start thread"
print str(e)
for i in range(0, MAXTHREADS):
currentThread = currentThread + 1
try:
threads[i] = Thread(target = getURL, args = ('thread' + str(i), currentThread, ))
threads[i].start()
threads[i].join()
except Exception,e:
print "Error: unable to start thread"
print str(e)
I'm open to any other cleaning up I can do here as well since I'm pretty new to python and entirely new to threading. I'm just trying to get the threading set up properly at this point. Eventually this will scrape the URLS.
I'd suggest looking into a thread pool, and having the threads take tasks from a suitable shared data structure (e.g. a queue) rather than starting new threads all the time.
Depending on what it is you actually want to do, if you're using CPython (if you don't know what I mean by CPython, you will be) you might not actually get any performance improvement from using threads (due to global interpreter lock). So you might be better off looking into the multiprocessing module.
from Queue import Queue
from threading import Thread
def worker():
while True:
item = q.get()
do_work(item)
q.task_done()
def do_work(url):
print "Processing URL:" + url
q = Queue()
for i in range(5):
t = Thread(target=worker)
t.daemon = True
t.start()
for item in ['url_' + str(i) for i in range(2000)]:
q.put(item)
q.join() # block until all tasks are done

How to handle multiple jobs in a queue with fixed number of threads in Python

In the below program I have posted 5 jobs to the queue, but have created only 3 threads. When I run the program, only 3 jobs are completed. How am I supposed to complete all 5 jobs with only 3 threads? Is there a way to the make a thread that has completed its job take the next job?
import time
import Queue
import threading
class worker(threading.Thread):
def __init__(self,qu):
threading.Thread.__init__(self)
self.que=qu
def run(self):
print "Going to sleep.."
time.sleep(self.que.get())
print "Slept .."
self.que.task_done()
q = Queue.Queue()
for j in range(3):
work = worker(q);
work.setDaemon(True)
work.start()
for i in range(5):
q.put(1)
q.join()
print "done!!"
You need to have your worker threads run in a loop. You can use a sentinel value (like None or custom class) to tell the workers to shut down after you've put all your actual worked items in the queue:
import time
import Queue
import threading
class worker(threading.Thread):
def __init__(self,qu):
threading.Thread.__init__(self)
self.que=qu
def run(self):
for item in iter(self.que.get, None): # This will call self.que.get() until None is returned, at which point the loop will break.
print "Going to sleep.."
time.sleep(item)
print "Slept .."
self.que.task_done()
self.que.task_done()
q = Queue.Queue()
for j in range(3):
work = worker(q);
work.setDaemon(True)
work.start()
for i in range(5):
q.put(1)
for i in range(3): # Shut down all the workers
q.put(None)
q.join()
print "done!!"
Another option would be to use a multiprocessing.dummy.Pool, which is a thread pool that Python manages for you:
import time
from multiprocessing.dummy import Pool
def run(i):
print "Going to sleep..."
time.sleep(i)
print "Slept .."
p = Pool(3) # 3 threads in the pool
p.map(run, range(5)) # Calls run(i) for each element i in range(5)
p.close()
p.join()
print "done!!"

How to close Threads in Python?

I have some issue with too many Threads unfinished.
I think that queue command .join() just close queue and not the threads using it.
In my script I need to check 280k domains and for each domain get list of his MX records and obtain an IPv6 address of servers if it has it.
I used threads and thanks for them the script its many times faster. But there is a problem, although there is join() for the queue, number of alive threads is growing till an error occur that informs that cant create any new thread (limitation of OS?).
How can I terminate/close/stop/reset threads after each For loop when I am retrieving new domain from database?
Thread Class definition...
class MX_getAAAA_thread(threading.Thread):
def __init__(self,queue,id_domain):
threading.Thread.__init__(self)
self.queue = queue
self.id_domain = id_domain
def run(self):
while True:
self.mx = self.queue.get()
res = dns.resolver.Resolver()
res.lifetime = 1.5
res.timeout = 0.5
try:
answers = res.query(self.mx,'AAAA')
ip_mx = str(answers[0])
except:
ip_mx = "N/A"
lock.acquire()
sql = "INSERT INTO mx (id_domain,mx,ip_mx) VALUES (" + str(id_domain) + ",'" + str(self.mx) + "','" + str(ip_mx) + "')"
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
print "MX" , '>>' , ip_mx, ' :: ', str(self.mx)
lock.release()
self.queue.task_done()
Thread class in use...
(The main For-loop is not here, this is just part of his body)
try:
answers = resolver.query(domain, 'MX')
qMX = Queue.Queue()
for i in range(len(answers)):
t = MX_getAAAA_thread(qMX,id_domain)
t.setDaemon(True)
threads.append(t)
t.start()
for mx in answers:
qMX.put(mx.exchange)
qMX.join()
except NoAnswer as e:
print "MX - Error: No Answer"
except Timeout as etime:
print "MX - Error: dns.exception.Timeout"
print "end of script"
I tried to:
for thread in threads:
thread.join()
after the queue was done, but thread.join() never stops waiting, despite fact that there is no need to wait, because when queue.join() executes there is nothing to do for threads.
What I often do when my thread involves an infinite loop like this, is to change the condition to something I can control from the outside. For example like this:
def run(self):
self.keepRunning = True
while self.keepRunning:
# do stuff
That way, I can change the keepRunning property from the outside and set it to false to gracefully terminate the thread the next time it checks the loop condition.
Btw. as you seem to spawn exactly one thread for each item you put into the queue, you don’t even need to have the threads loop at all, although I would argue that you should always enforce a maximum limit of threads that can be created in this way (i.e. for i in range(min(len(answers), MAX_THREAD_COUNT)):)
Alternative
In your case, instead of terminating the threads in each for-loop iteration, you could just reuse the threads. From what I gather from your thread’s source, all that makes a thread unique to an iteration is the id_domain property you set on its creation. You could however just provide that as well with your queue, so the threads are completely independent and you can reuse them.
This could look like this:
qMX = Queue.Queue()
threads = []
for i in range(MAX_THREAD_COUNT):
t = MX_getAAAA_thread(qMX)
t.daemon = True
threads.append(t)
t.start()
for id_domain in enumerateIdDomains():
answers = resolver.query(id_domain, 'MX')
for mx in answers:
qMX.put((id_domain, mx.exchange)) # insert a tuple
qMX.join()
for thread in threads:
thread.keepRunning = False
Of course, you would need to change your thread a bit then:
class MX_getAAAA_thread(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
self.keepRunning = True
while self.keepRunning:
id_domain, mx = self.queue.get()
# do stuff
I do not see why you need a Queue in the first place.
After all in your design every thread just processes one task.
You should be able to pass that task to the thread on creation.
This way you do not need a Queue and you get rid of the while-loop:
class MX_getAAAA_thread(threading.Thread):
def __init__(self, id_domain, mx):
threading.Thread.__init__(self)
self.id_domain = id_domain
self.mx = mx
Then you can rid of the while-loop inside the run-method:
def run(self):
res = dns.resolver.Resolver()
res.lifetime = 1.5
res.timeout = 0.5
try:
answers = res.query(self.mx,'AAAA')
ip_mx = str(answers[0])
except:
ip_mx = "N/A"
with lock:
sql = "INSERT INTO mx (id_domain,mx,ip_mx) VALUES (" + str(id_domain) + ",'" + str(self.mx) + "','" + str(ip_mx) + "')"
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
print "MX" , '>>' , ip_mx, ' :: ', str(self.mx)
Create one thread for each task
for mx in answers:
t = MX_getAAAA_thread(qMX, id_domain, mx)
t.setDaemon(True)
threads.append(t)
t.start()
and join them
for thread in threads:
thread.join()
Joining the threads will do the trick, but the joins in your case are blocking indefinitely because your threads aren't ever exiting your run loop. You need to exit the run method so that the threads can be joined.

Categories