Cannot exit from multithreading for loop - python

I can not break the following function if an error occurs.
def run(self, max_workers=10):
outputs = {}
q = queue.Queue()
for key, ground_truth in self.ground_truths.items():
q.put((key, ground_truth))
count = {}
count['total_finish'] = 0
start_time = time.time()
def worker():
while True:
try:
key, value = self.pred_on_one_image(q.get())
outputs[key] = value
count['total_finish'] += 1
except:
os._exit()
finally:
q.task_done()
for i in range(max_workers):
t = Thread(target=worker)
t.daemon = True
t.start()
q.join()
return outputs
I tried to use return, q.put(None), sys.exit(), but all of them not work, I have to manually Ctrl+C to break it.

quit() and exit() usually work for me.

Set q.get(block=False) to raise Empty Exception if queue is empty. Otherwise, the queue will wait until the item is available in Queue. The default value of block is True, therefore, the queue was being blocked and no exception was raised.

Related

How to use multiprocessing apply_async pool in a while loop correctly

I need to use a pool to asynchronously parse results coming from an extraction method and send those results to a write queue.
I have tried this: but it seems to just run iteratively... one process after the other.
process_pool = Pool(processes=30, maxtasksperchild=1)
while True:
filepath = read_queue.get(True)
if filepath is None:
break
res = process_pool.apply_async(func=process.run, args=(filepath, final_path), callback=write_queue.put)
results.append(res)
for result in results:
result.wait()
process_pool.close()
process_pool.join()
I have also tried just waiting on each result, but that does the same thing as the above:
process_pool = Pool(processes=30, maxtasksperchild=1)
while True:
filepath = read_queue.get(True)
if filepath is None:
break
res = process_pool.apply_async(func=process.run, args=(filepath, final_path), callback=write_queue.put)
res.wait()
process_pool.close()
process_pool.join()
I also tried just scheduling processes and letting the pool block itself if it's out of workers to spawn:
process_pool = Pool(processes=30, maxtasksperchild=1)
while True:
filepath = read_queue.get(True)
if filepath is None:
break
process_pool.apply_async(func=process.run, args=(filepath, final_path), callback=write_queue.put)
process_pool.close()
process_pool.join()
This doesn't work, and just runs through the processes over and over, not actually running any sort of function and I'm not sure why. It seems I have to do something with the AsyncResult for the pool to actually schedule the process.
I need it to work like this:
When there is a result waiting in the queue, spawn a new process in the pool with that specific argument from the queue.
On callback, put that processed result in the write queue.
However, I can't seem to get it to work asynchronously correctly. It will only work iteratively because I have to do something with result to actually get the task to schedule properly. Whether that is a .get, .wait, whatever.
# write.py
def write(p_list):
outfile = Path('outfile.txt.bz2')
for data in p_list:
if Path.exists(outfile):
mode = 'ab'
else:
mode = 'wb'
with bz2.open(filename=outfile, mode=mode, compresslevel=9) as output:
temp = (str(data) + '\n').encode('utf-8')
output.write(temp)
print('JSON files written', flush=True)
class Write(Process):
def __init__(self, write_queue: Queue):
Process.__init__(self)
self.write_queue = write_queue
def run(self):
while True:
try:
p_list = self.write_queue.get(True, 900)
except Empty:
continue
if p_list is None:
break
write(p_list)
-
# process.py
def parse(data: int):
global json_list
time.sleep(.1) # simulate parsing the json
json_list.append(data)
def read(data: int):
time.sleep(.1)
parse(data)
def run(data: int):
global json_list
json_list = []
read(data)
return json_list
if __name__ == '__main__':
global output_path, json_list
-
# main.py
if __name__ == '__main__':
read_queue = Queue()
write_queue = Queue()
write = Write(write_queue=write_queue)
write.daemon = True
write.start()
for i in range(0, 1000000):
read_queue.put(i)
read_queue.put(None)
process_pool = Pool(processes=30, maxtasksperchild=1)
while True:
data = read_queue.get(True)
if data is None:
break
res = process_pool.apply_async(func=process.run, args=(data,), callback=write_queue.put)
write_queue.put(None)
process_pool.close()
process_pool.join()
write.join()
print('process done')
So, the problem is that there is no problem. I'm just stupid. If you define a max task per worker of 1, the processes will schedule very quickly and it will look like nothing is happening (or maybe im the only one who thought that).
Here's a reasonable way to use an asynchronous process pool correctly within a while loop with a maxtasksperchild of 1
if __name__ == '__main__':
def func(elem):
time.sleep(0.5)
return elem
def callback(elem):
# do something with processed data
pass
queue = multiprocessing.Queue()
for i in range(0, 10000):
queue.put(i)
process_pool = multiprocessing.Pool(processes=num_processes, maxtasksperchild=1)
results = []
while True:
data = queue.get(True)
if data is None:
break
res = process_pool.apply_async(func=func, args=(data,), callback=callback)
results.append(res)
flag = False
for i, res in enumerate(results):
try:
res.wait(600)
# do some logging
results[i] = None
except TimeoutError:
flag = True
# do some logging
process_pool.close()
if flag:
process_pool.terminate()
process_pool.join()
# done!

Queue doesn't process all elements when there are many threads

I have noticed that when I have many threads pulling elements from a queue, there are less elements processed than the number that I put into the queue. This is sporadic but seems to happen somewhere around half the time when I run the following code.
#!/bin/env python
from threading import Thread
import httplib, sys
from Queue import Queue
import time
import random
concurrent = 500
num_jobs = 500
results = {}
def doWork():
while True:
result = None
try:
result = curl(q.get())
except Exception as e:
print "Error when trying to get from queue: {0}".format(str(e))
if results.has_key(result):
results[result] += 1
else:
results[result] = 1
try:
q.task_done()
except:
print "Called task_done when all tasks were done"
def curl(ourl):
result = 'all good'
try:
time.sleep(random.random() * 2)
except Exception as e:
result = "error: %s" % str(e)
except:
result = str(sys.exc_info()[0])
finally:
return result or "None"
print "\nRunning {0} jobs on {1} threads...".format(num_jobs, concurrent)
q = Queue()
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
for x in range(num_jobs):
q.put("something")
try:
q.join()
except KeyboardInterrupt:
sys.exit(1)
total_responses = 0
for result in results:
num_responses = results[result]
print "{0}: {1} time(s)".format(result, num_responses)
total_responses += num_responses
print "Number of elements processed: {0}".format(total_responses)
Tim Peters hit the nail on the head in the comments. The issue is that the tracking of results is threaded and isn't protected by any sort of mutex. That allows something like this to happen:
thread A gets result: "all good"
thread A checks results[result]
thread A sees no such key
thread A suspends # <-- before counting its result
thread B gets result: "all good"
thread B checks results[result]
thread B sees no such key
thread B sets results['all good'] = 1
thread C ...
thread C sets results['all good'] = 2
thread D ...
thread A resumes # <-- and remembers it needs to count its result still
thread A sets results['all good'] = 1 # resetting previous work!
A more typical workflow might have a results queue that the main thread is listening on.
workq = queue.Queue()
resultsq = queue.Queue()
make_work(into=workq)
do_work(from=workq, respond_on=resultsq)
# do_work would do respond_on.put_nowait(result) instead of
# return result
results = {}
while True:
try:
result = resultsq.get()
except queue.Empty:
break # maybe? You'd probably want to retry a few times
results.setdefault(result, 0) += 1

Multi-processing... Running 2 scripts on 2 terminals

I have 2 scripts that start multiple processes. Right now I'm opening up two different terminals and running python start.py to start both the scripts. How can I achieve this with one command, or one running one script.
Start.py 1
# globals
my_queue = multiprocessing.Manager().Queue() # queue to store our values
stop_event = multiprocessing.Event() # flag which signals processes to stop
my_pool = None
def my_function(foo):
print("starting %s" % foo)
try:
addnews.varfoo)
except Exception,e:
print str(e)
MAX_PROCESSES = 50
my_pool = multiprocessing.Pool(MAX_PROCESSES)
x = Var.objects.order_by('name').values('link')
for t in x:
t = t.values()[0]
my_pool.apply_async(my_function, args=(t,))
my_pool.close()
my_pool.join()
Start1.py 2
# globals
MAX_PROCESSES = 50
my_queue = multiprocessing.Manager().Queue() # queue to store our values
stop_event = multiprocessing.Event() # flag which signals processes to stop
my_pool = None
def my_function(var):
var.run_main(var)
stop_event.set()
def var_scanner():
# Since `t` could have unlimited size we'll put all `t` value in queue
while not stop_event.is_set(): # forever scan `values` for new items
y = Var.objects.order_by('foo).values('foo__foo')
for t in y:
t = t.values()[0]
my_queue.put(t)
try:
var_scanner_process = multiprocessing.Process(target=var_scanner)
var_scanner_process.start()
my_pool = multiprocessing.Pool(MAX_PROCESSES)
#while not stop_event.is_set():
try: # if queue isn't empty, get value from queue and create new process
var = my_queue.get_nowait() # getting value from queue
p = multiprocessing.Process(target=my_function, args=(var,))
p.start()
except Queue.Empty:
print "No more items in queue"
time.sleep(1)
#stop_event.set()
except KeyboardInterrupt as stop_test_exception:
print(" CTRL+C pressed. Stopping test....")
stop_event.set()
You can run the first script in the background on the same terminal, using the & shell modifier.
python start.py &
python start1.py

logging seems to have memory leak for multi-thread usage

I meet one scenario of memory leak in Python, I guess it's related with logging module for multi-thread, but I don't find why.
Version1 (With memory-leak and multi-thread call)
campaign_id_queue = Queue.Queue()
campaign_worker = {} # it has data inside, key is ID, value is Class object
for campaign_id, worker in campaign_worker.iteritems():
campaign_id.queue.put(campaign_id)
thread_list = []
for n in range(THREAD_NUM): # defined already
thread_list.append( Thread(target=parallel_run, args=(campaign_id_queue, now, n, logger)))
for thread in thread_list:
thread.daemon = True
thread.start()
campaign_id_queue.join()
# another file
def parallel_run(campaign_id_queue, now, n, logger):
while True:
try:
campaign_id = campaign_id_queue.get()
except Queue.Empty:
logger.warning('Queue empty')
else:
try:
if worker.open_clients(logger) < 0:
logger.error('error here')
continue
worker.run(now, logger)
except Exception, e:
logger.exception(e)
finally:
campaign_id_queue.task_done()
Version2 (Without memory-leak and single-thread call)
campaign_worker = {} # it has data inside, key is ID, value is Class object
for campaign_id, worker in campaign_worker.iteritems():
if worker.open_clients(logger) < 0:
logger.error('error here')
continue
worker.run(now, logger)
It's related with thread not killed after use, not related with logging module, it's solved now, thanks for attention.

Start one thread at a time with queqe

I am writing a server for some library (python).
I want that while the server is working in his loop it will open 1 thread to do something else.
I am controlling this thread with queue and until there is no return value in the queue i don't want the server to open another thread.
try:
#we have a return in the queqe
returnValue = threadQueue.get(False)
startAnotherThread = True
except Empty:
print "Waiting for return value from thread thread....."
if there is some return value in the queue then startAnotherThread will tell to some if statement to open another thread.
i don't know why it's not working mabye some one have an idea?
Solved:
Init before the server loop:
# thread queue
threadQueue = Queue()
# thread numbering
threadNumber = 0
# thread start
threadCanStart = True
Inside the server loop:
if threadCanStart:
myThread(threadNumber, threadQueue).start()
threadNumber += 1
threadCanStart = False
try:
#we have a return in the quqe
returnValue = threadQueue.get(False)
print "Queue return: ", returnValue
threadCanStart = True
except Empty:
print "Waiting for thread....."

Categories