#Version1
main_df = pd.read_csv('Million_rows.csv')
def myfunction(args*,start,end):
for i in range(start,end):
if condition1:
for item in mainTreeSearch:
...
lock.acquire()
###write to main_df
lock.release()
noLuck = False
break
if noLuck and Acondition:
lock.acquire()
###write to main_df
lock.release()
elif
... various asymmetric decision trees...
t1 = Thread(target=myfuct, args=(args*),0,250))
t2 = Thread(target=myfuct, args=(args*),250,500))
t3 = Thread(target=myfuct, args=(args*),500,750))
t4 = Thread(target=myfuct, args=(args*),750,1000))
My problem is that I don't know how to feed the threads the rest of the rows, I have tried Queue, unsuccessfully.
#Version2
def myfuntion(args*,q)
while True:
q.get()
....same search as above...without locking
q.task_done()
q = Queue(maxsize=0)
num_threads = 5
threads =[]
for i in range(num_threads):
worker = Thread(target=myfunction, args=(args*))
worker.setDaemon(True)
threads.append(worker)
worker.start()
for x in range(1000):
#time.sleep(.005)
q.put(x)
q.join()
In version 2 without sleep either 1 thread hogs all the data or random crashes happen.
In version 1, should I use threading.nodify() mechanism, if so how is it implemented?
I reformatted it to this and it works as expected
from Queue import Queue
import threading
q = Queue()
def myfuntion(q):
while True:
val = q.get()
print('\n' + str(threading.currentThread()))
print('\n' + str(val))
q.task_done()
num_threads = 5
threads = []
for i in range(num_threads):
worker = threading.Thread(target=myfuntion, args=(q,))
worker.setDaemon(True)
threads.append(worker)
worker.start()
for x in range(1000):
q.put(x)
q.join()
Check it out. I think the way you are passing the parameters is wrong.
Related
I want to implement producer-consumer pattern by using multiprocessing.pool.Pool
Since the JoinableQueue cannot be used in Pool (would claim RuntimeError: JoinableQueue objects should only be shared between processes through inheritance), I have to use multiprocessing.Manager() inspired by this answer.
The question is: now the program may hang when consumer jobs are larger than producer jobs.
import queue
import random
from multiprocessing import Manager, Pool
def consumer(q):
while True:
try:
res = q.get(block=False)
if res is None:
break
print(f'Consume {res}')
except queue.Empty:
pass
def producer(q, food):
for i in range(2):
res = f'{food} {i}'
print(f'Produce {res}')
q.put(res)
q.put(None) # sentinel
if __name__ == "__main__":
with Pool() as pool:
jobs = 2
foods = ['apple', 'banana', 'melon', 'salad']
q = Manager().Queue()
[pool.apply_async(func=consumer, args=(q, )) for _ in range(jobs + 1)] # would hang
# would not hang only when the consumer jobs is equal or less than the producer jobs
# [pool.apply_async(func=consumer, args=(q, )) for _ in range(jobs)]
[
pool.apply_async(func=producer, args=(q, random.choice(foods)))
for _ in range(jobs)
]
pool.close()
pool.join()
Seems like those extra consumers can't get the sentinel and just wait there forever.
So what's the elegant way to implement the producer-consumer pattern in multiprocessing.pool.Pool?
Or is it only possible with multiprocessing.Process + JoinableQueue?
You can use a multiprocessing.JoinableQueue by having your process pool workers accessing it as a global variable that gets initialized using a pool initializer:
import multiprocessing
def init_pool(input_q, output_q):
global in_q, out_q
in_q = input_q
out_q = output_q
def worker():
print(type(in_q))
# required by Windows
if __name__ == '__main__':
in_q = multiprocessing.JoinableQueue()
out_q = multiprocessing.JoinableQueue()
pool = multiprocessing.Pool(2, initializer=init_pool, initargs=(in_q, out_q))
pool.apply(worker)
Prints:
<class 'multiprocessing.queues.JoinableQueue'>
Seems like using multiprocessing.Process + JoinableQueue is a more elegant way.
import queue
import random
from multiprocessing import JoinableQueue, Process
def consumer(q: JoinableQueue):
while True:
try:
res = q.get(block=False)
print(f'Consume {res}')
q.task_done()
except queue.Empty:
pass
def producer(q: JoinableQueue, food):
for i in range(2):
res = f'{food} {i}'
print(f'Produce {res}')
q.put(res)
q.join()
if __name__ == "__main__":
foods = ['apple', 'banana', 'melon', 'salad']
jobs = 2
q = JoinableQueue()
producers = [
Process(target=producer, args=(q, random.choice(foods)))
for _ in range(jobs)
]
# daemon=True is important here
consumers = [
Process(target=consumer, args=(q, ), daemon=True)
for _ in range(jobs * 2)
]
# + order here doesn't matter
for p in consumers + producers:
p.start()
for p in producers:
p.join()
In funkid's self-answer,
when using JoinableQueue, you don't have to send/receive the trailing None in producer/consumer anymore. All producers wait until all of the items in the queue have been consumed completely (after the final task_done), then joined by the main process.
As pointed out by #Koby,
daemon=True is important because it allows the consumers (blocked by the empty queue) to be killed when the main process terminates.
Modified program:
import random
from multiprocessing import JoinableQueue, Process
def consumer(q: JoinableQueue):
while True:
res = q.get()
print(f'Consume {res}')
q.task_done()
def producer(q: JoinableQueue, food):
for i in range(2):
res = f'{food} {i}'
print(f'Produce {res}')
q.put(res)
q.join()
if __name__ == "__main__":
foods = ['apple', 'banana', 'melon', 'salad']
jobs = 2
q = JoinableQueue()
producers = [
Process(target=producer, args=(q, random.choice(foods)))
for _ in range(jobs)
]
# daemon=True is important here
consumers = [
Process(target=consumer, args=(q, ), daemon=True)
for _ in range(jobs * 2)
]
# + order here doesn't matter
for p in consumers + producers:
p.start()
for p in producers:
p.join()
I'm trying to implement mutual exclusion using semaphore in Python. The two processes (proc1, proc2) are supposed to be two independent, concurrent processes. They do exactly the same thing: store n in array[n], then increment n.
The purpose of the program is to show that using semaphore we can ensure that the array is filled properly: [0,1,2,3,4,5,6,7,8,9], without skipping any index. However, my code seems to store [0,1,0,0,0,0,0,0,0,0]. I haven't used threads in python before, so I don't know what's going on.
import threading
import time
n = 0
array = [0]*10
sem = threading.Semaphore()
def proc1():
global n, array
while True:
sem.acquire()
array[n] = n
n += 1
sem.release()
time.sleep(0.25)
def proc2():
global n, array
while True:
sem.acquire()
array[n] = n
n += 1
sem.release()
time.sleep(0.25)
t = threading.Thread(target = proc1)
t.start()
t2 = threading.Thread(target = proc2)
t2.start()
print (array)
the problem was that the OP tried to print the result before the threads were done.
He should have waited for join.
import threading
import time
n = 0
array = [0]*10
sem = threading.Semaphore()
def proc(num):
global n
while True:
sem.acquire()
n = n+1
sem.release()
if n > 9:
break
array[n] = n
print ("Thread {}: {}".format(num,array))
time.sleep(0.25)
t1 = threading.Thread(target = proc, args=[1])
t2 = threading.Thread(target = proc, args=[2])
t1.start()
t2.start()
t1.join()
t2.join()
Different take on a Semaphore pattern, handing the "tasks" within the Sempahore itself
class Sempahore:
def __init__(self, max_threads):
self.active_threads = 0
self.max_threads = max_threads
self.tasks = []
def add_task(self, func, args):
self.tasks.append(
Task(
func=func,
args=args
)
)
def run_task(self, task: Task):
_func = task.func
_args = task.args
self.active_threads += 1
_func(*_args)
self.active_threads -= 1
def run(self, blocking=False):
if blocking:
self._run()
else:
t = Thread(target=self._run)
t.start()
def _run(self):
while True:
if self.active_threads < self.max_threads:
task = self.tasks.pop()
logger.info(f'starting task: {task.task_id}')
t = Thread(
target=self.run_task,
args=(task,))
t.start()
if len(self.tasks) == 0:
break
So I've got this code for Producers and Consumers;
import threading
import time
import random
N = 8
buffer = N * [None]
free = threading.Semaphore(N)
items = threading.Semaphore(0)
def prod():
n = 0
i = 0
while True:
time.sleep(random.random())
free.acquire()
buffer[i] = n
i = (i + 1) % N
n += 1
items.release()
def cons():
i = 0
while True:
time.sleep(random.random())
items.acquire()
print(buffer[i])
i = (i + 1) % N
free.release()
def main():
p = threading.Thread(target=prod, args=[])
c = threading.Thread(target=cons, args=[])
p.start()
c.start()
p.join()
c.join()
main()
But I want to be able to have three threads each for the producer and consumer. Can someone suggest a way I could do this using a third semaphore? Thanks.
Assuming this is not a homework about semaphores and you want a real solution, you should use the Queue object, which can handle all of this by itself. If I understood it correctly, you want three producers and three consumers that share one buffer that can have at maximum 8 items. If that's the case, the code can be simplified to something like this:
import threading
import Queue
def prod(queue):
n = 0
while True:
time.sleep(random.random())
queue.put(n)
n += 1
def cons(queue):
while True:
time.sleep(random.random())
n = queue.get()
print n
def main():
N = 8
queue = Queue.Queue(N)
threads = []
for i in range(3):
threads.append(threading.Thread(target=cons, args=[queue])))
threads.append(threading.Thread(target=prod, args=[queue])))
for thread in threads:
thread.start()
for thread in threads:
thread.join() # this will never really finish, because the threads run forever
If you are interested how is the queue implemented internally, you can see the source code here.
I'm going through the "Little Book of Semaphores" right now, and I'm having a problem with the first Barrier problem. In the below code , I'm trying to have 3 threads rendezvous before continuing. This part works fine - I always get 3 "before"s pushed to the queue. However, I don't always get 3 "after"s pushed to the queue. Sometimes I do, but not always. What am I doing wrong?
import threading
import random
import Queue
import time
num_loops = 1
class myThread(threading.Thread):
def __init__(self, id, count, n, q, locks):
threading.Thread.__init__(self)
self.id = id
self.q = q
self.n = n
self.locks = locks
self.count = count
return
def run(self):
time.sleep(random.random()/100)
self.q.put("before")
with self.locks['mutex']:
self.count[0] += 1
if self.count[0] == self.n:
locks['barrier'].release()
locks['barrier'].acquire()
locks['barrier'].release()
time.sleep(random.random()/100)
self.q.put("after")
if __name__ == '__main__':
total = 10
incorrect = 0
num_threads = 3
for _ in range(total):
q = Queue.Queue()
locks = {'mutex': threading.Semaphore(1),
'barrier': threading.Semaphore(0),
}
threads = []
count = [0]
for i in range(num_threads):
t = myThread(i, count, num_threads, q, locks)
t.start()
threads.append(t)
for i in threads:
t.join()
print "join"
one_loop = ['before']*num_threads + ['after']*num_threads
total_loop = one_loop * num_loops
result = []
while not q.empty():
result.append(q.get())
print result
if result != total_loop:
incorrect += 1
print "%s out of %s is wrong" % (incorrect, total)
I found the problem. You do not join all the threads. The line:
for i in threads:
t.join()
print "join"
Should be:
for i in threads:
i.join() # changed line
print "join"
Joining t is first just waiting for the last thread created, then in the rest of the iterations a no-op.
I'm facing problems with the following example code:
from multiprocessing import Lock, Process, Queue, current_process
def worker(work_queue, done_queue):
for item in iter(work_queue.get, 'STOP'):
print("adding ", item, "to done queue")
#this works: done_queue.put(item*10)
done_queue.put(item*1000) #this doesnt!
return True
def main():
workers = 4
work_queue = Queue()
done_queue = Queue()
processes = []
for x in range(10):
work_queue.put("hi"+str(x))
for w in range(workers):
p = Process(target=worker, args=(work_queue, done_queue))
p.start()
processes.append(p)
work_queue.put('STOP')
for p in processes:
p.join()
done_queue.put('STOP')
for item in iter(done_queue.get, 'STOP'):
print(item)
if __name__ == '__main__':
main()
When the done Queue becomes big enough (a limit about 64k i think), the whole thing freezes without any further notice.
What is the general approach for such a situation when the queue becomes too big? is there some way to remove elements on the fly once they are processed? The Python docs recommend removing the p.join(), in a real application however i can not estimate when the processes have finished. Is there a simple solution for this problem besides infinite looping and using .get_nowait()?
This works for me with 3.4.0alpha4, 3.3, 3.2, 3.1 and 2.6. It tracebacks with 2.7 and 3.0. I pylint'd it, BTW.
#!/usr/local/cpython-3.3/bin/python
'''SSCCE for a queue deadlock'''
import sys
import multiprocessing
def worker(workerno, work_queue, done_queue):
'''Worker function'''
#reps = 10 # this worked for the OP
#reps = 1000 # this worked for me
reps = 10000 # this didn't
for item in iter(work_queue.get, 'STOP'):
print("adding", item, "to done queue")
#this works: done_queue.put(item*10)
for thing in item * reps:
#print('workerno: {}, adding thing {}'.format(workerno, thing))
done_queue.put(thing)
done_queue.put('STOP')
print('workerno: {0}, exited loop'.format(workerno))
return True
def main():
'''main function'''
workers = 4
work_queue = multiprocessing.Queue(maxsize=0)
done_queue = multiprocessing.Queue(maxsize=0)
processes = []
for integer in range(10):
work_queue.put("hi"+str(integer))
for workerno in range(workers):
dummy = workerno
process = multiprocessing.Process(target=worker, args=(workerno, work_queue, done_queue))
process.start()
processes.append(process)
work_queue.put('STOP')
itemno = 0
stops = 0
while True:
item = done_queue.get()
itemno += 1
sys.stdout.write('itemno {0}\r'.format(itemno))
if item == 'STOP':
stops += 1
if stops == workers:
break
print('exited done_queue empty loop')
for workerno, process in enumerate(processes):
print('attempting process.join() of workerno {0}'.format(workerno))
process.join()
done_queue.put('STOP')
if __name__ == '__main__':
main()
HTH