I'm busy trying to hunt down a issues I'm having in the google Pub/Sub library (https://github.com/googleapis/python-pubsub/issues/273) and I think it's related to the memory being used by the ThreadPoolExector.
To test this, I created a very simple threaded daemon application to see what the memory looks like when running:
from concurrent import futures
from random import seed
from random import randint
import time
import psutil
import os
import threading
import sys
PROCESS = psutil.Process(os.getpid())
def get_mem_usage():
return PROCESS.memory_info().rss // 1024
def check_for_even(my_number: int) -> str:
thread_name = threading.current_thread().getName()
print(f"{thread_name} - Checking for even: {my_number}")
if is_even(my_number):
return my_number
return None
def get_number() -> int:
return randint(1, 10)
def is_even(random_number) -> int:
if random_number % 2 == 0:
return True
return None
def got_number(future: futures.Future):
thread_name = threading.current_thread().getName()
x = [n for n in range(int(1e6))]
result = future.result()
if result:
print(f"{thread_name} - Got even number: {result}")
else:
print(f"{thread_name} - Number was not even")
thread_name = None
del thread_name
x = None
del x
result = None
del result
def main():
seed(int(time.time()))
MAX_WORKERS = 5
with futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
future_list = []
while (True):
try:
print(f"Memory: {get_mem_usage()}")
number = get_number()
future = executor.submit(check_for_even, number)
future.add_done_callback(got_number)
future_list.append(future)
except KeyboardInterrupt:
executor._threads.clear()
futures.thread._threads_queues.clear()
raise
print(f"Active thread count: {threading.active_count()}")
if threading.active_count() - 1 >= MAX_WORKERS:
for future in future_list:
if future.done():
future.cancel()
future.result()
future_list.remove(future)
time.sleep(0.001)
print(f"Future list count: {len(future_list)}")
if __name__ == '__main__':
main()
When I run this memory just keeps increasing for the most part. Some memory is cleaned up on occasion (it seems like almost never), but the rate that it increases far outstrips the rate that it decreases by, which means that in a running application in production, the box will eventually run out of memory and the application will grind to a halt and crash.
Obviously, this is not ideal.
Am I doing something incorrect in the example above, or is there something I could be doing better to manage and free up memory for the application (using gc.collect() doesn't do much)?
Related
I'm writing a program which starts one thread to generate "work" and add it to a queue every N seconds. Then, I have a thread pool which processes items in the queue.
The program below works perfectly fine, until I comment out/delete line #97 (time.sleep(0.5) in the main function). Once I do that, it generates a RuntimeError which attempting to gracefully stop the program (by sending a SIGINT or SIGTERM to the main process). It even works fine with an extremely small sleep like 0.1s, but has an issue with none at all.
I tried researching "reentrancy" but it went a bit over my head unfortunately.
Can anyone help me to understand this?
Code:
import random
import signal
import threading
import time
from concurrent.futures import Future, ThreadPoolExecutor
from datetime import datetime
from queue import Empty, Queue, SimpleQueue
from typing import Any
class UniqueQueue:
"""
A thread safe queue which can only ever contain unique items.
"""
def __init__(self) -> None:
self._q = Queue()
self._items = []
self._l = threading.Lock()
def get(self, block: bool = False, timeout: float | None = None) -> Any:
with self._l:
try:
item = self._q.get(block=block, timeout=timeout)
except Empty:
raise
else:
self._items.pop(0)
return item
def put(self, item: Any, block: bool = False, timeout: float | None = None) -> None:
with self._l:
if item in self._items:
return None
self._items.append(item)
self._q.put(item, block=block, timeout=timeout)
def size(self) -> int:
return self._q.qsize()
def empty(self) -> bool:
return self._q.empty()
def stop_app(sig_num, sig_frame) -> None:
# global stop_app_event
print("Signal received to stop the app")
stop_app_event.set()
def work_generator(q: UniqueQueue) -> None:
last_execution = time.time()
is_first_execution = True
while not stop_app_event.is_set():
elapsed_seconds = int(time.time() - last_execution)
if elapsed_seconds <= 10 and not is_first_execution:
time.sleep(0.5)
continue
last_execution = time.time()
is_first_execution = False
print("Generating work...")
for _ in range(100):
q.put({"n": random.randint(0, 500)})
def print_work(w) -> None:
print(f"{datetime.now()}: {w}")
def main():
# Create a work queue
work_queue = UniqueQueue()
# Create a thread to generate the work and add to the queue
t = threading.Thread(target=work_generator, args=(work_queue,))
t.start()
# Create a thread pool, get work from the queue, and submit to the pool for processing
pool = ThreadPoolExecutor(max_workers=20)
futures: list[Future] = []
while True:
print("Processing work...")
if stop_app_event.is_set():
print("stop_app_event is set:", stop_app_event.is_set())
for future in futures:
future.cancel()
break
print("Queue Size:", work_queue.size())
try:
while not work_queue.empty():
work = work_queue.get()
future = pool.submit(print_work, work)
futures.append(future)
except Empty:
pass
time.sleep(0.5)
print("Stopping the work generator thread...")
t.join(timeout=10)
print("Work generator stopped")
print("Stopping the thread pool...")
pool.shutdown(wait=True)
print("Thread pool stopped")
if __name__ == "__main__":
stop_app_event = threading.Event()
signal.signal(signalnum=signal.SIGINT, handler=stop_app)
signal.signal(signalnum=signal.SIGTERM, handler=stop_app)
main()
It's because you called print() in the signal handler, stop_app().
A signal handler is executed in a background thread In C, but in Python it is executed in the main thread(See the reference.). In your case, while executing a print() call, another print() was called, so the term 'reentrant' fits perfectly. And the current IO stack prohibits a reentrant call.(See the implementation if you are interested.)
You can remedy this by using os.write() and sys.stdout like the following.
import sys
import os
...
def stop_app(sig_num, sig_frame):
os.write(sys.stdout.fileno(), b"Signal received to stop the app\n")
stop_app_event.set()
The following code is taken from the book "Fluent in Python". The comments were put there by me, from different sources.
import sys
from time import perf_counter
from typing import NamedTuple
from multiprocessing import Process, SimpleQueue, cpu_count
from multiprocessing import queues
from primes import is_prime, NUMBERS
class PrimeResult(NamedTuple):
n: int
prime: bool
elapsed: float
JobQueue = queues.SimpleQueue[int]
ResultQueue = queues.SimpleQueue[PrimeResult]
def check(n: int) -> PrimeResult:
t0 = perf_counter()
res = is_prime(n)
return PrimeResult(n, res, perf_counter() - t0)
def worker(jobs: JobQueue, results: ResultQueue) -> None:
# SimpleQueue.get(block=True, timeout=None)
# Remove and return an item from the queue. If optional args block is true
# and timeout is None (the default), block if necessary until an item is
# available. If timeout is a positive number, it blocks at most timeout
# seconds and raises the Empty exception if no item was available within
# that time. Otherwise (block is false), return an item if one is immediately
# available, else raise the Empty exception (timeout is ignored in that case).
while n := jobs.get():
print(n)
results.put(check(n))
# the following line, will tell main to increase procs_done by 1.
results.put(PrimeResult(0, False, 0.0))
def start_jobs(procs: int, jobs: JobQueue, results: ResultQueue) -> None:
for n in NUMBERS:
jobs.put(n)
for _ in range(procs):
proc = Process(target=worker, args=(jobs, results))
proc.start()
# zero will evaluate to False in the While loop of worker function
jobs.put(0)
def report(procs: int, results: ResultQueue) -> int:
checked = 0
procs_done = 0
while procs_done < procs:
n, prime, elapsed = results.get()
if n == 0:
procs_done += 1
else:
checked += 1
label = "P" if prime else " "
print(f"{n:16} {label} {elapsed:9.6f}s")
return checked
def main() -> None:
if len(sys.argv) < 2:
procs = cpu_count()
else:
procs = int(sys.argv[1])
print(f"Checking {len(NUMBERS)} numbers with {procs} processes:")
t0 = perf_counter()
jobs: JobQueue = SimpleQueue()
results: ResultQueue = SimpleQueue()
start_jobs(procs, jobs, results)
checked = report(procs, results)
elapsed = perf_counter() - t0
print(f"{checked} checks in {elapsed:.2f}s")
if __name__ == "__main__":
main()
If I now change the import to from queue import SimpleQueue, this same code does not work, even though in the python documentation
The [multiprocessing.]Queue, SimpleQueue and JoinableQueue types are
multi-producer, multi-consumer FIFO queues modelled on the queue.Queue
class in the standard library. They differ in that Queue lacks the
task_done() and join() methods introduced into Python 2.5’s
queue.Queue class.
The problem seems to be that the processes will never get the zero valued tasked, and I'm not sure why... This will never allows to exit the loop.
After much debugging, I figured out that somehow the queues were not being shared... The results queue seemed to be restarted, and the jobs also, but with increasing number of zeros...
Then it dawned on me that I should be using something intended for threads which will by definition share the same memory... and it's that!
The queue module seems to be used for threads, whereas, if we want specifically queues for processes, we should use necessarily the multiprocessing module.
Below is the code that worked for me:
import sys
from time import perf_counter
from typing import NamedTuple
from multiprocessing import cpu_count, #Process,SimpleQueue
from threading import Thread
# below is for multithreading
from queue import (
Queue as SimpleQueue,
Empty,
Full
)
# The Queue, SimpleQueue and JoinableQueue types are multi-producer, multi-consumer
# FIFO queues modelled on the queue.Queue class in the standard library.
# They differ in that Queue lacks the task_done() and join() methods introduced into
# Python 2.5’s queue.Queue class.
# from multiprocessing import queues
from primes import is_prime, NUMBERS
class PrimeResult(NamedTuple):
n: int
prime: bool
elapsed: float
JobQueue = SimpleQueue[int]
ResultQueue = SimpleQueue[PrimeResult]
def check(n: int) -> PrimeResult:
t0 = perf_counter()
res = is_prime(n)
print(f"inside check with {n}")
return PrimeResult(n, res, perf_counter() - t0)
def worker(jobs: JobQueue, results: ResultQueue) -> None:
# SimpleQueue.get(block=True, timeout=None)
# Remove and return an item from the queue. If optional args block is true
# and timeout is None (the default), block if necessary until an item is
# available. If timeout is a positive number, it blocks at most timeout
# seconds and raises the Empty exception if no item was available within
# that time. Otherwise (block is false), return an item if one is immediately
# available, else raise the Empty exception (timeout is ignored in that case).
try:
n = jobs.get()
while n!=0:
print(f"\nGetting element {n} in jobs queue")
# Queue.put(item, block=True, timeout=None)
# Put item into the queue. If optional args block is true and timeout is
# None (the default), block if necessary until a free slot is available.
# If timeout is a positive number, it blocks at most timeout seconds and
# raises the Full exception if no free slot was available within that time.
# Otherwise (block is false), put an item on the queue if a free slot is
# immediately available, else raise the Full exception (timeout is ignored in
# that case).
results.put(check(n))
jobs.task_done()
print(f"exited check with {n}")
n = jobs.get()
print(f"exited get with new {n}")
except Empty or Full as e:
print(f"exception :{repr(e)}")
# the following line, will tell main to increase procs_done by 1.
print(f"exited while with n = {n}")
results.put(PrimeResult(0, False, 0.0))
print(f"after results.put(PrimeResult(0, False, 0.0))")
def start_jobs(procs: int, jobs: JobQueue, results: ResultQueue) -> None:
for n in NUMBERS:
# we're putting all the numbers on the queue.
jobs.put(n)
print(f"putting element {n} in jobs queue")
for _ in range(procs):
# proc = Process(target=worker, args=(jobs, results))
# proc.start()
# start()
# Start the process’s activity.
# This must be called at most once per process object. It arranges for the
# object’s run() method to be invoked in a separate process.
# run()
# Method representing the process’s activity.
# You may override this method in a subclass.
# The standard run() method invokes the callable object passed to the
# object’s constructor as the target argument, if any, with sequential and
# keyword arguments taken from the args and kwargs arguments, respectively.
thrd = Thread(target=worker, args=(jobs, results))
thrd.start()
# zero will evaluate to False in the While loop of worker function
# The jobs queue already has all the numbers. Now, we're putting the poison
# pill to make the worker function finish, thus killing that corresponding
# process.
jobs.put(0)
print(f"putting element {0} in jobs queue")
def report(procs: int, results: ResultQueue) -> int:
checked = 0
procs_done = 0
while procs_done < procs:
n, prime, elapsed = results.get()
if n == 0:
procs_done += 1
else:
checked += 1
label = "P" if prime else " "
print(f"{n:16} {label} {elapsed:9.6f}s")
print(f"\tprocs_done = {procs_done}")
return checked
def main() -> None:
# sys.argv - The list of command line arguments passed to a Python script.
# argv[0] is the script name (it is operating system dependent whether this
# is a full pathname or not). If the command was executed using the -c
# command line option to the interpreter, argv[0] is set to the string '-c'.
# If no script name was passed to the Python interpreter, argv[0] is the empty
# string. To loop over the standard input, or the list of files given on the
# command line, see the fileinput module.
if len(sys.argv) < 2:
procs = cpu_count()
else:
procs = int(sys.argv[1])
print(f"Checking {len(NUMBERS)} numbers with {procs} processes:")
t0 = perf_counter()
jobs: JobQueue = SimpleQueue()
results: ResultQueue = SimpleQueue()
start_jobs(procs, jobs, results)
checked = report(procs, results)
elapsed = perf_counter() - t0
print(f"{checked} checks in {elapsed:.2f}s")
if __name__ == "__main__":
main()
In my program, there is a section where I utilize from multiple threads for simulating a distributed environment. All threads are trying to crack a password. As you can see, all threads call the same target function func with different arguments. This function returns a result whenever a trial that cracks the password is found.
def func(self, inp):
trial = 0
while (crackPwd(inp, trial) != True):
trial += 1
return inp
threads = []
for inp in range(inpAmount):
thr = threading.Thread(target=func, args=(inp))
threads.append(thr)
thr.start()
for thr in threads:
thr.join()
However what I want to do is to stop other threads after one of the threads cracks the password. I mean, I want to continue with the program flow after a thread returns a result from func(). I tried to find a solution but none of them seems to match my problem. Now, I got results from all threads and lose so much time waiting for all threads to finish. I will appreciate your help.
Could you use an instance of the threading Event class?
By mocking the crackPwd function mentioned in your code to sleep until the sleep_time is 10 seconds (with 10% probability) I tested with:
import time
import random
import threading
def crackPwd(inp, trial):
sleep_time = random.randint(1, 10)
time.sleep(sleep_time)
return sleep_time
def func(inp):
trial = 0
while (crackPwd(inp, trial) != 10) and (not pwd_cracked.isSet()):
trial += 1
pwd_cracked.set()
return inp
threads = []
for inp in range(10):
pwd_cracked = threading.Event()
thr = threading.Thread(target=func, args=(inp, ))
threads.append(thr)
thr.start()
for thr in threads:
thr.join()
So for your original code:
def func(self, inp):
trial = 0
while (crackPwd(inp, trial) != True) and (not pwd_cracked.isSet()):
trial += 1
pwd_cracked.set()
return inp
threads = []
for inp in range(inpAmount):
pwd_cracked = threading.Event()
thr = threading.Thread(target=func, args=(inp, ))
threads.append(thr)
thr.start()
for thr in threads:
thr.join()
I have been working few days to understand the Redlock and I have seen that the performance to lock takes around 1 second which seems abit too much to just lock it in my opinion but I could be wrong.
I have created a small script:
redis_test.py
import serialized_redis
from pottery import Redlock
redis_connection = serialized_redis.MsgpackSerializedRedis(host='localhost', port=6379, db=0)
def lock(argument):
return Redlock(key=argument, auto_release_time=120 * 1000)
main.py
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import math
import random
import sys
import time
from threading import Thread
from loguru import logger
from lib.redis_test import lock, redis_connection
class StopWatch:
def __init__(self):
self.start()
def start(self):
self._startTime = time.time()
def getStartTime(self):
return self._startTime
def elapsed(self, prec=3):
prec = 3 if prec is None or not isinstance(prec, int) else prec
diff = time.time() - self._startTime
return self.round(diff, prec)
def round(self, n, p=0):
m = 10 ** p
return math.floor(n * m + 0.5) / m
def algorithm_with_lock(random_number):
print("Keys inside Redis", redis_connection.keys())
ourWatch = StopWatch()
ourWatch.start()
if not redis_connection.exists(f'redlock:{random_number}'):
# print("Time taken before redis_connection.exists", ourWatch.elapsed()) -> 0.0 seconds
with lock(f'{random_number}'):
print("Time taken before redis_connection.exists", ourWatch.elapsed()) # 1.002 seconds
time.sleep(5)
redis_connection.set("Hello_world", random.randint(1, 5))
return True
else:
return False
def main():
while True:
chosen_number = f"number_{random.randint(1, 3)}"
response = algorithm_with_lock(chosen_number)
if response:
logger.info(f"Yay, finished my job! -> {chosen_number}")
sys.exit()
else:
logger.debug(f"Trying new number! -> {chosen_number}")
time.sleep(1)
for i in range(1):
Thread(
target=main,
).start()
time.sleep(.1)
Issue with that is that it takes too long to actually lock a redis key which ends up that multiple threads can try to lock the same key which ends up being stuck in the lock tree. My guess is that it should not take 1 second to actually lock. But I could be wrong and here I am, I wonder what could be the reason of long time locking and if there is a chance im using it incorrectly?
I am reading myself into multighreading in python and came up with this simple test:
(btw. this implementation might be very bad, I just wrote that down quickly for testing purpose. Buf if there is something terribly wrong I would be thankful if you could point that out)
#!/usr/bin/python2.7
import threading
import timeit
lst = range(0, 100000)
lstres = []
lstlock = threading.Lock()
lstreslock = threading.Lock()
def add_five(x):
return x+5
def worker_thread(args):
print "started"
while len(lst) > 0:
lstlock.acquire()
try:
x = lst.pop(0)
except IndexError:
lstlock.release()
return
lstlock.release()
x = add_five(x)
lstreslock.acquire()
lstres.append(x)
lstreslock.release()
def test():
try:
t1 = threading.Thread(target = worker_thread, args = (1,))
#t2 = threading.Thread(target = worker_thread, args = (2,))
#t3 = threading.Thread(target = worker_thread, args = (3,))
#t4 = threading.Thread(target = worker_thread, args = (4,))
t1.start();
#t2.start();
#t3.start();
#t4.start();
t1.join();
#t2.join();
#t3.join();
#t4.join();
except:
print "Error"
print len(lstres)
if __name__ == "__main__":
t = timeit.Timer(test)
print t.timeit(2)
Despite the terrible example I see the following: one thread is faster than 4.
With one thread I get: 13.46 seconds, and with 4 threads: 25.47 seconds.
Is the access to the list by 4 threads a bottleneck thus causing slower times or did I do something wrong?
In your case, the Global Interpreter Lock isn't actually the problem.
Threading doesn't make things faster by default. In your case, the code is CPU bound. No thread is ever waiting for I/O (which allow another to use the CPU). If you have code which needs 100% of the CPU, then threading will only make it faster if a lot of the code is independent which your's isn't: Most of your code is holding locks, so no other thread can proceed.
Which brings us to the cause of the slowdown: Switching threads and fighting for locks costs time. That's what eats 12s in your case.