Add delay between workers in concurrent.futures and shutdown - python

Looking to add delay between Threadpoolexecutor workers
checked = 0
session_number = 0.00
def parseCombo(file_name, config_info, threads2):
try:
global checked
global session_number
### some non related code here
if config_info["Parsing"] == "First":
system("title "+ config_info["name"] + " [+] Start Parsing...")
with concurrent.futures.ThreadPoolExecutor(max_workers=threads2) as executor:
for number in numbers:
number = number.strip()
executor.submit(call_master, number, config_info)
time.sleep(20)
So basically, with those threads I'm making some requests to API that limit 2 requests per second.
The max workers i can set is 2, if I raise the API block all requests.
Each task take 60 seconds to complete, what I want to do, is set max_workers to 10, then the executor start with 2 tasks "workers" and wait 3 seconds to start the next 2 and so on, without waiting for the first 2 to finish.
Next thing is to shutdown the executor when one the task return value.
def beta(url, number, config_info):
try:
## some non related code here
global checked
global session_number
session_number = session_number
checked +=1
if '999' in text:
track = re.findall('\999[0-9]+\.?[0-9]*', text)
num = float(track[0].replace("999", ""))
session_number += num
log(number, track[0], config_info, url, text)
Is there a way to shutdown executor from def beta?
If possible, need it shutdown when this line is triggered
log(number, track[0], config_info, url, text)

Related

Multiprocessing function not writing to file or printing

I'm working on a Raspberry Pi (3 B+) making a data collection device and I'm
trying to spawn a process to record the data coming in and write it to a file. I have a function for the writing that works fine when I call it directly.
When I call it using the multiprocess approach however, nothing seems to happen. I can see in task monitors in Linux that the process does in fact get spawned but no file gets written, and when I try to pass a flag to it to shut down it doesn't work, meaning I end up terminating the process and nothing seems to have happened.
I've been over this every which way and can't see what I'm doing wrong; does anyone else? In case it's relevant, these are functions inside a parent class, and one of the functions is meant to spawn another as a thread.
Code I'm using:
from datetime import datetime, timedelta
import csv
from drivers.IMU_SEN0 import IMU_SEN0
import multiprocessing, os
class IMU_data_logger:
_output_filename = ''
_csv_headers = []
_accelerometer_headers = ['Accelerometer X','Accelerometer Y','Accelerometer Z']
_gyroscope_headers = ['Gyroscope X','Gyroscope Y','Gyroscope Z']
_magnetometer_headers = ['Bearing']
_log_accelerometer = False
_log_gyroscope= False
_log_magnetometer = False
IMU = None
_writer=[]
_run_underway = False
_process=[]
_stop_value = 0
def __init__(self,output_filename='/home/pi/blah.csv',log_accelerometer = True,log_gyroscope= True,log_magnetometer = True):
"""data logging device
NOTE! Multiple instances of this class should not use the same IMU devices simultaneously!"""
self._output_filename = output_filename
self._log_accelerometer = log_accelerometer
self._log_gyroscope = log_gyroscope
self._log_magnetometer = log_magnetometer
def __del__(self):
# TODO Update this
if self._run_underway: # If there's still a run underway, end it first
self.end_recording()
def _set_up(self):
self.IMU = IMU_SEN0(self._log_accelerometer,self._log_gyroscope,self._log_magnetometer)
self._set_up_headers()
def _set_up_headers(self):
"""Set up the headers of the CSV file based on the header substrings at top and the input flags on what will be measured"""
self._csv_headers = []
if self._log_accelerometer is not None:
self._csv_headers+= self._accelerometer_headers
if self._log_gyroscope is not None:
self._csv_headers+= self._gyroscope_headers
if self._log_magnetometer is not None:
self._csv_headers+= self._magnetometer_headers
def _record_data(self,frequency,stop_value):
self._set_up() #Run setup in thread
"""Record data function, which takes a recording frequency, in herz, as an input"""
previous_read_time=datetime.now()-timedelta(1,0,0)
self._run_underway = True # Note that a run is now going
Period = 1/frequency # Period, in seconds, of a recording based on the input frequency
print("Writing output data to",self._output_filename)
with open(self._output_filename,'w',newline='') as outcsv:
self._writer = csv.writer(outcsv)
self._writer.writerow(self._csv_headers) # Write headers to file
while stop_value.value==0: # While a run continues
if datetime.now()-previous_read_time>=timedelta(0,1,0): # If we've waited a period, collect the data; otherwise keep looping
print("run underway value",self._run_underway)
if datetime.now()-previous_read_time>=timedelta(0,Period,0): # If we've waited a period, collect the data; otherwise keep looping
previous_read_time = datetime.now() # Update previous readtime
next_row = []
if self._log_accelerometer:
# Get values in m/s^2
axes = self.IMU.read_accelerometer_values()
next_row += [axes['x'],axes['y'],axes['z']]
if self._log_gyroscope:
# Read gyro values
gyro = self.IMU.read_gyroscope_values()
next_row += [gyro['x'],gyro['y'],gyro['z']]
if self._log_magnetometer:
# Read magnetometer value
b= self.IMU.read_magnetometer_bearing()
next_row += b
self._writer.writerow(next_row)
# Close the csv when done
outcsv.close()
def start_recording(self,frequency_in_hz):
# Create recording process
self._stop_value = multiprocessing.Value('i',0)
self._process = multiprocessing.Process(target=self._record_data,args=(frequency_in_hz,self._stop_value))
# Start recording process
self._process.start()
print(datetime.now().strftime("%H:%M:%S.%f"),"Data logging process spawned")
print("Logging Accelerometer:",self._log_accelerometer)
print("Logging Gyroscope:",self._log_gyroscope)
print("Logging Magnetometer:",self._log_magnetometer)
print("ID of data logging process: {}".format(self._process.pid))
def end_recording(self,terminate_wait = 2):
"""Function to end the recording multithread that's been spawned.
Args: terminate_wait: This is the time, in seconds, to wait after attempting to shut down the process before terminating it."""
# Get process id
id = self._process.pid
# Set stop event for process
self._stop_value.value = 1
self._process.join(terminate_wait) # Wait two seconds for the process to terminate
if self._process.is_alive(): # If it's still alive after waiting
self._process.terminate()
print(datetime.now().strftime("%H:%M:%S.%f"),"Process",id,"needed to be terminated.")
else:
print(datetime.now().strftime("%H:%M:%S.%f"),"Process",id,"successfully ended itself.")
====================================================================
ANSWER: For anyone following up here, it turns out the problem was my use of the VS Code debugger which apparently doesn't work with multiprocessing and was somehow preventing the success of the spawned process. Many thanks to Tomasz Swider below for helping me work through issues and, eventually, find my idiocy. The help was very deeply appreciated!!
I can see few thing wrong in your code:
First thing
stop_value == 0 will not work as the multiprocess.Value('i', 0) != 0, change that line to
while stop_value.value == 0
Second, you never update previous_read_time so it will write the readings as fast as it can, you will run out of disk quick
Third, try use time.sleep() the thing you are doing is called busy looping and it is bad, it is wasting CPU cycles needlessly.
Four, terminating with self._stop_value = 1 probably will not work there must be other way to set that value maybe self._stop_value.value = 1.
Well here is a pice of example code based on the code that you have provided that is working just fine:
import csv
import multiprocessing
import time
from datetime import datetime, timedelta
from random import randint
class IMU(object):
#staticmethod
def read_accelerometer_values():
return dict(x=randint(0, 100), y=randint(0, 100), z=randint(0, 10))
class Foo(object):
def __init__(self, output_filename):
self._output_filename = output_filename
self._csv_headers = ['xxxx','y','z']
self._log_accelerometer = True
self.IMU = IMU()
def _record_data(self, frequency, stop_value):
#self._set_up() # Run setup functions for the data collection device and store it in the self.IMU variable
"""Record data function, which takes a recording frequency, in herz, as an input"""
previous_read_time = datetime.now() - timedelta(1, 0, 0)
self._run_underway = True # Note that a run is now going
Period = 1 / frequency # Period, in seconds, of a recording based on the input frequency
print("Writing output data to", self._output_filename)
with open(self._output_filename, 'w', newline='') as outcsv:
self._writer = csv.writer(outcsv)
self._writer.writerow(self._csv_headers) # Write headers to file
while stop_value.value == 0: # While a run continues
if datetime.now() - previous_read_time >= timedelta(0, 1,
0): # If we've waited a period, collect the data; otherwise keep looping
print("run underway value", self._run_underway)
if datetime.now() - previous_read_time >= timedelta(0, Period,
0): # If we've waited a period, collect the data; otherwise keep looping
next_row = []
if self._log_accelerometer:
# Get values in m/s^2
axes = self.IMU.read_accelerometer_values()
next_row += [axes['x'], axes['y'], axes['z']]
previous_read_time = datetime.now()
self._writer.writerow(next_row)
# Close the csv when done
outcsv.close()
def start_recording(self, frequency_in_hz):
# Create recording process
self._stop_value = multiprocessing.Value('i', 0)
self._process = multiprocessing.Process(target=self._record_data, args=(frequency_in_hz, self._stop_value))
# Start recording process
self._process.start()
print(datetime.now().strftime("%H:%M:%S.%f"), "Data logging process spawned")
print("ID of data logging process: {}".format(self._process.pid))
def end_recording(self, terminate_wait=2):
"""Function to end the recording multithread that's been spawned.
Args: terminate_wait: This is the time, in seconds, to wait after attempting to shut down the process before terminating it."""
# Get process id
id = self._process.pid
# Set stop event for process
self._stop_value.value = 1
self._process.join(terminate_wait) # Wait two seconds for the process to terminate
if self._process.is_alive(): # If it's still alive after waiting
self._process.terminate()
print(datetime.now().strftime("%H:%M:%S.%f"), "Process", id, "needed to be terminated.")
else:
print(datetime.now().strftime("%H:%M:%S.%f"), "Process", id, "successfully ended itself.")
if __name__ == '__main__':
foo = Foo('/tmp/foometer.csv')
foo.start_recording(20)
time.sleep(5)
print('Ending recording')
foo.end_recording()

Python Tornado rate limiting AsyncHttpClient fetch

Currently using an API that rate limits me to 3000 requests per 10 seconds. I have 10,000 urls that are fetched using Tornado due to it's asynchronous IO nature.
How do I go about implementing a rate limit to reflect the API limit?
from tornado import ioloop, httpclient
i = 0
def handle_request(response):
print(response.code)
global i
i -= 1
if i == 0:
ioloop.IOLoop.instance().stop()
http_client = httpclient.AsyncHTTPClient()
for url in open('urls.txt'):
i += 1
http_client.fetch(url.strip(), handle_request, method='HEAD')
ioloop.IOLoop.instance().start()
You can check where does the value of i lies in the interval of 3000 requests. For example, if i is in between 3000 and 6000, you can set the timeout of 10 seconds on every request until 6000. After 6000, just double the timeout. And so on.
http_client = AsyncHTTPClient()
timeout = 10
interval = 3000
for url in open('urls.txt'):
i += 1
if i <= interval:
# i is less than 3000
# just fetch the request without any timeout
http_client.fetch(url.strip(), handle_request, method='GET')
continue # skip the rest of the loop
if i % interval == 1:
# i is now 3001, or 6001, or so on ...
timeout += timeout # double the timeout for next 3000 calls
loop = ioloop.IOLoop.current()
loop.call_later(timeout, callback=functools.partial(http_client.fetch, url.strip(), handle_request, method='GET'))
Note: I only tested this code with small number of requests. It might be possible that the value of i would change because you're subtracting i in handle_request function. If that's the case, you should maintain another variable similar to i and perform subtraction on that.

How would I go about using concurrent.futures and queues for a real-time scenario?

It is fairly easy to do parallel work with Python 3's concurrent.futures module as shown below.
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to = {executor.submit(do_work, input, 60): input for input in dictionary}
for future in concurrent.futures.as_completed(future_to):
data = future.result()
It is also very handy to insert and retrieve items into a Queue.
q = queue.Queue()
for task in tasks:
q.put(task)
while not q.empty():
q.get()
I have a script running in background listening for updates. Now, in theory assume that, as those updates arrive, I would queue them and do work on them concurrently using the ThreadPoolExecutor.
Now, individually, all of these components work in isolation, and make sense, but how do I go about using them together? I am not aware if it is possible to feed the ThreadPoolExecutor work from the queue in real time unless the data to work from is predetermined?
In a nutshell, all I want to do is, receive updates of say 4 messages a second, shove them in a queue, and get my concurrent.futures to work on them. If I don't, then I am stuck with a sequential approach which is slow.
Let's take the canonical example in the Python documentation below:
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
The list of URLS is fixed. Is it possible to feed this list in real-time and get the worker to process it as they come by, perhaps from a queue for management purposes? I am a bit confused on whether my approach is actually possible?
The example from the Python docs, expanded to take its work from a queue. A change to note, is that this code uses concurrent.futures.wait instead of concurrent.futures.as_completed to allow new work to be started while waiting for other work to complete.
import concurrent.futures
import urllib.request
import time
import queue
q = queue.Queue()
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
def feed_the_workers(spacing):
""" Simulate outside actors sending in work to do, request each url twice """
for url in URLS + URLS:
time.sleep(spacing)
q.put(url)
return "DONE FEEDING"
def load_url(url, timeout):
""" Retrieve a single page and report the URL and contents """
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# start a future for a thread which sends work in through the queue
future_to_url = {
executor.submit(feed_the_workers, 0.25): 'FEEDER DONE'}
while future_to_url:
# check for status of the futures which are currently working
done, not_done = concurrent.futures.wait(
future_to_url, timeout=0.25,
return_when=concurrent.futures.FIRST_COMPLETED)
# if there is incoming work, start a new future
while not q.empty():
# fetch a url from the queue
url = q.get()
# Start the load operation and mark the future with its URL
future_to_url[executor.submit(load_url, url, 60)] = url
# process any completed futures
for future in done:
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
if url == 'FEEDER DONE':
print(data)
else:
print('%r page is %d bytes' % (url, len(data)))
# remove the now completed future
del future_to_url[future]
Output from fetching each url twice:
'http://www.foxnews.com/' page is 67574 bytes
'http://www.cnn.com/' page is 136975 bytes
'http://www.bbc.co.uk/' page is 193780 bytes
'http://some-made-up-domain.com/' page is 896 bytes
'http://www.foxnews.com/' page is 67574 bytes
'http://www.cnn.com/' page is 136975 bytes
DONE FEEDING
'http://www.bbc.co.uk/' page is 193605 bytes
'http://some-made-up-domain.com/' page is 896 bytes
'http://europe.wsj.com/' page is 874649 bytes
'http://europe.wsj.com/' page is 874649 bytes
At work I found a situation where I wanted to do parallel work on an unbounded stream of data. I created a small library inspired by the excellent answer already provided by Stephen Rauch.
I originally approached this problem by thinking about two separate threads, one that submits work to a queue and one that monitors the queue for any completed tasks and makes more room for new work to come in. This is similar to what Stephen Rauch proposed, where he consumes the stream using a feed_the_workers function that runs in a separate thread.
Talking to one of my colleagues, he helped me realize that you can get away with doing everything in a single thread if you define a buffered iterator that allows you to control how many elements are let out of the input stream every time you are ready to submit more work to the thread pool.
So we introduce the BufferedIter class
class BufferedIter(object):
def __init__(self, iterator):
self.iter = iterator
def nextN(self, n):
vals = []
for _ in range(n):
vals.append(next(self.iter))
return vals
which allows us to define the stream processor in the following way
import logging
import queue
import signal
import sys
import time
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
level = logging.DEBUG
log = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
handler.setLevel(level)
log.addHandler(handler)
log.setLevel(level)
WAIT_SLEEP = 1 # second, adjust this based on the timescale of your tasks
def stream_processor(input_stream, task, num_workers):
# Use a queue to signal shutdown.
shutting_down = queue.Queue()
def shutdown(signum, frame):
log.warning('Caught signal %d, shutting down gracefully ...' % signum)
# Put an item in the shutting down queue to signal shutdown.
shutting_down.put(None)
# Register the signal handler
signal.signal(signal.SIGTERM, shutdown)
signal.signal(signal.SIGINT, shutdown)
def is_shutting_down():
return not shutting_down.empty()
futures = dict()
buffer = BufferedIter(input_stream)
with ThreadPoolExecutor(num_workers) as executor:
num_success = 0
num_failure = 0
while True:
idle_workers = num_workers - len(futures)
if not is_shutting_down():
items = buffer.nextN(idle_workers)
for data in items:
futures[executor.submit(task, data)] = data
done, _ = wait(futures, timeout=WAIT_SLEEP, return_when=ALL_COMPLETED)
for f in done:
data = futures[f]
try:
f.result(timeout=0)
except Exception as exc:
log.error('future encountered an exception: %r, %s' % (data, exc))
num_failure += 1
else:
log.info('future finished successfully: %r' % data)
num_success += 1
del futures[f]
if is_shutting_down() and len(futures) == 0:
break
log.info("num_success=%d, num_failure=%d" % (num_success, num_failure))
Below we show an example for how to use the stream processor
import itertools
def integers():
"""Simulate an infinite stream of work."""
for i in itertools.count():
yield i
def task(x):
"""The task we would like to perform in parallel.
With some delay to simulate a time consuming job.
With a baked in exception to simulate errors.
"""
time.sleep(3)
if x == 4:
raise ValueError('bad luck')
return x * x
stream_processor(integers(), task, num_workers=3)
The output for this example is shown below
2019-01-15 22:34:40,193 future finished successfully: 1
2019-01-15 22:34:40,193 future finished successfully: 0
2019-01-15 22:34:40,193 future finished successfully: 2
2019-01-15 22:34:43,201 future finished successfully: 5
2019-01-15 22:34:43,201 future encountered an exception: 4, bad luck
2019-01-15 22:34:43,202 future finished successfully: 3
2019-01-15 22:34:46,208 future finished successfully: 6
2019-01-15 22:34:46,209 future finished successfully: 7
2019-01-15 22:34:46,209 future finished successfully: 8
2019-01-15 22:34:49,215 future finished successfully: 11
2019-01-15 22:34:49,215 future finished successfully: 10
2019-01-15 22:34:49,215 future finished successfully: 9
^C <=== THIS IS WHEN I HIT Ctrl-C
2019-01-15 22:34:50,648 Caught signal 2, shutting down gracefully ...
2019-01-15 22:34:52,221 future finished successfully: 13
2019-01-15 22:34:52,222 future finished successfully: 14
2019-01-15 22:34:52,222 future finished successfully: 12
2019-01-15 22:34:52,222 num_success=14, num_failure=1
I really liked the interesting approach by #pedro above. However, when processing thousands of files, I noticed that at the end a StopIteration would be thrown and some files would always be skipped. I had to make a little modification to as follows. Very useful answer again.
class BufferedIter(object):
def __init__(self, iterator):
self.iter = iterator
def nextN(self, n):
vals = []
try:
for _ in range(n):
vals.append(next(self.iter))
return vals, False
except StopIteration as e:
return vals, True
-- Call as follows
...
if not is_shutting_down():
items, is_finished = buffer.nextN(idle_workers)
if is_finished:
stop()
...
-- Where stop is a function that simply tells to shutdown
def stop():
shutting_down.put(None)
It is possible to gain the benefits of the executor without strictly having to use a Queue. New tasks are submitted from the main thread. The undone futures are tracked and waited on until all futures are done.
import concurrent.futures
import sys
import time
sys.setrecursionlimit(64) # This is only for demonstration purposes to trigger a RecursionError. Do not set in practice.
def slow_factorial(n: int) -> int:
time.sleep(0.01)
if n == 0:
return 1
else:
return n * slow_factorial(n-1)
initial_inputs = [0, 1, 5, 20, 200, 100, 50, 51, 55, 40, 44, 21, 222, 333, 202, 1000, 10, 9000, 9009, 99, 9999]
for executor_class in (concurrent.futures.ThreadPoolExecutor, concurrent.futures.ProcessPoolExecutor):
for max_workers in (4, 8, 16, 32):
start_time = time.monotonic()
with executor_class(max_workers=max_workers) as executor:
futures_to_n = {executor.submit(slow_factorial, n): n for n in initial_inputs}
while futures_to_n:
futures_done, futures_not_done = concurrent.futures.wait(futures_to_n, return_when=concurrent.futures.FIRST_COMPLETED)
# Note: Length of futures_done is often > 1.
for future in futures_done:
n = futures_to_n.pop(future)
try:
factorial_n = future.result()
except RecursionError:
n_smaller = int(n ** 0.9)
future = executor.submit(slow_factorial, n_smaller)
futures_to_n[future] = n_smaller
# print(f'Failed to compute factorial of {n}. Trying to compute factorial of a smaller number {n_smaller} instead.')
else:
# print(f'Factorial of {n} is {factorial_n}.')
pass
used_time = time.monotonic() - start_time
executor_type = executor_class.__name__.removesuffix('PoolExecutor').lower()
print(f'Workflow took {used_time:.1f}s with {max_workers} {executor_type} workers.')
print()
Output:
Workflow took 9.4s with 4 thread workers.
Workflow took 6.3s with 8 thread workers.
Workflow took 5.4s with 16 thread workers.
Workflow took 5.2s with 32 thread workers.
Workflow took 9.0s with 4 process workers.
Workflow took 5.9s with 8 process workers.
Workflow took 5.1s with 16 process workers.
Workflow took 4.9s with 32 process workers.
For more clarity, uncomment the two print statements. As per the output above, there is an asymptotic speed benefit with more workers.

Redis: # of channels degrading latency. How to prevent degradation?

pub.py
import redis
import datetime
import time
import json
import sys
import threading
import gevent
from gevent import monkey
monkey.patch_all()
def main(chan):
redis_host = '10.235.13.29'
r = redis.client.StrictRedis(host=redis_host, port=6379)
while True:
def getpkg():
package = {'time': time.time(),
'signature' : 'content'
}
return package
#test 2: complex data
now = json.dumps(getpkg())
# send it
r.publish(chan, now)
print 'Sending {0}'.format(now)
print 'data type is %s' % type(now)
time.sleep(1)
def zerg_rush(n):
for x in range(n):
t = threading.Thread(target=main, args=(x,))
t.setDaemon(True)
t.start()
if __name__ == '__main__':
num_of_chan = 10
zerg_rush(num_of_chan)
cnt = 0
stop_cnt = 21
while True:
print 'Waiting'
cnt += 1
if cnt == stop_cnt:
sys.exit(0)
time.sleep(30)
sub.py
import redis
import threading
import time
import json
import gevent
from gevent import monkey
monkey.patch_all()
def callback(ind):
redis_host = '10.235.13.29'
r = redis.client.StrictRedis(host=redis_host, port=6379)
sub = r.pubsub()
sub.subscribe(str(ind))
start = False
avg = 0
tot = 0
sum = 0
while True:
for m in sub.listen():
if not start:
start = True
continue
got_time = time.time()
decoded = json.loads(m['data'])
sent_time = float(decoded['time'])
dur = got_time - sent_time
tot += 1
sum += dur
avg = sum / tot
print decoded #'Recieved: {0}'.format(m['data'])
file_name = 'logs/sub_%s' % ind
f = open(file_name, 'a')
f.write('processing no. %s' % tot)
f.write('it took %s' % dur)
f.write('current avg: %s\n' % avg)
f.close()
def zerg_rush(n):
for x in range(n):
t = threading.Thread(target=callback, args=(x,))
t.setDaemon(True)
t.start()
def main():
num_of_chan = 10
zerg_rush(num_of_chan)
while True:
print 'Waiting'
time.sleep(30)
if __name__ == '__main__':
main()
I am testing redis pubsub to replace the use of rsh to communicate with remote boxes.
One of the things I have tested for was the number of channels affecting latency of publish and pubsub.listen().
Test: One publisher and one subscriber per channel (publisher publish every one second). Incremented the number of channels from and observed the latency (The duration from the moment publisher publish a message to the moment subscriber got the message via listen)
num of chan--------------avg latency in seconds
10:----------------------------------0.004453
50:----------------------------------0.005246
100:---------------------------------0.0155
200:---------------------------------0.0221
300:---------------------------------0.0621
Note: tested on 2 CPU + 4GB RAM + 1 NICsĀ RHEL6.4 VM.
What can I do to maintain low latency with high number of channels?
Redis is single-threaded so increasing more cpus wont help. maybe more RAM? if so, how much more?
Anything I can do code-wise or bottleneck is in Redis itself?
Maybe the limitation comes from the way my test codes are written with threading?
EDIT:
Redis Cluster vs ZeroMQ in Pub/Sub, for horizontally scaled distributed systems
Accepted answer says "You want to minimize latency, I guess. The number of channels is irrelevant. The key factors are the number of publishers and number of subscribers, message size, number of messages per second per publisher, number of messages received by each subscriber, roughly. ZeroMQ can do several million small messages per second from one node to another; your bottleneck will be the network long before it's the software. Most high-volume pubsub architectures therefore use something like PGM multicast, which ZeroMQ supports."
From my testings, i dont know if this is true. (The claim that the number of channels is irrelevant)
For example, i did a testing.
1) One channel. 100 publishers publishing to a channel with 1 subscriber listening. Publisher publishing one second at a time. latency was 0.00965 seconds
2) Same testing except 1000 publishers. latency was 0.00808 seconds
Now during my channel testing:
300 channels with 1 pub - 1 sub resulted in 0.0621 and this is only 600 connections which is less than above testing yet significantly slow in latency

Twisted execute 10 threads in same time and wait for result

Writing a program that verify list of emails syntax and MX records, as blocking programming is time consuming, I want do this async or by threads, this my code:
with open(file_path) as f:
# check the status of file, if away then file pointer will be at the last index
if (importState.status == ImportStateFile.STATUS_AWAY):
f.seek(importState.fileIndex, 0)
while True:
# the number of emails to process is configurable 10 or 20
emails = list(islice(f, app.config['NUMBER_EMAILS_TO_PROCESS']))
if len(emails) == 0:
break;
importState.fileIndex = importState.fileIndex + len(''.join(emails))
for email in emails:
email = email.strip('''<>;,'\r\n ''').lower()
d = threads.deferToThread(check_email, email)
d.addCallback(save_email_status, email, importState)
# set the number of emails processed
yield set_nbrs_emails_process(importState)
# do an insert of all emails
yield reactor.callFromThread(db.session.commit)
# set file status as success
yield finalize_import_file_state(importState)
reactor.callFromThread(reactor.stop)
Check email function:
def check_email(email):
pipe = subprocess.Popen(["./check_email", '--email=%s' % email], stdout=subprocess.PIPE)
status = pipe.stdout.read()
try:
status = int(status)
except ValueError:
status = -1
return status
what I need is to process 10 emails in same time and wait for result.
I'm not sure why there are threads involved in your example code. You don't need threads to interact with email with Twisted, nor to do so concurrently.
If you have an asynchronous function that returns a Deferred, you can just call it ten times and the ten different streams of work will proceed in parallel:
for i in range(10):
async_check_email_returning_deferred()
If you want to know when all ten results are available, you can use gatherResults:
from twisted.internet.defer import gatherResults
...
email_results = []
for i in range(10):
email_results.append(async_check_mail_returning_deferred())
all_results = gatherResults(email_results)
all_results is a Deferred that will fire when all of the Deferreds in email_results have fired (or when the first of them fires with a Failure).

Categories