I'm developing a BS kafka monitor tool. The program will listen to a kafka topic, and constantly output the new message from that topic. So which is the best approach to send those message constantly to browser side?
The program uses flask, so currently I'm using the stream_with_context to send new message to browser side. This works for now, but I wonder if this is the correct scenario to use stream_with_context since most usage case is for downloading and video streaming? or maybe I should use websocket?
#read_controller.route('/v1/listenkafka/<string:kafkaId>', methods=['GET'])
def start_stream(kafkaId):
try:
mykafka_json = eval(my_storage.get(kafkaId))
mykafka = kafkaserver(ip=mykafka_json['ip'], id=kafkaId, port=mykafka_json['port'])
return Response(stream_with_context(mykafka.consume_topic(mykafka_json['topic'])))
except Exception as e:
print(f"{e}")
return jsonify(f"{e}"), 400
#The generator listen to kafka and feed to stream
def consume_topic(self, topic, groupid='test-consumer-group'):
consumer = KafkaConsumer(topic,
group_id=groupid,
bootstrap_servers=[f"{self.ip}:{self.port}"])
print(f"Topic: {topic}#{self.ip}:{self.port} starts steaming at {datetime.now()}")
try:
for messages in consumer:
mykafka_json = eval(my_storage.get(self.id))
print(mykafka_json)
if mykafka_json['flag']:
my_storage.delete(self.id)
return
else:
message = {'topic':messages.topic,
'partition':messages.partition,
'offset':messages.offset,
'key':messages.key,
'value':messages.value}
print (message['value'])
yield message['value']
except StopIteration as e:
#TODO:: handle return
print(e)
finally:
print(f"Topic-{topic} finish at {datetime.now()}")
So, should I use stream_with_context in this scenario or should I switch to use websockt?
Thanks
Ok now I undertand。
The stream_with_context actually will return ALL contents from beginning at each time the front request.
So it is a tool for downloading, not for constantly pushing new data from server to client
Eventually, I chosed flask-socketIO, it is a better choice than websocket, but you need to study the sample to understand how it works...The doc miss some details...
Related
I need help with the python web framework, Quart. I want to build a python server that returns 202 as soon as a client requests some time consuming I/O task, and call the client back to return value of that task as soon as the task is done.
For that purpose, I add task requested by client to the background task using app.add_background_task(task) and that gave me a successful result as it returns 202 immediately. But I'm not sure how I can approach the return value of background task and call the client back to give that value.
I'm reading https://quart.palletsprojects.com/en/latest/how_to_guides/server_sent_events.html this article. But I'm not sure how to handle it.
async def background_task(timeout=10):
print(f"background task started at", str(datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
await asyncio.sleep(timeout)
print(f"background task completed at", str(datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
return "requested task done"
#app.route("/", methods=["GET"])
async def main_route():
print("Hello from main route")
app.add_background_task(background_task, 10)
return "request accepted", 202
To push information to the client, you'll need Websockets or some other mechanism - it'll require server and client-side implementations
A simpler solution is to poll the server from the client to determine if the task is complete or not. i.e. send requests repeatedly to the server until you get confirmation of what you expect, or your max number of attempts is exceeded (or a request just times out entirely)
I'm writing an asyncio application to monitor prices of crypto markets and trade/order events, but for an unknown reason some streams stop receiving data after few hours. I'm not familiar with the asyncio package and I would appreciate help in finding a solution.
Basically, the code below establishs websocket connections with a crypto exchange to listen streams of six symbols (ETH/USD, BTC/USD, BNB/USD,...) and trades events from two accounts (user1, user2). The application uses the library ccxtpro. The public method watch_ohlcv get price steams, while private methods watchMyTrades and watchOrders get new orders and trades events at account level.
The problem is that one or several streams are interrupted after few hours, and the object response get empty or None. I would like to detect and restart these streams after they stops working, how can I do that ?
# tasks.py
#app.task(bind=True, name='Start websocket loops')
def start_ws_loops(self):
ws_loops()
# methods.py
def ws_loops():
async def method_loop(client, exid, wallet, method, private, args):
exchange = Exchange.objects.get(exid=exid)
if private:
account = args['account']
else:
symbol = args['symbol']
while True:
try:
if private:
response = await getattr(client, method)()
if method == 'watchMyTrades':
do_stuff(response)
elif method == 'watchOrders':
do_stuff(response)
else:
response = await getattr(client, method)(**args)
if method == 'watch_ohlcv':
do_stuff(response)
# await asyncio.sleep(3)
except Exception as e:
print(str(e))
break
await client.close()
async def clients_loop(loop, dic):
exid = dic['exid']
wallet = dic['wallet']
method = dic['method']
private = dic['private']
args = dic['args']
exchange = Exchange.objects.get(exid=exid)
parameters = {'enableRateLimit': True, 'asyncio_loop': loop, 'newUpdates': True}
if private:
log.info('Initialize private instance')
account = args['account']
client = exchange.get_ccxt_client_pro(parameters, wallet=wallet, account=account)
else:
log.info('Initialize public instance')
client = exchange.get_ccxt_client_pro(parameters, wallet=wallet)
mloop = method_loop(client, exid, wallet, method, private, args)
await gather(mloop)
await client.close()
async def main(loop):
lst = []
private = ['watchMyTrades', 'watchOrders']
public = ['watch_ohlcv']
for exid in ['binance']:
for wallet in ['spot', 'future']:
# Private
for method in private:
for account in ['user1', 'user2']:
lst.append(dict(exid=exid,
wallet=wallet,
method=method,
private=True,
args=dict(account=account)
))
# Public
for method in public:
for symbol in ['ETH/USD', 'BTC/USD', 'BNB/USD']:
lst.append(dict(exid=exid,
wallet=wallet,
method=method,
private=False,
args=dict(symbol=symbol,
timeframe='5m',
limit=1
)
))
loops = [clients_loop(loop, dic) for dic in lst]
await gather(*loops)
loop = asyncio.new_event_loop()
loop.run_until_complete(main(loop))
let me share with you my experience since I am dealing with the same problem.
CCXT is not expected to get stalled streams after some time running it.
Unfortunately practice and theory are different and error 1006 happens quite often. I am using Binance, OKX, Bitmex and BTSE ( BTSE is not supported by CCXT) and my code runs on AWS server so I should not have any connection issue. Binance and OKX are the worst as far as error 1006 is concerned.. Honestly, after researching it on google, I have only understood 1006 is a NetworkError and I know CCXT tries to resubscribe the channel automatically. All other explanations I found online did not convince me. If somebody could give me more info about this error I would appreciate it.
In any case, every time an exception is raised, I put it in an exception_list as a dictionary containing info like time in mls, method, exchange, description ecc. The exception_list is then passed to a handle_exception method. In this case, if the list contains two 1006 exception within X time handle_exception returns we are not on sync with market data and trading must stop. I cancel all my limit order and I emit a beep ( calling human intervention).
As for your second question:
restart these streams after they stops working, how can I do that
remember that you are Running Tasks Concurrently
If return_exceptions is False (default), the first raised exception is
immediately propagated to the task that awaits on gather(). Other
awaitables in the aws sequence won’t be cancelled and will continue to
run.
here you can find info about restarting individual task in a a gather()
In your case, since you are using a single exchange (Binance) and unsubscribe is not implemented in CCXT, you will have to close the connection and restart all the task. You can still use the above example in the link for automating it. In case you are using more then one exchange you can design your code in a way that let you close and restart only the Exchange that failed.
Another option for you would be defining the tasks with more granularity in the main so that every task is related to a single and well defined exchange/user/method/symbol and every task subscribes a single channel. This will result in a more verbose and less elegant code but it will help you catching the exception and eventually restart only a specific coroutine.
I am obviously assuming that after error 1006 the channel status is unsubscribed
final thought:
never leave a robot unattended
Professional market makers with a team of engineers working in London do not go to the pub while their algos ( usually co-located within the exchange ) execute thousands of trades.
I hope this can help you or, at least, get you in the right directions for handling exceptions and restart tasks
You need to use callbacks.
For example:
ws = self.ws = await websockets.connect(END_POINTS, compression=None) # step 1
await self.ws.send(SEND_YOUR_SUBSCRIPTION_MESSAGES) # step 2
while True:
response = await self.ws.recv()
if response:
await handler(response)
In the last like await handler(response) you are sending the response to the handler().
This handler() is the callback, it is the function that actually consumes your data that you receive from the exchange server.
In this handler(), what you can do is you check if the response is your desired data (bid/ask price etc) or it throws an exception like ConnectionClosedError, in which case you restart the websocket by doing STEP 1 and STEP 2 from within your handler.
So basically in the callback method, you need to either process the data
or restart the websocket and pass the handler to it again to receive the responses.
Hope this helps. I could not share the complete code as i need to clean it for sensitive business logic.
I am using GCP with its Cloud Functions to execute web scrapers on a frequent basis. Also locally, my script is working without any problems.
I have a setup.py file in which I am initializing the connection to a Kafka Producer. This looks like this:
p = Producer(
{
"bootstrap.servers": os.environ.get("BOOTSTRAP.SERVERS"),
"security.protocol": os.environ.get("SECURITY.PROTOCOL"),
"sasl.mechanisms": os.environ.get("SASL.MECHANISMS"),
"sasl.username": os.environ.get("SASL.USERNAME"),
"sasl.password": os.environ.get("SASL.PASSWORD"),
"session.timeout.ms": os.environ.get("SESSION.TIMEOUT.MS")
}
)
def delivery_report(err, msg):
"""Called once for each message produced to indicate delivery result.
Triggered by poll() or flush()."""
print("Got here!")
if err is not None:
print("Message delivery failed: {}".format(err))
else:
print("Message delivered to {} [{}]".format(msg.topic(), msg.partition()))
return "DONE."
I am importing this setup in main.py in which my scraping functions are defined. This looks similar to this:
from setup import p, delivery_report
def scraper():
try:
# I won't insert my whole scraper here since it's working fine ...
print(scraped_data_as_dict)
p.produce(topic, json.dumps(scraped_data_as_dict), callback=delivery_report)
p.poll(0)
except Exception as e:
# Do sth else
The point here is: I am printing my scraped data in the console. But it doesn't do anything with the producer. It's not even logging an failed producer message (deliver_report) on the console. It's like my script is ignoring the producer command. Also, there are no Error reports in the LOG of the Cloud Function. What am I doing wrong since the function is doing something, except the important stuff? What do I have to be aware of when connection Kafka with Cloud Functions?
I am new to rabbitmq and trying to figure out how I can make a client request a server with information about memory and CPU utilization with this tutorial (https://www.rabbitmq.com/tutorials/tutorial-six-python.html).
So the client requests for CPU and memory ( I believe I will need two queues) and the server respond with the values.
Is there anyway to simple create a client.py and server.py with this case using the Pika library in Python.
I would recommend you to follow the first RabbitMQ tutorials if you haven't already. The RPC example builds on concepts covered on previous examples (direct queues, exclusive queues, acknowledgements, etc.).
The RPC solution proposed on the tutorial requires at least two queues, depending on how many clients you want to use:
One direct queue (rpc_queue), used to send requests from the client to the server.
One exclusive queue per client, used to receive responses.
The request/response cycle:
The client sends a message to the rpc_queue. Each message includes a reply_to property, with the name of the client exclusive queue the server should reply to, and a correlation_id property, which is just an unique id used to track the request.
The server waits for messages on the rpc_queue. When a message arrives, it prepares the response, adds the correlation_id to the new message, and sends it to the queue defined in the reply_to message property.
The client waits on its exclusive queue until it finds a message with the correlation_id that was originally generated.
Jumping straight to your problem, the first thing to do is to define the message format you'll want to use on your responses. You can use JSON, msgpack or any other serialization library. For example, if using JSON, one message could look something like this:
{
"cpu": 1.2,
"memory": 0.3
}
Then, on your server.py:
def on_request(channel, method, props, body):
response = {'cpu': current_cpu_usage(),
'memory': current_memory_usage()}
properties = pika.BasicProperties(correlation_id=props.correlation_id)
channel.basic_publish(exchange='',
routing_key=props.reply_to,
properties=properties,
body=json.dumps(response))
channel.basic_ack(delivery_tag=method.delivery_tag)
# ...
And on your client.py:
class ResponseTimeout(Exception): pass
class Client:
# similar constructor as `FibonacciRpcClient` from tutorial...
def on_response(self, channel, method, props, body):
if self.correlation_id == props.correlation_id:
self.response = json.loads(body.decode())
def call(self, timeout=2):
self.response = None
self.correlation_id = str(uuid.uuid4())
self.channel.basic_publish(exchange='',
routing_key='rpc_queue',
properties=pika.BasicProperties(
reply_to=self.callback_queue,
correlation_id=self.correlation_id),
body='')
start_time = time.time()
while self.response is None:
if (start_time + timeout) < time.time():
raise ResponseTimeout()
self.connection.process_data_events()
return self.response
As you see, the code is pretty much the same as the original FibonacciRpcClient. The main differences are:
We use JSON as data format for our messages.
Our client call() method doesn't require a body argument (there's nothing to send to the server)
We take care of response timeouts (if the server is down, or if it doesn't reply to our messages)
Still, there're a lot of things to improve here:
No error handling: For example, if the client "forgets" to send a reply_to queue, our server is gonna crash, and will crash again on restart (the broken message will be requeued infinitely as long as it isn't acknowledged by our server)
We don't handle broken connections (no reconnection mechanism)
...
You may also consider replacing the RPC approach with a publish/subscribe pattern; in this way, the server simply broadcasts its CPU/memory state every X time interval, and one or more clients receive the updates.
I've implemented a Server Sent Event API in my Django app to stream realtime updates from my backend to the browser. The backend is a Redis pubsub. My Django view looks like this:
def event_stream(request):
"""
Stream worker events out to browser.
"""
listener = events.Listener(
settings.EVENTS_PUBSUB_URL,
channels=[settings.EVENTS_PUBSUB_CHANNEL],
buffer_key=settings.EVENTS_BUFFER_KEY,
last_event_id=request.META.get('HTTP_LAST_EVENT_ID')
)
return http.HttpResponse(listener, mimetype='text/event-stream')
And the events.Listener class that I'm returning as an iterator looks like this:
class Listener(object):
def __init__(self, rcon_or_url, channels, buffer_key=None,
last_event_id=None):
if isinstance(rcon_or_url, redis.StrictRedis):
self.rcon = rcon_or_url
elif isinstance(rcon_or_url, basestring):
self.rcon = redis.StrictRedis(**utils.parse_redis_url(rcon_or_url))
self.channels = channels
self.buffer_key = buffer_key
self.last_event_id = last_event_id
self.pubsub = self.rcon.pubsub()
self.pubsub.subscribe(channels)
def __iter__(self):
# If we've been initted with a buffer key, then get all the events off
# that and spew them out before blocking on the pubsub.
if self.buffer_key:
buffered_events = self.rcon.lrange(self.buffer_key, 0, -1)
# check whether msg with last_event_id is still in buffer. If so,
# trim buffered_events to have only newer messages.
if self.last_event_id:
# Note that we're looping through most recent messages first,
# here
counter = 0
for msg in buffered_events:
if (json.loads(msg)['id'] == self.last_event_id):
break
counter += 1
buffered_events = buffered_events[:counter]
for msg in reversed(list(buffered_events)):
# Stream out oldest messages first
yield to_sse({'data': msg})
try:
for msg in self.pubsub.listen():
if msg['type'] == 'message':
yield to_sse(msg)
finally:
logging.info('Closing pubsub')
self.pubsub.close()
self.rcon.connection_pool.disconnect()
I'm able to successfully stream events out to the browser with this setup. However, it seems that the disconnect calls in the listener's "finally" don't ever actually get called. I assume that they're still camped out waiting for messages to come from the pubsub. As clients disconnect and reconnect, I can see the number of connections to my Redis instance climbing and never going down. Once it gets to around 1000, Redis starts freaking out and consuming all the available CPU.
I would like to be able to detect when the client is no longer listening and close the Redis connection(s) at that time.
Things I've tried or thought about:
A connection pool. But as the redis-py README states, "It is not safe to pass PubSub or Pipeline objects between threads."
A middleware to handle the connections, or maybe just disconnections. This won't work because a middleware's process_response() method gets called too early (before http headers are even sent to the client). I need something called when the client disconnects while I'm in the middle of streaming content to them.
The request_finished and got_request_exception signals. The first, like process_response() in a middleware, seems to fire too soon. The second doesn't get called when a client disconnects mid-stream.
Final wrinkle: In production I'm using Gevent so I can get away with keeping a lot of connections open at once. However, this connection leak issue occurs whether I'm using plain old 'manage.py runserver', or Gevent monkeypatched runserver, or Gunicorn's gevent workers.
UPDATE: As of Django 1.5, you'll need to return a StreamingHttpResponse instance if you want to lazily stream things out as I'm doing in this question/answer.
ORIGINAL ANSWER BELOW
After a lot of banging on things and reading framework code, I've found what I think is the right answer to this question.
According to the WSGI PEP, if your application returns an iterator with a close() method, it should be called by the WSGI server once the response has finished. Django supports this too. That's a natural place to do the Redis connection cleanup that I need.
There's a bug in Python's wsgiref implementation, and by extension in Django's 'runserver', that causes close() to be skipped if the client disconnects from the server mid-stream. I've submitted a patch.
Even if the server honors close(), it won't be called until a write to the client actually fails. If your iterator is blocked waiting on the pubsub and not sending anything, close() won't be called. I've worked around this by sending a no-op message into the pubsub each time a client connects. That way when a browser does a normal reconnect, the now-defunct threads will try to write to their closed connections, throw an exception, then get cleaned up when the server calls close(). The SSE spec says that any line beginning with a colon is a comment that should be ignored, so I'm just sending ":\n" as my no-op message to flush out stale clients.
Here's the new code. First the Django view:
def event_stream(request):
"""
Stream worker events out to browser.
"""
return events.SSEResponse(
settings.EVENTS_PUBSUB_URL,
channels=[settings.EVENTS_PUBSUB_CHANNEL],
buffer_key=settings.EVENTS_BUFFER_KEY,
last_event_id=request.META.get('HTTP_LAST_EVENT_ID')
)
And the Listener class that does the work, along with a helper function to format the SSEs and an HTTPResponse subclass that lets the view be a little cleaner:
class Listener(object):
def __init__(self,
rcon_or_url=settings.EVENTS_PUBSUB_URL,
channels=None,
buffer_key=settings.EVENTS_BUFFER_KEY,
last_event_id=None):
if isinstance(rcon_or_url, redis.StrictRedis):
self.rcon = rcon_or_url
elif isinstance(rcon_or_url, basestring):
self.rcon = redis.StrictRedis(**utils.parse_redis_url(rcon_or_url))
if channels is None:
channels = [settings.EVENTS_PUBSUB_CHANNEL]
self.channels = channels
self.buffer_key = buffer_key
self.last_event_id = last_event_id
self.pubsub = self.rcon.pubsub()
self.pubsub.subscribe(channels)
# Send a superfluous message down the pubsub to flush out stale
# connections.
for channel in self.channels:
# Use buffer_key=None since these pings never need to be remembered
# and replayed.
sender = Sender(self.rcon, channel, None)
sender.publish('_flush', tags=['hidden'])
def __iter__(self):
# If we've been initted with a buffer key, then get all the events off
# that and spew them out before blocking on the pubsub.
if self.buffer_key:
buffered_events = self.rcon.lrange(self.buffer_key, 0, -1)
# check whether msg with last_event_id is still in buffer. If so,
# trim buffered_events to have only newer messages.
if self.last_event_id:
# Note that we're looping through most recent messages first,
# here
counter = 0
for msg in buffered_events:
if (json.loads(msg)['id'] == self.last_event_id):
break
counter += 1
buffered_events = buffered_events[:counter]
for msg in reversed(list(buffered_events)):
# Stream out oldest messages first
yield to_sse({'data': msg})
for msg in self.pubsub.listen():
if msg['type'] == 'message':
yield to_sse(msg)
def close(self):
self.pubsub.close()
self.rcon.connection_pool.disconnect()
class SSEResponse(HttpResponse):
def __init__(self, rcon_or_url, channels, buffer_key=None,
last_event_id=None, *args, **kwargs):
self.listener = Listener(rcon_or_url, channels, buffer_key,
last_event_id)
super(SSEResponse, self).__init__(self.listener,
mimetype='text/event-stream',
*args, **kwargs)
def close(self):
"""
This will be called by the WSGI server at the end of the request, even
if the client disconnects midstream. Unless you're using Django's
runserver, in which case you should expect to see Redis connections
build up until http://bugs.python.org/issue16220 is fixed.
"""
self.listener.close()
def to_sse(msg):
"""
Given a Redis pubsub message that was published by a Sender (ie, has a JSON
body with time, message, title, tags, and id), return a properly-formatted
SSE string.
"""
data = json.loads(msg['data'])
# According to the SSE spec, lines beginning with a colon should be
# ignored. We can use that as a way to force zombie listeners to try
# pushing something down the socket and clean up their redis connections
# when they get an error.
# See http://dev.w3.org/html5/eventsource/#event-stream-interpretation
if data['message'] == '_flush':
return ":\n" # Administering colonic!
if 'id' in data:
out = "id: " + data['id'] + '\n'
else:
out = ''
if 'name' in data:
out += 'name: ' + data['name'] + '\n'
payload = json.dumps({
'time': data['time'],
'message': data['message'],
'tags': data['tags'],
'title': data['title'],
})
out += 'data: ' + payload + '\n\n'
return out