So, I have a Celery system set up, where I dynamically create a cloud VM instance for each task, once the task completes the VM instance will delete itself. To accomplish this I am creating a new queue and assigning the worker on the newly created instance to that queue so that tasks can be sent to specific instances. This works with 1 or 2 simultaneous tasks, but if I try more than that, then celery's result.get method just waits indefinitely. I am using Celery version 4.2.1 (windowlicker).
Here is my Celery config.py file:
"""A module that configures Celery"""
from os import environ
from utils.loggerFactory import make_logger
LOGGER = make_logger(__name__)
LOGGER.info('Celery initalizing...')
REDIS_BACKEND_HOST = None
if 'RedisDNS' in environ:
REDIS_BACKEND_HOST = environ['RedisDNS']
LOGGER.info('Set Redis instance hostname to {}'.format(REDIS_BACKEND_HOST))
else:
LOGGER.warning('Couldn\'t fetch RedisDNS, defaulting to localhost...')
REDIS_BACKEND_HOST = 'localhost'
BROKER_URL = 'redis://{}'.format(REDIS_BACKEND_HOST)
CELERY_RESULT_BACKEND = 'redis://{}'.format(REDIS_BACKEND_HOST)
CELERY_TRACK_STARTED = True
CELERY_TASK_CREATE_MISSING_QUEUES = True
CELERY_TASK_IGNORE_RESULT = False
LOGGER.info('Init complete')
Here is the main code for executing tasks:
if ENV != 'development':
# Create a new compute instance
try:
created_instance_name = create_worker_compute_instance(
task_info['computeInstanceType'])
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t create compute instance: {}'.format(request_id, str(exc)))
try:
LOGGER.info(
'[{}] Saving exception into redis...'.format(request_id))
result = json.loads(REDIS_CLIENT.get(request_id))
result['response'] = generate_response(
'Error: Couldn\'t create compute instance: {}'.format(str(exc)), None, 500)
result['code'] = 500
result['canDel'] = True
REDIS_CLIENT.set(request_id, json.dumps(result))
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t save exception into redis: {}'.format(request_id, str(exc)))
report_exception(ENV, exc)
report_exception(ENV, exc)
return
celery_queue_name = 'queue-{}'.format(created_instance_name)
LOGGER.info('[{}] Adding new Celery queue {}'.format(
request_id, celery_queue_name))
try:
APP.control.add_consumer(celery_queue_name, reply=False, destination=[
'worker1#{}'.format(created_instance_name)])
except Exception as exc:
LOGGER.error('[{}] Couldn\'t add queue {}: {}'.format(
request_id, celery_queue_name, str(exc)))
try:
LOGGER.info('[{}] Saving exception into redis...'.format(request_id))
result = json.loads(REDIS_CLIENT.get(request_id))
result['response'] = generate_response(
'Error: Couldn\'t add queue {}: {}'.format(celery_queue_name, str(exc)), None, 500)
result['code'] = 500
result['canDel'] = True
REDIS_CLIENT.set(request_id, json.dumps(result))
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t save exception into redis: {}'.format(request_id, str(exc)))
report_exception(ENV, exc)
report_exception(ENV, exc)
return
LOGGER.info('[{}] Queue added'.format(request_id))
else:
celery_queue_name = 'celery'
# Execute the task
LOGGER.info('[{}] Executing task...'.format(request_id))
async_result = run_task.apply_async(
args=(data, task_info, SERVICE_ACCOUNT_FILE_DATA), queue=celery_queue_name)
LOGGER.info('[{}] Waiting for task to complete...'.format(request_id))
task_result = None
try:
task_result = async_result.get()
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t execute task {}: {}'.format(request_id, task, str(exc)))
try:
LOGGER.info('[{}] Saving exception into redis...'.format(request_id))
result = json.loads(REDIS_CLIENT.get(request_id))
result['response'] = generate_response('Error: Couldn\'t execute task {}: {}'.format(
task, str(exc)), None, 500)
result['code'] = 500
result['canDel'] = True
REDIS_CLIENT.set(request_id, json.dumps(result))
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t save exception into redis: {}'.format(request_id, str(exc)))
report_exception(ENV, exc)
report_exception(ENV, exc)
return
LOGGER.info('[{}] Task executed successfully'.format(request_id))
task_result['message'] = 'Ok, task {} executed successfully'.format(
task)
try:
LOGGER.info('[{}] Saving result into redis...'.format(request_id))
result = json.loads(REDIS_CLIENT.get(request_id))
result['response'] = generate_response(
None, task_result, 0)
result['code'] = 200
result['canDel'] = True
REDIS_CLIENT.set(request_id, json.dumps(result))
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t save result into redis: {}'.format(request_id, str(exc)))
report_exception(ENV, exc)
return
Edit:
Here is small diagram for a broad overview of the system:
Ok, it seems that the issue is with APP.control.add_consumer(celery_queue_name, reply=False, destination=['worker1#{}'.format(created_instance_name)]). Even though that command returns successfully, the worker still hasn't been added to the queue.
I managed to fix the issue by including the queue name in the worker startup command with the -Q parameter.
Related
I applied connect try to connect over 500 machine SSH with Python Paramiko using multi-threading with setDeamon True and join with time out to handle stuck connections after connect about 50 -70 machine and close connections properly the paramiko transport connection stuck with some machines
<paramiko.Transport at 0x34d41130 (cipher aes128-ctr, 128 bits) (active; 0 open channel(s))>
which causes system error can't start new threads after stuck threads reach over 500 thread
how can I close paramiko transport connection after thread timed out
'''python
def run_sub_list_thread(self):
try:
threads = []
for machin_ip in node_sub_list:
thread = Thread(target=self.foo, args=[machin_ip])
thread.setDaemon(True)
threads.append(thread)
thread.start()
timeout = 60 * 20 / len(threads)
for thread in threads:
thread.join(timeout=timeout)
except Exception as e:
print(repr(e))
return 'yes', 'Success'
def _connect(self, machin_ip):
_ssh_client = paramiko.SSHClient()
_ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try:
_ssh_client.connect(
machin_ip, port=22, username=password[0], password=''
)
_invoked_shell = (
_ssh_client.invoke_shell()
)
self._invoked_shell = _invoked_shell
t.sleep(2)
except paramiko.ssh_exception.SSHException as e:
exception_message = "Exception::{0}::{1}".format(method_name, e)
print(exception_message)
return "error", exception_message, None, None
except Exception as e:
exception_message = "Exception::{0}::{1}".format(method_name, e)
print(exception_message)
return "error", exception_message, None, None
#Some code here to connect machine properly
self._ssh_client = _ssh_client
self._working_username = _working_username
self._working_password = _working_password
return _status, _invoked_shell, _working_username, _working_password
except Exception as e:
exception_message = "Exception::{0}::{1}".format(method_name, e)
print(exception_message)
return "error", exception_message, None, None
def _disconnect(self):
method_name = inspect.stack()[0][3]
class_name = self.__class__.__name__
try:
self._invoked_shell.close()
self._ssh_client.close()
return 'yes', 'Connection Closed'
except Exception as e:
exception_message = f"Exception::{class_name}::{method_name}::{repr(e)}"
print(exception_message)
return 'error', exception_message
def foo(self,machin_ip):
method_name = inspect.stack()[0][3]
class_name = self.__class__.__name__
try:
self._connect(machin_ip)
# sleep instead of run commands and processing commands output
t.sleep(300)
self._disconnect()
return 'yes', 'Done'
except Exception as e:
exception_message = f"Exception::{class_name}::{method_name}::{repr(e)}"
print(exception_message)
return 'error', exception_message
'''
this code run on windows server 2016
Get error can't start new thread after stuck paramiko.transport threads reached over 500 thread
<paramiko.Transport at 0x34d41130 (cipher aes128-ctr, 128 bits) (active; 0 open channel(s))>
I'm using Redis sentinel with three nodes. One is the master and the other two are slaves.
I have gone through this, but it expect that the error comes less frequently and can be re-tried, but my approach might be wrong here.
Here I am handling the reconfiguration of the nodes.
import redis
# Initialize on system boot
slave_node_1: redis.Redis = None
slave_node_2: redis.Redis = None
master_node: redis.Redis = None
# Handling the reconfiguration of the nodes.
def reconfig_redis_nodes():
Sentinel = redis.Sentinel([
(REDIS_HOST_0, SENTINEL_PORT),
(REDIS_HOST_1, SENTINEL_PORT),
(REDIS_HOST_2, SENTINEL_PORT)
], sentinel_kwargs={'password': REDIS_SENTINEL_PASSWORD})
host, port = Sentinel.discover_master(REDIS_MASTER_NAME)
globals()['master_node'] = redis.Redis(host=host, port=port, db=REDIS_DB, username=REDIS_USER, password=REDIS_PASSWORD, decode_responses=True)
slave_nodes = Sentinel.discover_slaves(REDIS_MASTER_NAME)
try:
host, port = slave_nodes[0]
globals()['slave_node_1'] = redis.Redis(host=host, port=port, db=REDIS_DB, username=REDIS_USER, password=REDIS_PASSWORD, decode_responses=True)
except IndexError as e:
pass
try:
host, port = slave_nodes[1]
globals()['slave_node_2'] = redis.Redis(host=host, port=port, db=REDIS_DB, username=REDIS_USER, password=REDIS_PASSWORD, decode_responses=True)
except IndexError as e:
pass
# Decorator to handle config change
def handle_redis_failover_master_switch(func):
def inner(*args, **kwargs):
retries = 0
max_retry = 5
while True:
try:
return func(*args, **kwargs)
except Exception as e:
reconfig_redis_nodes()
retries += 1
if retries > max_retry:
logger.critical(str(e))
raise Exception(e)
return inner
# And this is the Redis method I am using to set lock.
#handle_redis_failover_master_switch
def setnx(key: str, value: str, ttl_secs: int = 10):
return master_node.set(key, value, nx=True, ex=ttl_secs)
When I manually called these functions from the shell, they worked fine. But once the deployed (10 requests per second) Redis is throwing Redis is loading the dataset in memory
What is the cause of the issue here and how can I handle it gracefully?
Is it a bad idea for using sentinel for locking system?
I am attempting to port some old java code to python.
I am using pymqi to connect to a queue manager and query for all messageflow statistics topics using the topic string: $SYS/Broker/+/StatisticsAccounting/Archive/#
When using the existing java program messages are read from the topic without issue.
When using the new python code it is able to connect and query the topic without issue but always gives the message
Reason 2033: FAILED: MQRC_NO_MSG_AVAILABLE
Stats messages are published by the broker for each messageflow every 10 minutes, and I have left the new code running for over 30minutes, never having received a message.
I've also tried setting
get_opts['WaitInterval'] = pymqi.CMQC.MQWI_UNLIMITED
and sitting around for 20minutes rather than using a loop, but no luck.
Is there any IIB server config that might be impacting the messages that I am able to see, or are there other options I should be using within the client?
import pymqi
queue_manager = 'MYQM'
channel = 'MYAPP.SVRCONN'
host = 'MYHOST'
port = 'MYPORT'
topic_string = '$SYS/Broker/+/StatisticsAccounting/Archive/#'
conn_info = '%s(%s)' % (host, port)
user = ""
password = ""
qmgr = pymqi.QueueManager(None)
qmgr.connect_tcp_client(queue_manager, pymqi.CD(), channel, conn_info, user, password)
sub_desc = pymqi.SD()
sub_desc['Options'] = pymqi.CMQC.MQSO_CREATE + pymqi.CMQC.MQSO_RESUME + pymqi.CMQC.MQSO_MANAGED
sub_desc.set_vs('SubName', 'apptest')
sub_desc.set_vs('ObjectString', topic_string)
sub = pymqi.Subscription(qmgr)
sub.sub(sub_desc=sub_desc)
get_opts = pymqi.GMO(Options=pymqi.CMQC.MQGMO_WAIT)
get_opts['WaitInterval'] = 10000
md = pymqi.md()
keep_running = True
while keep_running:
try:
# Reset the MsgId, CorrelId & GroupId so that we can reuse
# the same 'md' object again.
md.MsgId = pymqi.CMQC.MQMI_NONE
md.CorrelId = pymqi.CMQC.MQCI_NONE
md.GroupId = pymqi.CMQC.MQGI_NONE
message = sub.get(None, md, get_opts)
print('Have message from Queue')
print(message)
except pymqi.MQMIError as e:
if e.comp == pymqi.CMQC.MQCC_FAILED and e.reason == pymqi.CMQC.MQRC_NO_MSG_AVAILABLE:
print("no message?")
print(e)
pass
else:
# Some other error condition.
raise
except (UnicodeDecodeError, ValueError) as e:
print('Message is not valid json')
print(e)
print(message)
continue
except KeyboardInterrupt:
print('Have received a keyboard interrupt')
keep_running = False
sub.close(sub_close_options=0,close_sub_queue=True)
qmgr.disconnect()
What I require is a simple hack for running function synchronously if celery is not active.
What I tried is:
is_celery_working returns False although celery and Redis both are running (ran celery -A project worker -l debug and redis-server respectively). Also get_celery_worker_status is always giving error in status.
I am using celery with Django.
from project.celery import app
def is_celery_working():
result = app.control.broadcast('ping', reply=True, limit=1)
return bool(result) # True if at least one result
def sync_async(func):
if is_celery_working():
return func.delay
else:
return func
sync_async(some_func)(**its_args, **its_kwrgs)
def get_celery_worker_status():
error_key = 'error'
try:
from celery.task.control import inspect
insp = inspect()
d = insp.stats()
if not d:
d = {error_key: 'No running Celery workers were found.'}
except IOError as e:
from errno import errorcode
msg = "Error connecting to the backend: " + str(e)
if len(e.args) > 0 and errorcode.get(e.args[0]) == 'ECONNREFUSED':
msg += ' Check that the RabbitMQ server is running.'
d = {error_key: msg}
except ImportError as e:
d = {error_key: str(e)}
return d
def sync_async(func):
status = get_celery_worker_status()
if 'error' not in status:
return func.delay
else:
return func
sync_async(some_func)(**its_args, **its_kwrgs)
Your simple is_celery_working function looks correct. If you're getting False, you may want to increase your timeout to 5 or 10 seconds using the optional timeout parameter.
def is_celery_working():
result = app.control.broadcast('ping', reply=True, limit=1, timeout=5.0)
return bool(result) # True if at least one result
def sync_async(func, *args, **kwargs):
try:
func.delay(*args, **kwargs)
except Exception as error:
print('Celery not active', error)
func(*args, **kwargs)
This just gives an error if the Redis server is not working. This worked fine for me as I am assuming that if Redis is not working then celery is stopped.
I am struggling to solve my issue, hope anyone from the community can help me here.
Our requirement is locked and can't be changed as the producer publishing the queues is controlled by a different team.
Producer which is written in JAVA declares three queues (TASK, RESPONSE, TASK_RESPONSE) and listens on them with the help of spring framework.
A hashmap is sent to the TASK and TASK_RESPONSE queue from the java AMQP client (Producer).
We need to consume these hashmaps and send the responses as follows.
If the queue TASK is processed, the response needs to be sent on RESPONSE queue incrementally.
If the queue TASK_RESPONSE is processed, the response needs to be sent on TASK_RESPONSE queue incrementally (RPC mode).
Now, we need to consume and publish this in python since we need to do some background processing on the tasks.
I tried to work with celery and dramatiq, but was not able to figure out how it can be done with them, so I tried writing myself (with the help of tutorials available online)
Problem is, I am able to consume the messages but not able to reply_to the RESPONSE queue. Here is my code.
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor
import pika
import datetime
import logging
import json
from logging import StreamHandler
from time import sleep
from random import randint
from pika import SelectConnection
from settings import *
logging.basicConfig(handlers=[StreamHandler()], level=logging.INFO, format=logging.BASIC_FORMAT)
_logger = logging.getLogger(__name__)
class QueueConsumer(object):
"""The consumer class to manage connections to the AMQP server/queue"""
def __init__(self, queue, logger, parameters, thread_id=0):
self.channel = None
self.connection = None
self.queue_name_task = queue['task']
self.queue_name_response = queue['response']
self.logger = logger
self.consumer_id = 'Consumer Thread: %d' % (thread_id,)
self.parameters = pika.ConnectionParameters(**parameters)
def consume(self):
try:
self.connection = SelectConnection(parameters=self.parameters, on_open_callback=self._on_connected)
self.connection.ioloop.start()
except Exception as e:
self.logger.error('{} {}'.format(self.consumer_id, str(e)))
self.connection.close()
self.connection.ioloop.start()
def _on_connected(self, connection):
connection.channel(on_open_callback=self._on_channel_open)
def _on_channel_open(self, channel):
self.channel = channel
try:
# Declare Task Queue
self.channel.queue_declare(queue=self.queue_name_task,
exclusive=False,
durable=True,
auto_delete=False,
callback=self._on_queue_declared)
self.logger.info("{} Opened Channel....".format(self.consumer_id))
# Declare Task Response Queue
self.channel.queue_declare(queue=self.queue_name_response,
exclusive=False,
durable=True,
auto_delete=False)
self.logger.info("{} Opened Channel....".format(self.consumer_id))
except Exception as e:
self.logger.error('{} {}'.format(self.consumer_id, str(e)))
def _on_queue_declared(self, frame):
self.logger.debug('{} ... declaring queue'.format(self.consumer_id))
self.channel.basic_qos(prefetch_count=1)
try:
self.channel.basic_consume(queue=self.queue_name_task,
on_message_callback=self.handle_delivery,
auto_ack=True)
self.logger.info("{} Declared queue...".format(self.consumer_id))
except Exception as e:
self.logger.error('{} crashing:--> {}'.format(self.consumer_id, str(e)))
def handle_delivery(self, channel, method, header, body):
try:
start_time = datetime.datetime.now()
_logger.info("Received...")
_logger.info("Content: %s" % body)
req = json.loads(self.decode(body))
# Do something
sleep(randint(10, 20))
time_taken = datetime.datetime.now() - start_time
log_msg = "[{}] Time Taken: {}.{}".format(req['bar']['baz'], time_taken.seconds, time_taken.microseconds)
_logger.info(log_msg)
# Publish the result to another queue.
try:
self.channel.basic_publish(exchange='',
routing_key=self.queue_name_response,
properties=pika.BasicProperties(),
body=log_msg)
_logger.info("Message Published...\t(%s)" % self.queue_name_response)
except Exception as e:
self.logger.error('{} Message publishing failed:--> {}'.format(self.consumer_id, str(e)))
except Exception as err:
_logger.exception(err)
def decode(self, body):
try:
_body = body.decode('utf-8')
except AttributeError:
_body = body
return _body
if __name__ == "__main__":
pika_parameters = OrderedDict([
('host', TF_BROKER_HOST),
('port', TF_BROKER_PORT),
('virtual_host', TF_BROKER_VHOST)
])
queue = {'task': TF_IAAS_TASK_QUEUE, 'response': TF_IAAS_REPLY_QUEUE}
try:
with ThreadPoolExecutor(max_workers=TF_IAAS_THREAD_SIZE, thread_name_prefix=TF_IAAS_THREAD_PREFIX) as executor:
start = 1
for thread_id in range(start, (TF_IAAS_THREAD_SIZE + start)):
executor.submit(QueueConsumer(queue, _logger, pika_parameters, thread_id).consume)
except Exception as err:
_logger.exception(err)
Publish Messages On RabbitMQ
import pika
import json
import random
import datetime
from faker import Faker
from random import randint
fake = Faker('en_US')
if __name__ == '__main__':
try:
connection = pika.BlockingConnection(pika.ConnectionParameters(
host='localhost'))
channel = connection.channel()
channel.queue_declare(queue='tf_task', durable=True)
started_at = datetime.datetime.now()
properties = pika.BasicProperties(delivery_mode=2)
for i in range(0, 10000):
body = {
'foo': randint(i, i+100),
'bar': {
'baz': fake.name(),
'poo': float(random.randrange(155+i, 389+i))/100
}
}
channel.basic_publish(exchange='',
routing_key='tf_task',
body=json.dumps(body),
properties=properties)
if i%10000 == 0:
duration = datetime.datetime.now() - started_at
print(i, duration.total_seconds())
print(" [x] Sent 'Hello World!'")
connection.close()
now = datetime.datetime.now()
duration = now - started_at
print(duration.total_seconds())
except Exception as e:
print(e)