Paramiko Transport Stuck Thread with Multi-Threading - python

I applied connect try to connect over 500 machine SSH with Python Paramiko using multi-threading with setDeamon True and join with time out to handle stuck connections after connect about 50 -70 machine and close connections properly the paramiko transport connection stuck with some machines
<paramiko.Transport at 0x34d41130 (cipher aes128-ctr, 128 bits) (active; 0 open channel(s))>
which causes system error can't start new threads after stuck threads reach over 500 thread
how can I close paramiko transport connection after thread timed out
'''python
def run_sub_list_thread(self):
try:
threads = []
for machin_ip in node_sub_list:
thread = Thread(target=self.foo, args=[machin_ip])
thread.setDaemon(True)
threads.append(thread)
thread.start()
timeout = 60 * 20 / len(threads)
for thread in threads:
thread.join(timeout=timeout)
except Exception as e:
print(repr(e))
return 'yes', 'Success'
def _connect(self, machin_ip):
_ssh_client = paramiko.SSHClient()
_ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try:
_ssh_client.connect(
machin_ip, port=22, username=password[0], password=''
)
_invoked_shell = (
_ssh_client.invoke_shell()
)
self._invoked_shell = _invoked_shell
t.sleep(2)
except paramiko.ssh_exception.SSHException as e:
exception_message = "Exception::{0}::{1}".format(method_name, e)
print(exception_message)
return "error", exception_message, None, None
except Exception as e:
exception_message = "Exception::{0}::{1}".format(method_name, e)
print(exception_message)
return "error", exception_message, None, None
#Some code here to connect machine properly
self._ssh_client = _ssh_client
self._working_username = _working_username
self._working_password = _working_password
return _status, _invoked_shell, _working_username, _working_password
except Exception as e:
exception_message = "Exception::{0}::{1}".format(method_name, e)
print(exception_message)
return "error", exception_message, None, None
def _disconnect(self):
method_name = inspect.stack()[0][3]
class_name = self.__class__.__name__
try:
self._invoked_shell.close()
self._ssh_client.close()
return 'yes', 'Connection Closed'
except Exception as e:
exception_message = f"Exception::{class_name}::{method_name}::{repr(e)}"
print(exception_message)
return 'error', exception_message
def foo(self,machin_ip):
method_name = inspect.stack()[0][3]
class_name = self.__class__.__name__
try:
self._connect(machin_ip)
# sleep instead of run commands and processing commands output
t.sleep(300)
self._disconnect()
return 'yes', 'Done'
except Exception as e:
exception_message = f"Exception::{class_name}::{method_name}::{repr(e)}"
print(exception_message)
return 'error', exception_message
'''
this code run on windows server 2016
Get error can't start new thread after stuck paramiko.transport threads reached over 500 thread
<paramiko.Transport at 0x34d41130 (cipher aes128-ctr, 128 bits) (active; 0 open channel(s))>

Related

Redis is loading the dataset in memory

I'm using Redis sentinel with three nodes. One is the master and the other two are slaves.
I have gone through this, but it expect that the error comes less frequently and can be re-tried, but my approach might be wrong here.
Here I am handling the reconfiguration of the nodes.
import redis
# Initialize on system boot
slave_node_1: redis.Redis = None
slave_node_2: redis.Redis = None
master_node: redis.Redis = None
# Handling the reconfiguration of the nodes.
def reconfig_redis_nodes():
Sentinel = redis.Sentinel([
(REDIS_HOST_0, SENTINEL_PORT),
(REDIS_HOST_1, SENTINEL_PORT),
(REDIS_HOST_2, SENTINEL_PORT)
], sentinel_kwargs={'password': REDIS_SENTINEL_PASSWORD})
host, port = Sentinel.discover_master(REDIS_MASTER_NAME)
globals()['master_node'] = redis.Redis(host=host, port=port, db=REDIS_DB, username=REDIS_USER, password=REDIS_PASSWORD, decode_responses=True)
slave_nodes = Sentinel.discover_slaves(REDIS_MASTER_NAME)
try:
host, port = slave_nodes[0]
globals()['slave_node_1'] = redis.Redis(host=host, port=port, db=REDIS_DB, username=REDIS_USER, password=REDIS_PASSWORD, decode_responses=True)
except IndexError as e:
pass
try:
host, port = slave_nodes[1]
globals()['slave_node_2'] = redis.Redis(host=host, port=port, db=REDIS_DB, username=REDIS_USER, password=REDIS_PASSWORD, decode_responses=True)
except IndexError as e:
pass
# Decorator to handle config change
def handle_redis_failover_master_switch(func):
def inner(*args, **kwargs):
retries = 0
max_retry = 5
while True:
try:
return func(*args, **kwargs)
except Exception as e:
reconfig_redis_nodes()
retries += 1
if retries > max_retry:
logger.critical(str(e))
raise Exception(e)
return inner
# And this is the Redis method I am using to set lock.
#handle_redis_failover_master_switch
def setnx(key: str, value: str, ttl_secs: int = 10):
return master_node.set(key, value, nx=True, ex=ttl_secs)
When I manually called these functions from the shell, they worked fine. But once the deployed (10 requests per second) Redis is throwing Redis is loading the dataset in memory
What is the cause of the issue here and how can I handle it gracefully?
Is it a bad idea for using sentinel for locking system?

IBM Stats subscription topic always returns Reason 2033: FAILED: MQRC_NO_MSG_AVAILABLE

I am attempting to port some old java code to python.
I am using pymqi to connect to a queue manager and query for all messageflow statistics topics using the topic string: $SYS/Broker/+/StatisticsAccounting/Archive/#
When using the existing java program messages are read from the topic without issue.
When using the new python code it is able to connect and query the topic without issue but always gives the message
Reason 2033: FAILED: MQRC_NO_MSG_AVAILABLE
Stats messages are published by the broker for each messageflow every 10 minutes, and I have left the new code running for over 30minutes, never having received a message.
I've also tried setting
get_opts['WaitInterval'] = pymqi.CMQC.MQWI_UNLIMITED
and sitting around for 20minutes rather than using a loop, but no luck.
Is there any IIB server config that might be impacting the messages that I am able to see, or are there other options I should be using within the client?
import pymqi
queue_manager = 'MYQM'
channel = 'MYAPP.SVRCONN'
host = 'MYHOST'
port = 'MYPORT'
topic_string = '$SYS/Broker/+/StatisticsAccounting/Archive/#'
conn_info = '%s(%s)' % (host, port)
user = ""
password = ""
qmgr = pymqi.QueueManager(None)
qmgr.connect_tcp_client(queue_manager, pymqi.CD(), channel, conn_info, user, password)
sub_desc = pymqi.SD()
sub_desc['Options'] = pymqi.CMQC.MQSO_CREATE + pymqi.CMQC.MQSO_RESUME + pymqi.CMQC.MQSO_MANAGED
sub_desc.set_vs('SubName', 'apptest')
sub_desc.set_vs('ObjectString', topic_string)
sub = pymqi.Subscription(qmgr)
sub.sub(sub_desc=sub_desc)
get_opts = pymqi.GMO(Options=pymqi.CMQC.MQGMO_WAIT)
get_opts['WaitInterval'] = 10000
md = pymqi.md()
keep_running = True
while keep_running:
try:
# Reset the MsgId, CorrelId & GroupId so that we can reuse
# the same 'md' object again.
md.MsgId = pymqi.CMQC.MQMI_NONE
md.CorrelId = pymqi.CMQC.MQCI_NONE
md.GroupId = pymqi.CMQC.MQGI_NONE
message = sub.get(None, md, get_opts)
print('Have message from Queue')
print(message)
except pymqi.MQMIError as e:
if e.comp == pymqi.CMQC.MQCC_FAILED and e.reason == pymqi.CMQC.MQRC_NO_MSG_AVAILABLE:
print("no message?")
print(e)
pass
else:
# Some other error condition.
raise
except (UnicodeDecodeError, ValueError) as e:
print('Message is not valid json')
print(e)
print(message)
continue
except KeyboardInterrupt:
print('Have received a keyboard interrupt')
keep_running = False
sub.close(sub_close_options=0,close_sub_queue=True)
qmgr.disconnect()

Doing ssh tunnel with Python script. Error: "Could not resolve hostname when trying to open ssh tunnel"

Hello I'm working on simple python ssh tunnel scrpit but I allways receive Could not resolve hostname error, but it works if run it manually. this is my code:
#!/usr/bin/env python
import subprocess
import time
import tempfile
class TunnelSSH():
def __init__(self, ssh_user: str, ssh_password: str, ssh_host: str, ssh_port: int,
local_tunnel_port:int, remote_tunnel_host:str, remote_tunnel_port:int):
self.ssh_user = ssh_user
self.ssh_password = ssh_password
self.ssh_host = ssh_host
self.ssh_port = ssh_port
self.local_tunnel_port = local_tunnel_port
self.remote_tunnel_port = remote_tunnel_port
self.remote_tunnel_host = remote_tunnel_host
_socket_file = tempfile.NamedTemporaryFile()
_socket_file.close()
self.socket = _socket_file.name
self.connected = False
def start(self):
ssh_conection = ['ssh', '-CN',
f'"{self.ssh_user}:{self.ssh_password}"#{self.ssh_host} -p {self.ssh_port}',
f'-L {self.local_tunnel_port}:{self.remote_tunnel_host}:{self.remote_tunnel_port}',
f'-S {self.socket}',
'-o ExitOnForwardFailure=True'
]
if not self.connected:
status = subprocess.call(ssh_conection)
self._check_connection(status)
time.sleep(self.retry_sleep)
else:
raise Exception('Tunnel is open')
def stop(self):
if self.connected:
if self._send_control_command('exit') != 0:
raise Exception('SSH tunnel failed to exit')
self.connected = False
def _check_connection(self, status) -> None:
"""Check connection status and set connected to True is tunnel is open"""
if status != 0:
raise Exception(f'SSH tunnel failed status: {status}')
if self._send_control_command('check'):
raise Exception(f'SSH tunnel failed to check')
self.connected = True
def _send_control_command(self, ctl_cmd:str):
call = ['ssh',f'-S {self.socket}',f'-O {self.ctl_cmd}', f'-l {self.ssh_user}', f'{self.ssh_host}']
return subprocess.check_call(call)
if __name__ == "__main__":
tunnel = TunnelSSH(ssh_user='...',
ssh_password='...',
ssh_host='...',
ssh_port=...,
local_tunnel_port=...,
remote_tunnel_host='...',
remote_tunnel_port=...
)
retry = 10 # times
wait_for_retry = 5 #s
for i in range(retry):
print(f'Connection attempt: {i}')
try:
tunnel.start()
except Exception as err:
tunnel.stop()
print(err)
time.sleep(wait_for_retry)
print(f'Connected: {tunnel.connected}')
subprocess.call expects a list of arguments. When ssh_conection is formed, several arguments are slapped together, so e.g. this part gets quoted into a single argument:
'"{self.ssh_user}:{self.ssh_password}"#{self.ssh_host} -p {self.ssh_port}'
Fix: properly split the arguments:
...
ssh_conection = ['ssh', '-CN',
f'{self.ssh_user}:{self.ssh_password}#{self.ssh_host}', # will be quoted automatically
'-p', f'{self.ssh_port}',
'-L', f'{self.local_tunnel_port}:{self.remote_tunnel_host}:{self.remote_tunnel_port}',
'-S', f'{self.socket}',
'-o', 'ExitOnForwardFailure=True'
]
...
What hinted the problem: IP addresses are used directly. 'cannot be resolved' on an IP address says that it is interpreted as a symbolic name, which makes spotting this easier.

Consuming and replying to separate queues | Pika implementation

I am struggling to solve my issue, hope anyone from the community can help me here.
Our requirement is locked and can't be changed as the producer publishing the queues is controlled by a different team.
Producer which is written in JAVA declares three queues (TASK, RESPONSE, TASK_RESPONSE) and listens on them with the help of spring framework.
A hashmap is sent to the TASK and TASK_RESPONSE queue from the java AMQP client (Producer).
We need to consume these hashmaps and send the responses as follows.
If the queue TASK is processed, the response needs to be sent on RESPONSE queue incrementally.
If the queue TASK_RESPONSE is processed, the response needs to be sent on TASK_RESPONSE queue incrementally (RPC mode).
Now, we need to consume and publish this in python since we need to do some background processing on the tasks.
I tried to work with celery and dramatiq, but was not able to figure out how it can be done with them, so I tried writing myself (with the help of tutorials available online)
Problem is, I am able to consume the messages but not able to reply_to the RESPONSE queue. Here is my code.
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor
import pika
import datetime
import logging
import json
from logging import StreamHandler
from time import sleep
from random import randint
from pika import SelectConnection
from settings import *
logging.basicConfig(handlers=[StreamHandler()], level=logging.INFO, format=logging.BASIC_FORMAT)
_logger = logging.getLogger(__name__)
class QueueConsumer(object):
"""The consumer class to manage connections to the AMQP server/queue"""
def __init__(self, queue, logger, parameters, thread_id=0):
self.channel = None
self.connection = None
self.queue_name_task = queue['task']
self.queue_name_response = queue['response']
self.logger = logger
self.consumer_id = 'Consumer Thread: %d' % (thread_id,)
self.parameters = pika.ConnectionParameters(**parameters)
def consume(self):
try:
self.connection = SelectConnection(parameters=self.parameters, on_open_callback=self._on_connected)
self.connection.ioloop.start()
except Exception as e:
self.logger.error('{} {}'.format(self.consumer_id, str(e)))
self.connection.close()
self.connection.ioloop.start()
def _on_connected(self, connection):
connection.channel(on_open_callback=self._on_channel_open)
def _on_channel_open(self, channel):
self.channel = channel
try:
# Declare Task Queue
self.channel.queue_declare(queue=self.queue_name_task,
exclusive=False,
durable=True,
auto_delete=False,
callback=self._on_queue_declared)
self.logger.info("{} Opened Channel....".format(self.consumer_id))
# Declare Task Response Queue
self.channel.queue_declare(queue=self.queue_name_response,
exclusive=False,
durable=True,
auto_delete=False)
self.logger.info("{} Opened Channel....".format(self.consumer_id))
except Exception as e:
self.logger.error('{} {}'.format(self.consumer_id, str(e)))
def _on_queue_declared(self, frame):
self.logger.debug('{} ... declaring queue'.format(self.consumer_id))
self.channel.basic_qos(prefetch_count=1)
try:
self.channel.basic_consume(queue=self.queue_name_task,
on_message_callback=self.handle_delivery,
auto_ack=True)
self.logger.info("{} Declared queue...".format(self.consumer_id))
except Exception as e:
self.logger.error('{} crashing:--> {}'.format(self.consumer_id, str(e)))
def handle_delivery(self, channel, method, header, body):
try:
start_time = datetime.datetime.now()
_logger.info("Received...")
_logger.info("Content: %s" % body)
req = json.loads(self.decode(body))
# Do something
sleep(randint(10, 20))
time_taken = datetime.datetime.now() - start_time
log_msg = "[{}] Time Taken: {}.{}".format(req['bar']['baz'], time_taken.seconds, time_taken.microseconds)
_logger.info(log_msg)
# Publish the result to another queue.
try:
self.channel.basic_publish(exchange='',
routing_key=self.queue_name_response,
properties=pika.BasicProperties(),
body=log_msg)
_logger.info("Message Published...\t(%s)" % self.queue_name_response)
except Exception as e:
self.logger.error('{} Message publishing failed:--> {}'.format(self.consumer_id, str(e)))
except Exception as err:
_logger.exception(err)
def decode(self, body):
try:
_body = body.decode('utf-8')
except AttributeError:
_body = body
return _body
if __name__ == "__main__":
pika_parameters = OrderedDict([
('host', TF_BROKER_HOST),
('port', TF_BROKER_PORT),
('virtual_host', TF_BROKER_VHOST)
])
queue = {'task': TF_IAAS_TASK_QUEUE, 'response': TF_IAAS_REPLY_QUEUE}
try:
with ThreadPoolExecutor(max_workers=TF_IAAS_THREAD_SIZE, thread_name_prefix=TF_IAAS_THREAD_PREFIX) as executor:
start = 1
for thread_id in range(start, (TF_IAAS_THREAD_SIZE + start)):
executor.submit(QueueConsumer(queue, _logger, pika_parameters, thread_id).consume)
except Exception as err:
_logger.exception(err)
Publish Messages On RabbitMQ
import pika
import json
import random
import datetime
from faker import Faker
from random import randint
fake = Faker('en_US')
if __name__ == '__main__':
try:
connection = pika.BlockingConnection(pika.ConnectionParameters(
host='localhost'))
channel = connection.channel()
channel.queue_declare(queue='tf_task', durable=True)
started_at = datetime.datetime.now()
properties = pika.BasicProperties(delivery_mode=2)
for i in range(0, 10000):
body = {
'foo': randint(i, i+100),
'bar': {
'baz': fake.name(),
'poo': float(random.randrange(155+i, 389+i))/100
}
}
channel.basic_publish(exchange='',
routing_key='tf_task',
body=json.dumps(body),
properties=properties)
if i%10000 == 0:
duration = datetime.datetime.now() - started_at
print(i, duration.total_seconds())
print(" [x] Sent 'Hello World!'")
connection.close()
now = datetime.datetime.now()
duration = now - started_at
print(duration.total_seconds())
except Exception as e:
print(e)

Having issues getting back results for Celery tasks

So, I have a Celery system set up, where I dynamically create a cloud VM instance for each task, once the task completes the VM instance will delete itself. To accomplish this I am creating a new queue and assigning the worker on the newly created instance to that queue so that tasks can be sent to specific instances. This works with 1 or 2 simultaneous tasks, but if I try more than that, then celery's result.get method just waits indefinitely. I am using Celery version 4.2.1 (windowlicker).
Here is my Celery config.py file:
"""A module that configures Celery"""
from os import environ
from utils.loggerFactory import make_logger
LOGGER = make_logger(__name__)
LOGGER.info('Celery initalizing...')
REDIS_BACKEND_HOST = None
if 'RedisDNS' in environ:
REDIS_BACKEND_HOST = environ['RedisDNS']
LOGGER.info('Set Redis instance hostname to {}'.format(REDIS_BACKEND_HOST))
else:
LOGGER.warning('Couldn\'t fetch RedisDNS, defaulting to localhost...')
REDIS_BACKEND_HOST = 'localhost'
BROKER_URL = 'redis://{}'.format(REDIS_BACKEND_HOST)
CELERY_RESULT_BACKEND = 'redis://{}'.format(REDIS_BACKEND_HOST)
CELERY_TRACK_STARTED = True
CELERY_TASK_CREATE_MISSING_QUEUES = True
CELERY_TASK_IGNORE_RESULT = False
LOGGER.info('Init complete')
Here is the main code for executing tasks:
if ENV != 'development':
# Create a new compute instance
try:
created_instance_name = create_worker_compute_instance(
task_info['computeInstanceType'])
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t create compute instance: {}'.format(request_id, str(exc)))
try:
LOGGER.info(
'[{}] Saving exception into redis...'.format(request_id))
result = json.loads(REDIS_CLIENT.get(request_id))
result['response'] = generate_response(
'Error: Couldn\'t create compute instance: {}'.format(str(exc)), None, 500)
result['code'] = 500
result['canDel'] = True
REDIS_CLIENT.set(request_id, json.dumps(result))
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t save exception into redis: {}'.format(request_id, str(exc)))
report_exception(ENV, exc)
report_exception(ENV, exc)
return
celery_queue_name = 'queue-{}'.format(created_instance_name)
LOGGER.info('[{}] Adding new Celery queue {}'.format(
request_id, celery_queue_name))
try:
APP.control.add_consumer(celery_queue_name, reply=False, destination=[
'worker1#{}'.format(created_instance_name)])
except Exception as exc:
LOGGER.error('[{}] Couldn\'t add queue {}: {}'.format(
request_id, celery_queue_name, str(exc)))
try:
LOGGER.info('[{}] Saving exception into redis...'.format(request_id))
result = json.loads(REDIS_CLIENT.get(request_id))
result['response'] = generate_response(
'Error: Couldn\'t add queue {}: {}'.format(celery_queue_name, str(exc)), None, 500)
result['code'] = 500
result['canDel'] = True
REDIS_CLIENT.set(request_id, json.dumps(result))
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t save exception into redis: {}'.format(request_id, str(exc)))
report_exception(ENV, exc)
report_exception(ENV, exc)
return
LOGGER.info('[{}] Queue added'.format(request_id))
else:
celery_queue_name = 'celery'
# Execute the task
LOGGER.info('[{}] Executing task...'.format(request_id))
async_result = run_task.apply_async(
args=(data, task_info, SERVICE_ACCOUNT_FILE_DATA), queue=celery_queue_name)
LOGGER.info('[{}] Waiting for task to complete...'.format(request_id))
task_result = None
try:
task_result = async_result.get()
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t execute task {}: {}'.format(request_id, task, str(exc)))
try:
LOGGER.info('[{}] Saving exception into redis...'.format(request_id))
result = json.loads(REDIS_CLIENT.get(request_id))
result['response'] = generate_response('Error: Couldn\'t execute task {}: {}'.format(
task, str(exc)), None, 500)
result['code'] = 500
result['canDel'] = True
REDIS_CLIENT.set(request_id, json.dumps(result))
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t save exception into redis: {}'.format(request_id, str(exc)))
report_exception(ENV, exc)
report_exception(ENV, exc)
return
LOGGER.info('[{}] Task executed successfully'.format(request_id))
task_result['message'] = 'Ok, task {} executed successfully'.format(
task)
try:
LOGGER.info('[{}] Saving result into redis...'.format(request_id))
result = json.loads(REDIS_CLIENT.get(request_id))
result['response'] = generate_response(
None, task_result, 0)
result['code'] = 200
result['canDel'] = True
REDIS_CLIENT.set(request_id, json.dumps(result))
except Exception as exc:
LOGGER.error(
'[{}] Couldn\'t save result into redis: {}'.format(request_id, str(exc)))
report_exception(ENV, exc)
return
Edit:
Here is small diagram for a broad overview of the system:
Ok, it seems that the issue is with APP.control.add_consumer(celery_queue_name, reply=False, destination=['worker1#{}'.format(created_instance_name)]). Even though that command returns successfully, the worker still hasn't been added to the queue.
I managed to fix the issue by including the queue name in the worker startup command with the -Q parameter.

Categories