Using the Ansible 2 Python API I'm able to run playbooks and handle results with a custom callback handler (thanks to this question). Everything works well, but now I'd like to implement a simple retry loop for the PlaybookExecutor.
All my callback handler is doing is stuffing any failed tasks in an array, and if I see that the array isn't empty count it as a failure and try again.
I have another python module that uses this script to kick off the playbook. The call to run_playbook is nested in a try/except block and I'd like an exception to bubble up so I can properly handle the failure.
I'd like to give my playbook 3 attempts at running and if all fail then raise an exception.
Here is my code:
#! /usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function
import logging
import os
from collections import namedtuple
from ansible.parsing.dataloader import DataLoader
from ansible.vars.manager import VariableManager
from ansible.inventory.manager import InventoryManager
from ansible.executor.playbook_executor import PlaybookExecutor
from ansible.plugins.callback import CallbackBase
class ResultsCallback(CallbackBase):
""" A callback plugin used for performing an action as results come in """
def __init__(self):
super(ResultsCallback, self).__init__()
# Store all failed results
self.failed = []
def v2_runner_on_failed(self, result, ignore_errors=False):
if ignore_errors:
self._display.display("...ignoring", color=C.COLOR_SKIP)
host = result._host
self.failed.append(result.task_name)
def create_inventory_file(hostnames):
inv_file = 'ansible_hosts.{0}'.format(os.getppid())
logging.print('\nCreating Ansible host file: {0}/{1}'.format(os.path.join(os.path.expanduser('~')), inv_file))
with open(os.path.join(os.path.expanduser('~'), inv_file), 'w') as host_file:
# If ec2, stuff into an '[ec2]' group.
# Otherwise don't use a group header
if 'ec2' in hostnames[0]:
host_file.write('[ec2]\n')
for host in hostnames:
host_file.write('{0}\n'.format(host))
return os.path.join(os.path.expanduser('~'), inv_file)
def run_playbook(hostnames, playbook, playbook_arguments, host_file=False):
# If user passes in the optional arg host_file, then just use that one.
if not host_file:
host_file = create_inventory_file(hostnames)
if not os.path.isfile(host_file):
logging.critical('Host file does not exist. Make sure absolute path is correct.\nInventory: {0}'.format(host_file))
raise RuntimeError('Host file does not exist')
loader = DataLoader()
inventory = InventoryManager(loader=loader, sources=host_file)
variable_manager = VariableManager(loader=loader, inventory=inventory)
# Add extra variables to use in playbook like so:
# variable_manager.extra_vars = {'name': 'value'}
if playbook_arguments:
variable_manager.extra_vars = playbook_arguments
Options = namedtuple('Options', ['listtags', 'listtasks', 'listhosts', 'syntax', 'connection','module_path', 'forks', 'remote_user', 'become', 'become_method', 'become_user', 'verbosity', 'check', 'diff', 'ask_sudo_pass'])
if 'superuser' in playbook_arguments:
remote_user = playbook_arguments['superuser']
else:
remote_user = 'ec2-user'
options = Options(listtags=None, listtasks=None, listhosts=None, syntax=None, connection='smart', module_path=None, forks=100, remote_user=remote_user, become=None, become_method='sudo', become_user='root', verbosity=None, check=False, diff=False, ask_sudo_pass=None)
pbex = PlaybookExecutor(playbooks=[playbook], inventory=inventory, variable_manager=variable_manager, loader=loader, options=options, passwords={})
callback = ResultsCallback()
pbex._tqm._stdout_callback = callback
logging.print('Provisioning cluster with Ansible...')
attempts = 3
for i in range(attempts):
try:
pbex.run()
failed = callback.failed
if failed:
logging.critical('Playbook failed!')
raise RuntimeError('{0} tasks failed'.format(len(failed)))
break
except:
if i < attempts - 1:
logging.critical('Attempting to re-try playbook')
continue
else:
raise
logging.print('\nRemoving Ansible Inventory file {0}'.format(host_file))
try:
os.remove(host_file)
except OSError:
pass
However, when I test the above code using a playbook that is guaranteed to fail, it fails with the following traceback:
Creating Ansible host file: /home/someuser/ansible_hosts.18600
Provisioning cluster with Ansible...
Playbook failed!
Attempting to re-try playbook
Exception during setup; tearing down all created instances
Traceback (most recent call last):
File "./manage_aws.py", line 486, in cmd_ec2_create
manage_ansible.run_playbook(hostnames, playbook, playbook_arguments)
File "/home/someuser/manage_ansible.py", line 88, in run_playbook
break
File "/usr/local/lib/python2.7/dist-packages/ansible/executor/playbook_executor.py", line 159, in run
result = self._tqm.run(play=play)
File "/usr/local/lib/python2.7/dist-packages/ansible/executor/task_queue_manager.py", line 296, in run
strategy.cleanup()
File "/usr/local/lib/python2.7/dist-packages/ansible/plugins/strategy/__init__.py", line 223, in cleanup
self._final_q.put(_sentinel)
File "/usr/lib/python2.7/multiprocessing/queues.py", line 100, in put
assert not self._closed
AssertionError
You'll notice that the exception is properly caught inside of the calling script manage_aws.py ("Exception during setup; tearing down all created instances") and we go to tear down the instances. That's great, but I'd like to properly re-try the playbook before deciding to do so.
I'm no Python master, so if anyone has any tips, or has accomplished something similar, then I would very much appreciate your advice.
Thanks in advance!
I found a solution, and while it isn't as graceful as I was hoping it works.
The problem that I was having seemed related to re-running on the same PlaybokExecutor object without letting the spawned threads cleanup properly.
What I did to fix it was just initialize a new PlaybookExecutor object when I notice that the first failed. Current implementation allows for only one retry, which is fine, but I'll most likely adapt it to do more if necessary.
Here is my adapted retry logic:
pbex = PlaybookExecutor(playbooks=[playbook], inventory=inventory, variable_manager=variable_manager, loader=loader, options=options, passwords={})
callback = ResultsCallback()
pbex._tqm._stdout_callback = callback
logging.print('Provisioning cluster with Ansible...')
pbex.run()
failed = callback.failed
if failed:
logging.critical('Playbook failed! Attempting retry...')
pbex_retry = PlaybookExecutor(playbooks=[playbook], inventory=inventory, variable_manager=variable_manager, loader=loader, options=options, passwords={})
callback_retry = ResultsCallback()
pbex_retry._tqm._stdout_callback = callback_retry
pbex_retry.run()
failed_retry = callback_retry.failed
if failed_retry:
logging.critical('Playbook failed again! Failed on task:\n{0}'.format(failed_retry[0]))
remove_inventory_file(host_file)
raise RuntimeError('Playbook failed to successfully configure the cluster.')
remove_inventory_file(host_file)
Super simple solution, but it is a shame my initial try didn't work out as intended. Maybe I'll re-visit it and try to properly cleanup the executor when failing.
Related
The following code will throw an ApiException 410 resource too old on the second watch.stream():
# python3 -m venv venv
# source venv/bin/activate
# pip install 'kubernetes==23.3.0'
from kubernetes import client,config,watch
config.load_kube_config(context='my-eks-context')
v1 = client.CoreV1Api()
watcher = watch.Watch()
namespace = 'kube-system'
last_resource_version=0
# this watch will timeout in 5s to have a fast way to simulate a watch that need to be retried
for i in watcher.stream(v1.list_namespaced_pod, namespace, resource_version=last_resource_version, timeout_seconds=5):
print(i['object'].metadata.resource_version)
last_resource_version = i['object'].metadata.resource_version
# we retry the watch starting from the last resource version known
# but this will raise a kubernetes.client.exceptions.ApiException: (410)
# Reason: Expired: too old resource version: 379140622 (380367990)
for i in watcher.stream(v1.list_namespaced_pod, namespace, resource_version=last_resource_version, timeout_seconds=5):
print('second loop', i['object'].metadata.resource_version)
last_resource_version = i['object'].metadata.resource_version
The kubernetes documentation states that:
If a client watch is disconnected then that client can start a new watch from the last returned resourceVersion
which is what I intended in the above code, which always gives the following exception:
Traceback (most recent call last):
File "main.py", line 24, in <module>
File "/Users/rubelagu/git/python-kubernetes-client/venv/lib/python3.8/site-packages/kubernetes/watch/watch.py", line 182, in stream
raise client.rest.ApiException(
kubernetes.client.exceptions.ApiException: (410)
Reason: Expired: too old resource version: 379164133 (380432814)
What am I doing wrong?
It seems that in the initial response to the watch (from an EKS cluster 1.21) the events can be returned in any order.
I did two subsequent watches two seconds apart and they contain the same 30 events in completely different ordering.
So it's not guaranteed that the last resource version that you see is actually the actual last and it's not guaranteed that you can resume from that resourceVersion/resource_version. Also you are not allowed to sort/collate those events by resourceVersion, since the kubernetes documentation for Resource Version Semantics explicity says:
Resource versions must be treated as opaque [...] You must not assume resource versions are numeric or collatable.
You must account for that by catching the resource too old exception and retrying without specifying a resource version, see below for an example:
from kubernetes import client,config,watch
from kubernetes.client.exceptions import ApiException
config.load_kube_config(context='eks-prod')
v1 = client.CoreV1Api()
# v1 = config.new_client_from_config(context="eks-prod").CoreV1Api()
watcher = watch.Watch()
namespace = 'my-namespace'
def list_pods(resource_version=None):
print('start watch from resource version: ', str(resource_version))
try:
for i in watcher.stream(v1.list_namespaced_pod, namespace, resource_version=resource_version, timeout_seconds=2):
print(i['object'].metadata.resource_version)
last_resource_version = i['object'].metadata.resource_version
except ApiException as e:
if e.status == 410: # Resource too old
return list_pods(resource_version=None)
else:
raise
return last_resource_version
last_resource_version = list_pods()
print('last_resource_version', last_resource_version)
list_pods(last_resource_version)
from os import getenv, listdir, path
from kubernetes import client, config
from kubernetes.stream import stream
import constants, logging
from pprint import pprint
def listdir_fullpath(directory):
return [path.join(directory, file) for file in listdir(directory)]
def active_context(kubeConfig, cluster):
config.load_kube_config(config_file=kubeConfig, context=cluster)
def kube_exec(command, apiInstance, podName, namespace, container):
response = None
execCommand = [
'/bin/bash',
'-c',
command]
try:
response = apiInstance.read_namespaced_pod(name=podName,
namespace=namespace)
except ApiException as e:
if e.status != 404:
print(f"Unknown error: {e}")
exit(1)
if not response:
print("Pod does not exist")
exit(1)
try:
response = stream(apiInstance.connect_get_namespaced_pod_exec,
podName,
namespace,
container=container,
command=execCommand,
stderr=True,
stdin=False,
stdout=True,
tty=False,
_preload_content=True)
except Exception as e:
print("error in executing cmd")
exit(1)
pprint(response)
if __name__ == '__main__':
configPath = constants.CONFIGFILE
kubeConfigList = listdir_fullpath(configPath)
kubeConfig = ':'.join(kubeConfigList)
active_context(kubeConfig, "ort.us-west-2.k8s.company-foo.net")
apiInstance = client.CoreV1Api()
kube_exec("whoami", apiInstance, "podname-foo", "namespace-foo", "container-foo")
I run this code
and the response I get from running whoami is:'java\n'
how can I run as root? also, I can't find a good doc for this client anywhere (the docs on the git repo are pretty horrible) if you can link me to any it would be awesome
EDIT: I just tried on a couple of different pods and containers, looks like some of them default to root, would still like to be able to choose my user when I run a command so question is still relevant
some of them default to root, would still like to be able to choose my user when I run a command so question is still relevant
You have influence over the UID (not the user directly, as far as I know) when you launch the Pod, but from that point forward, there is no equivalent to docker exec -u in kubernetes -- you can attach to the Pod, running as whatever UID it was launched as, but you cannot change the UID
I would hypothesize that's a security concern in locked down clusters, since one would not want someone with kubectl access to be able to elevate privileges
If you need to run as root in your container, then you should change the value of securityContext: runAsUser: 0 and then drop privileges for running your main process. That way new commands (spawned by your exec command) will run as root, just as your initial command: does
I develop locally on win10, which is a problem for the usage of the RQ task queue, which only works on linux systems because it requires the ability to fork processes. I'm trying to extend the flask-base project https://github.com/hack4impact/flask-base/tree/master/app which can use RQ. I came across https://github.com/michaelbrooks/rq-win . I love the idea of this repo (If I can get it working it will really simplify my life, since I develop on win 10 -64):
After installing this library
I can queue a job in my views by running something like:
#login_required
#main.route('/selected')
def selected():
messages = 'abcde'
j = get_queue().enqueue(render_png, messages, result_ttl=5000)
return j.get_id()
This returns a job_code correctly.
I changed the code in manage.py to:
from rq_win import WindowsWorker
#manager.command
def run_worker():
"""Initializes a slim rq task queue."""
listen = ['default']
REDIS_URL = 'redis://localhost:6379'
conn = Redis.from_url(REDIS_URL)
with Connection(conn):
# worker = Worker(map(Queue, listen))
worker = WindowsWorker(map(Queue, listen))
worker.work()
When I try to run it with:
$ python -u manage.py run_worker
09:40:44
09:40:44 *** Listening on ?[32mdefault?[39;49;00m...
09:40:58 ?[32mdefault?[39;49;00m: ?[34mapp.main.views.render_png('{"abcde"}')?[39;49;00m (8c1b6186-39a5-4daf-9c45-f60e4241cd1f)
...\lib\site-packages\rq\job.py:161: DeprecationWarning: job.status is deprecated. Use job.set_status() instead
DeprecationWarning
09:40:58 ?[31mValueError: Unknown type <class 'redis.client.StrictPipeline'>?[39;49;00m
Traceback (most recent call last):
File "...\lib\site-packages\rq_win\worker.py", line 87, in perform_job
queue.enqueue_dependents(job, pipeline=pipeline)
File "...\lib\site-packages\rq\queue.py", line 322, in enqueue_dependents
for job_id in pipe.smembers(dependents_key)]
File "...\lib\site-packages\rq\queue.py", line 322, in <listcomp>
for job_id in pipe.smembers(dependents_key)]
File "...\lib\site-packages\rq\compat\__init__.py", line 62, in as_text
raise ValueError('Unknown type %r' % type(v))
ValueError: Unknown type <class 'redis.client.StrictPipeline'>
So in summary, I think the jobs are being queued correctly within redis. However when the worker process tries to grab a job off of the queue to process, This error occurs. How can I fix this?
So after some digging, it looks like the root of the error is here, where job_id being sent to the as_text function is, somehow, a StrictPipeline object. However, I have been unable to replicate the error locally; can you post more of your code? Also, I would try re-installing the redis, rq, and rq-win modules, and possibly try importing rq.compat
I'm studying vCenter 6.5 and community samples help a lot, but in this particular situation I can't figure out, what's going on. The script:
from __future__ import with_statement
import atexit
from tools import cli
from pyVim import connect
from pyVmomi import vim, vmodl
def get_args():
*Boring args parsing works*
return args
def main():
args = get_args()
try:
service_instance = connect.SmartConnectNoSSL(host=args.host,
user=args.user,
pwd=args.password,
port=int(args.port))
atexit.register(connect.Disconnect, service_instance)
content = service_instance.RetrieveContent()
vm = content.searchIndex.FindByUuid(None, args.vm_uuid, True)
creds = vim.vm.guest.NamePasswordAuthentication(
username=args.vm_user, password=args.vm_pwd
)
try:
pm = content.guestOperationsManager.processManager
ps = vim.vm.guest.ProcessManager.ProgramSpec(
programPath=args.path_to_program,
arguments=args.program_arguments
)
res = pm.StartProgramInGuest(vm, creds, ps)
if res > 0:
print "Program executed, PID is %d" % res
except IOError, e:
print e
except vmodl.MethodFault as error:
print "Caught vmodl fault : " + error.msg
return -1
return 0
# Start program
if __name__ == "__main__":
main()
When I execute it in console, it successfully connects to the target virtual machine and prints
Program executed, PID is 2036
In task manager I see process with mentioned PID, it was created by the correct user, but there is no GUI of the process (calc.exe). RMB click does not allow to "Expand" the process.
I suppose, that this process was created with special parameters, maybe in different session.
In addition, I tried to run batch file to check if it actually executes, but the answer is no, batch file does not execute.
Any help, advices, clues would be awesome.
P.S. I tried other scripts and successfully transferred a file to the VM.
P.P.S. Sorry for my English.
Update: All such processes start in session 0.
Have you tried interactiveSession ?
https://github.com/vmware/pyvmomi/blob/master/docs/vim/vm/guest/GuestAuthentication.rst
This boolean argument passed to NamePasswordAuthentication and means the following:
This is set to true if the client wants an interactive session in the guest.
This is my first post here, so if there are any questions or if something is unlcear, don't hesitate to ask.
I am trying to use a dynamic host file so I can build multiple vagrant machines without having to manage the host file first. This is what I found online:
#!/usr/bin/env python
# Adapted from Mark Mandel's implementation
# https://github.com/ansible/ansible/blob/devel/plugins/inventory/vagrant.py
import argparse
import json
import paramiko
import subprocess
import sys
def parse_args():
parser = argparse.ArgumentParser(description="Vagrant inventory script")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--list', action='store_true')
group.add_argument('--host')
return parser.parse_args()
def list_running_hosts():
cmd = "vagrant status --machine-readable"
status = subprocess.check_output(cmd.split()).rstrip()
hosts = []
for line in status.split('\n'):
(_, host, key, value) = line.split(',')
if key == 'state' and value == 'running':
hosts.append(host)
return hosts
def get_host_details(host):
cmd = "vagrant ssh-config {}".format(host)
p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
config = paramiko.SSHConfig()
config.parse(p.stdout)
c = config.lookup(host)
return {'ansible_ssh_host': c['hostname'],
'ansible_ssh_port': c['port'],
'ansible_ssh_user': c['user'],
'ansible_ssh_private_key_file': c['identityfile'][0]}
def main():
args = parse_args()
if args.list:
hosts = list_running_hosts()
json.dump({'vagrant': hosts}, sys.stdout)
else:
details = get_host_details(args.host)
json.dump(details, sys.stdout)
if __name__ == '__main__':
main()
However, when I run this I get the following error:
ERROR! The file inventory/vagrant.py is marked as executable, but failed to execute correctly. If this is not supposed to be an executable script, correct this with `chmod -x inventory/vagrant.py`.
ERROR! Inventory script (inventory/vagrant.py) had an execution error: Traceback (most recent call last):
File "/home/sebas/Desktop/playbooks/inventory/vagrant.py", line 52, in <module>
main()
File "/home/sebas/Desktop/playbooks/inventory/vagrant.py", line 45, in main
hosts = list_running_hosts()
File "/home/sebas/Desktop/playbooks/inventory/vagrant.py", line 24, in list_running_hosts
(_, host, key, value) = line.split(',')
ValueError: too many values to unpack
ERROR! inventory/vagrant.py:4: Expected key=value host variable assignment, got: argparse
does anybody know what I did wrong? Thank you guys in advance!
I guess the problem is that vagrant status command will work only inside a directory with a Vagrantfile, or if the ID of a target machine is specified.
To get the state of all active Vagrant environments on the system, vagrant global-status should be used instead. But global-status has a drawback: it uses a cache and does not actively verify the state of machines.
So to reliably determine the state, first we need to get the IDs of all VMs with vagrant global-status and then check these IDs with vagrant status ID.