Merge pull request #26929 from steverweber/keepalive

Keepalive fixes
This commit is contained in:
Mike Place 2015-09-14 14:00:50 -06:00
commit 66d8f51d31
3 changed files with 33 additions and 41 deletions

View File

@ -34,5 +34,8 @@ EX_USAGE = 64
# The Salt specific exit codes are defined below:
# keepalive exit code is a hint that the process should be restarted
SALT_KEEPALIVE = 99
# SALT_BUILD_FAIL is used when salt fails to build something, like a container
SALT_BUILD_FAIL = 101

View File

@ -1649,7 +1649,13 @@ class Minion(MinionBase):
ping_interval = self.opts.get('ping_interval', 0) * 60
if ping_interval > 0:
def ping_master():
self._fire_master('ping', 'minion_ping')
if not self._fire_master('ping', 'minion_ping'):
if not self.opts.get('auth_safemode', True):
log.error('** Master Ping failed. Attempting to restart minion**')
delay = self.opts.get('random_reauth_delay', 5)
log.info('delaying random_reauth_delay {0}s'.format(delay))
# regular sys.exit raises an exception -- which isn't sufficient in a thread
os._exit(salt.defaults.exitcodes.SALT_KEEPALIVE)
self.periodic_callbacks['ping'] = tornado.ioloop.PeriodicCallback(ping_master, ping_interval * 1000, io_loop=self.io_loop)
self.periodic_callbacks['cleanup'] = tornado.ioloop.PeriodicCallback(self._fallback_cleanups, loop_interval * 1000, io_loop=self.io_loop)

View File

@ -47,7 +47,7 @@ def salt_master():
master.start()
def minion_process(queue):
def minion_process():
'''
Start a minion process
'''
@ -58,8 +58,8 @@ def minion_process(queue):
'''
Have the minion suicide if the parent process is gone
NOTE: there is a small race issue where the parent PID could be replace
with another process with the same PID!
NOTE: small race issue where the parent PID could be replace
with another process with same PID!
'''
while True:
time.sleep(5)
@ -69,39 +69,29 @@ def minion_process(queue):
except OSError:
# forcibly exit, regular sys.exit raises an exception-- which
# isn't sufficient in a thread
os._exit(999)
os._exit(salt.defaults.exitcodes.EX_GENERIC)
if not salt.utils.is_windows():
thread = threading.Thread(target=suicide_when_without_parent, args=(os.getppid(),))
thread.start()
minion = salt.cli.daemons.Minion()
restart = False
minion = None
try:
minion = salt.cli.daemons.Minion()
minion.start()
except (Exception, SaltClientError, SaltReqTimeoutError, SaltSystemExit) as exc:
log.error('Minion failed to start: ', exc_info=True)
restart = True
except SystemExit as exc:
restart = False
if restart is True:
except (SaltClientError, SaltReqTimeoutError, SaltSystemExit) as exc:
log.warn('** Restarting minion **')
delay = 60
if minion is not None:
if hasattr(minion, 'config'):
delay = minion.config.get('random_reauth_delay', 60)
random_delay = randint(1, delay)
log.info('Sleeping random_reauth_delay of {0} seconds'.format(random_delay))
# preform delay after minion resources have been cleaned
queue.put(random_delay)
else:
queue.put(0)
if minion is not None and hasattr(minion, 'config'):
delay = minion.config.get('random_reauth_delay', 60)
delay = randint(1, delay)
log.info('waiting random_reauth_delay {0}s'.format(delay))
time.sleep(delay)
exit(salt.defaults.exitcodes.SALT_KEEPALIVE)
def salt_minion():
'''
Start the salt minion.
Start the salt minion in a subprocess.
Auto restart minion on error.
'''
import salt.cli.daemons
import multiprocessing
@ -122,28 +112,21 @@ def salt_minion():
# keep one minion subprocess running
while True:
try:
queue = multiprocessing.Queue()
process = multiprocessing.Process(target=minion_process)
process.start()
except Exception:
# This breaks in containers
# if multiprocessing does not work
minion = salt.cli.daemons.Minion()
minion.start()
return
process = multiprocessing.Process(target=minion_process, args=(queue,))
process.start()
break
try:
process.join()
try:
restart_delay = queue.get(block=False)
except Exception:
if process.exitcode == 0:
# Minion process ended naturally, Ctrl+C or --version
break
restart_delay = 60
if restart_delay == 0:
# Minion process ended naturally, Ctrl+C, --version, etc.
if not process.exitcode == salt.defaults.exitcodes.SALT_KEEPALIVE:
break
# delay restart to reduce flooding and allow network resources to close
time.sleep(restart_delay)
# ontop of the random_reauth_delay already preformed
# delay extra to reduce flooding and free resources
# NOTE: values are static but should be fine.
time.sleep(2 + randint(1, 10))
except KeyboardInterrupt:
break
# need to reset logging because new minion objects