Merge pull request #14460 from steverweber/fix_restarts

fix issues with keepalive minion
This commit is contained in:
Thomas S Hatch 2014-07-25 21:46:01 -06:00
commit aeda9730ca
4 changed files with 152 additions and 101 deletions

View File

@ -133,31 +133,16 @@
# Unless your master is under unusually heavy load, this should be left at the default.
#auth_timeout: 60
# Number of consecutive SaltReqTimeoutError that are acceptable when trying to authenticate.
#auth_tries: 1
# If authentication failes due to SaltReqTimeoutError, continue without ending minion.
#auth_safemode: True
# If the minion hits an error that is recoverable, restart the minion.
#restart_on_error: False
# If authentication failes due to SaltReqTimeoutError during a ping_interval,
# cause sub minion proccess to restart.
#auth_safemode: False
# Ping Master to ensure connection is alive (minutes).
# TODO: perhaps could update the scheduler to raise Exception in main thread after /mine_interval (60 minutes)/ fails
#ping_interval: 0
# To auto recover Minions if Master changes IP address (DDNS)
#
# auth_tries: 10
# auth_safemode: False
# ping_interval: 90
# restart_on_error: True
#
# Minions wont know master is missing untill a ping fails. After the ping fail,
# the minion will attempt authentication and likly fails out and cause a restart.
# When the minion restarts it will resolve the Masters IP and attempt to reconnect.
#ping_interval: 90
# If you don't have any problems with syn-floods, dont bother with the
# three recon_* settings described below, just leave the defaults!

View File

@ -7,8 +7,6 @@ Make me some salt!
import os
import sys
import warnings
import time
from random import randint
# All salt related deprecation warnings should be shown once each!
warnings.filterwarnings(
@ -43,7 +41,7 @@ try:
except ImportError as exc:
if exc.args[0] != 'No module named _msgpack':
raise
from salt.exceptions import SaltSystemExit, MasterExit, SaltClientError
from salt.exceptions import SaltSystemExit, MasterExit
# Let's instantiate logger using salt.log.setup.logging.getLogger() so pylint
@ -157,65 +155,64 @@ class Minion(parsers.MinionOptionParser):
super(YourSubClass, self).prepare()
'''
if not hasattr(self, 'config'):
self.parse_args()
self.parse_args()
try:
if self.config['verify_env']:
confd = self.config.get('default_include')
if confd:
# If 'default_include' is specified in config, then use it
if '*' in confd:
# Value is of the form "minion.d/*.conf"
confd = os.path.dirname(confd)
if not os.path.isabs(confd):
# If configured 'default_include' is not an absolute
# path, consider it relative to folder of 'conf_file'
# (/etc/salt by default)
confd = os.path.join(
os.path.dirname(self.config['conf_file']), confd
)
else:
try:
if self.config['verify_env']:
confd = self.config.get('default_include')
if confd:
# If 'default_include' is specified in config, then use it
if '*' in confd:
# Value is of the form "minion.d/*.conf"
confd = os.path.dirname(confd)
if not os.path.isabs(confd):
# If configured 'default_include' is not an absolute
# path, consider it relative to folder of 'conf_file'
# (/etc/salt by default)
confd = os.path.join(
os.path.dirname(self.config['conf_file']), 'minion.d'
os.path.dirname(self.config['conf_file']), confd
)
v_dirs = [
self.config['pki_dir'],
self.config['cachedir'],
self.config['sock_dir'],
self.config['extension_modules'],
confd,
]
if self.config.get('transport') == 'raet':
v_dirs.append(os.path.join(self.config['pki_dir'], 'accepted'))
v_dirs.append(os.path.join(self.config['pki_dir'], 'pending'))
v_dirs.append(os.path.join(self.config['pki_dir'], 'rejected'))
v_dirs.append(os.path.join(self.config['cachedir'], 'raet'))
verify_env(
v_dirs,
self.config['user'],
permissive=self.config['permissive_pki_access'],
pki_dir=self.config['pki_dir'],
else:
confd = os.path.join(
os.path.dirname(self.config['conf_file']), 'minion.d'
)
logfile = self.config['log_file']
if logfile is not None and not logfile.startswith(('tcp://',
'udp://',
'file://')):
# Logfile is not using Syslog, verify
current_umask = os.umask(0077)
verify_files([logfile], self.config['user'])
os.umask(current_umask)
except OSError as err:
logger.exception('Failed to prepare salt environment')
sys.exit(err.errno)
self.setup_logfile_logger()
logger.info(
'Setting up the Salt Minion "{0}"'.format(
self.config['id']
v_dirs = [
self.config['pki_dir'],
self.config['cachedir'],
self.config['sock_dir'],
self.config['extension_modules'],
confd,
]
if self.config.get('transport') == 'raet':
v_dirs.append(os.path.join(self.config['pki_dir'], 'accepted'))
v_dirs.append(os.path.join(self.config['pki_dir'], 'pending'))
v_dirs.append(os.path.join(self.config['pki_dir'], 'rejected'))
v_dirs.append(os.path.join(self.config['cachedir'], 'raet'))
verify_env(
v_dirs,
self.config['user'],
permissive=self.config['permissive_pki_access'],
pki_dir=self.config['pki_dir'],
)
logfile = self.config['log_file']
if logfile is not None and not logfile.startswith(('tcp://',
'udp://',
'file://')):
# Logfile is not using Syslog, verify
current_umask = os.umask(0077)
verify_files([logfile], self.config['user'])
os.umask(current_umask)
except OSError as err:
logger.exception('Failed to prepare salt environment')
sys.exit(err.errno)
self.setup_logfile_logger()
logger.info(
'Setting up the Salt Minion "{0}"'.format(
self.config['id']
)
migrations.migrate_paths(self.config)
)
migrations.migrate_paths(self.config)
if self.config['transport'].lower() == 'zeromq':
# Late import so logging works correctly
import salt.minion
@ -248,29 +245,18 @@ class Minion(parsers.MinionOptionParser):
NOTE: Run any required code before calling `super()`.
'''
reconnect = True
while reconnect:
reconnect = False
try:
self.prepare()
if check_user(self.config['user']):
self.minion.tune_in()
except (KeyboardInterrupt, SaltSystemExit) as exc:
logger.warn('Stopping the Salt Minion')
if isinstance(exc, KeyboardInterrupt):
logger.warn('Exiting on Ctrl-c')
else:
logger.error(str(exc))
except SaltClientError as exc:
logger.error(exc)
if self.config.get('restart_on_error'):
logger.warn('** Restarting minion **')
s = randint(0, self.config.get('random_reauth_delay', 10))
logger.info('Sleeping random_reauth_delay of {0} seconds'.format(s))
time.sleep(s)
reconnect = True
finally:
self.shutdown()
try:
self.prepare()
if check_user(self.config['user']):
self.minion.tune_in()
except (KeyboardInterrupt, SaltSystemExit) as exc:
logger.warn('Stopping the Salt Minion')
if isinstance(exc, KeyboardInterrupt):
logger.warn('Exiting on Ctrl-c')
else:
logger.error(str(exc))
finally:
self.shutdown()
def shutdown(self):
'''

View File

@ -9,6 +9,10 @@ import os
import sys
import traceback
import logging
import multiprocessing
import threading
import time
from random import randint
# Import salt libs
import salt
@ -58,8 +62,84 @@ def salt_minion():
if '' in sys.path:
sys.path.remove('')
minion = salt.Minion()
minion.start()
if '--disable-keepalive' in sys.argv:
sys.argv.remove('--disable-keepalive')
minion = salt.Minion()
minion.start()
return
if '-d' in sys.argv or '--daemon' in sys.argv:
# disable daemonize on sub processes
if '-d' in sys.argv:
sys.argv.remove('-d')
if '--daemon' in sys.argv:
sys.argv.remove('--daemon')
# daemonize current process
salt.utils.daemonize()
# run minion in a new process so its simple to cleanup resource
def minion_process(q):
# have the minion suicide if the parent process is gone
# there is a small race issue where the parent PID could be replace
# with another proccess with the same PID
def suicide_when_without_parent(parent_pid):
while True:
time.sleep(5)
try:
# check pid alive (Unix only trick!)
os.kill(parent_pid, 0)
except OSError:
sys.exit(999)
if not salt.utils.is_windows():
t = threading.Thread(target=suicide_when_without_parent, args=(os.getppid(),))
t.start()
minion = None
try:
minion = salt.Minion()
minion.start()
q.put(0)
except Exception, err:
log.error(err)
log.warn('** Restarting minion **')
delay = 60
if minion is None:
if hasattr(minion, 'config'):
delay = minion.config.get('random_reauth_delay', 60)
random_delay = randint(1, delay)
log.info('Sleeping random_reauth_delay of {0} seconds'.format(random_delay))
# preform delay after minion resources have been cleaned
q.put(random_delay)
# keep one minion subprocess running
while True:
q = multiprocessing.Queue()
p = multiprocessing.Process(target=minion_process, args=(q,))
p.start()
try:
p.join()
try:
restart_delay = q.get(block=False)
except Exception:
if p.exitcode == 0:
# Minion process ended naturally
break
restart_delay = 60
if restart_delay == 0:
# minion closed on normal behaviour like Ctrl+C
break
# delay restart to reduce flooding and allow network resources to close
time.sleep(restart_delay)
except KeyboardInterrupt, err:
break
# need to reset logging because new minion objects
# cause extra log handlers to accumulate
rlogger = logging.getLogger()
for h in rlogger.handlers:
rlogger.removeHandler(h)
logging.basicConfig()
def salt_syndic():

View File

@ -46,7 +46,7 @@ class MinionTest(integration.ShellCase, integration.ShellCaseCommonTestsMixIn):
ret = self.run_script(
self._call_binary_,
'--config-dir {0} --pid-file {1} -l debug'.format(
'--disable-keepalive --config-dir {0} --pid-file {1} -l debug'.format(
config_dir,
pid_path
),