From bfcaab9ef48eb185471b48ded6dc3f335061e73d Mon Sep 17 00:00:00 2001 From: Mike Place Date: Thu, 10 Sep 2015 13:15:30 -0600 Subject: [PATCH] Better try and error handling for prep_jid Implement upper bounds for retries on FS errors when writing job files to cache. Raise exceptions up to clients and daemons for logging and user notification. Handle exception outlined in #25581. --- salt/client/mixins.py | 23 +++++++++++++---------- salt/exceptions.py | 6 ++++++ salt/master.py | 7 +++++-- salt/returners/local_cache.py | 26 +++++++++++++++++++------- 4 files changed, 43 insertions(+), 19 deletions(-) diff --git a/salt/client/mixins.py b/salt/client/mixins.py index 1fb4ba0732..e951d00942 100644 --- a/salt/client/mixins.py +++ b/salt/client/mixins.py @@ -328,16 +328,19 @@ class SyncClientMixin(object): data['success'] = False namespaced_event.fire_event(data, 'ret') - salt.utils.job.store_job( - self.opts, - {'id': self.opts['id'], - 'tgt': self.opts['id'], - 'jid': data['jid'], - 'return': data, - }, - event=None, - mminion=self.mminion, - ) + try: + salt.utils.job.store_job( + self.opts, + {'id': self.opts['id'], + 'tgt': self.opts['id'], + 'jid': data['jid'], + 'return': data, + }, + event=None, + mminion=self.mminion, + ) + except salt.exceptions.SaltCacheError: + log.error('Could not store job cache info. Job details for this run may be unavailable.') # if we fired an event, make sure to delete the event object. # This will ensure that we call destroy, which will do the 0MQ linger log.info('Runner completed: {0}'.format(data['jid'])) diff --git a/salt/exceptions.py b/salt/exceptions.py index 944f658347..3e40e2798c 100644 --- a/salt/exceptions.py +++ b/salt/exceptions.py @@ -151,6 +151,12 @@ class SaltClientTimeout(SaltException): self.jid = jid +class SaltCacheError(SaltException): + ''' + Thrown when a problem was encountered trying to read or write from the salt cache + ''' + + class SaltReqTimeoutError(SaltException): ''' Thrown when a salt master request call fails to return within the timeout diff --git a/salt/master.py b/salt/master.py index d1da12ab85..17e183e984 100644 --- a/salt/master.py +++ b/salt/master.py @@ -1272,8 +1272,11 @@ class AESFuncs(object): :param dict load: The minion payload ''' - salt.utils.job.store_job( - self.opts, load, event=self.event, mminion=self.mminion) + try: + salt.utils.job.store_job( + self.opts, load, event=self.event, mminion=self.mminion) + except salt.exception.SaltCacheError: + log.error('Could not store job information for load: {0}'.format(load)) def _syndic_return(self, load): ''' diff --git a/salt/returners/local_cache.py b/salt/returners/local_cache.py index af43e08601..655d24c806 100644 --- a/salt/returners/local_cache.py +++ b/salt/returners/local_cache.py @@ -17,6 +17,7 @@ import hashlib import salt.payload import salt.utils import salt.utils.jid +import salt.exceptions log = logging.getLogger(__name__) @@ -99,12 +100,16 @@ def _format_jid_instance(jid, job): #TODO: add to returner docs-- this is a new one -def prep_jid(nocache=False, passed_jid=None): +def prep_jid(nocache=False, passed_jid=None, recurse_count=0): ''' Return a job id and prepare the job id directory This is the function responsible for making sure jids don't collide (unless its passed a jid) So do what you have to do to make sure that stays the case ''' + if recurse_count >= 5: + err = 'prep_jid could not store a jid after {0} tries.'.format(recurse_count) + log.error(err) + raise salt.exceptions.SaltCacheError(err) if passed_jid is None: # this can be a None of an empty string jid = salt.utils.jid.gen_jid() else: @@ -117,15 +122,22 @@ def prep_jid(nocache=False, passed_jid=None): try: os.makedirs(jid_dir_) except OSError: - # TODO: some sort of sleep or something? Spinning is generally bad practice + time.sleep(0.1) if passed_jid is None: + recurse_count += recurse_count return prep_jid(nocache=nocache) - with salt.utils.fopen(os.path.join(jid_dir_, 'jid'), 'wb+') as fn_: - fn_.write(jid) - if nocache: - with salt.utils.fopen(os.path.join(jid_dir_, 'nocache'), 'wb+') as fn_: - fn_.write('') + try: + with salt.utils.fopen(os.path.join(jid_dir_, 'jid'), 'wb+') as fn_: + fn_.write(jid) + if nocache: + with salt.utils.fopen(os.path.join(jid_dir_, 'nocache'), 'wb+') as fn_: + fn_.write('') + except IOError: + log.warn('Could not write out jid file for job {0}. Retrying.'.format(jid)) + time.sleep(0.1) + recurse_count += recurse_count + return prep_jid(passed_jid = jid, nocache=nocache) return jid