Better try and error handling for prep_jid

Implement upper bounds for retries on FS errors when writing job files to cache. Raise exceptions up to clients and daemons for logging and user notification. Handle exception outlined in #25581.
2024-11-07 17:09:03 +00:00 · 2015-09-10 13:15:30 -06:00 · 2015-09-10 13:15:30 -06:00 · bfcaab9ef4
commit bfcaab9ef4
parent a563af29d3
4 changed files with 43 additions and 19 deletions
--- a/salt/client/mixins.py
+++ b/salt/client/mixins.py
@ -328,16 +328,19 @@ class SyncClientMixin(object):
            data['success'] = False

        namespaced_event.fire_event(data, 'ret')
-        salt.utils.job.store_job(
-            self.opts,
-            {'id': self.opts['id'],
-             'tgt': self.opts['id'],
-             'jid': data['jid'],
-             'return': data,
-             },
-            event=None,
-            mminion=self.mminion,
-            )
+        try:
+            salt.utils.job.store_job(
+                self.opts,
+                {'id': self.opts['id'],
+                 'tgt': self.opts['id'],
+                 'jid': data['jid'],
+                 'return': data,
+                 },
+                event=None,
+                mminion=self.mminion,
+                )
+        except salt.exceptions.SaltCacheError:
+            log.error('Could not store job cache info. Job details for this run may be unavailable.')
        # if we fired an event, make sure to delete the event object.
        # This will ensure that we call destroy, which will do the 0MQ linger
        log.info('Runner completed: {0}'.format(data['jid']))
--- a/salt/exceptions.py
+++ b/salt/exceptions.py
@ -151,6 +151,12 @@ class SaltClientTimeout(SaltException):
        self.jid = jid


+class SaltCacheError(SaltException):
+    '''
+    Thrown when a problem was encountered trying to read or write from the salt cache
+    '''
+
+
 class SaltReqTimeoutError(SaltException):
    '''
    Thrown when a salt master request call fails to return within the timeout
--- a/salt/master.py
+++ b/salt/master.py
@ -1272,8 +1272,11 @@ class AESFuncs(object):

        :param dict load: The minion payload
        '''
-        salt.utils.job.store_job(
-            self.opts, load, event=self.event, mminion=self.mminion)
+        try:
+            salt.utils.job.store_job(
+                self.opts, load, event=self.event, mminion=self.mminion)
+        except salt.exception.SaltCacheError:
+            log.error('Could not store job information for load: {0}'.format(load))

    def _syndic_return(self, load):
        '''
--- a/salt/returners/local_cache.py
+++ b/salt/returners/local_cache.py
@ -17,6 +17,7 @@ import hashlib
 import salt.payload
 import salt.utils
 import salt.utils.jid
+import salt.exceptions

 log = logging.getLogger(__name__)

@ -99,12 +100,16 @@ def _format_jid_instance(jid, job):


 #TODO: add to returner docs-- this is a new one
-def prep_jid(nocache=False, passed_jid=None):
+def prep_jid(nocache=False, passed_jid=None, recurse_count=0):
    '''
    Return a job id and prepare the job id directory
    This is the function responsible for making sure jids don't collide (unless its passed a jid)
    So do what you have to do to make sure that stays the case
    '''
+    if recurse_count >= 5:
+        err = 'prep_jid could not store a jid after {0} tries.'.format(recurse_count)
+        log.error(err)
+        raise salt.exceptions.SaltCacheError(err)
    if passed_jid is None:  # this can be a None of an empty string
        jid = salt.utils.jid.gen_jid()
    else:
@ -117,15 +122,22 @@ def prep_jid(nocache=False, passed_jid=None):
    try:
        os.makedirs(jid_dir_)
    except OSError:
-        # TODO: some sort of sleep or something? Spinning is generally bad practice
+        time.sleep(0.1)
        if passed_jid is None:
+            recurse_count += recurse_count
            return prep_jid(nocache=nocache)

-    with salt.utils.fopen(os.path.join(jid_dir_, 'jid'), 'wb+') as fn_:
-        fn_.write(jid)
-    if nocache:
-        with salt.utils.fopen(os.path.join(jid_dir_, 'nocache'), 'wb+') as fn_:
-            fn_.write('')
+    try:
+        with salt.utils.fopen(os.path.join(jid_dir_, 'jid'), 'wb+') as fn_:
+            fn_.write(jid)
+        if nocache:
+            with salt.utils.fopen(os.path.join(jid_dir_, 'nocache'), 'wb+') as fn_:
+                fn_.write('')
+    except IOError:
+        log.warn('Could not write out jid file for job {0}. Retrying.'.format(jid))
+        time.sleep(0.1)
+        recurse_count += recurse_count
+        return prep_jid(passed_jid = jid, nocache=nocache)

    return jid