Merge pull request #27025 from cachedout/issue_25581

Better try and error handling for prep_jid
2024-11-07 17:09:03 +00:00 · 2015-09-11 08:40:10 +01:00 · 2015-09-11 08:40:10 +01:00 · 843c28b435
commit 843c28b435
parent b9baa0b39a ecc09d9b93
4 changed files with 44 additions and 19 deletions
--- a/salt/client/mixins.py
+++ b/salt/client/mixins.py
@ -328,16 +328,19 @@ class SyncClientMixin(object):
            data['success'] = False

        namespaced_event.fire_event(data, 'ret')
-        salt.utils.job.store_job(
-            self.opts,
-            {'id': self.opts['id'],
-             'tgt': self.opts['id'],
-             'jid': data['jid'],
-             'return': data,
-             },
-            event=None,
-            mminion=self.mminion,
-            )
+        try:
+            salt.utils.job.store_job(
+                self.opts,
+                {'id': self.opts['id'],
+                 'tgt': self.opts['id'],
+                 'jid': data['jid'],
+                 'return': data,
+                 },
+                event=None,
+                mminion=self.mminion,
+                )
+        except salt.exceptions.SaltCacheError:
+            log.error('Could not store job cache info. Job details for this run may be unavailable.')
        # if we fired an event, make sure to delete the event object.
        # This will ensure that we call destroy, which will do the 0MQ linger
        log.info('Runner completed: {0}'.format(data['jid']))
--- a/salt/exceptions.py
+++ b/salt/exceptions.py
@ -151,6 +151,12 @@ class SaltClientTimeout(SaltException):
        self.jid = jid


+class SaltCacheError(SaltException):
+    '''
+    Thrown when a problem was encountered trying to read or write from the salt cache
+    '''
+
+
 class SaltReqTimeoutError(SaltException):
    '''
    Thrown when a salt master request call fails to return within the timeout
--- a/salt/master.py
+++ b/salt/master.py
@ -1272,8 +1272,11 @@ class AESFuncs(object):

        :param dict load: The minion payload
        '''
-        salt.utils.job.store_job(
-            self.opts, load, event=self.event, mminion=self.mminion)
+        try:
+            salt.utils.job.store_job(
+                self.opts, load, event=self.event, mminion=self.mminion)
+        except salt.exception.SaltCacheError:
+            log.error('Could not store job information for load: {0}'.format(load))

    def _syndic_return(self, load):
        '''
--- a/salt/returners/local_cache.py
+++ b/salt/returners/local_cache.py
@ -12,11 +12,13 @@ import os
 import shutil
 import datetime
 import hashlib
+import time

 # Import salt libs
 import salt.payload
 import salt.utils
 import salt.utils.jid
+import salt.exceptions

 log = logging.getLogger(__name__)

@ -99,12 +101,16 @@ def _format_jid_instance(jid, job):


 #TODO: add to returner docs-- this is a new one
-def prep_jid(nocache=False, passed_jid=None):
+def prep_jid(nocache=False, passed_jid=None, recurse_count=0):
    '''
    Return a job id and prepare the job id directory
    This is the function responsible for making sure jids don't collide (unless its passed a jid)
    So do what you have to do to make sure that stays the case
    '''
+    if recurse_count >= 5:
+        err = 'prep_jid could not store a jid after {0} tries.'.format(recurse_count)
+        log.error(err)
+        raise salt.exceptions.SaltCacheError(err)
    if passed_jid is None:  # this can be a None of an empty string
        jid = salt.utils.jid.gen_jid()
    else:
@ -117,15 +123,22 @@ def prep_jid(nocache=False, passed_jid=None):
    try:
        os.makedirs(jid_dir_)
    except OSError:
-        # TODO: some sort of sleep or something? Spinning is generally bad practice
+        time.sleep(0.1)
        if passed_jid is None:
+            recurse_count += recurse_count
            return prep_jid(nocache=nocache)

-    with salt.utils.fopen(os.path.join(jid_dir_, 'jid'), 'wb+') as fn_:
-        fn_.write(jid)
-    if nocache:
-        with salt.utils.fopen(os.path.join(jid_dir_, 'nocache'), 'wb+') as fn_:
-            fn_.write('')
+    try:
+        with salt.utils.fopen(os.path.join(jid_dir_, 'jid'), 'wb+') as fn_:
+            fn_.write(jid)
+        if nocache:
+            with salt.utils.fopen(os.path.join(jid_dir_, 'nocache'), 'wb+') as fn_:
+                fn_.write('')
+    except IOError:
+        log.warn('Could not write out jid file for job {0}. Retrying.'.format(jid))
+        time.sleep(0.1)
+        recurse_count += recurse_count
+        return prep_jid(passed_jid=jid, nocache=nocache)

    return jid