Better try and error handling for prep_jid

Implement upper bounds for retries on FS errors when writing job files to cache.

Raise exceptions up to clients and daemons for logging and user notification.

Handle exception outlined in #25581.
This commit is contained in:
Mike Place 2015-09-10 13:15:30 -06:00
parent a563af29d3
commit bfcaab9ef4
4 changed files with 43 additions and 19 deletions

View File

@ -328,16 +328,19 @@ class SyncClientMixin(object):
data['success'] = False
namespaced_event.fire_event(data, 'ret')
salt.utils.job.store_job(
self.opts,
{'id': self.opts['id'],
'tgt': self.opts['id'],
'jid': data['jid'],
'return': data,
},
event=None,
mminion=self.mminion,
)
try:
salt.utils.job.store_job(
self.opts,
{'id': self.opts['id'],
'tgt': self.opts['id'],
'jid': data['jid'],
'return': data,
},
event=None,
mminion=self.mminion,
)
except salt.exceptions.SaltCacheError:
log.error('Could not store job cache info. Job details for this run may be unavailable.')
# if we fired an event, make sure to delete the event object.
# This will ensure that we call destroy, which will do the 0MQ linger
log.info('Runner completed: {0}'.format(data['jid']))

View File

@ -151,6 +151,12 @@ class SaltClientTimeout(SaltException):
self.jid = jid
class SaltCacheError(SaltException):
'''
Thrown when a problem was encountered trying to read or write from the salt cache
'''
class SaltReqTimeoutError(SaltException):
'''
Thrown when a salt master request call fails to return within the timeout

View File

@ -1272,8 +1272,11 @@ class AESFuncs(object):
:param dict load: The minion payload
'''
salt.utils.job.store_job(
self.opts, load, event=self.event, mminion=self.mminion)
try:
salt.utils.job.store_job(
self.opts, load, event=self.event, mminion=self.mminion)
except salt.exception.SaltCacheError:
log.error('Could not store job information for load: {0}'.format(load))
def _syndic_return(self, load):
'''

View File

@ -17,6 +17,7 @@ import hashlib
import salt.payload
import salt.utils
import salt.utils.jid
import salt.exceptions
log = logging.getLogger(__name__)
@ -99,12 +100,16 @@ def _format_jid_instance(jid, job):
#TODO: add to returner docs-- this is a new one
def prep_jid(nocache=False, passed_jid=None):
def prep_jid(nocache=False, passed_jid=None, recurse_count=0):
'''
Return a job id and prepare the job id directory
This is the function responsible for making sure jids don't collide (unless its passed a jid)
So do what you have to do to make sure that stays the case
'''
if recurse_count >= 5:
err = 'prep_jid could not store a jid after {0} tries.'.format(recurse_count)
log.error(err)
raise salt.exceptions.SaltCacheError(err)
if passed_jid is None: # this can be a None of an empty string
jid = salt.utils.jid.gen_jid()
else:
@ -117,15 +122,22 @@ def prep_jid(nocache=False, passed_jid=None):
try:
os.makedirs(jid_dir_)
except OSError:
# TODO: some sort of sleep or something? Spinning is generally bad practice
time.sleep(0.1)
if passed_jid is None:
recurse_count += recurse_count
return prep_jid(nocache=nocache)
with salt.utils.fopen(os.path.join(jid_dir_, 'jid'), 'wb+') as fn_:
fn_.write(jid)
if nocache:
with salt.utils.fopen(os.path.join(jid_dir_, 'nocache'), 'wb+') as fn_:
fn_.write('')
try:
with salt.utils.fopen(os.path.join(jid_dir_, 'jid'), 'wb+') as fn_:
fn_.write(jid)
if nocache:
with salt.utils.fopen(os.path.join(jid_dir_, 'nocache'), 'wb+') as fn_:
fn_.write('')
except IOError:
log.warn('Could not write out jid file for job {0}. Retrying.'.format(jid))
time.sleep(0.1)
recurse_count += recurse_count
return prep_jid(passed_jid = jid, nocache=nocache)
return jid