mirror of
https://github.com/valitydev/salt.git
synced 2024-11-07 17:09:03 +00:00
Better try and error handling for prep_jid
Implement upper bounds for retries on FS errors when writing job files to cache. Raise exceptions up to clients and daemons for logging and user notification. Handle exception outlined in #25581.
This commit is contained in:
parent
a563af29d3
commit
bfcaab9ef4
@ -328,16 +328,19 @@ class SyncClientMixin(object):
|
||||
data['success'] = False
|
||||
|
||||
namespaced_event.fire_event(data, 'ret')
|
||||
salt.utils.job.store_job(
|
||||
self.opts,
|
||||
{'id': self.opts['id'],
|
||||
'tgt': self.opts['id'],
|
||||
'jid': data['jid'],
|
||||
'return': data,
|
||||
},
|
||||
event=None,
|
||||
mminion=self.mminion,
|
||||
)
|
||||
try:
|
||||
salt.utils.job.store_job(
|
||||
self.opts,
|
||||
{'id': self.opts['id'],
|
||||
'tgt': self.opts['id'],
|
||||
'jid': data['jid'],
|
||||
'return': data,
|
||||
},
|
||||
event=None,
|
||||
mminion=self.mminion,
|
||||
)
|
||||
except salt.exceptions.SaltCacheError:
|
||||
log.error('Could not store job cache info. Job details for this run may be unavailable.')
|
||||
# if we fired an event, make sure to delete the event object.
|
||||
# This will ensure that we call destroy, which will do the 0MQ linger
|
||||
log.info('Runner completed: {0}'.format(data['jid']))
|
||||
|
@ -151,6 +151,12 @@ class SaltClientTimeout(SaltException):
|
||||
self.jid = jid
|
||||
|
||||
|
||||
class SaltCacheError(SaltException):
|
||||
'''
|
||||
Thrown when a problem was encountered trying to read or write from the salt cache
|
||||
'''
|
||||
|
||||
|
||||
class SaltReqTimeoutError(SaltException):
|
||||
'''
|
||||
Thrown when a salt master request call fails to return within the timeout
|
||||
|
@ -1272,8 +1272,11 @@ class AESFuncs(object):
|
||||
|
||||
:param dict load: The minion payload
|
||||
'''
|
||||
salt.utils.job.store_job(
|
||||
self.opts, load, event=self.event, mminion=self.mminion)
|
||||
try:
|
||||
salt.utils.job.store_job(
|
||||
self.opts, load, event=self.event, mminion=self.mminion)
|
||||
except salt.exception.SaltCacheError:
|
||||
log.error('Could not store job information for load: {0}'.format(load))
|
||||
|
||||
def _syndic_return(self, load):
|
||||
'''
|
||||
|
@ -17,6 +17,7 @@ import hashlib
|
||||
import salt.payload
|
||||
import salt.utils
|
||||
import salt.utils.jid
|
||||
import salt.exceptions
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
@ -99,12 +100,16 @@ def _format_jid_instance(jid, job):
|
||||
|
||||
|
||||
#TODO: add to returner docs-- this is a new one
|
||||
def prep_jid(nocache=False, passed_jid=None):
|
||||
def prep_jid(nocache=False, passed_jid=None, recurse_count=0):
|
||||
'''
|
||||
Return a job id and prepare the job id directory
|
||||
This is the function responsible for making sure jids don't collide (unless its passed a jid)
|
||||
So do what you have to do to make sure that stays the case
|
||||
'''
|
||||
if recurse_count >= 5:
|
||||
err = 'prep_jid could not store a jid after {0} tries.'.format(recurse_count)
|
||||
log.error(err)
|
||||
raise salt.exceptions.SaltCacheError(err)
|
||||
if passed_jid is None: # this can be a None of an empty string
|
||||
jid = salt.utils.jid.gen_jid()
|
||||
else:
|
||||
@ -117,15 +122,22 @@ def prep_jid(nocache=False, passed_jid=None):
|
||||
try:
|
||||
os.makedirs(jid_dir_)
|
||||
except OSError:
|
||||
# TODO: some sort of sleep or something? Spinning is generally bad practice
|
||||
time.sleep(0.1)
|
||||
if passed_jid is None:
|
||||
recurse_count += recurse_count
|
||||
return prep_jid(nocache=nocache)
|
||||
|
||||
with salt.utils.fopen(os.path.join(jid_dir_, 'jid'), 'wb+') as fn_:
|
||||
fn_.write(jid)
|
||||
if nocache:
|
||||
with salt.utils.fopen(os.path.join(jid_dir_, 'nocache'), 'wb+') as fn_:
|
||||
fn_.write('')
|
||||
try:
|
||||
with salt.utils.fopen(os.path.join(jid_dir_, 'jid'), 'wb+') as fn_:
|
||||
fn_.write(jid)
|
||||
if nocache:
|
||||
with salt.utils.fopen(os.path.join(jid_dir_, 'nocache'), 'wb+') as fn_:
|
||||
fn_.write('')
|
||||
except IOError:
|
||||
log.warn('Could not write out jid file for job {0}. Retrying.'.format(jid))
|
||||
time.sleep(0.1)
|
||||
recurse_count += recurse_count
|
||||
return prep_jid(passed_jid = jid, nocache=nocache)
|
||||
|
||||
return jid
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user