mirror of
https://github.com/valitydev/salt.git
synced 2024-11-08 09:23:56 +00:00
Merge pull request #47609 from rrroo/improve-aws-retries
Unify retries for AWS API
This commit is contained in:
commit
70b62074b5
@ -75,7 +75,6 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
import pprint
|
import pprint
|
||||||
import logging
|
import logging
|
||||||
import random
|
|
||||||
|
|
||||||
# Import libs for talking to the EC2 API
|
# Import libs for talking to the EC2 API
|
||||||
import hmac
|
import hmac
|
||||||
@ -302,8 +301,8 @@ def query(params=None, setname=None, requesturl=None, location=None,
|
|||||||
# Retrieve access credentials from meta-data, or use provided
|
# Retrieve access credentials from meta-data, or use provided
|
||||||
access_key_id, secret_access_key, token = aws.creds(provider)
|
access_key_id, secret_access_key, token = aws.creds(provider)
|
||||||
|
|
||||||
attempts = 5
|
attempts = 0
|
||||||
while attempts > 0:
|
while attempts < aws.AWS_MAX_RETRIES:
|
||||||
params_with_headers = params.copy()
|
params_with_headers = params.copy()
|
||||||
timestamp = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
|
timestamp = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||||
|
|
||||||
@ -381,7 +380,6 @@ def query(params=None, setname=None, requesturl=None, location=None,
|
|||||||
|
|
||||||
signature = hmac.new(signing_key, (string_to_sign).encode('utf-8'),
|
signature = hmac.new(signing_key, (string_to_sign).encode('utf-8'),
|
||||||
hashlib.sha256).hexdigest()
|
hashlib.sha256).hexdigest()
|
||||||
#sig = binascii.b2a_base64(hashed)
|
|
||||||
|
|
||||||
authorization_header = algorithm + ' ' + 'Credential=' + \
|
authorization_header = algorithm + ' ' + 'Credential=' + \
|
||||||
provider['id'] + '/' + credential_scope + \
|
provider['id'] + '/' + credential_scope + \
|
||||||
@ -407,15 +405,14 @@ def query(params=None, setname=None, requesturl=None, location=None,
|
|||||||
|
|
||||||
# check to see if we should retry the query
|
# check to see if we should retry the query
|
||||||
err_code = data.get('Errors', {}).get('Error', {}).get('Code', '')
|
err_code = data.get('Errors', {}).get('Error', {}).get('Code', '')
|
||||||
if attempts > 0 and err_code and err_code in EC2_RETRY_CODES:
|
if err_code and err_code in EC2_RETRY_CODES:
|
||||||
attempts -= 1
|
attempts += 1
|
||||||
log.error(
|
log.error(
|
||||||
'EC2 Response Status Code and Error: [%s %s] %s; '
|
'EC2 Response Status Code and Error: [%s %s] %s; '
|
||||||
'Attempts remaining: %s',
|
'Attempts remaining: %s',
|
||||||
exc.response.status_code, exc, data, attempts
|
exc.response.status_code, exc, data, attempts
|
||||||
)
|
)
|
||||||
# Wait a bit before continuing to prevent throttling
|
aws.sleep_exponential_backoff(attempts)
|
||||||
time.sleep(2)
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
log.error(
|
log.error(
|
||||||
@ -1562,10 +1559,6 @@ def _modify_eni_properties(eni_id, properties=None, vm_=None):
|
|||||||
for k, v in six.iteritems(properties):
|
for k, v in six.iteritems(properties):
|
||||||
params[k] = v
|
params[k] = v
|
||||||
|
|
||||||
retries = 5
|
|
||||||
while retries > 0:
|
|
||||||
retries = retries - 1
|
|
||||||
|
|
||||||
result = aws.query(params,
|
result = aws.query(params,
|
||||||
return_root=True,
|
return_root=True,
|
||||||
location=get_location(vm_),
|
location=get_location(vm_),
|
||||||
@ -1574,17 +1567,13 @@ def _modify_eni_properties(eni_id, properties=None, vm_=None):
|
|||||||
sigver='4')
|
sigver='4')
|
||||||
|
|
||||||
if isinstance(result, dict) and result.get('error'):
|
if isinstance(result, dict) and result.get('error'):
|
||||||
time.sleep(1)
|
|
||||||
continue
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
raise SaltCloudException(
|
raise SaltCloudException(
|
||||||
'Could not change interface <{0}> attributes '
|
'Could not change interface <{0}> attributes <\'{1}\'>'.format(
|
||||||
'<\'{1}\'> after 5 retries'.format(
|
|
||||||
eni_id, properties
|
eni_id, properties
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _associate_eip_with_interface(eni_id, eip_id, private_ip=None, vm_=None):
|
def _associate_eip_with_interface(eni_id, eip_id, private_ip=None, vm_=None):
|
||||||
@ -1597,8 +1586,6 @@ def _associate_eip_with_interface(eni_id, eip_id, private_ip=None, vm_=None):
|
|||||||
be NATted to - useful if you have multiple IP addresses assigned to an
|
be NATted to - useful if you have multiple IP addresses assigned to an
|
||||||
interface.
|
interface.
|
||||||
'''
|
'''
|
||||||
retries = 5
|
|
||||||
while retries > 0:
|
|
||||||
params = {'Action': 'AssociateAddress',
|
params = {'Action': 'AssociateAddress',
|
||||||
'NetworkInterfaceId': eni_id,
|
'NetworkInterfaceId': eni_id,
|
||||||
'AllocationId': eip_id}
|
'AllocationId': eip_id}
|
||||||
@ -1606,7 +1593,6 @@ def _associate_eip_with_interface(eni_id, eip_id, private_ip=None, vm_=None):
|
|||||||
if private_ip:
|
if private_ip:
|
||||||
params['PrivateIpAddress'] = private_ip
|
params['PrivateIpAddress'] = private_ip
|
||||||
|
|
||||||
retries = retries - 1
|
|
||||||
result = aws.query(params,
|
result = aws.query(params,
|
||||||
return_root=True,
|
return_root=True,
|
||||||
location=get_location(vm_),
|
location=get_location(vm_),
|
||||||
@ -1614,12 +1600,13 @@ def _associate_eip_with_interface(eni_id, eip_id, private_ip=None, vm_=None):
|
|||||||
opts=__opts__,
|
opts=__opts__,
|
||||||
sigver='4')
|
sigver='4')
|
||||||
|
|
||||||
if isinstance(result, dict) and result.get('error'):
|
|
||||||
time.sleep(1)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not result[2].get('associationId'):
|
if not result[2].get('associationId'):
|
||||||
break
|
raise SaltCloudException(
|
||||||
|
'Could not associate elastic ip address '
|
||||||
|
'<{0}> with network interface <{1}>'.format(
|
||||||
|
eip_id, eni_id
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
log.debug(
|
log.debug(
|
||||||
'Associated ElasticIP address %s with interface %s',
|
'Associated ElasticIP address %s with interface %s',
|
||||||
@ -1628,13 +1615,6 @@ def _associate_eip_with_interface(eni_id, eip_id, private_ip=None, vm_=None):
|
|||||||
|
|
||||||
return result[2].get('associationId')
|
return result[2].get('associationId')
|
||||||
|
|
||||||
raise SaltCloudException(
|
|
||||||
'Could not associate elastic ip address '
|
|
||||||
'<{0}> with network interface <{1}>'.format(
|
|
||||||
eip_id, eni_id
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _update_enis(interfaces, instance, vm_=None):
|
def _update_enis(interfaces, instance, vm_=None):
|
||||||
config_enis = {}
|
config_enis = {}
|
||||||
@ -2011,7 +1991,8 @@ def request_instance(vm_=None, call=None):
|
|||||||
params[termination_key] = six.text_type(set_del_root_vol_on_destroy).lower()
|
params[termination_key] = six.text_type(set_del_root_vol_on_destroy).lower()
|
||||||
|
|
||||||
# Use default volume type if not specified
|
# Use default volume type if not specified
|
||||||
if ex_blockdevicemappings and dev_index < len(ex_blockdevicemappings) and 'Ebs.VolumeType' not in ex_blockdevicemappings[dev_index]:
|
if ex_blockdevicemappings and dev_index < len(ex_blockdevicemappings) and \
|
||||||
|
'Ebs.VolumeType' not in ex_blockdevicemappings[dev_index]:
|
||||||
type_key = '{0}BlockDeviceMapping.{1}.Ebs.VolumeType'.format(spot_prefix, dev_index)
|
type_key = '{0}BlockDeviceMapping.{1}.Ebs.VolumeType'.format(spot_prefix, dev_index)
|
||||||
params[type_key] = rd_type
|
params[type_key] = rd_type
|
||||||
|
|
||||||
@ -2182,8 +2163,7 @@ def query_instance(vm_=None, call=None):
|
|||||||
provider = get_provider(vm_)
|
provider = get_provider(vm_)
|
||||||
|
|
||||||
attempts = 0
|
attempts = 0
|
||||||
# perform exponential backoff and wait up to one minute (2**6 seconds)
|
while attempts < aws.AWS_MAX_RETRIES:
|
||||||
while attempts < 7:
|
|
||||||
data, requesturl = aws.query(params, # pylint: disable=unbalanced-tuple-unpacking
|
data, requesturl = aws.query(params, # pylint: disable=unbalanced-tuple-unpacking
|
||||||
location=location,
|
location=location,
|
||||||
provider=provider,
|
provider=provider,
|
||||||
@ -2205,7 +2185,7 @@ def query_instance(vm_=None, call=None):
|
|||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
time.sleep(random.uniform(1, 2**attempts))
|
aws.sleep_exponential_backoff(attempts)
|
||||||
attempts += 1
|
attempts += 1
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
@ -2215,7 +2195,6 @@ def query_instance(vm_=None, call=None):
|
|||||||
|
|
||||||
def __query_ip_address(params, url): # pylint: disable=W0613
|
def __query_ip_address(params, url): # pylint: disable=W0613
|
||||||
data = aws.query(params,
|
data = aws.query(params,
|
||||||
#requesturl=url,
|
|
||||||
location=location,
|
location=location,
|
||||||
provider=provider,
|
provider=provider,
|
||||||
opts=__opts__,
|
opts=__opts__,
|
||||||
@ -3028,9 +3007,9 @@ def set_tags(name=None,
|
|||||||
params['Tag.{0}.Key'.format(idx)] = tag_k
|
params['Tag.{0}.Key'.format(idx)] = tag_k
|
||||||
params['Tag.{0}.Value'.format(idx)] = tag_v
|
params['Tag.{0}.Value'.format(idx)] = tag_v
|
||||||
|
|
||||||
attempts = 5
|
attempts = 0
|
||||||
while attempts >= 0:
|
while attempts < aws.AWS_MAX_RETRIES:
|
||||||
result = aws.query(params,
|
aws.query(params,
|
||||||
setname='tagSet',
|
setname='tagSet',
|
||||||
location=location,
|
location=location,
|
||||||
provider=get_provider(),
|
provider=get_provider(),
|
||||||
@ -3064,9 +3043,8 @@ def set_tags(name=None,
|
|||||||
|
|
||||||
if failed_to_set_tags:
|
if failed_to_set_tags:
|
||||||
log.warning('Failed to set tags. Remaining attempts %s', attempts)
|
log.warning('Failed to set tags. Remaining attempts %s', attempts)
|
||||||
attempts -= 1
|
attempts += 1
|
||||||
# Just a little delay between attempts...
|
aws.sleep_exponential_backoff(attempts)
|
||||||
time.sleep(1)
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return settags
|
return settags
|
||||||
@ -3405,8 +3383,8 @@ def _get_node(name=None, instance_id=None, location=None):
|
|||||||
|
|
||||||
provider = get_provider()
|
provider = get_provider()
|
||||||
|
|
||||||
attempts = 10
|
attempts = 0
|
||||||
while attempts >= 0:
|
while attempts < aws.AWS_MAX_RETRIES:
|
||||||
try:
|
try:
|
||||||
instances = aws.query(params,
|
instances = aws.query(params,
|
||||||
location=location,
|
location=location,
|
||||||
@ -3416,13 +3394,12 @@ def _get_node(name=None, instance_id=None, location=None):
|
|||||||
instance_info = _extract_instance_info(instances).values()
|
instance_info = _extract_instance_info(instances).values()
|
||||||
return next(iter(instance_info))
|
return next(iter(instance_info))
|
||||||
except IndexError:
|
except IndexError:
|
||||||
attempts -= 1
|
attempts += 1
|
||||||
log.debug(
|
log.debug(
|
||||||
'Failed to get the data for node \'%s\'. Remaining '
|
'Failed to get the data for node \'%s\'. Remaining '
|
||||||
'attempts: %s', instance_id or name, attempts
|
'attempts: %s', instance_id or name, attempts
|
||||||
)
|
)
|
||||||
# Just a little delay between attempts...
|
aws.sleep_exponential_backoff(attempts)
|
||||||
time.sleep(0.5)
|
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
@ -3946,7 +3923,8 @@ def register_image(kwargs=None, call=None):
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
salt-cloud -f register_image my-ec2-config ami_name=my_ami description="my description" root_device_name=/dev/xvda snapshot_id=snap-xxxxxxxx
|
salt-cloud -f register_image my-ec2-config ami_name=my_ami description="my description"
|
||||||
|
root_device_name=/dev/xvda snapshot_id=snap-xxxxxxxx
|
||||||
'''
|
'''
|
||||||
|
|
||||||
if call != 'function':
|
if call != 'function':
|
||||||
|
@ -52,6 +52,8 @@ AWS_RETRY_CODES = [
|
|||||||
]
|
]
|
||||||
AWS_METADATA_TIMEOUT = 3.05
|
AWS_METADATA_TIMEOUT = 3.05
|
||||||
|
|
||||||
|
AWS_MAX_RETRIES = 7
|
||||||
|
|
||||||
IROLE_CODE = 'use-instance-role-credentials'
|
IROLE_CODE = 'use-instance-role-credentials'
|
||||||
__AccessKeyId__ = ''
|
__AccessKeyId__ = ''
|
||||||
__SecretAccessKey__ = ''
|
__SecretAccessKey__ = ''
|
||||||
@ -61,6 +63,21 @@ __Location__ = ''
|
|||||||
__AssumeCache__ = {}
|
__AssumeCache__ = {}
|
||||||
|
|
||||||
|
|
||||||
|
def sleep_exponential_backoff(attempts):
|
||||||
|
"""
|
||||||
|
backoff an exponential amount of time to throttle requests
|
||||||
|
during "API Rate Exceeded" failures as suggested by the AWS documentation here:
|
||||||
|
https://docs.aws.amazon.com/AWSEC2/latest/APIReference/query-api-troubleshooting.html
|
||||||
|
and also here:
|
||||||
|
https://docs.aws.amazon.com/general/latest/gr/api-retries.html
|
||||||
|
Failure to implement this approach results in a failure rate of >30% when using salt-cloud with
|
||||||
|
"--parallel" when creating 50 or more instances with a fixed delay of 2 seconds.
|
||||||
|
A failure rate of >10% is observed when using the salt-api with an asyncronous client
|
||||||
|
specified (runner_async).
|
||||||
|
"""
|
||||||
|
time.sleep(random.uniform(1, 2**attempts))
|
||||||
|
|
||||||
|
|
||||||
def creds(provider):
|
def creds(provider):
|
||||||
'''
|
'''
|
||||||
Return the credentials for AWS signing. This could be just the id and key
|
Return the credentials for AWS signing. This could be just the id and key
|
||||||
@ -441,9 +458,8 @@ def query(params=None, setname=None, requesturl=None, location=None,
|
|||||||
)
|
)
|
||||||
headers = {}
|
headers = {}
|
||||||
|
|
||||||
MAX_RETRIES = 6
|
|
||||||
attempts = 0
|
attempts = 0
|
||||||
while attempts < MAX_RETRIES:
|
while attempts < AWS_MAX_RETRIES:
|
||||||
log.debug('AWS Request: %s', requesturl)
|
log.debug('AWS Request: %s', requesturl)
|
||||||
log.trace('AWS Request Parameters: %s', params_with_headers)
|
log.trace('AWS Request Parameters: %s', params_with_headers)
|
||||||
try:
|
try:
|
||||||
@ -461,23 +477,14 @@ def query(params=None, setname=None, requesturl=None, location=None,
|
|||||||
|
|
||||||
# check to see if we should retry the query
|
# check to see if we should retry the query
|
||||||
err_code = data.get('Errors', {}).get('Error', {}).get('Code', '')
|
err_code = data.get('Errors', {}).get('Error', {}).get('Code', '')
|
||||||
if attempts < MAX_RETRIES and err_code and err_code in AWS_RETRY_CODES:
|
if attempts < AWS_MAX_RETRIES and err_code and err_code in AWS_RETRY_CODES:
|
||||||
attempts += 1
|
attempts += 1
|
||||||
log.error(
|
log.error(
|
||||||
'AWS Response Status Code and Error: [%s %s] %s; '
|
'AWS Response Status Code and Error: [%s %s] %s; '
|
||||||
'Attempts remaining: %s',
|
'Attempts remaining: %s',
|
||||||
exc.response.status_code, exc, data, attempts
|
exc.response.status_code, exc, data, attempts
|
||||||
)
|
)
|
||||||
# backoff an exponential amount of time to throttle requests
|
sleep_exponential_backoff(attempts)
|
||||||
# during "API Rate Exceeded" failures as suggested by the AWS documentation here:
|
|
||||||
# https://docs.aws.amazon.com/AWSEC2/latest/APIReference/query-api-troubleshooting.html
|
|
||||||
# and also here:
|
|
||||||
# https://docs.aws.amazon.com/general/latest/gr/api-retries.html
|
|
||||||
# Failure to implement this approach results in a failure rate of >30% when using salt-cloud with
|
|
||||||
# "--parallel" when creating 50 or more instances with a fixed delay of 2 seconds.
|
|
||||||
# A failure rate of >10% is observed when using the salt-api with an asyncronous client
|
|
||||||
# specified (runner_async).
|
|
||||||
time.sleep(random.uniform(1, 2**attempts))
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
log.error(
|
log.error(
|
||||||
|
Loading…
Reference in New Issue
Block a user