mirror of
https://github.com/valitydev/salt.git
synced 2024-11-07 08:58:59 +00:00
Merge pull request #25815 from lyft/mongolab_telemetry_salt_module
Mongolab telemetry salt module
This commit is contained in:
commit
a0ab784216
367
salt/modules/telemetry.py
Normal file
367
salt/modules/telemetry.py
Normal file
@ -0,0 +1,367 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
Connection module for Telemetry
|
||||
|
||||
... versionadded:: Boron.
|
||||
|
||||
https://github.com/mongolab/mongolab-telemetry-api-docs/blob/master/alerts.md
|
||||
|
||||
:configuration: This module accepts explicit telemetry credentials or
|
||||
can also read api key credentials from a pillar. More Information available at::
|
||||
|
||||
https://github.com/mongolab/mongolab-telemetry-api-docs/blob/master/alerts.md
|
||||
|
||||
in the minion's config file::
|
||||
|
||||
telemetry.telemetry_api_keys:
|
||||
- abc123 # Key 1
|
||||
- efg321 # Backup Key 1
|
||||
telemetry_api_base_url: https://telemetry-api.mongolab.com/v0
|
||||
|
||||
:depends: requests
|
||||
|
||||
'''
|
||||
from __future__ import absolute_import
|
||||
from salt._compat import string_types
|
||||
import json
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Import third party libs
|
||||
try:
|
||||
import requests
|
||||
HAS_REQUESTS = True
|
||||
except ImportError:
|
||||
HAS_REQUESTS = False
|
||||
|
||||
__virtualname__ = 'telemetry'
|
||||
|
||||
|
||||
def __virtual__():
|
||||
# Only load if imports exist.
|
||||
if not HAS_REQUESTS:
|
||||
return False
|
||||
return __virtualname__
|
||||
|
||||
|
||||
def _get_telemetry_base(profile):
|
||||
config = __salt__['config.option'](profile)
|
||||
return config.get('telemetry_api_base_url')
|
||||
|
||||
|
||||
def _auth(api_key=None, profile='telemetry'):
|
||||
# return telemetry api key in the header
|
||||
if api_key is None and profile is None:
|
||||
raise Exception("Missing api_key and profile")
|
||||
if profile:
|
||||
if isinstance(profile, string_types):
|
||||
_profile = __salt__['config.option'](profile)
|
||||
elif isinstance(profile, dict):
|
||||
_profile = profile
|
||||
|
||||
if _profile:
|
||||
api_key = _profile.get('telemetry_api_keys')[0]
|
||||
else:
|
||||
raise Exception("Missing api_key")
|
||||
|
||||
return {'Telemetry-API-Key': api_key, 'content-type': 'application/json'}
|
||||
|
||||
|
||||
def _update_cache(deployment_id, metric_name, alert):
|
||||
key = "telemetry.{0}.alerts".format(deployment_id)
|
||||
|
||||
if key in __context__:
|
||||
alerts = __context__[key]
|
||||
alerts[metric_name] = alert
|
||||
__context__[key] = alerts
|
||||
|
||||
return __context__.get(key, [])
|
||||
|
||||
|
||||
def _retrieve_channel_id(email, profile='telemetry'):
|
||||
'''
|
||||
Given an email address, checks the local
|
||||
cache if corresponding email address: channel_id
|
||||
mapping exists
|
||||
|
||||
email
|
||||
Email escalation policy
|
||||
profile
|
||||
A dict of telemetry config information.
|
||||
'''
|
||||
key = "telemetry.channels"
|
||||
auth = _auth(profile=profile)
|
||||
|
||||
if key not in __context__:
|
||||
get_url = _get_telemetry_base(profile) + "/notification-channels?_type=EmailNotificationChannel"
|
||||
response = requests.get(get_url, headers=auth)
|
||||
|
||||
if response.status_code == 200:
|
||||
cache_result = {}
|
||||
for index, alert in enumerate(response.json()):
|
||||
cache_result[alert.get('email')] = alert.get('_id', 'false')
|
||||
|
||||
__context__[key] = cache_result
|
||||
|
||||
return __context__[key].get(email, False)
|
||||
|
||||
|
||||
def get_alert_config(deployment_id, metric_name=None, api_key=None, profile="telemetry"):
|
||||
'''
|
||||
Get all alert definitions associated with a given deployment or if metric_name
|
||||
is specified, obtain the specific alert config
|
||||
|
||||
Returns dictionary or list of dictionaries.
|
||||
|
||||
CLI Example:
|
||||
|
||||
salt myminion telemetry.get_alert_config rs-ds033197 currentConnections profile=telemetry
|
||||
salt myminion telemetry.get_alert_config rs-ds033197 profile=telemetry
|
||||
'''
|
||||
|
||||
auth = _auth(profile=profile)
|
||||
alert = False
|
||||
|
||||
key = "telemetry.{0}.alerts".format(deployment_id)
|
||||
|
||||
if key not in __context__:
|
||||
try:
|
||||
get_url = _get_telemetry_base(profile) + "/alerts?deployment={0}".format(deployment_id)
|
||||
response = requests.get(get_url, headers=auth)
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(str(e))
|
||||
return False
|
||||
|
||||
http_result = {}
|
||||
if response.status_code == 200:
|
||||
for alert in response.json():
|
||||
http_result[alert.get('condition', {}).get('metric')] = alert
|
||||
__context__[key] = http_result
|
||||
|
||||
if not __context__.get(key):
|
||||
return []
|
||||
|
||||
alerts = __context__[key].values()
|
||||
|
||||
if metric_name:
|
||||
return __context__[key].get(metric_name)
|
||||
|
||||
return [alert['_id'] for alert in alerts if '_id' in alert]
|
||||
|
||||
|
||||
def get_notification_channel_id(notify_channel, profile="telemetry"):
|
||||
'''
|
||||
Given an email address, creates a notification-channels
|
||||
if one is not found and also returns the corresponsing
|
||||
notification channel id.
|
||||
|
||||
notify_channel
|
||||
Email escalation policy
|
||||
profile
|
||||
A dict of telemetry config information.
|
||||
|
||||
CLI Example:
|
||||
|
||||
salt myminion telemetry.get_notification_channel_id userx@company.com profile=telemetry
|
||||
'''
|
||||
|
||||
# This helper is used to procure the channel ids
|
||||
# used to notify when the alarm threshold is violated
|
||||
auth = _auth(profile=profile)
|
||||
|
||||
notification_channel_id = _retrieve_channel_id(notify_channel)
|
||||
|
||||
if not notification_channel_id:
|
||||
log.info("{0} channel does not exist, creating.".format(notify_channel))
|
||||
|
||||
# create the notification channel and cache the id
|
||||
post_url = _get_telemetry_base(profile) + "/notification-channels"
|
||||
data = {
|
||||
"_type": "EmailNotificationChannel",
|
||||
"name": notify_channel[:notify_channel.find('@')] + 'EscalationPolicy',
|
||||
"email": notify_channel
|
||||
}
|
||||
response = requests.post(post_url, data=json.dumps(data), headers=auth)
|
||||
if response.status_code == 200:
|
||||
log.info("Successfully created EscalationPolicy {0} with EmailNotificationChannel {1}"
|
||||
.format(data.get('name'), notify_channel))
|
||||
notification_channel_id = response.json().get('_id')
|
||||
__context__["telemetry.channels"][notify_channel] = notification_channel_id
|
||||
else:
|
||||
raise Exception("Failed to created notification channel {0}".format(notify_channel))
|
||||
|
||||
return notification_channel_id
|
||||
|
||||
|
||||
def get_alarms(deployment_id, profile="telemetry"):
|
||||
'''
|
||||
get all the alarms set up against the current deployment
|
||||
|
||||
Returns dictionary of alarm information
|
||||
|
||||
CLI Example:
|
||||
|
||||
salt myminion telemetry.get_alarms rs-ds033197 profile=telemetry
|
||||
|
||||
'''
|
||||
auth = _auth(profile=profile)
|
||||
|
||||
try:
|
||||
response = requests.get(_get_telemetry_base(profile) + "/alerts?deployment={0}".format(deployment_id), headers=auth)
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(str(e))
|
||||
return False
|
||||
|
||||
if response.status_code == 200:
|
||||
alarms = response.json()
|
||||
|
||||
if len(alarms) > 0:
|
||||
return alarms
|
||||
|
||||
return 'No alarms defined for deployment: {0}'.format(deployment_id)
|
||||
else:
|
||||
# Non 200 response, sent back the error response'
|
||||
return {'err_code': response.status_code, 'err_msg': json.loads(response.text).get('err', '')}
|
||||
|
||||
|
||||
def create_alarm(deployment_id, metric_name, data, api_key=None, profile="telemetry"):
|
||||
'''
|
||||
create an telemetry alarms.
|
||||
|
||||
data is a dict of alert configuration data.
|
||||
|
||||
Returns (bool success, str message) tuple.
|
||||
|
||||
CLI Example:
|
||||
|
||||
salt myminion telemetry.create_alarm rs-ds033197 {} profile=telemetry
|
||||
|
||||
'''
|
||||
|
||||
auth = _auth(api_key, profile)
|
||||
request_uri = _get_telemetry_base(profile) + "/alerts"
|
||||
|
||||
key = "telemetry.{0}.alerts".format(deployment_id)
|
||||
|
||||
# set the notification channels if not already set
|
||||
post_body = {
|
||||
"deployment": deployment_id,
|
||||
"filter": data.get('filter'),
|
||||
"notificationChannel": get_notification_channel_id(data.get('escalate_to')).split(),
|
||||
"condition": {
|
||||
"metric": metric_name,
|
||||
"max": data.get('max'),
|
||||
"min": data.get('min')
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(request_uri, data=json.dumps(post_body), headers=auth)
|
||||
except requests.exceptions.RequestException as e:
|
||||
# TODO: May be we should retry?
|
||||
log.error(str(e))
|
||||
|
||||
if response.status_code >= 200 and response.status_code < 300:
|
||||
# update cache
|
||||
log.info("Created alarm on metric: {0} in deployment: {1}".format(metric_name, deployment_id))
|
||||
log.debug("Updating cache for metric {0} in deployment {1}: {2}".format(metric_name, deployment_id, response.json()))
|
||||
_update_cache(deployment_id, metric_name, response.json())
|
||||
else:
|
||||
log.error("Failed to create alarm on metric: {0} in deployment {1}: payload: {2}".
|
||||
format(metric_name, deployment_id, json.dumps(post_body)))
|
||||
|
||||
return response.status_code >= 200 and response.status_code < 300, response.json()
|
||||
|
||||
|
||||
def update_alarm(deployment_id, metric_name, data, api_key=None, profile="telemetry"):
|
||||
'''
|
||||
update an telemetry alarms. data is a dict of alert configuration data.
|
||||
|
||||
Returns (bool success, str message) tuple.
|
||||
|
||||
CLI Example:
|
||||
|
||||
salt myminion telemetry.update_alarm rs-ds033197 {} profile=telemetry
|
||||
|
||||
'''
|
||||
auth = _auth(api_key, profile)
|
||||
alert = get_alert_config(deployment_id, metric_name, api_key, profile)
|
||||
|
||||
if not alert:
|
||||
return False, "No entity found matching deployment {0} and alarms {1}".format(deployment_id, metric_name)
|
||||
|
||||
request_uri = _get_telemetry_base(profile) + '/alerts/' + alert['_id']
|
||||
|
||||
# set the notification channels if not already set
|
||||
post_body = {
|
||||
"deployment": deployment_id,
|
||||
"filter": data.get('filter'),
|
||||
"notificationChannel": get_notification_channel_id(data.get('escalate_to')).split(),
|
||||
"condition": {
|
||||
"metric": metric_name,
|
||||
"max": data.get('max'),
|
||||
"min": data.get('min')
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.put(request_uri, data=json.dumps(post_body), headers=auth)
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error("Update failed {0}" .format(str(e)))
|
||||
return False, str(e)
|
||||
|
||||
if response.status_code >= 200 and response.status_code < 300:
|
||||
# Also update cache
|
||||
log.debug("Updating cache for metric {0} in deployment {1}: {2}".format(metric_name, deployment_id, response.json()))
|
||||
_update_cache(deployment_id, metric_name, response.json())
|
||||
log.info("Updated alarm on metric: {0} in deployment: {1}".format(metric_name, deployment_id))
|
||||
return True, response.json()
|
||||
|
||||
err_msg = "Failed to create alarm on metric: {0} in deployment {1}: payload: {2}".format(metric_name, deployment_id, json.dumps(post_body))
|
||||
log.error(err_msg)
|
||||
return False, err_msg
|
||||
|
||||
|
||||
def delete_alarms(deployment_id, alert_id=None, metric_name=None, api_key=None, profile='telemetry'):
|
||||
'''delete an alert specified by alert_id or if not specified blows away all the alerts
|
||||
in the current deployment.
|
||||
|
||||
Returns (bool success, str message) tuple.
|
||||
|
||||
CLI Example:
|
||||
|
||||
salt myminion telemetry.delete_alarms rs-ds033197 profile=telemetry
|
||||
|
||||
'''
|
||||
auth = _auth(profile=profile)
|
||||
|
||||
if alert_id is None:
|
||||
# Delete all the alarms associated with this deployment
|
||||
alert_ids = get_alert_config(deployment_id, api_key=api_key, profile=profile)
|
||||
else:
|
||||
alert_ids = [alert_id]
|
||||
|
||||
if len(alert_ids) == 0:
|
||||
return False, "failed to find alert associated with deployment: {0}".format(deployment_id)
|
||||
|
||||
failed_to_delete = []
|
||||
for id in alert_ids:
|
||||
delete_url = _get_telemetry_base(profile) + "/alerts/{0}".format(id)
|
||||
|
||||
try:
|
||||
response = requests.delete(delete_url, headers=auth)
|
||||
if metric_name:
|
||||
log.debug("updating cache and delete {0} key from {1}".format(metric_name, deployment_id))
|
||||
_update_cache(deployment_id, metric_name, None)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error('Delete failed: {0}'.format(str(e)))
|
||||
|
||||
if response.status_code != 200:
|
||||
failed_to_delete.append(id)
|
||||
|
||||
if len(failed_to_delete) > 0:
|
||||
return False, "Failed to delete {0} alarms in deployment: {1}" .format(', '.join(failed_to_delete), deployment_id)
|
||||
|
||||
return True, "Successfully deleted {0} alerts in deployment: {1}".format(', '.join(alert_ids), deployment_id)
|
198
salt/states/telemetry_alert.py
Normal file
198
salt/states/telemetry_alert.py
Normal file
@ -0,0 +1,198 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
.. versionadded:: Boron.
|
||||
|
||||
Manage Telemetry alert configurations
|
||||
=================
|
||||
|
||||
Create, Update and destroy Mongo Telemetry alert configurations.
|
||||
|
||||
This module uses requests, which can be installed via package, or pip.
|
||||
|
||||
This module accepts explicit credential (telemetry api key)
|
||||
or can also read api key credentials from a pillar.
|
||||
Example:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
ensure telemetry alert X is defined on deployment Y:
|
||||
telemetry_alert.present:
|
||||
- deployment_id: "rs-XXXXXX"
|
||||
- metric_name: "testMetric"
|
||||
- alert_config:
|
||||
max: 1
|
||||
filter: SERVER_ROLE_MONGOD_PRIMARY
|
||||
escalate_to: "example@pagerduty.com"
|
||||
- name: "**MANAGED BY ORCA DO NOT EDIT BY HAND** manages alarm on testMetric"
|
||||
'''
|
||||
from __future__ import absolute_import
|
||||
from salt._compat import string_types
|
||||
|
||||
|
||||
def __virtual__():
|
||||
# Only load if telemetry is available.
|
||||
return 'telemetry_alert' if 'telemetry.get_alert_config' in __salt__ else False
|
||||
|
||||
|
||||
def present(name, deployment_id, metric_name, alert_config, api_key=None, profile='telemetry'):
|
||||
'''
|
||||
Ensure the telemetry alert exists.
|
||||
|
||||
name
|
||||
An optional description of the alarm (not currently supported by telemetry API)
|
||||
|
||||
deployment_id
|
||||
Specifies the ID of the root deployment resource
|
||||
(replica set cluster or sharded cluster) to which this alert definition is attached
|
||||
|
||||
metric_name
|
||||
Specifies the unique ID of the metric to whose values these thresholds will be applied
|
||||
|
||||
alert_config: Is a list of dictionaries where each dict contains the following fields:
|
||||
filter
|
||||
By default the alert will apply to the deployment and all its constituent resources.
|
||||
If the alert only applies to a subset of those resources, a filter may be specified to narrow this scope.
|
||||
|
||||
min
|
||||
the smallest "ok" value the metric may take on; if missing or null, no minimum is enforced.
|
||||
|
||||
max
|
||||
the largest "ok" value the metric may take on; if missing or null, no maximum is enforced.
|
||||
|
||||
notify_all
|
||||
Used to indicate if you want to alert both onCallEngineer and apiNotifications
|
||||
|
||||
api_key
|
||||
Telemetry api key for the user
|
||||
|
||||
profile
|
||||
A dict of telemetry config information. If present, will be used instead of
|
||||
api_key.
|
||||
|
||||
'''
|
||||
|
||||
ret = {'name': metric_name, 'result': True, 'comment': '', 'changes': {}}
|
||||
|
||||
saved_alert_config = __salt__['telemetry.get_alert_config'](
|
||||
deployment_id, metric_name, api_key, profile)
|
||||
|
||||
post_body = {
|
||||
"deployment": deployment_id,
|
||||
"filter": alert_config.get('filter'),
|
||||
"notificationChannel": __salt__['telemetry.get_notification_channel_id'](alert_config.get('escalate_to')).split(),
|
||||
"condition": {
|
||||
"metric": metric_name,
|
||||
"max": alert_config.get('max'),
|
||||
"min": alert_config.get('min')
|
||||
}
|
||||
}
|
||||
# Diff the alert config with the passed-in attributes
|
||||
difference = []
|
||||
if saved_alert_config:
|
||||
#del saved_alert_config["_id"]
|
||||
for k, v in post_body.items():
|
||||
if k not in saved_alert_config:
|
||||
difference.append("{0}={1} (new)".format(k, v))
|
||||
continue
|
||||
v2 = saved_alert_config[k]
|
||||
|
||||
if v == v2:
|
||||
continue
|
||||
if isinstance(v, string_types) and str(v) == str(v2):
|
||||
continue
|
||||
if isinstance(v, float) and v == float(v2):
|
||||
continue
|
||||
if isinstance(v, int) and v == int(v2):
|
||||
continue
|
||||
difference.append("{0}='{1}' was: '{2}'".format(k, v, v2))
|
||||
else:
|
||||
difference.append("new alert config")
|
||||
|
||||
create_or_update_args = (
|
||||
deployment_id,
|
||||
metric_name,
|
||||
alert_config,
|
||||
api_key,
|
||||
profile,
|
||||
)
|
||||
if saved_alert_config: # alert config is present. update, or do nothing
|
||||
# check to see if attributes matches is_present. If so, do nothing.
|
||||
if len(difference) == 0:
|
||||
ret['comment'] = "alert config {0} present and matching".format(metric_name)
|
||||
return ret
|
||||
if __opts__['test']:
|
||||
msg = 'alert config {0} is to be updated.'.format(metric_name)
|
||||
ret['comment'] = msg
|
||||
ret['result'] = "\n".join(difference)
|
||||
return ret
|
||||
|
||||
result, msg = __salt__['telemetry.update_alarm'](*create_or_update_args)
|
||||
|
||||
if result:
|
||||
ret['changes']['diff'] = difference
|
||||
ret['comment'] = "Alert updated."
|
||||
else:
|
||||
ret['result'] = False
|
||||
ret['comment'] = 'Failed to update {0} alert config: {1}'.format(metric_name, msg)
|
||||
else: # alert config is absent. create it.
|
||||
if __opts__['test']:
|
||||
msg = 'alert config {0} is to be created.'.format(metric_name)
|
||||
ret['comment'] = msg
|
||||
ret['result'] = None
|
||||
return ret
|
||||
|
||||
result, msg = __salt__['telemetry.create_alarm'](*create_or_update_args)
|
||||
|
||||
if result:
|
||||
ret['changes']['new'] = msg
|
||||
else:
|
||||
ret['result'] = False
|
||||
ret['comment'] = 'Failed to create {0} alert config: {1}'.format(metric_name, msg)
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def absent(name, deployment_id, metric_name, api_key=None, profile="telemetry"):
|
||||
'''
|
||||
Ensure the telemetry alert config is deleted
|
||||
|
||||
name
|
||||
An optional description of the alarms (not currently supported by telemetry API)
|
||||
|
||||
deployment_id
|
||||
Specifies the ID of the root deployment resource
|
||||
(replica set cluster or sharded cluster) to which this alert definition is attached
|
||||
|
||||
metric_name
|
||||
Specifies the unique ID of the metric to whose values these thresholds will be applied
|
||||
|
||||
api_key
|
||||
Telemetry api key for the user
|
||||
|
||||
profile
|
||||
A dict with telemetry config data. If present, will be used instead of
|
||||
api_key.
|
||||
'''
|
||||
ret = {'name': metric_name, 'result': True, 'comment': '', 'changes': {}}
|
||||
|
||||
is_present = __salt__['telemetry.get_alert_config'](
|
||||
deployment_id, metric_name, api_key, profile)
|
||||
|
||||
if is_present:
|
||||
alert_id = is_present.get('_id')
|
||||
if __opts__['test']:
|
||||
ret['comment'] = 'alert {0} is set to be removed from deployment: {1}.'.format(metric_name, deployment_id)
|
||||
ret['result'] = None
|
||||
return ret
|
||||
deleted, msg = __salt__['telemetry.delete_alarms'](
|
||||
deployment_id, alert_id, is_present.get('condition', {}).get('metric'), api_key, profile)
|
||||
|
||||
if deleted:
|
||||
ret['changes']['old'] = metric_name
|
||||
ret['changes']['new'] = None
|
||||
else:
|
||||
ret['result'] = False
|
||||
ret['comment'] = 'Failed to delete alert {0} from deployment: {1}'.format(metric_name, msg)
|
||||
else:
|
||||
ret['comment'] = 'alarm on {0} does not exist within {1}.'.format(metric_name, deployment_id)
|
||||
return ret
|
Loading…
Reference in New Issue
Block a user