Merge pull request #25815 from lyft/mongolab_telemetry_salt_module

Mongolab telemetry salt module
This commit is contained in:
Nicole Thomas 2015-07-30 14:23:22 -06:00
commit a0ab784216
2 changed files with 565 additions and 0 deletions

367
salt/modules/telemetry.py Normal file
View File

@ -0,0 +1,367 @@
# -*- coding: utf-8 -*-
'''
Connection module for Telemetry
... versionadded:: Boron.
https://github.com/mongolab/mongolab-telemetry-api-docs/blob/master/alerts.md
:configuration: This module accepts explicit telemetry credentials or
can also read api key credentials from a pillar. More Information available at::
https://github.com/mongolab/mongolab-telemetry-api-docs/blob/master/alerts.md
in the minion's config file::
telemetry.telemetry_api_keys:
- abc123 # Key 1
- efg321 # Backup Key 1
telemetry_api_base_url: https://telemetry-api.mongolab.com/v0
:depends: requests
'''
from __future__ import absolute_import
from salt._compat import string_types
import json
import logging
log = logging.getLogger(__name__)
# Import third party libs
try:
import requests
HAS_REQUESTS = True
except ImportError:
HAS_REQUESTS = False
__virtualname__ = 'telemetry'
def __virtual__():
# Only load if imports exist.
if not HAS_REQUESTS:
return False
return __virtualname__
def _get_telemetry_base(profile):
config = __salt__['config.option'](profile)
return config.get('telemetry_api_base_url')
def _auth(api_key=None, profile='telemetry'):
# return telemetry api key in the header
if api_key is None and profile is None:
raise Exception("Missing api_key and profile")
if profile:
if isinstance(profile, string_types):
_profile = __salt__['config.option'](profile)
elif isinstance(profile, dict):
_profile = profile
if _profile:
api_key = _profile.get('telemetry_api_keys')[0]
else:
raise Exception("Missing api_key")
return {'Telemetry-API-Key': api_key, 'content-type': 'application/json'}
def _update_cache(deployment_id, metric_name, alert):
key = "telemetry.{0}.alerts".format(deployment_id)
if key in __context__:
alerts = __context__[key]
alerts[metric_name] = alert
__context__[key] = alerts
return __context__.get(key, [])
def _retrieve_channel_id(email, profile='telemetry'):
'''
Given an email address, checks the local
cache if corresponding email address: channel_id
mapping exists
email
Email escalation policy
profile
A dict of telemetry config information.
'''
key = "telemetry.channels"
auth = _auth(profile=profile)
if key not in __context__:
get_url = _get_telemetry_base(profile) + "/notification-channels?_type=EmailNotificationChannel"
response = requests.get(get_url, headers=auth)
if response.status_code == 200:
cache_result = {}
for index, alert in enumerate(response.json()):
cache_result[alert.get('email')] = alert.get('_id', 'false')
__context__[key] = cache_result
return __context__[key].get(email, False)
def get_alert_config(deployment_id, metric_name=None, api_key=None, profile="telemetry"):
'''
Get all alert definitions associated with a given deployment or if metric_name
is specified, obtain the specific alert config
Returns dictionary or list of dictionaries.
CLI Example:
salt myminion telemetry.get_alert_config rs-ds033197 currentConnections profile=telemetry
salt myminion telemetry.get_alert_config rs-ds033197 profile=telemetry
'''
auth = _auth(profile=profile)
alert = False
key = "telemetry.{0}.alerts".format(deployment_id)
if key not in __context__:
try:
get_url = _get_telemetry_base(profile) + "/alerts?deployment={0}".format(deployment_id)
response = requests.get(get_url, headers=auth)
except requests.exceptions.RequestException as e:
log.error(str(e))
return False
http_result = {}
if response.status_code == 200:
for alert in response.json():
http_result[alert.get('condition', {}).get('metric')] = alert
__context__[key] = http_result
if not __context__.get(key):
return []
alerts = __context__[key].values()
if metric_name:
return __context__[key].get(metric_name)
return [alert['_id'] for alert in alerts if '_id' in alert]
def get_notification_channel_id(notify_channel, profile="telemetry"):
'''
Given an email address, creates a notification-channels
if one is not found and also returns the corresponsing
notification channel id.
notify_channel
Email escalation policy
profile
A dict of telemetry config information.
CLI Example:
salt myminion telemetry.get_notification_channel_id userx@company.com profile=telemetry
'''
# This helper is used to procure the channel ids
# used to notify when the alarm threshold is violated
auth = _auth(profile=profile)
notification_channel_id = _retrieve_channel_id(notify_channel)
if not notification_channel_id:
log.info("{0} channel does not exist, creating.".format(notify_channel))
# create the notification channel and cache the id
post_url = _get_telemetry_base(profile) + "/notification-channels"
data = {
"_type": "EmailNotificationChannel",
"name": notify_channel[:notify_channel.find('@')] + 'EscalationPolicy',
"email": notify_channel
}
response = requests.post(post_url, data=json.dumps(data), headers=auth)
if response.status_code == 200:
log.info("Successfully created EscalationPolicy {0} with EmailNotificationChannel {1}"
.format(data.get('name'), notify_channel))
notification_channel_id = response.json().get('_id')
__context__["telemetry.channels"][notify_channel] = notification_channel_id
else:
raise Exception("Failed to created notification channel {0}".format(notify_channel))
return notification_channel_id
def get_alarms(deployment_id, profile="telemetry"):
'''
get all the alarms set up against the current deployment
Returns dictionary of alarm information
CLI Example:
salt myminion telemetry.get_alarms rs-ds033197 profile=telemetry
'''
auth = _auth(profile=profile)
try:
response = requests.get(_get_telemetry_base(profile) + "/alerts?deployment={0}".format(deployment_id), headers=auth)
except requests.exceptions.RequestException as e:
log.error(str(e))
return False
if response.status_code == 200:
alarms = response.json()
if len(alarms) > 0:
return alarms
return 'No alarms defined for deployment: {0}'.format(deployment_id)
else:
# Non 200 response, sent back the error response'
return {'err_code': response.status_code, 'err_msg': json.loads(response.text).get('err', '')}
def create_alarm(deployment_id, metric_name, data, api_key=None, profile="telemetry"):
'''
create an telemetry alarms.
data is a dict of alert configuration data.
Returns (bool success, str message) tuple.
CLI Example:
salt myminion telemetry.create_alarm rs-ds033197 {} profile=telemetry
'''
auth = _auth(api_key, profile)
request_uri = _get_telemetry_base(profile) + "/alerts"
key = "telemetry.{0}.alerts".format(deployment_id)
# set the notification channels if not already set
post_body = {
"deployment": deployment_id,
"filter": data.get('filter'),
"notificationChannel": get_notification_channel_id(data.get('escalate_to')).split(),
"condition": {
"metric": metric_name,
"max": data.get('max'),
"min": data.get('min')
}
}
try:
response = requests.post(request_uri, data=json.dumps(post_body), headers=auth)
except requests.exceptions.RequestException as e:
# TODO: May be we should retry?
log.error(str(e))
if response.status_code >= 200 and response.status_code < 300:
# update cache
log.info("Created alarm on metric: {0} in deployment: {1}".format(metric_name, deployment_id))
log.debug("Updating cache for metric {0} in deployment {1}: {2}".format(metric_name, deployment_id, response.json()))
_update_cache(deployment_id, metric_name, response.json())
else:
log.error("Failed to create alarm on metric: {0} in deployment {1}: payload: {2}".
format(metric_name, deployment_id, json.dumps(post_body)))
return response.status_code >= 200 and response.status_code < 300, response.json()
def update_alarm(deployment_id, metric_name, data, api_key=None, profile="telemetry"):
'''
update an telemetry alarms. data is a dict of alert configuration data.
Returns (bool success, str message) tuple.
CLI Example:
salt myminion telemetry.update_alarm rs-ds033197 {} profile=telemetry
'''
auth = _auth(api_key, profile)
alert = get_alert_config(deployment_id, metric_name, api_key, profile)
if not alert:
return False, "No entity found matching deployment {0} and alarms {1}".format(deployment_id, metric_name)
request_uri = _get_telemetry_base(profile) + '/alerts/' + alert['_id']
# set the notification channels if not already set
post_body = {
"deployment": deployment_id,
"filter": data.get('filter'),
"notificationChannel": get_notification_channel_id(data.get('escalate_to')).split(),
"condition": {
"metric": metric_name,
"max": data.get('max'),
"min": data.get('min')
}
}
try:
response = requests.put(request_uri, data=json.dumps(post_body), headers=auth)
except requests.exceptions.RequestException as e:
log.error("Update failed {0}" .format(str(e)))
return False, str(e)
if response.status_code >= 200 and response.status_code < 300:
# Also update cache
log.debug("Updating cache for metric {0} in deployment {1}: {2}".format(metric_name, deployment_id, response.json()))
_update_cache(deployment_id, metric_name, response.json())
log.info("Updated alarm on metric: {0} in deployment: {1}".format(metric_name, deployment_id))
return True, response.json()
err_msg = "Failed to create alarm on metric: {0} in deployment {1}: payload: {2}".format(metric_name, deployment_id, json.dumps(post_body))
log.error(err_msg)
return False, err_msg
def delete_alarms(deployment_id, alert_id=None, metric_name=None, api_key=None, profile='telemetry'):
'''delete an alert specified by alert_id or if not specified blows away all the alerts
in the current deployment.
Returns (bool success, str message) tuple.
CLI Example:
salt myminion telemetry.delete_alarms rs-ds033197 profile=telemetry
'''
auth = _auth(profile=profile)
if alert_id is None:
# Delete all the alarms associated with this deployment
alert_ids = get_alert_config(deployment_id, api_key=api_key, profile=profile)
else:
alert_ids = [alert_id]
if len(alert_ids) == 0:
return False, "failed to find alert associated with deployment: {0}".format(deployment_id)
failed_to_delete = []
for id in alert_ids:
delete_url = _get_telemetry_base(profile) + "/alerts/{0}".format(id)
try:
response = requests.delete(delete_url, headers=auth)
if metric_name:
log.debug("updating cache and delete {0} key from {1}".format(metric_name, deployment_id))
_update_cache(deployment_id, metric_name, None)
except requests.exceptions.RequestException as e:
log.error('Delete failed: {0}'.format(str(e)))
if response.status_code != 200:
failed_to_delete.append(id)
if len(failed_to_delete) > 0:
return False, "Failed to delete {0} alarms in deployment: {1}" .format(', '.join(failed_to_delete), deployment_id)
return True, "Successfully deleted {0} alerts in deployment: {1}".format(', '.join(alert_ids), deployment_id)

View File

@ -0,0 +1,198 @@
# -*- coding: utf-8 -*-
'''
.. versionadded:: Boron.
Manage Telemetry alert configurations
=================
Create, Update and destroy Mongo Telemetry alert configurations.
This module uses requests, which can be installed via package, or pip.
This module accepts explicit credential (telemetry api key)
or can also read api key credentials from a pillar.
Example:
.. code-block:: yaml
ensure telemetry alert X is defined on deployment Y:
telemetry_alert.present:
- deployment_id: "rs-XXXXXX"
- metric_name: "testMetric"
- alert_config:
max: 1
filter: SERVER_ROLE_MONGOD_PRIMARY
escalate_to: "example@pagerduty.com"
- name: "**MANAGED BY ORCA DO NOT EDIT BY HAND** manages alarm on testMetric"
'''
from __future__ import absolute_import
from salt._compat import string_types
def __virtual__():
# Only load if telemetry is available.
return 'telemetry_alert' if 'telemetry.get_alert_config' in __salt__ else False
def present(name, deployment_id, metric_name, alert_config, api_key=None, profile='telemetry'):
'''
Ensure the telemetry alert exists.
name
An optional description of the alarm (not currently supported by telemetry API)
deployment_id
Specifies the ID of the root deployment resource
(replica set cluster or sharded cluster) to which this alert definition is attached
metric_name
Specifies the unique ID of the metric to whose values these thresholds will be applied
alert_config: Is a list of dictionaries where each dict contains the following fields:
filter
By default the alert will apply to the deployment and all its constituent resources.
If the alert only applies to a subset of those resources, a filter may be specified to narrow this scope.
min
the smallest "ok" value the metric may take on; if missing or null, no minimum is enforced.
max
the largest "ok" value the metric may take on; if missing or null, no maximum is enforced.
notify_all
Used to indicate if you want to alert both onCallEngineer and apiNotifications
api_key
Telemetry api key for the user
profile
A dict of telemetry config information. If present, will be used instead of
api_key.
'''
ret = {'name': metric_name, 'result': True, 'comment': '', 'changes': {}}
saved_alert_config = __salt__['telemetry.get_alert_config'](
deployment_id, metric_name, api_key, profile)
post_body = {
"deployment": deployment_id,
"filter": alert_config.get('filter'),
"notificationChannel": __salt__['telemetry.get_notification_channel_id'](alert_config.get('escalate_to')).split(),
"condition": {
"metric": metric_name,
"max": alert_config.get('max'),
"min": alert_config.get('min')
}
}
# Diff the alert config with the passed-in attributes
difference = []
if saved_alert_config:
#del saved_alert_config["_id"]
for k, v in post_body.items():
if k not in saved_alert_config:
difference.append("{0}={1} (new)".format(k, v))
continue
v2 = saved_alert_config[k]
if v == v2:
continue
if isinstance(v, string_types) and str(v) == str(v2):
continue
if isinstance(v, float) and v == float(v2):
continue
if isinstance(v, int) and v == int(v2):
continue
difference.append("{0}='{1}' was: '{2}'".format(k, v, v2))
else:
difference.append("new alert config")
create_or_update_args = (
deployment_id,
metric_name,
alert_config,
api_key,
profile,
)
if saved_alert_config: # alert config is present. update, or do nothing
# check to see if attributes matches is_present. If so, do nothing.
if len(difference) == 0:
ret['comment'] = "alert config {0} present and matching".format(metric_name)
return ret
if __opts__['test']:
msg = 'alert config {0} is to be updated.'.format(metric_name)
ret['comment'] = msg
ret['result'] = "\n".join(difference)
return ret
result, msg = __salt__['telemetry.update_alarm'](*create_or_update_args)
if result:
ret['changes']['diff'] = difference
ret['comment'] = "Alert updated."
else:
ret['result'] = False
ret['comment'] = 'Failed to update {0} alert config: {1}'.format(metric_name, msg)
else: # alert config is absent. create it.
if __opts__['test']:
msg = 'alert config {0} is to be created.'.format(metric_name)
ret['comment'] = msg
ret['result'] = None
return ret
result, msg = __salt__['telemetry.create_alarm'](*create_or_update_args)
if result:
ret['changes']['new'] = msg
else:
ret['result'] = False
ret['comment'] = 'Failed to create {0} alert config: {1}'.format(metric_name, msg)
return ret
def absent(name, deployment_id, metric_name, api_key=None, profile="telemetry"):
'''
Ensure the telemetry alert config is deleted
name
An optional description of the alarms (not currently supported by telemetry API)
deployment_id
Specifies the ID of the root deployment resource
(replica set cluster or sharded cluster) to which this alert definition is attached
metric_name
Specifies the unique ID of the metric to whose values these thresholds will be applied
api_key
Telemetry api key for the user
profile
A dict with telemetry config data. If present, will be used instead of
api_key.
'''
ret = {'name': metric_name, 'result': True, 'comment': '', 'changes': {}}
is_present = __salt__['telemetry.get_alert_config'](
deployment_id, metric_name, api_key, profile)
if is_present:
alert_id = is_present.get('_id')
if __opts__['test']:
ret['comment'] = 'alert {0} is set to be removed from deployment: {1}.'.format(metric_name, deployment_id)
ret['result'] = None
return ret
deleted, msg = __salt__['telemetry.delete_alarms'](
deployment_id, alert_id, is_present.get('condition', {}).get('metric'), api_key, profile)
if deleted:
ret['changes']['old'] = metric_name
ret['changes']['new'] = None
else:
ret['result'] = False
ret['comment'] = 'Failed to delete alert {0} from deployment: {1}'.format(metric_name, msg)
else:
ret['comment'] = 'alarm on {0} does not exist within {1}.'.format(metric_name, deployment_id)
return ret