fleet/infrastructure/sandbox/Monitoring/main.tf
Robert Fairburn 152fb9e9b2
Sandbox v4.31.0 (#11584)
# Checklist for submitter

If some of the following don't apply, delete the relevant line.

- [ ] Changes file added for user-visible changes in `changes/` or
`orbit/changes/`.
See [Changes
files](https://fleetdm.com/docs/contributing/committing-changes#changes-files)
for more information.
- [ ] Documented any API changes (docs/Using-Fleet/REST-API.md or
docs/Contributing/API-for-contributors.md)
- [ ] Documented any permissions changes
- [ ] Input data is properly validated, `SELECT *` is avoided, SQL
injection is prevented (using placeholders for values in statements)
- [ ] Added support on fleet's osquery simulator `cmd/osquery-perf` for
new osquery data ingestion features.
- [ ] Added/updated tests
- [ ] Manual QA for all new/changed functionality
  - For Orbit and Fleet Desktop changes:
- [ ] Manual QA must be performed in the three main OSs, macOS, Windows
and Linux.
- [ ] Auto-update manual QA, from released version of component to new
version (see [tools/tuf/test](../tools/tuf/test/README.md)).

---------

Co-authored-by: zwinnerman-fleetdm <zwinnerman@fleetdm.com>
2023-05-10 10:27:12 -05:00

281 lines
7.7 KiB
HCL

terraform {
required_providers {
docker = {
source = "kreuzwerker/docker"
version = "~> 2.16.0"
}
git = {
source = "paultyng/git"
version = "~> 0.1.0"
}
}
}
data "aws_region" "current" {}
locals {
full_name = "${var.prefix}-monitoring"
}
module "notify_slack" {
source = "terraform-aws-modules/notify-slack/aws"
version = "5.5.0"
sns_topic_name = var.prefix
slack_webhook_url = var.slack_webhook
slack_channel = "#help-p1"
slack_username = "monitoring"
}
data "aws_iam_policy_document" "lifecycle-lambda-assume-role" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["lambda.amazonaws.com"]
}
}
}
resource "aws_iam_role_policy_attachment" "lifecycle-lambda-lambda" {
role = aws_iam_role.lifecycle-lambda.id
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
}
resource "aws_iam_role_policy_attachment" "lifecycle-lambda" {
role = aws_iam_role.lifecycle-lambda.id
policy_arn = aws_iam_policy.lifecycle-lambda.arn
}
resource "aws_iam_policy" "lifecycle-lambda" {
name = "${local.full_name}-lifecycle-lambda"
policy = data.aws_iam_policy_document.lifecycle-lambda.json
}
data "aws_iam_policy_document" "lifecycle-lambda" {
statement {
actions = [
"dynamodb:List*",
"dynamodb:DescribeReservedCapacity*",
"dynamodb:DescribeLimits",
"dynamodb:DescribeTimeToLive"
]
resources = ["*"]
}
statement {
actions = [
"dynamodb:BatchGet*",
"dynamodb:DescribeStream",
"dynamodb:DescribeTable",
"dynamodb:Get*",
"dynamodb:Query",
"dynamodb:Scan",
"dynamodb:BatchWrite*",
"dynamodb:CreateTable",
"dynamodb:Delete*",
"dynamodb:Update*",
"dynamodb:PutItem"
]
resources = [var.dynamodb_table.arn]
}
statement {
actions = [ #tfsec:ignore:aws-iam-no-policy-wildcards
"kms:Encrypt*",
"kms:Decrypt*",
"kms:ReEncrypt*",
"kms:GenerateDataKey*",
"kms:Describe*"
]
resources = [aws_kms_key.ecr.arn, var.kms_key.arn]
}
statement {
actions = ["cloudwatch:PutMetricData"]
resources = ["*"]
}
}
resource "aws_iam_role" "lifecycle-lambda" {
name = local.full_name
assume_role_policy = data.aws_iam_policy_document.lifecycle-lambda-assume-role.json
}
resource "aws_kms_key" "ecr" {
deletion_window_in_days = 10
enable_key_rotation = true
}
resource "aws_ecr_repository" "main" {
name = local.full_name
image_tag_mutability = "IMMUTABLE"
image_scanning_configuration {
scan_on_push = true
}
encryption_configuration {
encryption_type = "KMS"
kms_key = aws_kms_key.ecr.arn
}
}
resource "random_uuid" "lifecycle-lambda" {
keepers = {
lambda = data.archive_file.lifecycle-lambda.output_sha
}
}
data "archive_file" "lifecycle-lambda" {
type = "zip"
output_path = "${path.module}/.lambda.zip"
source_dir = "${path.module}/lambda"
}
data "git_repository" "main" {
path = "${path.module}/../../../"
}
resource "docker_registry_image" "lifecycle-lambda" {
name = "${aws_ecr_repository.main.repository_url}:${data.git_repository.main.branch}-${random_uuid.lifecycle-lambda.result}"
keep_remotely = true
build {
context = "${path.module}/lambda/"
pull_parent = true
platform = "linux/amd64"
}
}
resource "aws_cloudwatch_event_rule" "lifecycle" {
name_prefix = local.full_name
schedule_expression = "rate(5 minutes)"
is_enabled = true
}
resource "aws_cloudwatch_event_target" "lifecycle" {
rule = aws_cloudwatch_event_rule.lifecycle.name
arn = aws_lambda_function.lifecycle.arn
}
resource "aws_lambda_function" "lifecycle" {
# If the file is not in the current working directory you will need to include a
# path.module in the filename.
image_uri = docker_registry_image.lifecycle-lambda.name
package_type = "Image"
function_name = "${local.full_name}-lifecycle-lambda"
kms_key_arn = var.kms_key.arn
role = aws_iam_role.lifecycle-lambda.arn
reserved_concurrent_executions = -1
timeout = 10
memory_size = 512
tracing_config {
mode = "Active"
}
environment {
variables = {
DYNAMODB_LIFECYCLE_TABLE = var.dynamodb_table.id
}
}
}
resource "aws_lambda_permission" "lifecycle" {
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.lifecycle.function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.lifecycle.arn
}
resource "aws_cloudwatch_metric_alarm" "totalInstances" {
alarm_name = "${var.prefix}-lifecycle-totalCount"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "instances"
namespace = "Fleet/sandbox"
period = "900"
statistic = "Average"
threshold = "90"
alarm_actions = [module.notify_slack.slack_topic_arn]
ok_actions = [module.notify_slack.slack_topic_arn]
treat_missing_data = "breaching"
datapoints_to_alarm = 1
dimensions = {
Type = "totalCount"
}
}
resource "aws_cloudwatch_metric_alarm" "unclaimed" {
alarm_name = "${var.prefix}-lifecycle-unclaimed"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
metric_name = "instances"
namespace = "Fleet/sandbox"
period = "900"
statistic = "Average"
threshold = "10"
alarm_actions = [module.notify_slack.slack_topic_arn]
ok_actions = [module.notify_slack.slack_topic_arn]
treat_missing_data = "breaching"
datapoints_to_alarm = 1
dimensions = {
Type = "unclaimedCount"
}
}
resource "aws_cloudwatch_metric_alarm" "lb" {
for_each = toset(["HTTPCode_ELB_5XX_Count", "HTTPCode_Target_5XX_Count"])
alarm_name = "${var.prefix}-lb-${each.key}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = each.key
namespace = "AWS/ApplicationELB"
period = "120"
statistic = "Sum"
threshold = "0"
alarm_actions = [module.notify_slack.slack_topic_arn]
ok_actions = [module.notify_slack.slack_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
LoadBalancer = var.lb.arn_suffix
}
}
resource "aws_cloudwatch_metric_alarm" "jitprovisioner" {
for_each = toset(["Errors"])
alarm_name = "${var.prefix}-jitprovisioner-${each.key}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = each.key
namespace = "AWS/Lambda"
period = "120"
statistic = "Sum"
threshold = "0"
alarm_actions = [module.notify_slack.slack_topic_arn]
ok_actions = [module.notify_slack.slack_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
FunctionName = var.jitprovisioner.id
}
}
resource "aws_cloudwatch_metric_alarm" "deprovisioner" {
for_each = toset(["ExecutionsFailed"])
alarm_name = "${var.prefix}-deprovisioner-${each.key}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = each.key
namespace = "AWS/States"
period = "120"
statistic = "Sum"
threshold = "0"
alarm_actions = [module.notify_slack.slack_topic_arn]
ok_actions = [module.notify_slack.slack_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
StateMachineArn = var.deprovisioner.arn
}
}