refactor terraform to allow bootstrapping (#2662)

* refactor to allow bootstrapping* move monitoring into its own package, update readme * add variable for license_key * replication lag alarm less sensitive
2024-11-06 00:45:19 +00:00 · 2021-11-09 20:14:05 -05:00 · 2021-11-09 20:14:05 -05:00 · dac043f1f5
commit dac043f1f5
parent 2b8502ad61
8 changed files with 199 additions and 83 deletions
--- a/tools/terraform/ecs-iam.tf
+++ b/tools/terraform/ecs-iam.tf
@ -8,7 +8,7 @@ data "aws_iam_policy_document" "fleet" {
  statement {
    effect    = "Allow"
    actions   = ["secretsmanager:GetSecretValue"]
-    resources = [aws_secretsmanager_secret.database_password_secret.arn, data.aws_secretsmanager_secret.license.arn]
+    resources = [aws_secretsmanager_secret.database_password_secret.arn]
  }

  // useful when there is a static number of mysql cluster members
--- a/tools/terraform/ecs.tf
+++ b/tools/terraform/ecs.tf
@ -1,13 +1,15 @@
-//resource "aws_route53_record" "record" {
-//  name = "fleetdm"
-//  type = "A"
-//  zone_id = "Z046188311R47QSK245X"
-//  alias {
-//    evaluate_target_health = false
-//    name = aws_alb.main.dns_name
-//    zone_id = aws_alb.main.zone_id
-//  }
-//}
+data "aws_region" "current" {}
+
+resource "aws_route53_record" "record" {
+  name    = "fleet-alb-${terraform.workspace}"
+  type    = "A"
+  zone_id = aws_route53_zone.dogfood_fleetdm_com.zone_id
+  alias {
+    evaluate_target_health = false
+    name                   = aws_alb.main.dns_name
+    zone_id                = aws_alb.main.zone_id
+  }
+}

 resource "aws_alb" "main" {
  name            = "fleetdm"
@ -111,12 +113,6 @@ resource "aws_cloudwatch_log_group" "backend" {
  retention_in_days = 1
 }

-data "aws_region" "current" {}
-
-data "aws_secretsmanager_secret" "license" {
-  name = "/fleet/license"
-}
-
 resource "aws_ecs_task_definition" "backend" {
  family                   = "fleet"
  network_mode             = "awsvpc"
@ -159,10 +155,6 @@ resource "aws_ecs_task_definition" "backend" {
          {
            name      = "FLEET_MYSQL_READ_REPLICA_PASSWORD"
            valueFrom = aws_secretsmanager_secret.database_password_secret.arn
-          },
-          {
-            name      = "FLEET_LICENSE_KEY"
-            valueFrom = data.aws_secretsmanager_secret.license.arn
          }
        ]
        environment = [
@ -235,7 +227,7 @@ resource "aws_ecs_task_definition" "backend" {
            value = var.logging_debug
          },
          {
-            name = "FLEET_LOGGING_JSON"
+            name  = "FLEET_LOGGING_JSON"
            value = var.logging_json
          },
          {
@ -246,6 +238,10 @@ resource "aws_ecs_task_definition" "backend" {
            name  = "FLEET_S3_PREFIX"
            value = "carve_results/"
          },
+          {
+            name  = "FLEET_LICENSE_KEY"
+            value = var.fleet_license
+          }
        ]
      }
  ])
--- a/tools/terraform/main.tf
+++ b/tools/terraform/main.tf
@ -7,11 +7,11 @@ provider "aws" {
 }

 terraform {
-  // these values are hard-coded to prevent chicken before the egg situations
+  // these values should match what is bootstrapped in ./remote-state
  backend "s3" {
-    bucket = "fleet-terraform-remote-state"
-    region = "us-east-2"
-    key = "fleet/"
+    bucket         = "fleet-terraform-remote-state"
+    region         = "us-east-2"
+    key            = "fleet/"
    dynamodb_table = "fleet-terraform-state-lock"
  }
  required_providers {
@ -22,39 +22,4 @@ terraform {
  }
 }

-data "aws_caller_identity" "current" {}
-
-resource "aws_s3_bucket" "remote_state" {
-  bucket = "${var.prefix}-terraform-remote-state"
-  acl    = "private"
-  versioning {
-    enabled = true
-  }
-  lifecycle {
-    prevent_destroy = true
-  }
-  tags = {
-    Name = "S3 Remote Terraform State Store"
-  }
-}
-
-resource "aws_s3_bucket_public_access_block" "fleet_terraform_state" {
-  bucket              = aws_s3_bucket.remote_state.id
-  block_public_acls   = true
-  block_public_policy = true
-}
-
-resource "aws_dynamodb_table" "fleet_terraform_state_lock" {
-  name         = "fleet-terraform-state-lock"
-  hash_key     = "LockID"
-  billing_mode = "PAY_PER_REQUEST"
-
-  attribute {
-    name = "LockID"
-    type = "S"
-  }
-
-  tags = {
-    Name = "DynamoDB Terraform State Lock Table"
-  }
-}
+data "aws_caller_identity" "current" {}
--- a/tools/terraform/monitoring/monitoring.tf
+++ b/tools/terraform/monitoring/monitoring.tf
@ -1,3 +1,39 @@
+terraform {
+  // these values should match what is bootstrapped in ./remote-state
+  backend "s3" {
+    bucket         = "fleet-terraform-remote-state"
+    region         = "us-east-2"
+    key            = "fleet-monitoring/"
+    dynamodb_table = "fleet-terraform-state-lock"
+  }
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "3.57.0"
+    }
+  }
+}
+provider "aws" {
+  region = "us-east-2"
+}
+data "aws_caller_identity" "current" {}
+data "aws_region" "current" {}
+
+data "terraform_remote_state" "fleet" {
+  backend = "s3"
+  config = {
+    bucket = "fleet-terraform-remote-state"
+    region = "us-east-2"
+    key    = "env:/${terraform.workspace}/fleet"
+  }
+}
+
+locals {
+  fleet_ecs_service_name = data.terraform_remote_state.fleet.outputs.fleet_ecs_service_name
+  alb_target_group_name  = data.terraform_remote_state.fleet.outputs.aws_alb_target_group_name
+  alb_name               = data.terraform_remote_state.fleet.outputs.aws_alb_name
+}
+
 // sns topic to send cloudwatch alarms to
 resource "aws_sns_topic" "cloudwatch_alarm_topic" {
  name = "cloudwatch-alarm-${terraform.workspace}"
@ -67,7 +103,7 @@ data "aws_iam_policy_document" "sns_topic_policy" {

 // Database alarms
 resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
-  for_each            = toset(module.aurora_mysql.rds_cluster_instance_ids)
+  for_each            = data.terraform_remote_state.fleet.outputs.mysql_cluster_members
  alarm_name          = "rds_cpu_utilization_too_high-${each.key}-${terraform.workspace}"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
@ -89,7 +125,7 @@ resource "aws_db_event_subscription" "default" {
  sns_topic = aws_sns_topic.cloudwatch_alarm_topic.arn

  source_type = "db-instance"
-  source_ids  = module.aurora_mysql.rds_cluster_instance_ids
+  source_ids  = data.terraform_remote_state.fleet.outputs.mysql_cluster_members

  event_categories = [
    "failover",
@ -114,14 +150,14 @@ resource "aws_cloudwatch_metric_alarm" "alb_healthyhosts" {
  namespace           = "AWS/ApplicationELB"
  period              = "60"
  statistic           = "Minimum"
-  threshold           = var.fleet_min_capacity
-  alarm_description   = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${aws_alb.main.name}\" or the target group \"${aws_alb_target_group.main.name}\" and the fleet backend service \"${aws_ecs_service.fleet.name}\""
+  threshold           = data.terraform_remote_state.fleet.outputs.fleet_min_capacity
+  alarm_description   = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${local.alb_name}\" or the target group \"${local.alb_target_group_name}\" and the fleet backend service \"${local.fleet_ecs_service_name}\""
  actions_enabled     = "true"
  alarm_actions       = [aws_sns_topic.cloudwatch_alarm_topic.arn]
  ok_actions          = [aws_sns_topic.cloudwatch_alarm_topic.arn]
  dimensions = {
-    TargetGroup  = aws_alb_target_group.main.arn_suffix
-    LoadBalancer = aws_alb.main.arn_suffix
+    TargetGroup  = data.terraform_remote_state.fleet.outputs.target_group_arn_suffix
+    LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
  }
 }

@ -131,7 +167,7 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time" {
  comparison_operator       = "GreaterThanUpperThreshold"
  evaluation_periods        = "2"
  threshold_metric_id       = "e1"
-  alarm_description         = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${aws_ecs_service.fleet.name}\" because the backend might need to be scaled up."
+  alarm_description         = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${local.fleet_ecs_service_name}\" because the backend might need to be scaled up."
  alarm_actions             = [aws_sns_topic.cloudwatch_alarm_topic.arn]
  ok_actions                = [aws_sns_topic.cloudwatch_alarm_topic.arn]
  insufficient_data_actions = []
@ -154,8 +190,8 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time" {
      unit        = "Count"

      dimensions = {
-        TargetGroup  = aws_alb_target_group.main.arn_suffix
-        LoadBalancer = aws_alb.main.arn_suffix
+        TargetGroup  = data.terraform_remote_state.fleet.outputs.target_group_arn_suffix
+        LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
      }
    }
  }
@ -174,13 +210,13 @@ resource "aws_cloudwatch_metric_alarm" "httpcode_elb_5xx_count" {
  alarm_actions       = [aws_sns_topic.cloudwatch_alarm_topic.arn]
  ok_actions          = [aws_sns_topic.cloudwatch_alarm_topic.arn]
  dimensions = {
-    LoadBalancer = aws_alb.main.arn_suffix
+    LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
  }
 }

 // Elasticache (redis) alerts https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/CacheMetrics.WhichShouldIMonitor.html
 resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
-  for_each            = toset(aws_elasticache_replication_group.default.member_clusters)
+  for_each            = data.terraform_remote_state.fleet.outputs.redis_cluster_members
  alarm_name          = "redis-cpu-utilization-${each.key}-${terraform.workspace}"
  alarm_description   = "Redis cluster CPU utilization node ${each.key}"
  comparison_operator = "GreaterThanThreshold"
@ -198,11 +234,10 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
    CacheClusterId = each.key
  }

-  depends_on = [aws_elasticache_replication_group.default]
 }

 resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
-  for_each            = toset(aws_elasticache_replication_group.default.member_clusters)
+  for_each            = data.terraform_remote_state.fleet.outputs.redis_cluster_members
  alarm_name          = "redis-cpu-engine-utilization-${each.key}-${terraform.workspace}"
  alarm_description   = "Redis cluster CPU Engine utilization node ${each.key}"
  comparison_operator = "GreaterThanThreshold"
@ -220,7 +255,6 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
    CacheClusterId = each.key
  }

-  depends_on = [aws_elasticache_replication_group.default]
 }

 resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {
@ -237,11 +271,10 @@ resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {

  threshold = "80"

-  depends_on = [aws_elasticache_replication_group.default]
 }

 resource "aws_cloudwatch_metric_alarm" "redis-current-connections" {
-  for_each                  = toset(aws_elasticache_replication_group.default.member_clusters)
+  for_each                  = data.terraform_remote_state.fleet.outputs.redis_cluster_members
  alarm_name                = "redis-current-connections-${each.key}-${terraform.workspace}"
  alarm_description         = "Redis current connections for node ${each.key}"
  comparison_operator       = "LessThanLowerOrGreaterThanUpperThreshold"
@ -279,7 +312,7 @@ resource "aws_cloudwatch_metric_alarm" "redis-replication-lag" {
  alarm_name                = "redis-replication-lag-${terraform.workspace}"
  alarm_description         = "This metric is only applicable for a node running as a read replica. It represents how far behind, in seconds, the replica is in applying changes from the primary node. For Redis engine version 5.0.6 onwards, the lag can be measured in milliseconds."
  comparison_operator       = "GreaterThanUpperThreshold"
-  evaluation_periods        = "1"
+  evaluation_periods        = "3"
  threshold_metric_id       = "e1"
  alarm_actions             = [aws_sns_topic.cloudwatch_alarm_topic.arn]
  ok_actions                = [aws_sns_topic.cloudwatch_alarm_topic.arn]
@ -320,6 +353,6 @@ resource "aws_cloudwatch_metric_alarm" "acm_certificate_expired" {
  ok_actions          = [aws_sns_topic.cloudwatch_alarm_topic.arn]

  dimensions = {
-    CertificateArn = aws_acm_certificate.dogfood_fleetdm_com.arn
+    CertificateArn = data.terraform_remote_state.fleet.outputs.acm_certificate_arn
  }
 }
--- a/tools/terraform/outputs.tf
+++ b/tools/terraform/outputs.tf
@ -20,4 +20,40 @@ output "fleet-backend-task-revision" {

 output "fleet-migration-task-revision" {
  value = aws_ecs_task_definition.migration.revision
+}
+
+output "redis_cluster_members" {
+  value = toset(aws_elasticache_replication_group.default.member_clusters)
+}
+
+output "mysql_cluster_members" {
+  value = toset(module.aurora_mysql.rds_cluster_instance_ids)
+}
+
+output "acm_certificate_arn" {
+  value = aws_acm_certificate.dogfood_fleetdm_com.arn
+}
+
+output "load_balancer_arn_suffix" {
+  value = aws_alb.main.arn_suffix
+}
+
+output "target_group_arn_suffix" {
+  value = aws_alb_target_group.main.arn_suffix
+}
+
+output "fleet_min_capacity" {
+  value = var.fleet_min_capacity
+}
+
+output "fleet_ecs_service_name" {
+  value = aws_ecs_service.fleet.name
+}
+
+output "aws_alb_target_group_name" {
+  value = aws_alb_target_group.main.name
+}
+
+output "aws_alb_name" {
+  value = aws_alb.main.name
 }
--- a/tools/terraform/readme.md
+++ b/tools/terraform/readme.md
@ -1,10 +1,44 @@
 ## Terraform

-`terraform init && terraform workspace new dev`
+### Bootstrapping remote state

-`terraform plan`
+First we need to bootstrap our terraform remote state management. This lives outside the main project to avoid "chicken before the egg"
+issues. We are going to create the remote state S3 bucket and DynamoDB state locking table and then use hardcoded values
+in parent folder `main.tf`.
+1. `cd remote-state`
+2. `terraform init`
+3. `terraform apply`

-`terraform apply`
+### Creating the Fleet infrastructure
+
+Create a new `tfvars` file for example:
+
+```terraform
+fleet_backend_cpu  = 512
+fleet_backend_mem  = 4096 // 4GB needed for vuln processing
+redis_instance     = "cache.t3.micro"
+fleet_min_capacity = 2
+fleet_max_capacity = 5
+```
+
+If you have a Fleet license key you can include it in the `tfvars` file which will enable the paid features.
+
+```terraform
+fleet_license = "<your license key here"
+```
+
+**To deploy the infrastructure**:
+1. `terraform init && terraform workspace new prod` (workspace is optional terraform defaults to the `default` workspace)
+2. `terraform plan -var-file=<your_tfvars_file>`
+3. `terraform apply -var-file=<your_tfvars_file>`
+
+**To deploy cloudwatch alarms** (requires infrastruture to be deployed)
+1. `cd monitoring`
+2. `terraform init && terraform workspace new prod` (workspace is optional terraform defaults to the `default` workspace)
+3. `terraform plan -var-file=<your_tfvars_file>`
+4. `terraform apply -var-file=<your_tfvars_file>`
+
+Check out [AWS Chatbot](https://docs.aws.amazon.com/chatbot/latest/adminguide/setting-up.html) for a quick and easy way to hook up Cloudwatch Alarms into a Slack channel. 

 ### Configuration

--- a/tools/terraform/remote-state/main.tf
+++ b/tools/terraform/remote-state/main.tf
@ -0,0 +1,47 @@
+variable "prefix" {
+  default = "fleet"
+}
+
+variable "region" {
+  default = "us-east-2"
+}
+
+provider "aws" {
+  region = var.region
+}
+
+resource "aws_s3_bucket" "remote_state" {
+  bucket = "${var.prefix}-terraform-remote-state"
+  acl    = "private"
+  versioning {
+    enabled = true
+  }
+  lifecycle {
+    prevent_destroy = true
+  }
+  tags = {
+    Name = "S3 Remote Terraform State Store"
+  }
+}
+
+resource "aws_s3_bucket_public_access_block" "fleet_terraform_state" {
+  bucket              = aws_s3_bucket.remote_state.id
+  block_public_acls   = true
+  block_public_policy = true
+}
+
+
+resource "aws_dynamodb_table" "fleet_terraform_state_lock" {
+  name         = "${var.prefix}-terraform-state-lock"
+  hash_key     = "LockID"
+  billing_mode = "PAY_PER_REQUEST"
+
+  attribute {
+    name = "LockID"
+    type = "S"
+  }
+
+  tags = {
+    Name = "DynamoDB Terraform State Lock Table"
+  }
+}
--- a/tools/terraform/variables.tf
+++ b/tools/terraform/variables.tf
@ -103,4 +103,9 @@ variable "memory_tracking_target_value" {
 variable "cpu_tracking_target_value" {
  description = "target cpu utilization for target tracking policy (default 60%)"
  default     = 60
+}
+
+variable "fleet_license" {
+  description = "Fleet Premium license key"
+  default = ""
 }