From dac043f1f54ed1048e1c480f6c7f777a23ea2a85 Mon Sep 17 00:00:00 2001
From: Benjamin Edwards <edwards.benw@gmail.com>
Date: Tue, 9 Nov 2021 20:14:05 -0500
Subject: [PATCH] refactor terraform to allow bootstrapping (#2662)

* refactor to allow bootstrapping* move monitoring into its own package, update readme
* add variable for license_key
* replication lag alarm less sensitive
---
 tools/terraform/ecs-iam.tf                    |  2 +-
 tools/terraform/ecs.tf                        | 38 +++++-----
 tools/terraform/main.tf                       | 45 ++----------
 .../terraform/{ => monitoring}/monitoring.tf  | 69 ++++++++++++++-----
 tools/terraform/outputs.tf                    | 36 ++++++++++
 tools/terraform/readme.md                     | 40 ++++++++++-
 tools/terraform/remote-state/main.tf          | 47 +++++++++++++
 tools/terraform/variables.tf                  |  5 ++
 8 files changed, 199 insertions(+), 83 deletions(-)
 rename tools/terraform/{ => monitoring}/monitoring.tf (81%)
 create mode 100644 tools/terraform/remote-state/main.tf

diff --git a/tools/terraform/ecs-iam.tf b/tools/terraform/ecs-iam.tf
index 33294dffa..c0acce1bd 100644
--- a/tools/terraform/ecs-iam.tf
+++ b/tools/terraform/ecs-iam.tf
@@ -8,7 +8,7 @@ data "aws_iam_policy_document" "fleet" {
   statement {
     effect    = "Allow"
     actions   = ["secretsmanager:GetSecretValue"]
-    resources = [aws_secretsmanager_secret.database_password_secret.arn, data.aws_secretsmanager_secret.license.arn]
+    resources = [aws_secretsmanager_secret.database_password_secret.arn]
   }
 
   // useful when there is a static number of mysql cluster members
diff --git a/tools/terraform/ecs.tf b/tools/terraform/ecs.tf
index 5a4fd13ce..187fadd5a 100644
--- a/tools/terraform/ecs.tf
+++ b/tools/terraform/ecs.tf
@@ -1,13 +1,15 @@
-//resource "aws_route53_record" "record" {
-//  name = "fleetdm"
-//  type = "A"
-//  zone_id = "Z046188311R47QSK245X"
-//  alias {
-//    evaluate_target_health = false
-//    name = aws_alb.main.dns_name
-//    zone_id = aws_alb.main.zone_id
-//  }
-//}
+data "aws_region" "current" {}
+
+resource "aws_route53_record" "record" {
+  name    = "fleet-alb-${terraform.workspace}"
+  type    = "A"
+  zone_id = aws_route53_zone.dogfood_fleetdm_com.zone_id
+  alias {
+    evaluate_target_health = false
+    name                   = aws_alb.main.dns_name
+    zone_id                = aws_alb.main.zone_id
+  }
+}
 
 resource "aws_alb" "main" {
   name            = "fleetdm"
@@ -111,12 +113,6 @@ resource "aws_cloudwatch_log_group" "backend" {
   retention_in_days = 1
 }
 
-data "aws_region" "current" {}
-
-data "aws_secretsmanager_secret" "license" {
-  name = "/fleet/license"
-}
-
 resource "aws_ecs_task_definition" "backend" {
   family                   = "fleet"
   network_mode             = "awsvpc"
@@ -159,10 +155,6 @@ resource "aws_ecs_task_definition" "backend" {
           {
             name      = "FLEET_MYSQL_READ_REPLICA_PASSWORD"
             valueFrom = aws_secretsmanager_secret.database_password_secret.arn
-          },
-          {
-            name      = "FLEET_LICENSE_KEY"
-            valueFrom = data.aws_secretsmanager_secret.license.arn
           }
         ]
         environment = [
@@ -235,7 +227,7 @@ resource "aws_ecs_task_definition" "backend" {
             value = var.logging_debug
           },
           {
-            name = "FLEET_LOGGING_JSON"
+            name  = "FLEET_LOGGING_JSON"
             value = var.logging_json
           },
           {
@@ -246,6 +238,10 @@ resource "aws_ecs_task_definition" "backend" {
             name  = "FLEET_S3_PREFIX"
             value = "carve_results/"
           },
+          {
+            name  = "FLEET_LICENSE_KEY"
+            value = var.fleet_license
+          }
         ]
       }
   ])
diff --git a/tools/terraform/main.tf b/tools/terraform/main.tf
index ff46e7f4c..7a0d2a87e 100644
--- a/tools/terraform/main.tf
+++ b/tools/terraform/main.tf
@@ -7,11 +7,11 @@ provider "aws" {
 }
 
 terraform {
-  // these values are hard-coded to prevent chicken before the egg situations
+  // these values should match what is bootstrapped in ./remote-state
   backend "s3" {
-    bucket = "fleet-terraform-remote-state"
-    region = "us-east-2"
-    key = "fleet/"
+    bucket         = "fleet-terraform-remote-state"
+    region         = "us-east-2"
+    key            = "fleet/"
     dynamodb_table = "fleet-terraform-state-lock"
   }
   required_providers {
@@ -22,39 +22,4 @@ terraform {
   }
 }
 
-data "aws_caller_identity" "current" {}
-
-resource "aws_s3_bucket" "remote_state" {
-  bucket = "${var.prefix}-terraform-remote-state"
-  acl    = "private"
-  versioning {
-    enabled = true
-  }
-  lifecycle {
-    prevent_destroy = true
-  }
-  tags = {
-    Name = "S3 Remote Terraform State Store"
-  }
-}
-
-resource "aws_s3_bucket_public_access_block" "fleet_terraform_state" {
-  bucket              = aws_s3_bucket.remote_state.id
-  block_public_acls   = true
-  block_public_policy = true
-}
-
-resource "aws_dynamodb_table" "fleet_terraform_state_lock" {
-  name         = "fleet-terraform-state-lock"
-  hash_key     = "LockID"
-  billing_mode = "PAY_PER_REQUEST"
-
-  attribute {
-    name = "LockID"
-    type = "S"
-  }
-
-  tags = {
-    Name = "DynamoDB Terraform State Lock Table"
-  }
-}
\ No newline at end of file
+data "aws_caller_identity" "current" {}
\ No newline at end of file
diff --git a/tools/terraform/monitoring.tf b/tools/terraform/monitoring/monitoring.tf
similarity index 81%
rename from tools/terraform/monitoring.tf
rename to tools/terraform/monitoring/monitoring.tf
index 91ec76623..f7b484329 100644
--- a/tools/terraform/monitoring.tf
+++ b/tools/terraform/monitoring/monitoring.tf
@@ -1,3 +1,39 @@
+terraform {
+  // these values should match what is bootstrapped in ./remote-state
+  backend "s3" {
+    bucket         = "fleet-terraform-remote-state"
+    region         = "us-east-2"
+    key            = "fleet-monitoring/"
+    dynamodb_table = "fleet-terraform-state-lock"
+  }
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "3.57.0"
+    }
+  }
+}
+provider "aws" {
+  region = "us-east-2"
+}
+data "aws_caller_identity" "current" {}
+data "aws_region" "current" {}
+
+data "terraform_remote_state" "fleet" {
+  backend = "s3"
+  config = {
+    bucket = "fleet-terraform-remote-state"
+    region = "us-east-2"
+    key    = "env:/${terraform.workspace}/fleet"
+  }
+}
+
+locals {
+  fleet_ecs_service_name = data.terraform_remote_state.fleet.outputs.fleet_ecs_service_name
+  alb_target_group_name  = data.terraform_remote_state.fleet.outputs.aws_alb_target_group_name
+  alb_name               = data.terraform_remote_state.fleet.outputs.aws_alb_name
+}
+
 // sns topic to send cloudwatch alarms to
 resource "aws_sns_topic" "cloudwatch_alarm_topic" {
   name = "cloudwatch-alarm-${terraform.workspace}"
@@ -67,7 +103,7 @@ data "aws_iam_policy_document" "sns_topic_policy" {
 
 // Database alarms
 resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
-  for_each            = toset(module.aurora_mysql.rds_cluster_instance_ids)
+  for_each            = data.terraform_remote_state.fleet.outputs.mysql_cluster_members
   alarm_name          = "rds_cpu_utilization_too_high-${each.key}-${terraform.workspace}"
   comparison_operator = "GreaterThanThreshold"
   evaluation_periods  = "1"
@@ -89,7 +125,7 @@ resource "aws_db_event_subscription" "default" {
   sns_topic = aws_sns_topic.cloudwatch_alarm_topic.arn
 
   source_type = "db-instance"
-  source_ids  = module.aurora_mysql.rds_cluster_instance_ids
+  source_ids  = data.terraform_remote_state.fleet.outputs.mysql_cluster_members
 
   event_categories = [
     "failover",
@@ -114,14 +150,14 @@ resource "aws_cloudwatch_metric_alarm" "alb_healthyhosts" {
   namespace           = "AWS/ApplicationELB"
   period              = "60"
   statistic           = "Minimum"
-  threshold           = var.fleet_min_capacity
-  alarm_description   = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${aws_alb.main.name}\" or the target group \"${aws_alb_target_group.main.name}\" and the fleet backend service \"${aws_ecs_service.fleet.name}\""
+  threshold           = data.terraform_remote_state.fleet.outputs.fleet_min_capacity
+  alarm_description   = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${local.alb_name}\" or the target group \"${local.alb_target_group_name}\" and the fleet backend service \"${local.fleet_ecs_service_name}\""
   actions_enabled     = "true"
   alarm_actions       = [aws_sns_topic.cloudwatch_alarm_topic.arn]
   ok_actions          = [aws_sns_topic.cloudwatch_alarm_topic.arn]
   dimensions = {
-    TargetGroup  = aws_alb_target_group.main.arn_suffix
-    LoadBalancer = aws_alb.main.arn_suffix
+    TargetGroup  = data.terraform_remote_state.fleet.outputs.target_group_arn_suffix
+    LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
   }
 }
 
@@ -131,7 +167,7 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time" {
   comparison_operator       = "GreaterThanUpperThreshold"
   evaluation_periods        = "2"
   threshold_metric_id       = "e1"
-  alarm_description         = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${aws_ecs_service.fleet.name}\" because the backend might need to be scaled up."
+  alarm_description         = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${local.fleet_ecs_service_name}\" because the backend might need to be scaled up."
   alarm_actions             = [aws_sns_topic.cloudwatch_alarm_topic.arn]
   ok_actions                = [aws_sns_topic.cloudwatch_alarm_topic.arn]
   insufficient_data_actions = []
@@ -154,8 +190,8 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time" {
       unit        = "Count"
 
       dimensions = {
-        TargetGroup  = aws_alb_target_group.main.arn_suffix
-        LoadBalancer = aws_alb.main.arn_suffix
+        TargetGroup  = data.terraform_remote_state.fleet.outputs.target_group_arn_suffix
+        LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
       }
     }
   }
@@ -174,13 +210,13 @@ resource "aws_cloudwatch_metric_alarm" "httpcode_elb_5xx_count" {
   alarm_actions       = [aws_sns_topic.cloudwatch_alarm_topic.arn]
   ok_actions          = [aws_sns_topic.cloudwatch_alarm_topic.arn]
   dimensions = {
-    LoadBalancer = aws_alb.main.arn_suffix
+    LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
   }
 }
 
 // Elasticache (redis) alerts https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/CacheMetrics.WhichShouldIMonitor.html
 resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
-  for_each            = toset(aws_elasticache_replication_group.default.member_clusters)
+  for_each            = data.terraform_remote_state.fleet.outputs.redis_cluster_members
   alarm_name          = "redis-cpu-utilization-${each.key}-${terraform.workspace}"
   alarm_description   = "Redis cluster CPU utilization node ${each.key}"
   comparison_operator = "GreaterThanThreshold"
@@ -198,11 +234,10 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
     CacheClusterId = each.key
   }
 
-  depends_on = [aws_elasticache_replication_group.default]
 }
 
 resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
-  for_each            = toset(aws_elasticache_replication_group.default.member_clusters)
+  for_each            = data.terraform_remote_state.fleet.outputs.redis_cluster_members
   alarm_name          = "redis-cpu-engine-utilization-${each.key}-${terraform.workspace}"
   alarm_description   = "Redis cluster CPU Engine utilization node ${each.key}"
   comparison_operator = "GreaterThanThreshold"
@@ -220,7 +255,6 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
     CacheClusterId = each.key
   }
 
-  depends_on = [aws_elasticache_replication_group.default]
 }
 
 resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {
@@ -237,11 +271,10 @@ resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {
 
   threshold = "80"
 
-  depends_on = [aws_elasticache_replication_group.default]
 }
 
 resource "aws_cloudwatch_metric_alarm" "redis-current-connections" {
-  for_each                  = toset(aws_elasticache_replication_group.default.member_clusters)
+  for_each                  = data.terraform_remote_state.fleet.outputs.redis_cluster_members
   alarm_name                = "redis-current-connections-${each.key}-${terraform.workspace}"
   alarm_description         = "Redis current connections for node ${each.key}"
   comparison_operator       = "LessThanLowerOrGreaterThanUpperThreshold"
@@ -279,7 +312,7 @@ resource "aws_cloudwatch_metric_alarm" "redis-replication-lag" {
   alarm_name                = "redis-replication-lag-${terraform.workspace}"
   alarm_description         = "This metric is only applicable for a node running as a read replica. It represents how far behind, in seconds, the replica is in applying changes from the primary node. For Redis engine version 5.0.6 onwards, the lag can be measured in milliseconds."
   comparison_operator       = "GreaterThanUpperThreshold"
-  evaluation_periods        = "1"
+  evaluation_periods        = "3"
   threshold_metric_id       = "e1"
   alarm_actions             = [aws_sns_topic.cloudwatch_alarm_topic.arn]
   ok_actions                = [aws_sns_topic.cloudwatch_alarm_topic.arn]
@@ -320,6 +353,6 @@ resource "aws_cloudwatch_metric_alarm" "acm_certificate_expired" {
   ok_actions          = [aws_sns_topic.cloudwatch_alarm_topic.arn]
 
   dimensions = {
-    CertificateArn = aws_acm_certificate.dogfood_fleetdm_com.arn
+    CertificateArn = data.terraform_remote_state.fleet.outputs.acm_certificate_arn
   }
 }
\ No newline at end of file
diff --git a/tools/terraform/outputs.tf b/tools/terraform/outputs.tf
index 287664987..62f710959 100644
--- a/tools/terraform/outputs.tf
+++ b/tools/terraform/outputs.tf
@@ -20,4 +20,40 @@ output "fleet-backend-task-revision" {
 
 output "fleet-migration-task-revision" {
   value = aws_ecs_task_definition.migration.revision
+}
+
+output "redis_cluster_members" {
+  value = toset(aws_elasticache_replication_group.default.member_clusters)
+}
+
+output "mysql_cluster_members" {
+  value = toset(module.aurora_mysql.rds_cluster_instance_ids)
+}
+
+output "acm_certificate_arn" {
+  value = aws_acm_certificate.dogfood_fleetdm_com.arn
+}
+
+output "load_balancer_arn_suffix" {
+  value = aws_alb.main.arn_suffix
+}
+
+output "target_group_arn_suffix" {
+  value = aws_alb_target_group.main.arn_suffix
+}
+
+output "fleet_min_capacity" {
+  value = var.fleet_min_capacity
+}
+
+output "fleet_ecs_service_name" {
+  value = aws_ecs_service.fleet.name
+}
+
+output "aws_alb_target_group_name" {
+  value = aws_alb_target_group.main.name
+}
+
+output "aws_alb_name" {
+  value = aws_alb.main.name
 }
\ No newline at end of file
diff --git a/tools/terraform/readme.md b/tools/terraform/readme.md
index 5c6cb4baa..c82828a0b 100644
--- a/tools/terraform/readme.md
+++ b/tools/terraform/readme.md
@@ -1,10 +1,44 @@
 ## Terraform
 
-`terraform init && terraform workspace new dev`
+### Bootstrapping remote state
 
-`terraform plan`
+First we need to bootstrap our terraform remote state management. This lives outside the main project to avoid "chicken before the egg"
+issues. We are going to create the remote state S3 bucket and DynamoDB state locking table and then use hardcoded values
+in parent folder `main.tf`.
+1. `cd remote-state`
+2. `terraform init`
+3. `terraform apply`
 
-`terraform apply`
+### Creating the Fleet infrastructure
+
+Create a new `tfvars` file for example:
+
+```terraform
+fleet_backend_cpu  = 512
+fleet_backend_mem  = 4096 // 4GB needed for vuln processing
+redis_instance     = "cache.t3.micro"
+fleet_min_capacity = 2
+fleet_max_capacity = 5
+```
+
+If you have a Fleet license key you can include it in the `tfvars` file which will enable the paid features.
+
+```terraform
+fleet_license = "<your license key here"
+```
+
+**To deploy the infrastructure**:
+1. `terraform init && terraform workspace new prod` (workspace is optional terraform defaults to the `default` workspace)
+2. `terraform plan -var-file=<your_tfvars_file>`
+3. `terraform apply -var-file=<your_tfvars_file>`
+
+**To deploy cloudwatch alarms** (requires infrastruture to be deployed)
+1. `cd monitoring`
+2. `terraform init && terraform workspace new prod` (workspace is optional terraform defaults to the `default` workspace)
+3. `terraform plan -var-file=<your_tfvars_file>`
+4. `terraform apply -var-file=<your_tfvars_file>`
+
+Check out [AWS Chatbot](https://docs.aws.amazon.com/chatbot/latest/adminguide/setting-up.html) for a quick and easy way to hook up Cloudwatch Alarms into a Slack channel. 
 
 ### Configuration
 
diff --git a/tools/terraform/remote-state/main.tf b/tools/terraform/remote-state/main.tf
new file mode 100644
index 000000000..60ace8535
--- /dev/null
+++ b/tools/terraform/remote-state/main.tf
@@ -0,0 +1,47 @@
+variable "prefix" {
+  default = "fleet"
+}
+
+variable "region" {
+  default = "us-east-2"
+}
+
+provider "aws" {
+  region = var.region
+}
+
+resource "aws_s3_bucket" "remote_state" {
+  bucket = "${var.prefix}-terraform-remote-state"
+  acl    = "private"
+  versioning {
+    enabled = true
+  }
+  lifecycle {
+    prevent_destroy = true
+  }
+  tags = {
+    Name = "S3 Remote Terraform State Store"
+  }
+}
+
+resource "aws_s3_bucket_public_access_block" "fleet_terraform_state" {
+  bucket              = aws_s3_bucket.remote_state.id
+  block_public_acls   = true
+  block_public_policy = true
+}
+
+
+resource "aws_dynamodb_table" "fleet_terraform_state_lock" {
+  name         = "${var.prefix}-terraform-state-lock"
+  hash_key     = "LockID"
+  billing_mode = "PAY_PER_REQUEST"
+
+  attribute {
+    name = "LockID"
+    type = "S"
+  }
+
+  tags = {
+    Name = "DynamoDB Terraform State Lock Table"
+  }
+}
\ No newline at end of file
diff --git a/tools/terraform/variables.tf b/tools/terraform/variables.tf
index 2ab84d9d8..a222b69d7 100644
--- a/tools/terraform/variables.tf
+++ b/tools/terraform/variables.tf
@@ -103,4 +103,9 @@ variable "memory_tracking_target_value" {
 variable "cpu_tracking_target_value" {
   description = "target cpu utilization for target tracking policy (default 60%)"
   default     = 60
+}
+
+variable "fleet_license" {
+  description = "Fleet Premium license key"
+  default = ""
 }
\ No newline at end of file