refactor terraform to allow bootstrapping (#2662)

* refactor to allow bootstrapping* move monitoring into its own package, update readme
* add variable for license_key
* replication lag alarm less sensitive
This commit is contained in:
Benjamin Edwards 2021-11-09 20:14:05 -05:00 committed by GitHub
parent 2b8502ad61
commit dac043f1f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 199 additions and 83 deletions

View File

@ -8,7 +8,7 @@ data "aws_iam_policy_document" "fleet" {
statement { statement {
effect = "Allow" effect = "Allow"
actions = ["secretsmanager:GetSecretValue"] actions = ["secretsmanager:GetSecretValue"]
resources = [aws_secretsmanager_secret.database_password_secret.arn, data.aws_secretsmanager_secret.license.arn] resources = [aws_secretsmanager_secret.database_password_secret.arn]
} }
// useful when there is a static number of mysql cluster members // useful when there is a static number of mysql cluster members

View File

@ -1,13 +1,15 @@
//resource "aws_route53_record" "record" { data "aws_region" "current" {}
// name = "fleetdm"
// type = "A" resource "aws_route53_record" "record" {
// zone_id = "Z046188311R47QSK245X" name = "fleet-alb-${terraform.workspace}"
// alias { type = "A"
// evaluate_target_health = false zone_id = aws_route53_zone.dogfood_fleetdm_com.zone_id
// name = aws_alb.main.dns_name alias {
// zone_id = aws_alb.main.zone_id evaluate_target_health = false
// } name = aws_alb.main.dns_name
//} zone_id = aws_alb.main.zone_id
}
}
resource "aws_alb" "main" { resource "aws_alb" "main" {
name = "fleetdm" name = "fleetdm"
@ -111,12 +113,6 @@ resource "aws_cloudwatch_log_group" "backend" {
retention_in_days = 1 retention_in_days = 1
} }
data "aws_region" "current" {}
data "aws_secretsmanager_secret" "license" {
name = "/fleet/license"
}
resource "aws_ecs_task_definition" "backend" { resource "aws_ecs_task_definition" "backend" {
family = "fleet" family = "fleet"
network_mode = "awsvpc" network_mode = "awsvpc"
@ -159,10 +155,6 @@ resource "aws_ecs_task_definition" "backend" {
{ {
name = "FLEET_MYSQL_READ_REPLICA_PASSWORD" name = "FLEET_MYSQL_READ_REPLICA_PASSWORD"
valueFrom = aws_secretsmanager_secret.database_password_secret.arn valueFrom = aws_secretsmanager_secret.database_password_secret.arn
},
{
name = "FLEET_LICENSE_KEY"
valueFrom = data.aws_secretsmanager_secret.license.arn
} }
] ]
environment = [ environment = [
@ -235,7 +227,7 @@ resource "aws_ecs_task_definition" "backend" {
value = var.logging_debug value = var.logging_debug
}, },
{ {
name = "FLEET_LOGGING_JSON" name = "FLEET_LOGGING_JSON"
value = var.logging_json value = var.logging_json
}, },
{ {
@ -246,6 +238,10 @@ resource "aws_ecs_task_definition" "backend" {
name = "FLEET_S3_PREFIX" name = "FLEET_S3_PREFIX"
value = "carve_results/" value = "carve_results/"
}, },
{
name = "FLEET_LICENSE_KEY"
value = var.fleet_license
}
] ]
} }
]) ])

View File

@ -7,11 +7,11 @@ provider "aws" {
} }
terraform { terraform {
// these values are hard-coded to prevent chicken before the egg situations // these values should match what is bootstrapped in ./remote-state
backend "s3" { backend "s3" {
bucket = "fleet-terraform-remote-state" bucket = "fleet-terraform-remote-state"
region = "us-east-2" region = "us-east-2"
key = "fleet/" key = "fleet/"
dynamodb_table = "fleet-terraform-state-lock" dynamodb_table = "fleet-terraform-state-lock"
} }
required_providers { required_providers {
@ -22,39 +22,4 @@ terraform {
} }
} }
data "aws_caller_identity" "current" {} data "aws_caller_identity" "current" {}
resource "aws_s3_bucket" "remote_state" {
bucket = "${var.prefix}-terraform-remote-state"
acl = "private"
versioning {
enabled = true
}
lifecycle {
prevent_destroy = true
}
tags = {
Name = "S3 Remote Terraform State Store"
}
}
resource "aws_s3_bucket_public_access_block" "fleet_terraform_state" {
bucket = aws_s3_bucket.remote_state.id
block_public_acls = true
block_public_policy = true
}
resource "aws_dynamodb_table" "fleet_terraform_state_lock" {
name = "fleet-terraform-state-lock"
hash_key = "LockID"
billing_mode = "PAY_PER_REQUEST"
attribute {
name = "LockID"
type = "S"
}
tags = {
Name = "DynamoDB Terraform State Lock Table"
}
}

View File

@ -1,3 +1,39 @@
terraform {
// these values should match what is bootstrapped in ./remote-state
backend "s3" {
bucket = "fleet-terraform-remote-state"
region = "us-east-2"
key = "fleet-monitoring/"
dynamodb_table = "fleet-terraform-state-lock"
}
required_providers {
aws = {
source = "hashicorp/aws"
version = "3.57.0"
}
}
}
provider "aws" {
region = "us-east-2"
}
data "aws_caller_identity" "current" {}
data "aws_region" "current" {}
data "terraform_remote_state" "fleet" {
backend = "s3"
config = {
bucket = "fleet-terraform-remote-state"
region = "us-east-2"
key = "env:/${terraform.workspace}/fleet"
}
}
locals {
fleet_ecs_service_name = data.terraform_remote_state.fleet.outputs.fleet_ecs_service_name
alb_target_group_name = data.terraform_remote_state.fleet.outputs.aws_alb_target_group_name
alb_name = data.terraform_remote_state.fleet.outputs.aws_alb_name
}
// sns topic to send cloudwatch alarms to // sns topic to send cloudwatch alarms to
resource "aws_sns_topic" "cloudwatch_alarm_topic" { resource "aws_sns_topic" "cloudwatch_alarm_topic" {
name = "cloudwatch-alarm-${terraform.workspace}" name = "cloudwatch-alarm-${terraform.workspace}"
@ -67,7 +103,7 @@ data "aws_iam_policy_document" "sns_topic_policy" {
// Database alarms // Database alarms
resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" { resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
for_each = toset(module.aurora_mysql.rds_cluster_instance_ids) for_each = data.terraform_remote_state.fleet.outputs.mysql_cluster_members
alarm_name = "rds_cpu_utilization_too_high-${each.key}-${terraform.workspace}" alarm_name = "rds_cpu_utilization_too_high-${each.key}-${terraform.workspace}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1" evaluation_periods = "1"
@ -89,7 +125,7 @@ resource "aws_db_event_subscription" "default" {
sns_topic = aws_sns_topic.cloudwatch_alarm_topic.arn sns_topic = aws_sns_topic.cloudwatch_alarm_topic.arn
source_type = "db-instance" source_type = "db-instance"
source_ids = module.aurora_mysql.rds_cluster_instance_ids source_ids = data.terraform_remote_state.fleet.outputs.mysql_cluster_members
event_categories = [ event_categories = [
"failover", "failover",
@ -114,14 +150,14 @@ resource "aws_cloudwatch_metric_alarm" "alb_healthyhosts" {
namespace = "AWS/ApplicationELB" namespace = "AWS/ApplicationELB"
period = "60" period = "60"
statistic = "Minimum" statistic = "Minimum"
threshold = var.fleet_min_capacity threshold = data.terraform_remote_state.fleet.outputs.fleet_min_capacity
alarm_description = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${aws_alb.main.name}\" or the target group \"${aws_alb_target_group.main.name}\" and the fleet backend service \"${aws_ecs_service.fleet.name}\"" alarm_description = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${local.alb_name}\" or the target group \"${local.alb_target_group_name}\" and the fleet backend service \"${local.fleet_ecs_service_name}\""
actions_enabled = "true" actions_enabled = "true"
alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn] alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn] ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
dimensions = { dimensions = {
TargetGroup = aws_alb_target_group.main.arn_suffix TargetGroup = data.terraform_remote_state.fleet.outputs.target_group_arn_suffix
LoadBalancer = aws_alb.main.arn_suffix LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
} }
} }
@ -131,7 +167,7 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time" {
comparison_operator = "GreaterThanUpperThreshold" comparison_operator = "GreaterThanUpperThreshold"
evaluation_periods = "2" evaluation_periods = "2"
threshold_metric_id = "e1" threshold_metric_id = "e1"
alarm_description = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${aws_ecs_service.fleet.name}\" because the backend might need to be scaled up." alarm_description = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${local.fleet_ecs_service_name}\" because the backend might need to be scaled up."
alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn] alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn] ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
insufficient_data_actions = [] insufficient_data_actions = []
@ -154,8 +190,8 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time" {
unit = "Count" unit = "Count"
dimensions = { dimensions = {
TargetGroup = aws_alb_target_group.main.arn_suffix TargetGroup = data.terraform_remote_state.fleet.outputs.target_group_arn_suffix
LoadBalancer = aws_alb.main.arn_suffix LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
} }
} }
} }
@ -174,13 +210,13 @@ resource "aws_cloudwatch_metric_alarm" "httpcode_elb_5xx_count" {
alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn] alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn] ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
dimensions = { dimensions = {
LoadBalancer = aws_alb.main.arn_suffix LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
} }
} }
// Elasticache (redis) alerts https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/CacheMetrics.WhichShouldIMonitor.html // Elasticache (redis) alerts https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/CacheMetrics.WhichShouldIMonitor.html
resource "aws_cloudwatch_metric_alarm" "redis_cpu" { resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
for_each = toset(aws_elasticache_replication_group.default.member_clusters) for_each = data.terraform_remote_state.fleet.outputs.redis_cluster_members
alarm_name = "redis-cpu-utilization-${each.key}-${terraform.workspace}" alarm_name = "redis-cpu-utilization-${each.key}-${terraform.workspace}"
alarm_description = "Redis cluster CPU utilization node ${each.key}" alarm_description = "Redis cluster CPU utilization node ${each.key}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
@ -198,11 +234,10 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
CacheClusterId = each.key CacheClusterId = each.key
} }
depends_on = [aws_elasticache_replication_group.default]
} }
resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" { resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
for_each = toset(aws_elasticache_replication_group.default.member_clusters) for_each = data.terraform_remote_state.fleet.outputs.redis_cluster_members
alarm_name = "redis-cpu-engine-utilization-${each.key}-${terraform.workspace}" alarm_name = "redis-cpu-engine-utilization-${each.key}-${terraform.workspace}"
alarm_description = "Redis cluster CPU Engine utilization node ${each.key}" alarm_description = "Redis cluster CPU Engine utilization node ${each.key}"
comparison_operator = "GreaterThanThreshold" comparison_operator = "GreaterThanThreshold"
@ -220,7 +255,6 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
CacheClusterId = each.key CacheClusterId = each.key
} }
depends_on = [aws_elasticache_replication_group.default]
} }
resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" { resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {
@ -237,11 +271,10 @@ resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {
threshold = "80" threshold = "80"
depends_on = [aws_elasticache_replication_group.default]
} }
resource "aws_cloudwatch_metric_alarm" "redis-current-connections" { resource "aws_cloudwatch_metric_alarm" "redis-current-connections" {
for_each = toset(aws_elasticache_replication_group.default.member_clusters) for_each = data.terraform_remote_state.fleet.outputs.redis_cluster_members
alarm_name = "redis-current-connections-${each.key}-${terraform.workspace}" alarm_name = "redis-current-connections-${each.key}-${terraform.workspace}"
alarm_description = "Redis current connections for node ${each.key}" alarm_description = "Redis current connections for node ${each.key}"
comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold" comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold"
@ -279,7 +312,7 @@ resource "aws_cloudwatch_metric_alarm" "redis-replication-lag" {
alarm_name = "redis-replication-lag-${terraform.workspace}" alarm_name = "redis-replication-lag-${terraform.workspace}"
alarm_description = "This metric is only applicable for a node running as a read replica. It represents how far behind, in seconds, the replica is in applying changes from the primary node. For Redis engine version 5.0.6 onwards, the lag can be measured in milliseconds." alarm_description = "This metric is only applicable for a node running as a read replica. It represents how far behind, in seconds, the replica is in applying changes from the primary node. For Redis engine version 5.0.6 onwards, the lag can be measured in milliseconds."
comparison_operator = "GreaterThanUpperThreshold" comparison_operator = "GreaterThanUpperThreshold"
evaluation_periods = "1" evaluation_periods = "3"
threshold_metric_id = "e1" threshold_metric_id = "e1"
alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn] alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn] ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
@ -320,6 +353,6 @@ resource "aws_cloudwatch_metric_alarm" "acm_certificate_expired" {
ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn] ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
dimensions = { dimensions = {
CertificateArn = aws_acm_certificate.dogfood_fleetdm_com.arn CertificateArn = data.terraform_remote_state.fleet.outputs.acm_certificate_arn
} }
} }

View File

@ -20,4 +20,40 @@ output "fleet-backend-task-revision" {
output "fleet-migration-task-revision" { output "fleet-migration-task-revision" {
value = aws_ecs_task_definition.migration.revision value = aws_ecs_task_definition.migration.revision
}
output "redis_cluster_members" {
value = toset(aws_elasticache_replication_group.default.member_clusters)
}
output "mysql_cluster_members" {
value = toset(module.aurora_mysql.rds_cluster_instance_ids)
}
output "acm_certificate_arn" {
value = aws_acm_certificate.dogfood_fleetdm_com.arn
}
output "load_balancer_arn_suffix" {
value = aws_alb.main.arn_suffix
}
output "target_group_arn_suffix" {
value = aws_alb_target_group.main.arn_suffix
}
output "fleet_min_capacity" {
value = var.fleet_min_capacity
}
output "fleet_ecs_service_name" {
value = aws_ecs_service.fleet.name
}
output "aws_alb_target_group_name" {
value = aws_alb_target_group.main.name
}
output "aws_alb_name" {
value = aws_alb.main.name
} }

View File

@ -1,10 +1,44 @@
## Terraform ## Terraform
`terraform init && terraform workspace new dev` ### Bootstrapping remote state
`terraform plan` First we need to bootstrap our terraform remote state management. This lives outside the main project to avoid "chicken before the egg"
issues. We are going to create the remote state S3 bucket and DynamoDB state locking table and then use hardcoded values
in parent folder `main.tf`.
1. `cd remote-state`
2. `terraform init`
3. `terraform apply`
`terraform apply` ### Creating the Fleet infrastructure
Create a new `tfvars` file for example:
```terraform
fleet_backend_cpu = 512
fleet_backend_mem = 4096 // 4GB needed for vuln processing
redis_instance = "cache.t3.micro"
fleet_min_capacity = 2
fleet_max_capacity = 5
```
If you have a Fleet license key you can include it in the `tfvars` file which will enable the paid features.
```terraform
fleet_license = "<your license key here"
```
**To deploy the infrastructure**:
1. `terraform init && terraform workspace new prod` (workspace is optional terraform defaults to the `default` workspace)
2. `terraform plan -var-file=<your_tfvars_file>`
3. `terraform apply -var-file=<your_tfvars_file>`
**To deploy cloudwatch alarms** (requires infrastruture to be deployed)
1. `cd monitoring`
2. `terraform init && terraform workspace new prod` (workspace is optional terraform defaults to the `default` workspace)
3. `terraform plan -var-file=<your_tfvars_file>`
4. `terraform apply -var-file=<your_tfvars_file>`
Check out [AWS Chatbot](https://docs.aws.amazon.com/chatbot/latest/adminguide/setting-up.html) for a quick and easy way to hook up Cloudwatch Alarms into a Slack channel.
### Configuration ### Configuration

View File

@ -0,0 +1,47 @@
variable "prefix" {
default = "fleet"
}
variable "region" {
default = "us-east-2"
}
provider "aws" {
region = var.region
}
resource "aws_s3_bucket" "remote_state" {
bucket = "${var.prefix}-terraform-remote-state"
acl = "private"
versioning {
enabled = true
}
lifecycle {
prevent_destroy = true
}
tags = {
Name = "S3 Remote Terraform State Store"
}
}
resource "aws_s3_bucket_public_access_block" "fleet_terraform_state" {
bucket = aws_s3_bucket.remote_state.id
block_public_acls = true
block_public_policy = true
}
resource "aws_dynamodb_table" "fleet_terraform_state_lock" {
name = "${var.prefix}-terraform-state-lock"
hash_key = "LockID"
billing_mode = "PAY_PER_REQUEST"
attribute {
name = "LockID"
type = "S"
}
tags = {
Name = "DynamoDB Terraform State Lock Table"
}
}

View File

@ -103,4 +103,9 @@ variable "memory_tracking_target_value" {
variable "cpu_tracking_target_value" { variable "cpu_tracking_target_value" {
description = "target cpu utilization for target tracking policy (default 60%)" description = "target cpu utilization for target tracking policy (default 60%)"
default = 60 default = 60
}
variable "fleet_license" {
description = "Fleet Premium license key"
default = ""
} }