refactor terraform to allow bootstrapping (#2662)

* refactor to allow bootstrapping* move monitoring into its own package, update readme
* add variable for license_key
* replication lag alarm less sensitive
This commit is contained in:
Benjamin Edwards 2021-11-09 20:14:05 -05:00 committed by GitHub
parent 2b8502ad61
commit dac043f1f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 199 additions and 83 deletions

View File

@ -8,7 +8,7 @@ data "aws_iam_policy_document" "fleet" {
statement {
effect = "Allow"
actions = ["secretsmanager:GetSecretValue"]
resources = [aws_secretsmanager_secret.database_password_secret.arn, data.aws_secretsmanager_secret.license.arn]
resources = [aws_secretsmanager_secret.database_password_secret.arn]
}
// useful when there is a static number of mysql cluster members

View File

@ -1,13 +1,15 @@
//resource "aws_route53_record" "record" {
// name = "fleetdm"
// type = "A"
// zone_id = "Z046188311R47QSK245X"
// alias {
// evaluate_target_health = false
// name = aws_alb.main.dns_name
// zone_id = aws_alb.main.zone_id
// }
//}
data "aws_region" "current" {}
resource "aws_route53_record" "record" {
name = "fleet-alb-${terraform.workspace}"
type = "A"
zone_id = aws_route53_zone.dogfood_fleetdm_com.zone_id
alias {
evaluate_target_health = false
name = aws_alb.main.dns_name
zone_id = aws_alb.main.zone_id
}
}
resource "aws_alb" "main" {
name = "fleetdm"
@ -111,12 +113,6 @@ resource "aws_cloudwatch_log_group" "backend" {
retention_in_days = 1
}
data "aws_region" "current" {}
data "aws_secretsmanager_secret" "license" {
name = "/fleet/license"
}
resource "aws_ecs_task_definition" "backend" {
family = "fleet"
network_mode = "awsvpc"
@ -159,10 +155,6 @@ resource "aws_ecs_task_definition" "backend" {
{
name = "FLEET_MYSQL_READ_REPLICA_PASSWORD"
valueFrom = aws_secretsmanager_secret.database_password_secret.arn
},
{
name = "FLEET_LICENSE_KEY"
valueFrom = data.aws_secretsmanager_secret.license.arn
}
]
environment = [
@ -235,7 +227,7 @@ resource "aws_ecs_task_definition" "backend" {
value = var.logging_debug
},
{
name = "FLEET_LOGGING_JSON"
name = "FLEET_LOGGING_JSON"
value = var.logging_json
},
{
@ -246,6 +238,10 @@ resource "aws_ecs_task_definition" "backend" {
name = "FLEET_S3_PREFIX"
value = "carve_results/"
},
{
name = "FLEET_LICENSE_KEY"
value = var.fleet_license
}
]
}
])

View File

@ -7,11 +7,11 @@ provider "aws" {
}
terraform {
// these values are hard-coded to prevent chicken before the egg situations
// these values should match what is bootstrapped in ./remote-state
backend "s3" {
bucket = "fleet-terraform-remote-state"
region = "us-east-2"
key = "fleet/"
bucket = "fleet-terraform-remote-state"
region = "us-east-2"
key = "fleet/"
dynamodb_table = "fleet-terraform-state-lock"
}
required_providers {
@ -22,39 +22,4 @@ terraform {
}
}
data "aws_caller_identity" "current" {}
resource "aws_s3_bucket" "remote_state" {
bucket = "${var.prefix}-terraform-remote-state"
acl = "private"
versioning {
enabled = true
}
lifecycle {
prevent_destroy = true
}
tags = {
Name = "S3 Remote Terraform State Store"
}
}
resource "aws_s3_bucket_public_access_block" "fleet_terraform_state" {
bucket = aws_s3_bucket.remote_state.id
block_public_acls = true
block_public_policy = true
}
resource "aws_dynamodb_table" "fleet_terraform_state_lock" {
name = "fleet-terraform-state-lock"
hash_key = "LockID"
billing_mode = "PAY_PER_REQUEST"
attribute {
name = "LockID"
type = "S"
}
tags = {
Name = "DynamoDB Terraform State Lock Table"
}
}
data "aws_caller_identity" "current" {}

View File

@ -1,3 +1,39 @@
terraform {
// these values should match what is bootstrapped in ./remote-state
backend "s3" {
bucket = "fleet-terraform-remote-state"
region = "us-east-2"
key = "fleet-monitoring/"
dynamodb_table = "fleet-terraform-state-lock"
}
required_providers {
aws = {
source = "hashicorp/aws"
version = "3.57.0"
}
}
}
provider "aws" {
region = "us-east-2"
}
data "aws_caller_identity" "current" {}
data "aws_region" "current" {}
data "terraform_remote_state" "fleet" {
backend = "s3"
config = {
bucket = "fleet-terraform-remote-state"
region = "us-east-2"
key = "env:/${terraform.workspace}/fleet"
}
}
locals {
fleet_ecs_service_name = data.terraform_remote_state.fleet.outputs.fleet_ecs_service_name
alb_target_group_name = data.terraform_remote_state.fleet.outputs.aws_alb_target_group_name
alb_name = data.terraform_remote_state.fleet.outputs.aws_alb_name
}
// sns topic to send cloudwatch alarms to
resource "aws_sns_topic" "cloudwatch_alarm_topic" {
name = "cloudwatch-alarm-${terraform.workspace}"
@ -67,7 +103,7 @@ data "aws_iam_policy_document" "sns_topic_policy" {
// Database alarms
resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
for_each = toset(module.aurora_mysql.rds_cluster_instance_ids)
for_each = data.terraform_remote_state.fleet.outputs.mysql_cluster_members
alarm_name = "rds_cpu_utilization_too_high-${each.key}-${terraform.workspace}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
@ -89,7 +125,7 @@ resource "aws_db_event_subscription" "default" {
sns_topic = aws_sns_topic.cloudwatch_alarm_topic.arn
source_type = "db-instance"
source_ids = module.aurora_mysql.rds_cluster_instance_ids
source_ids = data.terraform_remote_state.fleet.outputs.mysql_cluster_members
event_categories = [
"failover",
@ -114,14 +150,14 @@ resource "aws_cloudwatch_metric_alarm" "alb_healthyhosts" {
namespace = "AWS/ApplicationELB"
period = "60"
statistic = "Minimum"
threshold = var.fleet_min_capacity
alarm_description = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${aws_alb.main.name}\" or the target group \"${aws_alb_target_group.main.name}\" and the fleet backend service \"${aws_ecs_service.fleet.name}\""
threshold = data.terraform_remote_state.fleet.outputs.fleet_min_capacity
alarm_description = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${local.alb_name}\" or the target group \"${local.alb_target_group_name}\" and the fleet backend service \"${local.fleet_ecs_service_name}\""
actions_enabled = "true"
alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
dimensions = {
TargetGroup = aws_alb_target_group.main.arn_suffix
LoadBalancer = aws_alb.main.arn_suffix
TargetGroup = data.terraform_remote_state.fleet.outputs.target_group_arn_suffix
LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
}
}
@ -131,7 +167,7 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time" {
comparison_operator = "GreaterThanUpperThreshold"
evaluation_periods = "2"
threshold_metric_id = "e1"
alarm_description = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${aws_ecs_service.fleet.name}\" because the backend might need to be scaled up."
alarm_description = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${local.fleet_ecs_service_name}\" because the backend might need to be scaled up."
alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
insufficient_data_actions = []
@ -154,8 +190,8 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time" {
unit = "Count"
dimensions = {
TargetGroup = aws_alb_target_group.main.arn_suffix
LoadBalancer = aws_alb.main.arn_suffix
TargetGroup = data.terraform_remote_state.fleet.outputs.target_group_arn_suffix
LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
}
}
}
@ -174,13 +210,13 @@ resource "aws_cloudwatch_metric_alarm" "httpcode_elb_5xx_count" {
alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
dimensions = {
LoadBalancer = aws_alb.main.arn_suffix
LoadBalancer = data.terraform_remote_state.fleet.outputs.load_balancer_arn_suffix
}
}
// Elasticache (redis) alerts https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/CacheMetrics.WhichShouldIMonitor.html
resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
for_each = toset(aws_elasticache_replication_group.default.member_clusters)
for_each = data.terraform_remote_state.fleet.outputs.redis_cluster_members
alarm_name = "redis-cpu-utilization-${each.key}-${terraform.workspace}"
alarm_description = "Redis cluster CPU utilization node ${each.key}"
comparison_operator = "GreaterThanThreshold"
@ -198,11 +234,10 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
CacheClusterId = each.key
}
depends_on = [aws_elasticache_replication_group.default]
}
resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
for_each = toset(aws_elasticache_replication_group.default.member_clusters)
for_each = data.terraform_remote_state.fleet.outputs.redis_cluster_members
alarm_name = "redis-cpu-engine-utilization-${each.key}-${terraform.workspace}"
alarm_description = "Redis cluster CPU Engine utilization node ${each.key}"
comparison_operator = "GreaterThanThreshold"
@ -220,7 +255,6 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
CacheClusterId = each.key
}
depends_on = [aws_elasticache_replication_group.default]
}
resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {
@ -237,11 +271,10 @@ resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {
threshold = "80"
depends_on = [aws_elasticache_replication_group.default]
}
resource "aws_cloudwatch_metric_alarm" "redis-current-connections" {
for_each = toset(aws_elasticache_replication_group.default.member_clusters)
for_each = data.terraform_remote_state.fleet.outputs.redis_cluster_members
alarm_name = "redis-current-connections-${each.key}-${terraform.workspace}"
alarm_description = "Redis current connections for node ${each.key}"
comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold"
@ -279,7 +312,7 @@ resource "aws_cloudwatch_metric_alarm" "redis-replication-lag" {
alarm_name = "redis-replication-lag-${terraform.workspace}"
alarm_description = "This metric is only applicable for a node running as a read replica. It represents how far behind, in seconds, the replica is in applying changes from the primary node. For Redis engine version 5.0.6 onwards, the lag can be measured in milliseconds."
comparison_operator = "GreaterThanUpperThreshold"
evaluation_periods = "1"
evaluation_periods = "3"
threshold_metric_id = "e1"
alarm_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
@ -320,6 +353,6 @@ resource "aws_cloudwatch_metric_alarm" "acm_certificate_expired" {
ok_actions = [aws_sns_topic.cloudwatch_alarm_topic.arn]
dimensions = {
CertificateArn = aws_acm_certificate.dogfood_fleetdm_com.arn
CertificateArn = data.terraform_remote_state.fleet.outputs.acm_certificate_arn
}
}

View File

@ -20,4 +20,40 @@ output "fleet-backend-task-revision" {
output "fleet-migration-task-revision" {
value = aws_ecs_task_definition.migration.revision
}
output "redis_cluster_members" {
value = toset(aws_elasticache_replication_group.default.member_clusters)
}
output "mysql_cluster_members" {
value = toset(module.aurora_mysql.rds_cluster_instance_ids)
}
output "acm_certificate_arn" {
value = aws_acm_certificate.dogfood_fleetdm_com.arn
}
output "load_balancer_arn_suffix" {
value = aws_alb.main.arn_suffix
}
output "target_group_arn_suffix" {
value = aws_alb_target_group.main.arn_suffix
}
output "fleet_min_capacity" {
value = var.fleet_min_capacity
}
output "fleet_ecs_service_name" {
value = aws_ecs_service.fleet.name
}
output "aws_alb_target_group_name" {
value = aws_alb_target_group.main.name
}
output "aws_alb_name" {
value = aws_alb.main.name
}

View File

@ -1,10 +1,44 @@
## Terraform
`terraform init && terraform workspace new dev`
### Bootstrapping remote state
`terraform plan`
First we need to bootstrap our terraform remote state management. This lives outside the main project to avoid "chicken before the egg"
issues. We are going to create the remote state S3 bucket and DynamoDB state locking table and then use hardcoded values
in parent folder `main.tf`.
1. `cd remote-state`
2. `terraform init`
3. `terraform apply`
`terraform apply`
### Creating the Fleet infrastructure
Create a new `tfvars` file for example:
```terraform
fleet_backend_cpu = 512
fleet_backend_mem = 4096 // 4GB needed for vuln processing
redis_instance = "cache.t3.micro"
fleet_min_capacity = 2
fleet_max_capacity = 5
```
If you have a Fleet license key you can include it in the `tfvars` file which will enable the paid features.
```terraform
fleet_license = "<your license key here"
```
**To deploy the infrastructure**:
1. `terraform init && terraform workspace new prod` (workspace is optional terraform defaults to the `default` workspace)
2. `terraform plan -var-file=<your_tfvars_file>`
3. `terraform apply -var-file=<your_tfvars_file>`
**To deploy cloudwatch alarms** (requires infrastruture to be deployed)
1. `cd monitoring`
2. `terraform init && terraform workspace new prod` (workspace is optional terraform defaults to the `default` workspace)
3. `terraform plan -var-file=<your_tfvars_file>`
4. `terraform apply -var-file=<your_tfvars_file>`
Check out [AWS Chatbot](https://docs.aws.amazon.com/chatbot/latest/adminguide/setting-up.html) for a quick and easy way to hook up Cloudwatch Alarms into a Slack channel.
### Configuration

View File

@ -0,0 +1,47 @@
variable "prefix" {
default = "fleet"
}
variable "region" {
default = "us-east-2"
}
provider "aws" {
region = var.region
}
resource "aws_s3_bucket" "remote_state" {
bucket = "${var.prefix}-terraform-remote-state"
acl = "private"
versioning {
enabled = true
}
lifecycle {
prevent_destroy = true
}
tags = {
Name = "S3 Remote Terraform State Store"
}
}
resource "aws_s3_bucket_public_access_block" "fleet_terraform_state" {
bucket = aws_s3_bucket.remote_state.id
block_public_acls = true
block_public_policy = true
}
resource "aws_dynamodb_table" "fleet_terraform_state_lock" {
name = "${var.prefix}-terraform-state-lock"
hash_key = "LockID"
billing_mode = "PAY_PER_REQUEST"
attribute {
name = "LockID"
type = "S"
}
tags = {
Name = "DynamoDB Terraform State Lock Table"
}
}

View File

@ -103,4 +103,9 @@ variable "memory_tracking_target_value" {
variable "cpu_tracking_target_value" {
description = "target cpu utilization for target tracking policy (default 60%)"
default = 60
}
variable "fleet_license" {
description = "Fleet Premium license key"
default = ""
}