Add infra for loadtest (#2218)

* Add infra for loadtest * Move loadtest stuff to a new file and parametrize fleet min/max capacity * wip * wip * wip * wip * wip * wip * wip * Update to be ready for review * Update link and other variables needed * Address review comments and update links
2024-11-06 00:45:19 +00:00 · 2021-10-14 12:04:27 -03:00 · 2021-10-14 12:04:27 -03:00 · d904d501ac
commit d904d501ac
parent 825939e3dc
5 changed files with 128 additions and 15 deletions
--- a/docs/02-Deploying/05-Fleet-public-load-testing.md
+++ b/docs/02-Deploying/05-Fleet-public-load-testing.md
@ -0,0 +1,73 @@
+# Load testing
+
+## Baseline Test
+
+Baseline setup: 6 custom labels, 6 policies, and 2 packs with ~6 queries each, and be able to live query all the hosts.
+
+## How we are simulating osquery
+
+The simulation is run by using [osquery-perf](https://github.com/fleetdm/fleet/tree/main/cmd/osquery-perf) using the following command:
+
+```bash
+go run cmd/osquery-perf/agent.go -enroll_secret <secret here> -host_count 150000 -server_url <server URL here> -node_key_file nodekeys
+```
+
+After the hosts have been enrolled, you can simply add `-only_already_enrolled` to make sure the node keys from the file 
+are used and no enrollment happens, virtually "resuming" the execution of all the simulated hosts.
+
+## Infrastructure setup
+
+The deployment of Fleet was done through the example [terraform provided in the repo](https://github.com/fleetdm/fleet/tree/main/tools/terraform) with the following command:
+
+```bash
+terraform apply \ 
+  -var domain_fleetctl=<your domain here> \
+  -var domain_fleetdm=<alternative domain here> \ 
+  -var s3_bucket=<log bucket name> \
+  -var fleet_image="fleetdm/fleet:<tag targeted>" \
+  -var vulnerabilities_path="" \
+  -var fleet_max_capacity=100 \ 
+  -var fleet_min_capacity=5
+```
+
+## Bare minimum setup
+
+Fleet instances:
+- 1 Fargate Task
+- 256 CPU units
+- 512 MB of memory
+- Amount of hosts: 1000
+
+Redis: 
+- Version: 5.0.6
+- Instance type: cache.m5.large
+
+Mysql:
+- Version: 5.7.mysql_aurora.2.10.0
+- Instance type: db.t4g.medium
+
+With the above infrastructure, 1000 hosts were able to run and be live query without a problem.
+
+## 150k hosts
+
+Fleet instances:
+- 25 Task
+- 1024 CPU units
+- 2048 MB of memory
+- Amount of hosts: 150000k
+
+Redis:
+- Version: 5.0.6
+- Instance: cache.m5.large
+
+Mysql:
+- Version: 5.7.mysql_aurora.2.10.0
+- Instance: db.r5.4xlarge
+
+The setup auto scaled based on CPU usage. After a while, the task count ended up in 25 instances even while live querying 
+or adding a new label. 
+
+## Limitations of the test
+
+While osquery-perf simulates enough of osquery to be a good first step, it's not the smartest simulation as of the time of
+this writing. Particularly, it doesn't simulate host users and software inventory yet.
--- a/tools/terraform/ecs.tf
+++ b/tools/terraform/ecs.tf
@ -14,6 +14,7 @@ resource "aws_alb" "main" {
  internal        = false
  security_groups = [aws_security_group.lb.id, aws_security_group.backend.id]
  subnets         = module.vpc.public_subnets
+  idle_timeout    = 120
 }

 resource "aws_alb_target_group" "main" {
@ -81,7 +82,7 @@ resource "aws_ecs_service" "fleet" {
  launch_type                        = "FARGATE"
  cluster                            = aws_ecs_cluster.fleet.id
  task_definition                    = aws_ecs_task_definition.backend.arn
-  desired_count                      = 1
+  desired_count                      = 5
  deployment_minimum_healthy_percent = 100
  deployment_maximum_percent         = 200
  health_check_grace_period_seconds  = 30
@ -117,15 +118,15 @@ resource "aws_ecs_task_definition" "backend" {
  requires_compatibilities = ["FARGATE"]
  execution_role_arn       = aws_iam_role.main.arn
  task_role_arn            = aws_iam_role.main.arn
-  cpu                      = 512
-  memory                   = 4096
+  cpu                      = var.fleet_backend_cpu
+  memory                   = var.fleet_backend_mem
  container_definitions = jsonencode(
    [
      {
        name        = "fleet"
-        image       = var.image
-        cpu         = 512
-        memory      = 4096
+        image       = var.fleet_image
+        cpu         = var.fleet_backend_cpu
+        memory      = var.fleet_backend_mem
        mountPoints = []
        volumesFrom = []
        essential   = true
@ -220,6 +221,14 @@ resource "aws_ecs_task_definition" "backend" {
            name  = "FLEET_VULNERABILITIES_DATABASES_PATH"
            value = var.vuln_db_path
          },
+          {
+            name  = "FLEET_OSQUERY_ENABLE_ASYNC_HOST_PROCESSING"
+            value = var.async_host_processing
+          },
+          {
+            name  = "FLEET_LOGGING_DEBUG"
+            value = var.logging_debug
+          },
          {
            name  = "FLEET_S3_BUCKET"
            value = aws_s3_bucket.osquery-carve.bucket
@ -228,7 +237,6 @@ resource "aws_ecs_task_definition" "backend" {
            name  = "FLEET_S3_PREFIX"
            value = "carve_results/"
          },
-
        ]
      }
  ])
@ -247,7 +255,7 @@ resource "aws_ecs_task_definition" "migration" {
    [
      {
        name        = "fleet-prepare-db"
-        image       = var.image
+        image       = var.fleet_image
        cpu         = var.cpu_migrate
        memory      = var.mem_migrate
        mountPoints = []
--- a/tools/terraform/firehose.tf
+++ b/tools/terraform/firehose.tf
@ -1,5 +1,5 @@
 resource "aws_s3_bucket" "osquery-results" {
-  bucket = "fleet-osquery-results-archive"
+  bucket = var.osquery_results_s3_bucket
  acl    = "private"

  lifecycle_rule {
@ -19,7 +19,7 @@ resource "aws_s3_bucket" "osquery-results" {
 }

 resource "aws_s3_bucket" "osquery-status" {
-  bucket = "fleet-osquery-status-archive"
+  bucket = var.osquery_status_s3_bucket
  acl    = "private"

  lifecycle_rule {
--- a/tools/terraform/redis.tf
+++ b/tools/terraform/redis.tf
@ -4,12 +4,12 @@ variable "maintenance_window" {
 variable "engine_version" {
  default = "6.x"
 }
-variable "node_type" {
-  default = "cache.t3.micro"
-}
 variable "number_cache_clusters" {
  default = 3
 }
+variable "redis_instance" {
+  default = "cache.m5.large"
+}
 resource "aws_elasticache_replication_group" "default" {
  availability_zones            = ["us-east-2a", "us-east-2b", "us-east-2c"]
  engine                        = "redis"
@ -18,7 +18,7 @@ resource "aws_elasticache_replication_group" "default" {
  security_group_ids            = [aws_security_group.redis.id]
  replication_group_id          = "fleetdm-redis"
  number_cache_clusters         = var.number_cache_clusters
-  node_type                     = var.node_type
+  node_type                     = var.redis_instance
  engine_version                = var.engine_version
  port                          = "6379"
  maintenance_window            = var.maintenance_window
--- a/tools/terraform/variables.tf
+++ b/tools/terraform/variables.tf
@ -14,6 +14,36 @@ variable "domain_fleetctl" {
  default = "dogfood.fleetctl.com"
 }

+variable "osquery_results_s3_bucket" {
+  default = "fleet-osquery-results-archive"
+}
+
+variable "osquery_status_s3_bucket" {
+  default = "fleet-osquery-status-archive"
+}
+
+variable "vulnerabilities_path" {
+  default = "/home/fleet"
+}
+
+variable "fleet_backend_cpu" {
+  default = 256
+  type = number
+}
+
+variable "fleet_backend_mem" {
+  default = 512
+  type = number
+}
+
+variable "async_host_processing" {
+  default = "false"
+}
+
+variable "logging_debug" {
+default = "false"
+}
+
 variable "database_user" {
  description = "database user fleet will authenticate and query with"
  default     = "fleet"
@ -24,7 +54,7 @@ variable "database_name" {
  default = "fleet"
 }

-variable "image" {
+variable "fleet_image" {
  description = "the name of the container image to run"
  default     = "fleetdm/fleet"
 }
@ -42,11 +72,13 @@ variable "vuln_db_path" {
 variable "cpu_migrate" {
  description = "cpu units for migration task"
  default     = 1024
+  type = number
 }

 variable "mem_migrate" {
  description = "memory limit for migration task in MB"
  default     = 2048
+  type = number
 }

 variable "fleet_max_capacity" {