mirror of
https://github.com/empayre/fleet.git
synced 2024-11-06 00:45:19 +00:00
Add infra for loadtest (#2218)
* Add infra for loadtest * Move loadtest stuff to a new file and parametrize fleet min/max capacity * wip * wip * wip * wip * wip * wip * wip * Update to be ready for review * Update link and other variables needed * Address review comments and update links
This commit is contained in:
parent
825939e3dc
commit
d904d501ac
73
docs/02-Deploying/05-Fleet-public-load-testing.md
Normal file
73
docs/02-Deploying/05-Fleet-public-load-testing.md
Normal file
@ -0,0 +1,73 @@
|
||||
# Load testing
|
||||
|
||||
## Baseline Test
|
||||
|
||||
Baseline setup: 6 custom labels, 6 policies, and 2 packs with ~6 queries each, and be able to live query all the hosts.
|
||||
|
||||
## How we are simulating osquery
|
||||
|
||||
The simulation is run by using [osquery-perf](https://github.com/fleetdm/fleet/tree/main/cmd/osquery-perf) using the following command:
|
||||
|
||||
```bash
|
||||
go run cmd/osquery-perf/agent.go -enroll_secret <secret here> -host_count 150000 -server_url <server URL here> -node_key_file nodekeys
|
||||
```
|
||||
|
||||
After the hosts have been enrolled, you can simply add `-only_already_enrolled` to make sure the node keys from the file
|
||||
are used and no enrollment happens, virtually "resuming" the execution of all the simulated hosts.
|
||||
|
||||
## Infrastructure setup
|
||||
|
||||
The deployment of Fleet was done through the example [terraform provided in the repo](https://github.com/fleetdm/fleet/tree/main/tools/terraform) with the following command:
|
||||
|
||||
```bash
|
||||
terraform apply \
|
||||
-var domain_fleetctl=<your domain here> \
|
||||
-var domain_fleetdm=<alternative domain here> \
|
||||
-var s3_bucket=<log bucket name> \
|
||||
-var fleet_image="fleetdm/fleet:<tag targeted>" \
|
||||
-var vulnerabilities_path="" \
|
||||
-var fleet_max_capacity=100 \
|
||||
-var fleet_min_capacity=5
|
||||
```
|
||||
|
||||
## Bare minimum setup
|
||||
|
||||
Fleet instances:
|
||||
- 1 Fargate Task
|
||||
- 256 CPU units
|
||||
- 512 MB of memory
|
||||
- Amount of hosts: 1000
|
||||
|
||||
Redis:
|
||||
- Version: 5.0.6
|
||||
- Instance type: cache.m5.large
|
||||
|
||||
Mysql:
|
||||
- Version: 5.7.mysql_aurora.2.10.0
|
||||
- Instance type: db.t4g.medium
|
||||
|
||||
With the above infrastructure, 1000 hosts were able to run and be live query without a problem.
|
||||
|
||||
## 150k hosts
|
||||
|
||||
Fleet instances:
|
||||
- 25 Task
|
||||
- 1024 CPU units
|
||||
- 2048 MB of memory
|
||||
- Amount of hosts: 150000k
|
||||
|
||||
Redis:
|
||||
- Version: 5.0.6
|
||||
- Instance: cache.m5.large
|
||||
|
||||
Mysql:
|
||||
- Version: 5.7.mysql_aurora.2.10.0
|
||||
- Instance: db.r5.4xlarge
|
||||
|
||||
The setup auto scaled based on CPU usage. After a while, the task count ended up in 25 instances even while live querying
|
||||
or adding a new label.
|
||||
|
||||
## Limitations of the test
|
||||
|
||||
While osquery-perf simulates enough of osquery to be a good first step, it's not the smartest simulation as of the time of
|
||||
this writing. Particularly, it doesn't simulate host users and software inventory yet.
|
@ -14,6 +14,7 @@ resource "aws_alb" "main" {
|
||||
internal = false
|
||||
security_groups = [aws_security_group.lb.id, aws_security_group.backend.id]
|
||||
subnets = module.vpc.public_subnets
|
||||
idle_timeout = 120
|
||||
}
|
||||
|
||||
resource "aws_alb_target_group" "main" {
|
||||
@ -81,7 +82,7 @@ resource "aws_ecs_service" "fleet" {
|
||||
launch_type = "FARGATE"
|
||||
cluster = aws_ecs_cluster.fleet.id
|
||||
task_definition = aws_ecs_task_definition.backend.arn
|
||||
desired_count = 1
|
||||
desired_count = 5
|
||||
deployment_minimum_healthy_percent = 100
|
||||
deployment_maximum_percent = 200
|
||||
health_check_grace_period_seconds = 30
|
||||
@ -117,15 +118,15 @@ resource "aws_ecs_task_definition" "backend" {
|
||||
requires_compatibilities = ["FARGATE"]
|
||||
execution_role_arn = aws_iam_role.main.arn
|
||||
task_role_arn = aws_iam_role.main.arn
|
||||
cpu = 512
|
||||
memory = 4096
|
||||
cpu = var.fleet_backend_cpu
|
||||
memory = var.fleet_backend_mem
|
||||
container_definitions = jsonencode(
|
||||
[
|
||||
{
|
||||
name = "fleet"
|
||||
image = var.image
|
||||
cpu = 512
|
||||
memory = 4096
|
||||
image = var.fleet_image
|
||||
cpu = var.fleet_backend_cpu
|
||||
memory = var.fleet_backend_mem
|
||||
mountPoints = []
|
||||
volumesFrom = []
|
||||
essential = true
|
||||
@ -220,6 +221,14 @@ resource "aws_ecs_task_definition" "backend" {
|
||||
name = "FLEET_VULNERABILITIES_DATABASES_PATH"
|
||||
value = var.vuln_db_path
|
||||
},
|
||||
{
|
||||
name = "FLEET_OSQUERY_ENABLE_ASYNC_HOST_PROCESSING"
|
||||
value = var.async_host_processing
|
||||
},
|
||||
{
|
||||
name = "FLEET_LOGGING_DEBUG"
|
||||
value = var.logging_debug
|
||||
},
|
||||
{
|
||||
name = "FLEET_S3_BUCKET"
|
||||
value = aws_s3_bucket.osquery-carve.bucket
|
||||
@ -228,7 +237,6 @@ resource "aws_ecs_task_definition" "backend" {
|
||||
name = "FLEET_S3_PREFIX"
|
||||
value = "carve_results/"
|
||||
},
|
||||
|
||||
]
|
||||
}
|
||||
])
|
||||
@ -247,7 +255,7 @@ resource "aws_ecs_task_definition" "migration" {
|
||||
[
|
||||
{
|
||||
name = "fleet-prepare-db"
|
||||
image = var.image
|
||||
image = var.fleet_image
|
||||
cpu = var.cpu_migrate
|
||||
memory = var.mem_migrate
|
||||
mountPoints = []
|
||||
|
@ -1,5 +1,5 @@
|
||||
resource "aws_s3_bucket" "osquery-results" {
|
||||
bucket = "fleet-osquery-results-archive"
|
||||
bucket = var.osquery_results_s3_bucket
|
||||
acl = "private"
|
||||
|
||||
lifecycle_rule {
|
||||
@ -19,7 +19,7 @@ resource "aws_s3_bucket" "osquery-results" {
|
||||
}
|
||||
|
||||
resource "aws_s3_bucket" "osquery-status" {
|
||||
bucket = "fleet-osquery-status-archive"
|
||||
bucket = var.osquery_status_s3_bucket
|
||||
acl = "private"
|
||||
|
||||
lifecycle_rule {
|
||||
|
@ -4,12 +4,12 @@ variable "maintenance_window" {
|
||||
variable "engine_version" {
|
||||
default = "6.x"
|
||||
}
|
||||
variable "node_type" {
|
||||
default = "cache.t3.micro"
|
||||
}
|
||||
variable "number_cache_clusters" {
|
||||
default = 3
|
||||
}
|
||||
variable "redis_instance" {
|
||||
default = "cache.m5.large"
|
||||
}
|
||||
resource "aws_elasticache_replication_group" "default" {
|
||||
availability_zones = ["us-east-2a", "us-east-2b", "us-east-2c"]
|
||||
engine = "redis"
|
||||
@ -18,7 +18,7 @@ resource "aws_elasticache_replication_group" "default" {
|
||||
security_group_ids = [aws_security_group.redis.id]
|
||||
replication_group_id = "fleetdm-redis"
|
||||
number_cache_clusters = var.number_cache_clusters
|
||||
node_type = var.node_type
|
||||
node_type = var.redis_instance
|
||||
engine_version = var.engine_version
|
||||
port = "6379"
|
||||
maintenance_window = var.maintenance_window
|
||||
|
@ -14,6 +14,36 @@ variable "domain_fleetctl" {
|
||||
default = "dogfood.fleetctl.com"
|
||||
}
|
||||
|
||||
variable "osquery_results_s3_bucket" {
|
||||
default = "fleet-osquery-results-archive"
|
||||
}
|
||||
|
||||
variable "osquery_status_s3_bucket" {
|
||||
default = "fleet-osquery-status-archive"
|
||||
}
|
||||
|
||||
variable "vulnerabilities_path" {
|
||||
default = "/home/fleet"
|
||||
}
|
||||
|
||||
variable "fleet_backend_cpu" {
|
||||
default = 256
|
||||
type = number
|
||||
}
|
||||
|
||||
variable "fleet_backend_mem" {
|
||||
default = 512
|
||||
type = number
|
||||
}
|
||||
|
||||
variable "async_host_processing" {
|
||||
default = "false"
|
||||
}
|
||||
|
||||
variable "logging_debug" {
|
||||
default = "false"
|
||||
}
|
||||
|
||||
variable "database_user" {
|
||||
description = "database user fleet will authenticate and query with"
|
||||
default = "fleet"
|
||||
@ -24,7 +54,7 @@ variable "database_name" {
|
||||
default = "fleet"
|
||||
}
|
||||
|
||||
variable "image" {
|
||||
variable "fleet_image" {
|
||||
description = "the name of the container image to run"
|
||||
default = "fleetdm/fleet"
|
||||
}
|
||||
@ -42,11 +72,13 @@ variable "vuln_db_path" {
|
||||
variable "cpu_migrate" {
|
||||
description = "cpu units for migration task"
|
||||
default = 1024
|
||||
type = number
|
||||
}
|
||||
|
||||
variable "mem_migrate" {
|
||||
description = "memory limit for migration task in MB"
|
||||
default = 2048
|
||||
type = number
|
||||
}
|
||||
|
||||
variable "fleet_max_capacity" {
|
||||
|
Loading…
Reference in New Issue
Block a user