fleet/server/service/async/async_scheduled_query_stats.go

218 lines
8.0 KiB
Go

package async
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
"github.com/fleetdm/fleet/v4/server/config"
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
"github.com/fleetdm/fleet/v4/server/datastore/redis"
"github.com/fleetdm/fleet/v4/server/fleet"
redigo "github.com/gomodule/redigo/redis"
)
const (
scheduledQueryStatsHostQueriesKey = "scheduled_query_stats:{%d}" // the hash of scheduled query stats for a given host
scheduledQueryStatsHostIDsKey = "scheduled_query_stats:active_host_ids"
scheduledQueryStatsKeyMinTTL = 7 * 24 * time.Hour // 1 week
)
// RecordScheduledQueryStats records the scheduled query stats for a given host.
func (t *Task) RecordScheduledQueryStats(ctx context.Context, hostID uint, stats []fleet.PackStats, ts time.Time) error {
cfg := t.taskConfigs[config.AsyncTaskScheduledQueryStats]
if !cfg.Enabled {
return t.datastore.SaveHostPackStats(ctx, hostID, stats)
}
// set an expiration on the key, ensuring that if async processing is
// disabled, the key (eventually) does not use any redis space. Ensure that
// TTL is reasonably big to avoid deleting information that hasn't been
// collected yet - 1 week or 10 * the collector interval, whichever is
// biggest.
ttl := scheduledQueryStatsKeyMinTTL
if maxTTL := 10 * cfg.CollectInterval; maxTTL > ttl {
ttl = maxTTL
}
// the redis key is a hash where the field names are the
// packname\x00schedqueryname and the values are the json-marshaled
// fleet.ScheduledQueryStats struct. Using this structure ensures the fields
// are always overridden with the latest stats for the pack/query tuple,
// which mirrors the behavior of the mysql table (we only keep the latest
// stats).
// keys and arguments passed to the script are:
// KEYS[1]: hash key (scheduledQueryStatsHostQueriesKey)
// ARGV[1]: ttl for the key
// ARGV[2:]: arguments for HSET
script := redigo.NewScript(1, `
redis.call('HSET', KEYS[1], unpack(ARGV, 2))
return redis.call('EXPIRE', KEYS[1], ARGV[1])
`)
// build the arguments
key := fmt.Sprintf(scheduledQueryStatsHostQueriesKey, hostID)
args := redigo.Args{key, int(ttl.Seconds())}
for _, ps := range stats {
for _, qs := range ps.QueryStats {
field := fmt.Sprintf("%s\x00%s", ps.PackName, qs.ScheduledQueryName)
jsonStat, err := json.Marshal(qs)
if err != nil {
return ctxerr.Wrap(ctx, err, "marshal scheduled query stats")
}
args = args.Add(field, string(jsonStat))
}
}
if len(args) > 2 {
// only if there are fields to HSET
conn := t.pool.Get()
defer conn.Close()
if err := redis.BindConn(t.pool, conn, key); err != nil {
return ctxerr.Wrap(ctx, err, "bind redis connection")
}
if _, err := script.Do(conn, args...); err != nil {
return ctxerr.Wrap(ctx, err, "run redis script")
}
}
// Storing the host id in the set of active host IDs for scheduled query stats
// outside of the redis script because in Redis Cluster mode the key may not
// live on the same node as the host's keys. At the same time, purge any
// entry in the set that is older than now - TTL.
if _, err := storePurgeActiveHostID(t.pool, scheduledQueryStatsHostIDsKey, hostID, ts, ts.Add(-ttl)); err != nil {
return ctxerr.Wrap(ctx, err, "store active host id")
}
return nil
}
func (t *Task) collectScheduledQueryStats(ctx context.Context, ds fleet.Datastore, pool fleet.RedisPool, stats *collectorExecStats) error {
cfg := t.taskConfigs[config.AsyncTaskScheduledQueryStats]
hosts, err := loadActiveHostIDs(pool, scheduledQueryStatsHostIDsKey, cfg.RedisScanKeysCount)
if err != nil {
return ctxerr.Wrap(ctx, err, "load active host ids")
}
stats.Keys = len(hosts)
getHostStats := func(hostID uint) (sqStats []fleet.ScheduledQueryStats, schedQueryNames [][2]string, err error) {
keyHash := fmt.Sprintf(scheduledQueryStatsHostQueriesKey, hostID)
conn := redis.ConfigureDoer(pool, pool.Get())
defer conn.Close()
var cursor int
for {
stats.RedisCmds++
res, err := redigo.Values(conn.Do("HSCAN", keyHash, cursor, "COUNT", cfg.RedisScanKeysCount))
if err != nil {
return nil, nil, ctxerr.Wrap(ctx, err, "redis HSCAN")
}
var hashFieldVals []interface{}
if _, err := redigo.Scan(res, &cursor, &hashFieldVals); err != nil {
return nil, nil, ctxerr.Wrap(ctx, err, "scan HSCAN result")
}
items := len(hashFieldVals)
stats.Items += items / 2 // because keys/values are alternating
for len(hashFieldVals) > 0 {
var packSchedName, schedQueryStatJSON string
hashFieldVals, err = redigo.Scan(hashFieldVals, &packSchedName, &schedQueryStatJSON)
if err != nil {
return nil, nil, ctxerr.Wrap(ctx, err, "scan HSCAN field and value")
}
// decode the pack name and scheduled query name from the field
parts := strings.SplitN(packSchedName, "\x00", 2)
if len(parts) != 2 {
return nil, nil, ctxerr.Errorf(ctx, "invalid format for hash field: %s", packSchedName)
}
schedQueryNames = append(schedQueryNames, [2]string{parts[0], parts[1]})
// unmarshal the scheduled query stats
var sqStat fleet.ScheduledQueryStats
if err := json.Unmarshal([]byte(schedQueryStatJSON), &sqStat); err != nil {
return nil, nil, ctxerr.Wrap(ctx, err, "unmarshal scheduled query stats hash value")
}
sqStat.PackName = parts[0]
sqStats = append(sqStats, sqStat)
}
if cursor == 0 {
// iteration completed, clear the hash but do not fail on error
_, _ = conn.Do("DEL", keyHash)
return sqStats, schedQueryNames, nil
}
}
}
// NOTE(mna): packs typically target many hosts, so it's very likely that the
// batch of recorded stats have a small-ish set of packName+schedQueryName that
// is repeated many times over. Instead of loading those schedQuery IDs for each
// insertion of the stats, load it once before processing the batch.
// get all hosts' stats and index the scheduled query names
hostsStats := make(map[uint][]fleet.ScheduledQueryStats, len(hosts)) // key is host ID
uniqueSchedQueries := make(map[[2]string]uint) // key is pack+scheduled query names, value is scheduled query id
for _, host := range hosts {
sqStats, names, err := getHostStats(host.HostID)
if err != nil {
return err
}
hostsStats[host.HostID] = sqStats
for _, nm := range names {
uniqueSchedQueries[nm] = 0
}
}
// batch-load scheduled query IDs
schedNames := make([][2]string, 0, len(uniqueSchedQueries))
for k := range uniqueSchedQueries {
schedNames = append(schedNames, k)
}
schedIDs, err := ds.ScheduledQueryIDsByName(ctx, fleet.DefaultScheduledQueryIDsByNameBatchSize, schedNames...)
if err != nil {
return ctxerr.Wrap(ctx, err, "batch-load scheduled query ids from names")
}
// store the IDs along with the names
for i, nm := range schedNames {
uniqueSchedQueries[nm] = schedIDs[i]
}
// build the batch of stats to upsert, ignoring stats for non-existing
// scheduled queries
batchStatsByHost := make(map[uint][]fleet.ScheduledQueryStats, len(hostsStats))
for hid, sqStats := range hostsStats {
batchStats := batchStatsByHost[hid]
for _, sqStat := range sqStats {
sqStat.ScheduledQueryID = uniqueSchedQueries[[2]string{sqStat.PackName, sqStat.ScheduledQueryName}]
// ignore if the scheduled query does not exist
if sqStat.ScheduledQueryID != 0 {
batchStats = append(batchStats, sqStat)
}
}
batchStatsByHost[hid] = batchStats
}
countExecs, err := ds.AsyncBatchSaveHostsScheduledQueryStats(ctx, batchStatsByHost, cfg.InsertBatch)
if err != nil {
return ctxerr.Wrap(ctx, err, "batch-save scheduled query stats for hosts")
}
stats.Inserts += countExecs // technically, could be updates
if len(hosts) > 0 {
// batch-remove any host ID from the active set that still has its score to
// the initial value, so that the active set does not keep all (potentially
// 100K+) host IDs to process at all times - only those with reported
// results to process.
if _, err := removeProcessedHostIDs(pool, scheduledQueryStatsHostIDsKey, hosts); err != nil {
return ctxerr.Wrap(ctx, err, "remove processed host ids")
}
}
return nil
}