fleet/server/datastore/redis/redis.go

package redis

import (
	"net"
	"strings"
	"time"

	"github.com/cenkalti/backoff/v4"
	"github.com/fleetdm/fleet/v4/server/config"
	"github.com/fleetdm/fleet/v4/server/fleet"
	"github.com/gomodule/redigo/redis"
	"github.com/mna/redisc"
	"github.com/pkg/errors"
)

// this is an adapter type to implement the same Stats method as for
// redisc.Cluster, so both can satisfy the same interface.
type standalonePool struct {
	*redis.Pool
	addr string
}

func (p *standalonePool) Stats() map[string]redis.PoolStats {
	return map[string]redis.PoolStats{
		p.addr: p.Pool.Stats(),
	}
}

func (p *standalonePool) Mode() fleet.RedisMode {
	return fleet.RedisStandalone
}

type clusterPool struct {
	*redisc.Cluster
	followRedirs bool
	readReplica  bool
}

func (p *clusterPool) Mode() fleet.RedisMode {
	return fleet.RedisCluster
}

// PoolConfig holds the redis pool configuration options.
type PoolConfig struct {
	Server                    string
	Password                  string
	Database                  int
	UseTLS                    bool
	ConnTimeout               time.Duration
	KeepAlive                 time.Duration
	ConnectRetryAttempts      int
	ClusterFollowRedirections bool
	ClusterReadFromReplica    bool
	TLSCert                   string
	TLSKey                    string
	TLSCA                     string
	TLSServerName             string
	TLSHandshakeTimeout       time.Duration
	MaxIdleConns              int
	MaxOpenConns              int
	ConnMaxLifetime           time.Duration
	IdleTimeout               time.Duration
	TLSSkipVerify             bool

	// allows for testing dial retries and other dial-related scenarios
	testRedisDialFunc func(net, addr string, opts ...redis.DialOption) (redis.Conn, error)
}

// NewPool creates a Redis connection pool using the provided server
// address, password and database.
func NewPool(config PoolConfig) (fleet.RedisPool, error) {
	cluster, err := newCluster(config)
	if err != nil {
		return nil, err
	}
	if err := cluster.Refresh(); err != nil {
		if isClusterDisabled(err) || isClusterCommandUnknown(err) {
			// not a Redis Cluster setup, use a standalone Redis pool
			pool, _ := cluster.CreatePool(config.Server, cluster.DialOptions...)
			cluster.Close()
			return &standalonePool{pool, config.Server}, nil
		}
		return nil, errors.Wrap(err, "refresh cluster")
	}

	return &clusterPool{
		cluster,
		config.ClusterFollowRedirections,
		config.ClusterReadFromReplica,
	}, nil
}

// ReadOnlyConn turns conn into a connection that will try to connect to a
// replica instead of a primary. Note that this is not guaranteed that it will
// do so (there may not be any replica, or due to redirections it may end up on
// a primary, etc.), and it will only try to do so if pool is a Redis Cluster
// pool. The returned connection should only be used to run read-only
// commands.
func ReadOnlyConn(pool fleet.RedisPool, conn redis.Conn) redis.Conn {
	if p, isCluster := pool.(*clusterPool); isCluster && p.readReplica {
		// it only fails if the connection is not a redisc connection or the
		// connection is already bound, in which case we just return the connection
		// as-is.
		_ = redisc.ReadOnlyConn(conn)
	}
	return conn
}

// ConfigureDoer configures conn to follow redirections if the redis
// configuration requested it and the pool is a Redis Cluster pool. If the conn
// is already in error, or if it is not a redisc cluster connection, it is
// returned unaltered.
func ConfigureDoer(pool fleet.RedisPool, conn redis.Conn) redis.Conn {
	if p, isCluster := pool.(*clusterPool); isCluster {
		if err := conn.Err(); err == nil && p.followRedirs {
			rc, err := redisc.RetryConn(conn, 3, 300*time.Millisecond)
			if err == nil {
				return rc
			}
		}
	}
	return conn
}

// SplitKeysBySlot takes a list of redis keys and groups them by hash slot
// so that keys in a given group are guaranteed to hash to the same slot, making
// them safe to run e.g. in a pipeline on the same connection or as part of a
// multi-key command in a Redis Cluster setup. When using standalone Redis, it
// simply returns all keys in the same group (i.e. the top-level slice has a
// length of 1).
func SplitKeysBySlot(pool fleet.RedisPool, keys ...string) [][]string {
	if _, isCluster := pool.(*clusterPool); isCluster {
		return redisc.SplitBySlot(keys...)
	}
	return [][]string{keys}
}

// EachNode calls fn for each node in the redis cluster, with a connection
// to that node, until all nodes have been visited. The connection is
// automatically closed after the call. If fn returns an error, the iteration
// of nodes stops and EachNode returns that error. For standalone redis,
// fn is called only once.
//
// If replicas is true, it will visit each replica node instead, otherwise the
// primary nodes are visited. Keep in mind that if replicas is true, it will
// visit all known replicas - which is great e.g. to run diagnostics on each
// node, but can be surprising if the goal is e.g. to collect all keys, as it
// is possible that more than one node is acting as replica for the same
// primary, meaning that the same keys could be seen multiple times - you
// should be prepared to handle this scenario. The connection provided to fn is
// not a ReadOnly connection (conn.ReadOnly hasn't been called on it), it is up
// to fn to execute the READONLY redis command if required.
func EachNode(pool fleet.RedisPool, replicas bool, fn func(conn redis.Conn) error) error {
	if cluster, isCluster := pool.(*clusterPool); isCluster {
		return cluster.EachNode(replicas, func(_ string, conn redis.Conn) error {
			return fn(conn)
		})
	}

	conn := pool.Get()
	defer conn.Close()
	return fn(conn)
}

// BindConn binds the connection to the redis node that serves those keys.
// In a Redis Cluster setup, all keys must hash to the same slot, otherwise
// an error is returned. In a Redis Standalone setup, it is a no-op and never
// fails. On successful return, the connection is ready to be used with those
// keys.
func BindConn(pool fleet.RedisPool, conn redis.Conn, keys ...string) error {
	if _, isCluster := pool.(*clusterPool); isCluster {
		return redisc.BindConn(conn, keys...)
	}
	return nil
}

// PublishHasListeners is like the PUBLISH redis command, but it also returns a
// boolean indicating if channel still has subscribed listeners. It is required
// because the redis command only returns the count of subscribers active on
// the same node as the one that is used to publish, which may not always be
// the case in Redis Cluster (especially with the read from replica option
// set).
//
// In Standalone mode, it is the same as PUBLISH (with the count of subscribers
// turned into a boolean), and in Cluster mode, if the count returned by
// PUBLISH is 0, it gets the number of subscribers on each node in the cluster
// to get the accurate count.
func PublishHasListeners(pool fleet.RedisPool, conn redis.Conn, channel, message string) (bool, error) {
	n, err := redis.Int(conn.Do("PUBLISH", channel, message))
	if n > 0 || err != nil {
		return n > 0, err
	}

	// otherwise n == 0, check the actual number of subscribers if this is a
	// redis cluster.
	if _, isCluster := pool.(*clusterPool); !isCluster {
		return false, nil
	}

	errDone := errors.New("done")
	var count int

	// subscribers can be subscribed on replicas, so we need to iterate on both
	// primaries and replicas.
	for _, replicas := range []bool{true, false} {
		err = EachNode(pool, replicas, func(conn redis.Conn) error {
			res, err := redis.Values(conn.Do("PUBSUB", "NUMSUB", channel))
			if err != nil {
				return err
			}
			var (
				name string
				n    int
			)
			_, err = redis.Scan(res, &name, &n)
			if err != nil {
				return err
			}
			count += n
			if count > 0 {
				// end early if we know it has subscribers
				return errDone
			}
			return nil
		})

		if err == errDone {
			break
		}
	}

	// if it completed successfully
	if err == nil || err == errDone {
		return count > 0, nil
	}
	return false, errors.Wrap(err, "checking for active subscribers")
}

func newCluster(conf PoolConfig) (*redisc.Cluster, error) {
	opts := []redis.DialOption{
		redis.DialDatabase(conf.Database),
		redis.DialUseTLS(conf.UseTLS),
		redis.DialConnectTimeout(conf.ConnTimeout),
		redis.DialKeepAlive(conf.KeepAlive),
		redis.DialPassword(conf.Password),
		// Read/Write timeouts not set here because we may see results
		// only rarely on the pub/sub channel.
	}

	if conf.UseTLS {
		tlsCfg := config.TLS{
			TLSCA:         conf.TLSCA,
			TLSCert:       conf.TLSCert,
			TLSKey:        conf.TLSKey,
			TLSServerName: conf.TLSServerName,
		}
		cfg, err := tlsCfg.ToTLSConfig()
		if err != nil {
			return nil, err
		}
		cfg.InsecureSkipVerify = conf.TLSSkipVerify

		opts = append(opts,
			redis.DialTLSConfig(cfg),
			redis.DialUseTLS(true),
			redis.DialTLSHandshakeTimeout(conf.TLSHandshakeTimeout))
	}

	dialFn := redis.Dial
	if conf.testRedisDialFunc != nil {
		dialFn = conf.testRedisDialFunc
	}

	return &redisc.Cluster{
		StartupNodes: []string{conf.Server},
		DialOptions:  opts,
		CreatePool: func(server string, opts ...redis.DialOption) (*redis.Pool, error) {
			return &redis.Pool{
				MaxIdle:         conf.MaxIdleConns,
				MaxActive:       conf.MaxOpenConns,
				IdleTimeout:     conf.IdleTimeout,
				MaxConnLifetime: conf.ConnMaxLifetime,

				Dial: func() (redis.Conn, error) {
					var conn redis.Conn
					op := func() error {
						c, err := dialFn("tcp", server, opts...)

						var netErr net.Error
						if errors.As(err, &netErr) {
							if netErr.Temporary() || netErr.Timeout() {
								// retryable error
								return err
							}
						}
						if err != nil {
							// at this point, this is a non-retryable error
							return backoff.Permanent(err)
						}

						// success, store the connection to use
						conn = c
						return nil
					}

					if conf.ConnectRetryAttempts > 0 {
						boff := backoff.WithMaxRetries(backoff.NewExponentialBackOff(), uint64(conf.ConnectRetryAttempts))
						if err := backoff.Retry(op, boff); err != nil {
							return nil, err
						}
					} else if err := op(); err != nil {
						return nil, err
					}
					return conn, nil
				},

				TestOnBorrow: func(c redis.Conn, t time.Time) error {
					if time.Since(t) < time.Minute {
						return nil
					}
					_, err := c.Do("PING")
					return err
				},
			}, nil
		},
	}, nil
}

func isClusterDisabled(err error) bool {
	return strings.Contains(err.Error(), "ERR This instance has cluster support disabled")
}

// On GCP Memorystore the CLUSTER command is entirely unavailable and fails with
// this error. See
// https://cloud.google.com/memorystore/docs/redis/product-constraints#blocked_redis_commands
func isClusterCommandUnknown(err error) bool {
	return strings.Contains(err.Error(), "ERR unknown command `CLUSTER`")
}