Orbit to detect 5XX alongside network errors (#17084)

#16423, #16326 

On the [original PR](https://github.com/fleetdm/fleet/pull/16968) we
missed detecting 5XX errors. Fleet usually runs behind load balancers,
so when bringing Fleet down, orbit connects successfully but gets 5XX
errors, so we need to detect those too.
This commit is contained in:
Lucas Manuel Rodriguez 2024-02-22 14:24:17 -03:00 committed by GitHub
parent 2c383a060f
commit 0642387b32
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 50 additions and 28 deletions

View File

@ -9,7 +9,6 @@ import (
"fmt" "fmt"
"io" "io"
"io/fs" "io/fs"
"net"
"net/url" "net/url"
"os" "os"
"os/exec" "os/exec"
@ -783,8 +782,13 @@ func main() {
enrollSecret, enrollSecret,
fleetClientCertificate, fleetClientCertificate,
orbitHostInfo, orbitHostInfo,
func(err net.Error) { &service.OnGetConfigErrFuncs{
log.Info().Err(err).Msg("network error") DebugErrFunc: func(err error) {
log.Debug().Err(err).Msg("get config")
},
OnNetErrFunc: func(err error) {
log.Info().Err(err).Msg("network error")
},
}, },
) )
if err != nil { if err != nil {
@ -1058,8 +1062,13 @@ func main() {
enrollSecret, enrollSecret,
fleetClientCertificate, fleetClientCertificate,
orbitHostInfo, orbitHostInfo,
func(err net.Error) { &service.OnGetConfigErrFuncs{
log.Info().Err(err).Msg("network error") DebugErrFunc: func(err error) {
log.Debug().Err(err).Msg("get config")
},
OnNetErrFunc: func(err error) {
log.Info().Err(err).Msg("network error")
},
}, },
) )
if err != nil { if err != nil {

View File

@ -37,7 +37,7 @@ type OrbitClient struct {
lastRecordedErr error lastRecordedErr error
configCache configCache configCache configCache
onNetErrOnGetConfigFn OnNetErrOnGetConfigFunc onGetConfigErrFns *OnGetConfigErrFuncs
lastNetErrOnGetConfigLogged time.Time lastNetErrOnGetConfigLogged time.Time
// TestNodeKey is used for testing only. // TestNodeKey is used for testing only.
@ -87,8 +87,14 @@ func (oc *OrbitClient) request(verb string, path string, params interface{}, res
return nil return nil
} }
// OnNetErrOnGetConfigFunc is a function executed when there are network errors in GetConfig. // OnGetConfigErrFuncs defines functions to be executed on GetConfig errors.
type OnNetErrOnGetConfigFunc func(err net.Error) type OnGetConfigErrFuncs struct {
// OnNetErrFunc receives network and 5XX errors on GetConfig requests.
// These errors are rate limited to once every 5 minutes.
OnNetErrFunc func(err error)
// DebugErrFunc receives all errors on GetConfig requests.
DebugErrFunc func(err error)
}
var ( var (
netErrInterval = 5 * time.Minute netErrInterval = 5 * time.Minute
@ -100,8 +106,7 @@ var (
// - rootDir is the Orbit's root directory, where the Orbit node key is loaded-from/stored. // - rootDir is the Orbit's root directory, where the Orbit node key is loaded-from/stored.
// - addr is the address of the Fleet server. // - addr is the address of the Fleet server.
// - orbitHostInfo is the host system information used for enrolling to Fleet. // - orbitHostInfo is the host system information used for enrolling to Fleet.
// - OnNetErrOnGetConfigFn is called when there's a network error in GetConfig (this method // - onGetConfigErrFns can be used to handle errors in the GetConfig request.
// is rate limited to be executed once every 5 minutes).
func NewOrbitClient( func NewOrbitClient(
rootDir string, rootDir string,
addr string, addr string,
@ -110,7 +115,7 @@ func NewOrbitClient(
enrollSecret string, enrollSecret string,
fleetClientCert *tls.Certificate, fleetClientCert *tls.Certificate,
orbitHostInfo fleet.OrbitHostInfo, orbitHostInfo fleet.OrbitHostInfo,
onNetErrOnGetConfigFn OnNetErrOnGetConfigFunc, onGetConfigErrFns *OnGetConfigErrFuncs,
) (*OrbitClient, error) { ) (*OrbitClient, error) {
orbitCapabilities := fleet.CapabilityMap{} orbitCapabilities := fleet.CapabilityMap{}
bc, err := newBaseClient(addr, insecureSkipVerify, rootCA, "", fleetClientCert, orbitCapabilities) bc, err := newBaseClient(addr, insecureSkipVerify, rootCA, "", fleetClientCert, orbitCapabilities)
@ -120,12 +125,12 @@ func NewOrbitClient(
nodeKeyFilePath := filepath.Join(rootDir, constant.OrbitNodeKeyFileName) nodeKeyFilePath := filepath.Join(rootDir, constant.OrbitNodeKeyFileName)
return &OrbitClient{ return &OrbitClient{
nodeKeyFilePath: nodeKeyFilePath, nodeKeyFilePath: nodeKeyFilePath,
baseClient: bc, baseClient: bc,
enrollSecret: enrollSecret, enrollSecret: enrollSecret,
hostInfo: orbitHostInfo, hostInfo: orbitHostInfo,
enrolled: false, enrolled: false,
onNetErrOnGetConfigFn: onNetErrOnGetConfigFn, onGetConfigErrFns: onGetConfigErrFns,
}, nil }, nil
} }
@ -140,22 +145,28 @@ func (oc *OrbitClient) GetConfig() (*fleet.OrbitConfig, error) {
// If time-to-live passed, we update the config cache // If time-to-live passed, we update the config cache
now := time.Now() now := time.Now()
if now.After(oc.configCache.lastUpdated.Add(configCacheTTL)) { if now.After(oc.configCache.lastUpdated.Add(configCacheTTL)) {
verb, path := "POST", "/api/fleet/orbit/config"
var ( var (
resp fleet.OrbitConfig resp fleet.OrbitConfig
err error err error
) )
verb, path := "POST", "/api/fleet/orbit/config" // Retry until we don't get a network error or a 5XX error.
// Retry until we don't get a network error.
_ = retry.Do(func() error { _ = retry.Do(func() error {
err = oc.authenticatedRequest(verb, path, &orbitGetConfigRequest{}, &resp) err = oc.authenticatedRequest(verb, path, &orbitGetConfigRequest{}, &resp)
var netErr net.Error var (
if errors.As(err, &netErr) { netErr net.Error
statusCodeErr *statusCodeErr
)
if err != nil && oc.onGetConfigErrFns != nil && oc.onGetConfigErrFns.DebugErrFunc != nil {
oc.onGetConfigErrFns.DebugErrFunc(err)
}
if errors.As(err, &netErr) || (errors.As(err, &statusCodeErr) && statusCodeErr.code >= 500) {
now := time.Now() now := time.Now()
if oc.onNetErrOnGetConfigFn != nil && now.After(oc.lastNetErrOnGetConfigLogged.Add(netErrInterval)) { if oc.onGetConfigErrFns != nil && oc.onGetConfigErrFns.OnNetErrFunc != nil && now.After(oc.lastNetErrOnGetConfigLogged.Add(netErrInterval)) {
oc.onNetErrOnGetConfigFn(netErr) oc.onGetConfigErrFns.OnNetErrFunc(err)
oc.lastNetErrOnGetConfigLogged = now oc.lastNetErrOnGetConfigLogged = now
} }
return err // retry on network errors return err // retry on network or server 5XX errors
} }
return nil return nil
}, retry.WithInterval(configRetryOnNetworkError)) }, retry.WithInterval(configRetryOnNetworkError))

View File

@ -35,6 +35,7 @@ GENERATE_MSI=1 \
ENROLL_SECRET=6/EzU/+jPkxfTamWnRv1+IJsO4T9Etju \ ENROLL_SECRET=6/EzU/+jPkxfTamWnRv1+IJsO4T9Etju \
FLEET_DESKTOP=1 \ FLEET_DESKTOP=1 \
USE_FLEET_SERVER_CERTIFICATE=1 \ USE_FLEET_SERVER_CERTIFICATE=1 \
DEBUG=1 \
./tools/tuf/test/main.sh ./tools/tuf/test/main.sh
``` ```

View File

@ -29,6 +29,7 @@ set -ex
# USE_FLEET_SERVER_CERTIFICATE: Whether to use a custom certificate bundle. # USE_FLEET_SERVER_CERTIFICATE: Whether to use a custom certificate bundle.
# USE_UPDATE_SERVER_CERTIFICATE: Whether to use a custom certificate bundle. # USE_UPDATE_SERVER_CERTIFICATE: Whether to use a custom certificate bundle.
# FLEET_DESKTOP_ALTERNATIVE_BROWSER_HOST: Alternative host:port to use for the Fleet Desktop browser URLs. # FLEET_DESKTOP_ALTERNATIVE_BROWSER_HOST: Alternative host:port to use for the Fleet Desktop browser URLs.
# DEBUG: Whether or not to build the package with --debug.
if [ -n "$GENERATE_PKG" ]; then if [ -n "$GENERATE_PKG" ]; then
echo "Generating pkg..." echo "Generating pkg..."
@ -40,7 +41,7 @@ if [ -n "$GENERATE_PKG" ]; then
${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \ ${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \
${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \ ${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \
${INSECURE:+--insecure} \ ${INSECURE:+--insecure} \
--debug \ ${DEBUG:+--debug} \
--update-roots="$ROOT_KEYS" \ --update-roots="$ROOT_KEYS" \
--update-interval=10s \ --update-interval=10s \
--disable-open-folder \ --disable-open-folder \
@ -64,7 +65,7 @@ if [ -n "$GENERATE_DEB" ]; then
${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \ ${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \
${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \ ${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \
${INSECURE:+--insecure} \ ${INSECURE:+--insecure} \
--debug \ ${DEBUG:+--debug} \
--update-roots="$ROOT_KEYS" \ --update-roots="$ROOT_KEYS" \
--update-interval=10s \ --update-interval=10s \
--disable-open-folder \ --disable-open-folder \
@ -87,7 +88,7 @@ if [ -n "$GENERATE_RPM" ]; then
${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \ ${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \
${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \ ${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \
${INSECURE:+--insecure} \ ${INSECURE:+--insecure} \
--debug \ ${DEBUG:+--debug} \
--update-roots="$ROOT_KEYS" \ --update-roots="$ROOT_KEYS" \
--update-interval=10s \ --update-interval=10s \
--disable-open-folder \ --disable-open-folder \
@ -110,7 +111,7 @@ if [ -n "$GENERATE_MSI" ]; then
${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \ ${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \
${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \ ${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \
${INSECURE:+--insecure} \ ${INSECURE:+--insecure} \
--debug \ ${DEBUG:+--debug} \
--update-roots="$ROOT_KEYS" \ --update-roots="$ROOT_KEYS" \
--update-interval=10s \ --update-interval=10s \
--disable-open-folder \ --disable-open-folder \