Orbit to detect 5XX alongside network errors (#17084)

#16423, #16326 

On the [original PR](https://github.com/fleetdm/fleet/pull/16968) we
missed detecting 5XX errors. Fleet usually runs behind load balancers,
so when bringing Fleet down, orbit connects successfully but gets 5XX
errors, so we need to detect those too.
This commit is contained in:
Lucas Manuel Rodriguez 2024-02-22 14:24:17 -03:00 committed by GitHub
parent 2c383a060f
commit 0642387b32
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 50 additions and 28 deletions

View File

@ -9,7 +9,6 @@ import (
"fmt"
"io"
"io/fs"
"net"
"net/url"
"os"
"os/exec"
@ -783,8 +782,13 @@ func main() {
enrollSecret,
fleetClientCertificate,
orbitHostInfo,
func(err net.Error) {
log.Info().Err(err).Msg("network error")
&service.OnGetConfigErrFuncs{
DebugErrFunc: func(err error) {
log.Debug().Err(err).Msg("get config")
},
OnNetErrFunc: func(err error) {
log.Info().Err(err).Msg("network error")
},
},
)
if err != nil {
@ -1058,8 +1062,13 @@ func main() {
enrollSecret,
fleetClientCertificate,
orbitHostInfo,
func(err net.Error) {
log.Info().Err(err).Msg("network error")
&service.OnGetConfigErrFuncs{
DebugErrFunc: func(err error) {
log.Debug().Err(err).Msg("get config")
},
OnNetErrFunc: func(err error) {
log.Info().Err(err).Msg("network error")
},
},
)
if err != nil {

View File

@ -37,7 +37,7 @@ type OrbitClient struct {
lastRecordedErr error
configCache configCache
onNetErrOnGetConfigFn OnNetErrOnGetConfigFunc
onGetConfigErrFns *OnGetConfigErrFuncs
lastNetErrOnGetConfigLogged time.Time
// TestNodeKey is used for testing only.
@ -87,8 +87,14 @@ func (oc *OrbitClient) request(verb string, path string, params interface{}, res
return nil
}
// OnNetErrOnGetConfigFunc is a function executed when there are network errors in GetConfig.
type OnNetErrOnGetConfigFunc func(err net.Error)
// OnGetConfigErrFuncs defines functions to be executed on GetConfig errors.
type OnGetConfigErrFuncs struct {
// OnNetErrFunc receives network and 5XX errors on GetConfig requests.
// These errors are rate limited to once every 5 minutes.
OnNetErrFunc func(err error)
// DebugErrFunc receives all errors on GetConfig requests.
DebugErrFunc func(err error)
}
var (
netErrInterval = 5 * time.Minute
@ -100,8 +106,7 @@ var (
// - rootDir is the Orbit's root directory, where the Orbit node key is loaded-from/stored.
// - addr is the address of the Fleet server.
// - orbitHostInfo is the host system information used for enrolling to Fleet.
// - OnNetErrOnGetConfigFn is called when there's a network error in GetConfig (this method
// is rate limited to be executed once every 5 minutes).
// - onGetConfigErrFns can be used to handle errors in the GetConfig request.
func NewOrbitClient(
rootDir string,
addr string,
@ -110,7 +115,7 @@ func NewOrbitClient(
enrollSecret string,
fleetClientCert *tls.Certificate,
orbitHostInfo fleet.OrbitHostInfo,
onNetErrOnGetConfigFn OnNetErrOnGetConfigFunc,
onGetConfigErrFns *OnGetConfigErrFuncs,
) (*OrbitClient, error) {
orbitCapabilities := fleet.CapabilityMap{}
bc, err := newBaseClient(addr, insecureSkipVerify, rootCA, "", fleetClientCert, orbitCapabilities)
@ -120,12 +125,12 @@ func NewOrbitClient(
nodeKeyFilePath := filepath.Join(rootDir, constant.OrbitNodeKeyFileName)
return &OrbitClient{
nodeKeyFilePath: nodeKeyFilePath,
baseClient: bc,
enrollSecret: enrollSecret,
hostInfo: orbitHostInfo,
enrolled: false,
onNetErrOnGetConfigFn: onNetErrOnGetConfigFn,
nodeKeyFilePath: nodeKeyFilePath,
baseClient: bc,
enrollSecret: enrollSecret,
hostInfo: orbitHostInfo,
enrolled: false,
onGetConfigErrFns: onGetConfigErrFns,
}, nil
}
@ -140,22 +145,28 @@ func (oc *OrbitClient) GetConfig() (*fleet.OrbitConfig, error) {
// If time-to-live passed, we update the config cache
now := time.Now()
if now.After(oc.configCache.lastUpdated.Add(configCacheTTL)) {
verb, path := "POST", "/api/fleet/orbit/config"
var (
resp fleet.OrbitConfig
err error
)
verb, path := "POST", "/api/fleet/orbit/config"
// Retry until we don't get a network error.
// Retry until we don't get a network error or a 5XX error.
_ = retry.Do(func() error {
err = oc.authenticatedRequest(verb, path, &orbitGetConfigRequest{}, &resp)
var netErr net.Error
if errors.As(err, &netErr) {
var (
netErr net.Error
statusCodeErr *statusCodeErr
)
if err != nil && oc.onGetConfigErrFns != nil && oc.onGetConfigErrFns.DebugErrFunc != nil {
oc.onGetConfigErrFns.DebugErrFunc(err)
}
if errors.As(err, &netErr) || (errors.As(err, &statusCodeErr) && statusCodeErr.code >= 500) {
now := time.Now()
if oc.onNetErrOnGetConfigFn != nil && now.After(oc.lastNetErrOnGetConfigLogged.Add(netErrInterval)) {
oc.onNetErrOnGetConfigFn(netErr)
if oc.onGetConfigErrFns != nil && oc.onGetConfigErrFns.OnNetErrFunc != nil && now.After(oc.lastNetErrOnGetConfigLogged.Add(netErrInterval)) {
oc.onGetConfigErrFns.OnNetErrFunc(err)
oc.lastNetErrOnGetConfigLogged = now
}
return err // retry on network errors
return err // retry on network or server 5XX errors
}
return nil
}, retry.WithInterval(configRetryOnNetworkError))

View File

@ -35,6 +35,7 @@ GENERATE_MSI=1 \
ENROLL_SECRET=6/EzU/+jPkxfTamWnRv1+IJsO4T9Etju \
FLEET_DESKTOP=1 \
USE_FLEET_SERVER_CERTIFICATE=1 \
DEBUG=1 \
./tools/tuf/test/main.sh
```

View File

@ -29,6 +29,7 @@ set -ex
# USE_FLEET_SERVER_CERTIFICATE: Whether to use a custom certificate bundle.
# USE_UPDATE_SERVER_CERTIFICATE: Whether to use a custom certificate bundle.
# FLEET_DESKTOP_ALTERNATIVE_BROWSER_HOST: Alternative host:port to use for the Fleet Desktop browser URLs.
# DEBUG: Whether or not to build the package with --debug.
if [ -n "$GENERATE_PKG" ]; then
echo "Generating pkg..."
@ -40,7 +41,7 @@ if [ -n "$GENERATE_PKG" ]; then
${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \
${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \
${INSECURE:+--insecure} \
--debug \
${DEBUG:+--debug} \
--update-roots="$ROOT_KEYS" \
--update-interval=10s \
--disable-open-folder \
@ -64,7 +65,7 @@ if [ -n "$GENERATE_DEB" ]; then
${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \
${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \
${INSECURE:+--insecure} \
--debug \
${DEBUG:+--debug} \
--update-roots="$ROOT_KEYS" \
--update-interval=10s \
--disable-open-folder \
@ -87,7 +88,7 @@ if [ -n "$GENERATE_RPM" ]; then
${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \
${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \
${INSECURE:+--insecure} \
--debug \
${DEBUG:+--debug} \
--update-roots="$ROOT_KEYS" \
--update-interval=10s \
--disable-open-folder \
@ -110,7 +111,7 @@ if [ -n "$GENERATE_MSI" ]; then
${USE_FLEET_SERVER_CERTIFICATE:+--fleet-certificate=./tools/osquery/fleet.crt} \
${USE_UPDATE_SERVER_CERTIFICATE:+--update-tls-certificate=./tools/osquery/fleet.crt} \
${INSECURE:+--insecure} \
--debug \
${DEBUG:+--debug} \
--update-roots="$ROOT_KEYS" \
--update-interval=10s \
--disable-open-folder \