From e9a464e0cf2ea487f87347c8f78ab751324f0283 Mon Sep 17 00:00:00 2001 From: Lucas Manuel Rodriguez Date: Wed, 13 Mar 2024 07:57:00 -0300 Subject: [PATCH] Add exponential backoff to orbit enroll retries (#17368) #16594 - [X] Changes file added for user-visible changes in `changes/` or `orbit/changes/`. See [Changes files](https://fleetdm.com/docs/contributing/committing-changes#changes-files) for more information. - [X] Added/updated tests - [X] Manual QA for all new/changed functionality - For Orbit and Fleet Desktop changes: - [X] Manual QA must be performed in the three main OSs, macOS, Windows and Linux. - [X] Auto-update manual QA, from released version of component to new version (see [tools/tuf/test](../tools/tuf/test/README.md)). --- Dockerfile-desktop-linux | 6 ----- orbit/changes/16594-orbit-enroll-backoff | 1 + orbit/pkg/constant/constant.go | 11 +++++---- pkg/retry/retry.go | 27 ++++++++++++++++----- pkg/retry/retry_test.go | 30 ++++++++++++++++++++++++ server/service/orbit_client.go | 8 +++++-- tools/tuf/test/create_repository.sh | 4 +++- 7 files changed, 68 insertions(+), 19 deletions(-) create mode 100644 orbit/changes/16594-orbit-enroll-backoff diff --git a/Dockerfile-desktop-linux b/Dockerfile-desktop-linux index eb3828939..592fe3b04 100644 --- a/Dockerfile-desktop-linux +++ b/Dockerfile-desktop-linux @@ -1,12 +1,6 @@ FROM --platform=linux/amd64 golang:1.21.7-bullseye@sha256:447afe790df28e0bc19d782a9f776a105ce3b8417cdd21f33affc4ed6d38f9d5 LABEL maintainer="Fleet Developers" -RUN apt-get update && apt-get install -y \ - gcc \ - libgtk-3-dev \ - libayatana-appindicator3-dev \ - && rm -rf /var/lib/apt/lists/* - RUN mkdir -p /usr/src/fleet RUN mkdir -p /output diff --git a/orbit/changes/16594-orbit-enroll-backoff b/orbit/changes/16594-orbit-enroll-backoff new file mode 100644 index 000000000..01ef59980 --- /dev/null +++ b/orbit/changes/16594-orbit-enroll-backoff @@ -0,0 +1 @@ +* Add exponential backoff to orbit enroll retries. diff --git a/orbit/pkg/constant/constant.go b/orbit/pkg/constant/constant.go index 931b48e23..8a11160e5 100644 --- a/orbit/pkg/constant/constant.go +++ b/orbit/pkg/constant/constant.go @@ -19,10 +19,13 @@ const ( DesktopAppExecName = "fleet-desktop" // OrbitNodeKeyFileName is the filename on disk where we write the orbit node key to OrbitNodeKeyFileName = "secret-orbit-node-key.txt" - // OrbitEnrollMaxRetries is the max retries when doing an enroll request - OrbitEnrollMaxRetries = 3 - // OrbitEnrollRetrySleep is the time duration to sleep between retries - OrbitEnrollRetrySleep = 5 * time.Second + // OrbitEnrollMaxRetries is the max number of retries when doing an enroll request. + // We set it to 6 to allow the retry backoff to take effect. + OrbitEnrollMaxRetries = 6 + // OrbitEnrollBackoffMultiplier is the multiplier to use for backing off between enroll retries. + OrbitEnrollBackoffMultiplier = 2 + // OrbitEnrollRetrySleep is the duration to sleep between enroll retries. + OrbitEnrollRetrySleep = 10 * time.Second // OsquerydName is the name of osqueryd binary // We use osqueryd as name to properly identify the process when listing // running processes/tasks. diff --git a/pkg/retry/retry.go b/pkg/retry/retry.go index 19d0de8b2..b25267b78 100644 --- a/pkg/retry/retry.go +++ b/pkg/retry/retry.go @@ -6,18 +6,26 @@ import ( ) type config struct { - interval time.Duration - maxAttempts int + initialInterval time.Duration + backoffMultiplier int + maxAttempts int } // Option allows to configure the behavior of retry.Do type Option func(*config) -// WithRetryInterval allows to specify a custom duration to wait +// WithInterval allows to specify a custom duration to wait // between retries. func WithInterval(i time.Duration) Option { return func(c *config) { - c.interval = i + c.initialInterval = i + } +} + +// WithBackoffMultiplier allows to specify the backoff multiplier between retries. +func WithBackoffMultiplier(m int) Option { + return func(c *config) { + c.backoffMultiplier = m } } @@ -37,16 +45,17 @@ func WithMaxAttempts(a int) Option { // seconds func Do(fn func() error, opts ...Option) error { cfg := &config{ - interval: 30 * time.Second, + initialInterval: 30 * time.Second, } for _, opt := range opts { opt(cfg) } attempts := 0 - ticker := time.NewTicker(cfg.interval) + ticker := time.NewTicker(cfg.initialInterval) defer ticker.Stop() + backoff := 1 for { attempts++ err := fn() @@ -58,6 +67,12 @@ func Do(fn func() error, opts ...Option) error { return err } + if cfg.backoffMultiplier != 0 { + interval := time.Duration(backoff) * cfg.initialInterval + backoff *= cfg.backoffMultiplier + ticker.Reset(interval) + } + <-ticker.C } } diff --git a/pkg/retry/retry_test.go b/pkg/retry/retry_test.go index 7907778ce..9424e93d1 100644 --- a/pkg/retry/retry_test.go +++ b/pkg/retry/retry_test.go @@ -38,4 +38,34 @@ func TestRetryDo(t *testing.T) { require.NoError(t, err) require.Equal(t, max, count) }) + + t.Run("with backoff", func(t *testing.T) { + count := 0 + max := 4 + start := time.Now() + err := Do(func() error { + switch count { + case 0: + require.WithinDuration(t, start, time.Now(), 1*time.Millisecond) + case 1: + require.WithinDuration(t, start.Add(50*time.Millisecond), time.Now(), 10*time.Millisecond) + case 2: + require.WithinDuration(t, start.Add((50+100)*time.Millisecond), time.Now(), 10*time.Millisecond) + case 3: + require.WithinDuration(t, start.Add((50+100+200)*time.Millisecond), time.Now(), 10*time.Millisecond) + } + count++ + if count != max { + return errTest + } + return nil + }, + WithInterval(50*time.Millisecond), + WithBackoffMultiplier(2), + WithMaxAttempts(4), + ) + + require.NoError(t, err) + require.Equal(t, max, count) + }) } diff --git a/server/service/orbit_client.go b/server/service/orbit_client.go index f171de55c..0203319b1 100644 --- a/server/service/orbit_client.go +++ b/server/service/orbit_client.go @@ -302,8 +302,12 @@ func (oc *OrbitClient) getNodeKeyOrEnroll() (string, error) { return err } }, - retry.WithInterval(OrbitRetryInterval()), + // The below configuration means the following retry intervals (exponential backoff): + // 10s, 20s, 40s, 80s, 160s and then return the failure (max attempts = 6) + // thus executing no more than ~6 enroll request failures every ~5 minutes. + retry.WithInterval(orbitEnrollRetryInterval()), retry.WithMaxAttempts(constant.OrbitEnrollMaxRetries), + retry.WithBackoffMultiplier(constant.OrbitEnrollBackoffMultiplier), ); err != nil { return "", fmt.Errorf("orbit node key enroll failed, attempts=%d", constant.OrbitEnrollMaxRetries) } @@ -402,7 +406,7 @@ func (oc *OrbitClient) setLastRecordedError(err error) { oc.lastRecordedErr = fmt.Errorf("%s: %w", time.Now().UTC().Format("2006-01-02T15:04:05Z"), err) } -func OrbitRetryInterval() time.Duration { +func orbitEnrollRetryInterval() time.Duration { interval := os.Getenv("FLEETD_ENROLL_RETRY_INTERVAL") if interval != "" { d, err := time.ParseDuration(interval) diff --git a/tools/tuf/test/create_repository.sh b/tools/tuf/test/create_repository.sh index 2633f096b..8772db5b3 100755 --- a/tools/tuf/test/create_repository.sh +++ b/tools/tuf/test/create_repository.sh @@ -73,7 +73,9 @@ for system in $SYSTEMS; do # compiling a macOS-arm64 binary requires CGO and a macOS computer (for # Apple keychain, some tables, etc), if this is the case, compile an # universal binary. - if [ $system == "macos" ] && [ "$(uname -s)" = "Darwin" ]; then + # + # NOTE(lucas): Cross-compiling orbit for arm64 from Intel macOS currently fails (CGO error). + if [ $system == "macos" ] && [ "$(uname -s)" = "Darwin" ] && [ "$(uname -m)" = "arm64" ]; then CGO_ENABLED=1 \ CODESIGN_IDENTITY=$CODESIGN_IDENTITY \ ORBIT_VERSION=42 \