Add exponential backoff to orbit enroll retries (#17368)

#16594

- [X] Changes file added for user-visible changes in `changes/` or
`orbit/changes/`.
See [Changes
files](https://fleetdm.com/docs/contributing/committing-changes#changes-files)
for more information.
- [X] Added/updated tests
- [X] Manual QA for all new/changed functionality
  - For Orbit and Fleet Desktop changes:
- [X] Manual QA must be performed in the three main OSs, macOS, Windows
and Linux.
- [X] Auto-update manual QA, from released version of component to new
version (see [tools/tuf/test](../tools/tuf/test/README.md)).
This commit is contained in:
Lucas Manuel Rodriguez 2024-03-13 07:57:00 -03:00 committed by GitHub
parent 0856c44544
commit e9a464e0cf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 68 additions and 19 deletions

View File

@ -1,12 +1,6 @@
FROM --platform=linux/amd64 golang:1.21.7-bullseye@sha256:447afe790df28e0bc19d782a9f776a105ce3b8417cdd21f33affc4ed6d38f9d5
LABEL maintainer="Fleet Developers"
RUN apt-get update && apt-get install -y \
gcc \
libgtk-3-dev \
libayatana-appindicator3-dev \
&& rm -rf /var/lib/apt/lists/*
RUN mkdir -p /usr/src/fleet
RUN mkdir -p /output

View File

@ -0,0 +1 @@
* Add exponential backoff to orbit enroll retries.

View File

@ -19,10 +19,13 @@ const (
DesktopAppExecName = "fleet-desktop"
// OrbitNodeKeyFileName is the filename on disk where we write the orbit node key to
OrbitNodeKeyFileName = "secret-orbit-node-key.txt"
// OrbitEnrollMaxRetries is the max retries when doing an enroll request
OrbitEnrollMaxRetries = 3
// OrbitEnrollRetrySleep is the time duration to sleep between retries
OrbitEnrollRetrySleep = 5 * time.Second
// OrbitEnrollMaxRetries is the max number of retries when doing an enroll request.
// We set it to 6 to allow the retry backoff to take effect.
OrbitEnrollMaxRetries = 6
// OrbitEnrollBackoffMultiplier is the multiplier to use for backing off between enroll retries.
OrbitEnrollBackoffMultiplier = 2
// OrbitEnrollRetrySleep is the duration to sleep between enroll retries.
OrbitEnrollRetrySleep = 10 * time.Second
// OsquerydName is the name of osqueryd binary
// We use osqueryd as name to properly identify the process when listing
// running processes/tasks.

View File

@ -6,18 +6,26 @@ import (
)
type config struct {
interval time.Duration
maxAttempts int
initialInterval time.Duration
backoffMultiplier int
maxAttempts int
}
// Option allows to configure the behavior of retry.Do
type Option func(*config)
// WithRetryInterval allows to specify a custom duration to wait
// WithInterval allows to specify a custom duration to wait
// between retries.
func WithInterval(i time.Duration) Option {
return func(c *config) {
c.interval = i
c.initialInterval = i
}
}
// WithBackoffMultiplier allows to specify the backoff multiplier between retries.
func WithBackoffMultiplier(m int) Option {
return func(c *config) {
c.backoffMultiplier = m
}
}
@ -37,16 +45,17 @@ func WithMaxAttempts(a int) Option {
// seconds
func Do(fn func() error, opts ...Option) error {
cfg := &config{
interval: 30 * time.Second,
initialInterval: 30 * time.Second,
}
for _, opt := range opts {
opt(cfg)
}
attempts := 0
ticker := time.NewTicker(cfg.interval)
ticker := time.NewTicker(cfg.initialInterval)
defer ticker.Stop()
backoff := 1
for {
attempts++
err := fn()
@ -58,6 +67,12 @@ func Do(fn func() error, opts ...Option) error {
return err
}
if cfg.backoffMultiplier != 0 {
interval := time.Duration(backoff) * cfg.initialInterval
backoff *= cfg.backoffMultiplier
ticker.Reset(interval)
}
<-ticker.C
}
}

View File

@ -38,4 +38,34 @@ func TestRetryDo(t *testing.T) {
require.NoError(t, err)
require.Equal(t, max, count)
})
t.Run("with backoff", func(t *testing.T) {
count := 0
max := 4
start := time.Now()
err := Do(func() error {
switch count {
case 0:
require.WithinDuration(t, start, time.Now(), 1*time.Millisecond)
case 1:
require.WithinDuration(t, start.Add(50*time.Millisecond), time.Now(), 10*time.Millisecond)
case 2:
require.WithinDuration(t, start.Add((50+100)*time.Millisecond), time.Now(), 10*time.Millisecond)
case 3:
require.WithinDuration(t, start.Add((50+100+200)*time.Millisecond), time.Now(), 10*time.Millisecond)
}
count++
if count != max {
return errTest
}
return nil
},
WithInterval(50*time.Millisecond),
WithBackoffMultiplier(2),
WithMaxAttempts(4),
)
require.NoError(t, err)
require.Equal(t, max, count)
})
}

View File

@ -302,8 +302,12 @@ func (oc *OrbitClient) getNodeKeyOrEnroll() (string, error) {
return err
}
},
retry.WithInterval(OrbitRetryInterval()),
// The below configuration means the following retry intervals (exponential backoff):
// 10s, 20s, 40s, 80s, 160s and then return the failure (max attempts = 6)
// thus executing no more than ~6 enroll request failures every ~5 minutes.
retry.WithInterval(orbitEnrollRetryInterval()),
retry.WithMaxAttempts(constant.OrbitEnrollMaxRetries),
retry.WithBackoffMultiplier(constant.OrbitEnrollBackoffMultiplier),
); err != nil {
return "", fmt.Errorf("orbit node key enroll failed, attempts=%d", constant.OrbitEnrollMaxRetries)
}
@ -402,7 +406,7 @@ func (oc *OrbitClient) setLastRecordedError(err error) {
oc.lastRecordedErr = fmt.Errorf("%s: %w", time.Now().UTC().Format("2006-01-02T15:04:05Z"), err)
}
func OrbitRetryInterval() time.Duration {
func orbitEnrollRetryInterval() time.Duration {
interval := os.Getenv("FLEETD_ENROLL_RETRY_INTERVAL")
if interval != "" {
d, err := time.ParseDuration(interval)

View File

@ -73,7 +73,9 @@ for system in $SYSTEMS; do
# compiling a macOS-arm64 binary requires CGO and a macOS computer (for
# Apple keychain, some tables, etc), if this is the case, compile an
# universal binary.
if [ $system == "macos" ] && [ "$(uname -s)" = "Darwin" ]; then
#
# NOTE(lucas): Cross-compiling orbit for arm64 from Intel macOS currently fails (CGO error).
if [ $system == "macos" ] && [ "$(uname -s)" = "Darwin" ] && [ "$(uname -m)" = "arm64" ]; then
CGO_ENABLED=1 \
CODESIGN_IDENTITY=$CODESIGN_IDENTITY \
ORBIT_VERSION=42 \