mirror of
https://github.com/empayre/fleet.git
synced 2024-11-06 00:45:19 +00:00
Add exponential backoff to orbit enroll retries (#17368)
#16594 - [X] Changes file added for user-visible changes in `changes/` or `orbit/changes/`. See [Changes files](https://fleetdm.com/docs/contributing/committing-changes#changes-files) for more information. - [X] Added/updated tests - [X] Manual QA for all new/changed functionality - For Orbit and Fleet Desktop changes: - [X] Manual QA must be performed in the three main OSs, macOS, Windows and Linux. - [X] Auto-update manual QA, from released version of component to new version (see [tools/tuf/test](../tools/tuf/test/README.md)).
This commit is contained in:
parent
0856c44544
commit
e9a464e0cf
@ -1,12 +1,6 @@
|
||||
FROM --platform=linux/amd64 golang:1.21.7-bullseye@sha256:447afe790df28e0bc19d782a9f776a105ce3b8417cdd21f33affc4ed6d38f9d5
|
||||
LABEL maintainer="Fleet Developers"
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
libgtk-3-dev \
|
||||
libayatana-appindicator3-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN mkdir -p /usr/src/fleet
|
||||
RUN mkdir -p /output
|
||||
|
||||
|
1
orbit/changes/16594-orbit-enroll-backoff
Normal file
1
orbit/changes/16594-orbit-enroll-backoff
Normal file
@ -0,0 +1 @@
|
||||
* Add exponential backoff to orbit enroll retries.
|
@ -19,10 +19,13 @@ const (
|
||||
DesktopAppExecName = "fleet-desktop"
|
||||
// OrbitNodeKeyFileName is the filename on disk where we write the orbit node key to
|
||||
OrbitNodeKeyFileName = "secret-orbit-node-key.txt"
|
||||
// OrbitEnrollMaxRetries is the max retries when doing an enroll request
|
||||
OrbitEnrollMaxRetries = 3
|
||||
// OrbitEnrollRetrySleep is the time duration to sleep between retries
|
||||
OrbitEnrollRetrySleep = 5 * time.Second
|
||||
// OrbitEnrollMaxRetries is the max number of retries when doing an enroll request.
|
||||
// We set it to 6 to allow the retry backoff to take effect.
|
||||
OrbitEnrollMaxRetries = 6
|
||||
// OrbitEnrollBackoffMultiplier is the multiplier to use for backing off between enroll retries.
|
||||
OrbitEnrollBackoffMultiplier = 2
|
||||
// OrbitEnrollRetrySleep is the duration to sleep between enroll retries.
|
||||
OrbitEnrollRetrySleep = 10 * time.Second
|
||||
// OsquerydName is the name of osqueryd binary
|
||||
// We use osqueryd as name to properly identify the process when listing
|
||||
// running processes/tasks.
|
||||
|
@ -6,18 +6,26 @@ import (
|
||||
)
|
||||
|
||||
type config struct {
|
||||
interval time.Duration
|
||||
initialInterval time.Duration
|
||||
backoffMultiplier int
|
||||
maxAttempts int
|
||||
}
|
||||
|
||||
// Option allows to configure the behavior of retry.Do
|
||||
type Option func(*config)
|
||||
|
||||
// WithRetryInterval allows to specify a custom duration to wait
|
||||
// WithInterval allows to specify a custom duration to wait
|
||||
// between retries.
|
||||
func WithInterval(i time.Duration) Option {
|
||||
return func(c *config) {
|
||||
c.interval = i
|
||||
c.initialInterval = i
|
||||
}
|
||||
}
|
||||
|
||||
// WithBackoffMultiplier allows to specify the backoff multiplier between retries.
|
||||
func WithBackoffMultiplier(m int) Option {
|
||||
return func(c *config) {
|
||||
c.backoffMultiplier = m
|
||||
}
|
||||
}
|
||||
|
||||
@ -37,16 +45,17 @@ func WithMaxAttempts(a int) Option {
|
||||
// seconds
|
||||
func Do(fn func() error, opts ...Option) error {
|
||||
cfg := &config{
|
||||
interval: 30 * time.Second,
|
||||
initialInterval: 30 * time.Second,
|
||||
}
|
||||
for _, opt := range opts {
|
||||
opt(cfg)
|
||||
}
|
||||
|
||||
attempts := 0
|
||||
ticker := time.NewTicker(cfg.interval)
|
||||
ticker := time.NewTicker(cfg.initialInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
backoff := 1
|
||||
for {
|
||||
attempts++
|
||||
err := fn()
|
||||
@ -58,6 +67,12 @@ func Do(fn func() error, opts ...Option) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if cfg.backoffMultiplier != 0 {
|
||||
interval := time.Duration(backoff) * cfg.initialInterval
|
||||
backoff *= cfg.backoffMultiplier
|
||||
ticker.Reset(interval)
|
||||
}
|
||||
|
||||
<-ticker.C
|
||||
}
|
||||
}
|
||||
|
@ -38,4 +38,34 @@ func TestRetryDo(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, max, count)
|
||||
})
|
||||
|
||||
t.Run("with backoff", func(t *testing.T) {
|
||||
count := 0
|
||||
max := 4
|
||||
start := time.Now()
|
||||
err := Do(func() error {
|
||||
switch count {
|
||||
case 0:
|
||||
require.WithinDuration(t, start, time.Now(), 1*time.Millisecond)
|
||||
case 1:
|
||||
require.WithinDuration(t, start.Add(50*time.Millisecond), time.Now(), 10*time.Millisecond)
|
||||
case 2:
|
||||
require.WithinDuration(t, start.Add((50+100)*time.Millisecond), time.Now(), 10*time.Millisecond)
|
||||
case 3:
|
||||
require.WithinDuration(t, start.Add((50+100+200)*time.Millisecond), time.Now(), 10*time.Millisecond)
|
||||
}
|
||||
count++
|
||||
if count != max {
|
||||
return errTest
|
||||
}
|
||||
return nil
|
||||
},
|
||||
WithInterval(50*time.Millisecond),
|
||||
WithBackoffMultiplier(2),
|
||||
WithMaxAttempts(4),
|
||||
)
|
||||
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, max, count)
|
||||
})
|
||||
}
|
||||
|
@ -302,8 +302,12 @@ func (oc *OrbitClient) getNodeKeyOrEnroll() (string, error) {
|
||||
return err
|
||||
}
|
||||
},
|
||||
retry.WithInterval(OrbitRetryInterval()),
|
||||
// The below configuration means the following retry intervals (exponential backoff):
|
||||
// 10s, 20s, 40s, 80s, 160s and then return the failure (max attempts = 6)
|
||||
// thus executing no more than ~6 enroll request failures every ~5 minutes.
|
||||
retry.WithInterval(orbitEnrollRetryInterval()),
|
||||
retry.WithMaxAttempts(constant.OrbitEnrollMaxRetries),
|
||||
retry.WithBackoffMultiplier(constant.OrbitEnrollBackoffMultiplier),
|
||||
); err != nil {
|
||||
return "", fmt.Errorf("orbit node key enroll failed, attempts=%d", constant.OrbitEnrollMaxRetries)
|
||||
}
|
||||
@ -402,7 +406,7 @@ func (oc *OrbitClient) setLastRecordedError(err error) {
|
||||
oc.lastRecordedErr = fmt.Errorf("%s: %w", time.Now().UTC().Format("2006-01-02T15:04:05Z"), err)
|
||||
}
|
||||
|
||||
func OrbitRetryInterval() time.Duration {
|
||||
func orbitEnrollRetryInterval() time.Duration {
|
||||
interval := os.Getenv("FLEETD_ENROLL_RETRY_INTERVAL")
|
||||
if interval != "" {
|
||||
d, err := time.ParseDuration(interval)
|
||||
|
@ -73,7 +73,9 @@ for system in $SYSTEMS; do
|
||||
# compiling a macOS-arm64 binary requires CGO and a macOS computer (for
|
||||
# Apple keychain, some tables, etc), if this is the case, compile an
|
||||
# universal binary.
|
||||
if [ $system == "macos" ] && [ "$(uname -s)" = "Darwin" ]; then
|
||||
#
|
||||
# NOTE(lucas): Cross-compiling orbit for arm64 from Intel macOS currently fails (CGO error).
|
||||
if [ $system == "macos" ] && [ "$(uname -s)" = "Darwin" ] && [ "$(uname -m)" = "arm64" ]; then
|
||||
CGO_ENABLED=1 \
|
||||
CODESIGN_IDENTITY=$CODESIGN_IDENTITY \
|
||||
ORBIT_VERSION=42 \
|
||||
|
Loading…
Reference in New Issue
Block a user