mirror of
https://github.com/empayre/fleet.git
synced 2024-11-06 00:45:19 +00:00
Add exponential backoff to orbit enroll retries (#17368)
#16594 - [X] Changes file added for user-visible changes in `changes/` or `orbit/changes/`. See [Changes files](https://fleetdm.com/docs/contributing/committing-changes#changes-files) for more information. - [X] Added/updated tests - [X] Manual QA for all new/changed functionality - For Orbit and Fleet Desktop changes: - [X] Manual QA must be performed in the three main OSs, macOS, Windows and Linux. - [X] Auto-update manual QA, from released version of component to new version (see [tools/tuf/test](../tools/tuf/test/README.md)).
This commit is contained in:
parent
0856c44544
commit
e9a464e0cf
@ -1,12 +1,6 @@
|
|||||||
FROM --platform=linux/amd64 golang:1.21.7-bullseye@sha256:447afe790df28e0bc19d782a9f776a105ce3b8417cdd21f33affc4ed6d38f9d5
|
FROM --platform=linux/amd64 golang:1.21.7-bullseye@sha256:447afe790df28e0bc19d782a9f776a105ce3b8417cdd21f33affc4ed6d38f9d5
|
||||||
LABEL maintainer="Fleet Developers"
|
LABEL maintainer="Fleet Developers"
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y \
|
|
||||||
gcc \
|
|
||||||
libgtk-3-dev \
|
|
||||||
libayatana-appindicator3-dev \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
RUN mkdir -p /usr/src/fleet
|
RUN mkdir -p /usr/src/fleet
|
||||||
RUN mkdir -p /output
|
RUN mkdir -p /output
|
||||||
|
|
||||||
|
1
orbit/changes/16594-orbit-enroll-backoff
Normal file
1
orbit/changes/16594-orbit-enroll-backoff
Normal file
@ -0,0 +1 @@
|
|||||||
|
* Add exponential backoff to orbit enroll retries.
|
@ -19,10 +19,13 @@ const (
|
|||||||
DesktopAppExecName = "fleet-desktop"
|
DesktopAppExecName = "fleet-desktop"
|
||||||
// OrbitNodeKeyFileName is the filename on disk where we write the orbit node key to
|
// OrbitNodeKeyFileName is the filename on disk where we write the orbit node key to
|
||||||
OrbitNodeKeyFileName = "secret-orbit-node-key.txt"
|
OrbitNodeKeyFileName = "secret-orbit-node-key.txt"
|
||||||
// OrbitEnrollMaxRetries is the max retries when doing an enroll request
|
// OrbitEnrollMaxRetries is the max number of retries when doing an enroll request.
|
||||||
OrbitEnrollMaxRetries = 3
|
// We set it to 6 to allow the retry backoff to take effect.
|
||||||
// OrbitEnrollRetrySleep is the time duration to sleep between retries
|
OrbitEnrollMaxRetries = 6
|
||||||
OrbitEnrollRetrySleep = 5 * time.Second
|
// OrbitEnrollBackoffMultiplier is the multiplier to use for backing off between enroll retries.
|
||||||
|
OrbitEnrollBackoffMultiplier = 2
|
||||||
|
// OrbitEnrollRetrySleep is the duration to sleep between enroll retries.
|
||||||
|
OrbitEnrollRetrySleep = 10 * time.Second
|
||||||
// OsquerydName is the name of osqueryd binary
|
// OsquerydName is the name of osqueryd binary
|
||||||
// We use osqueryd as name to properly identify the process when listing
|
// We use osqueryd as name to properly identify the process when listing
|
||||||
// running processes/tasks.
|
// running processes/tasks.
|
||||||
|
@ -6,18 +6,26 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type config struct {
|
type config struct {
|
||||||
interval time.Duration
|
initialInterval time.Duration
|
||||||
maxAttempts int
|
backoffMultiplier int
|
||||||
|
maxAttempts int
|
||||||
}
|
}
|
||||||
|
|
||||||
// Option allows to configure the behavior of retry.Do
|
// Option allows to configure the behavior of retry.Do
|
||||||
type Option func(*config)
|
type Option func(*config)
|
||||||
|
|
||||||
// WithRetryInterval allows to specify a custom duration to wait
|
// WithInterval allows to specify a custom duration to wait
|
||||||
// between retries.
|
// between retries.
|
||||||
func WithInterval(i time.Duration) Option {
|
func WithInterval(i time.Duration) Option {
|
||||||
return func(c *config) {
|
return func(c *config) {
|
||||||
c.interval = i
|
c.initialInterval = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithBackoffMultiplier allows to specify the backoff multiplier between retries.
|
||||||
|
func WithBackoffMultiplier(m int) Option {
|
||||||
|
return func(c *config) {
|
||||||
|
c.backoffMultiplier = m
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -37,16 +45,17 @@ func WithMaxAttempts(a int) Option {
|
|||||||
// seconds
|
// seconds
|
||||||
func Do(fn func() error, opts ...Option) error {
|
func Do(fn func() error, opts ...Option) error {
|
||||||
cfg := &config{
|
cfg := &config{
|
||||||
interval: 30 * time.Second,
|
initialInterval: 30 * time.Second,
|
||||||
}
|
}
|
||||||
for _, opt := range opts {
|
for _, opt := range opts {
|
||||||
opt(cfg)
|
opt(cfg)
|
||||||
}
|
}
|
||||||
|
|
||||||
attempts := 0
|
attempts := 0
|
||||||
ticker := time.NewTicker(cfg.interval)
|
ticker := time.NewTicker(cfg.initialInterval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
backoff := 1
|
||||||
for {
|
for {
|
||||||
attempts++
|
attempts++
|
||||||
err := fn()
|
err := fn()
|
||||||
@ -58,6 +67,12 @@ func Do(fn func() error, opts ...Option) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if cfg.backoffMultiplier != 0 {
|
||||||
|
interval := time.Duration(backoff) * cfg.initialInterval
|
||||||
|
backoff *= cfg.backoffMultiplier
|
||||||
|
ticker.Reset(interval)
|
||||||
|
}
|
||||||
|
|
||||||
<-ticker.C
|
<-ticker.C
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -38,4 +38,34 @@ func TestRetryDo(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.Equal(t, max, count)
|
require.Equal(t, max, count)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("with backoff", func(t *testing.T) {
|
||||||
|
count := 0
|
||||||
|
max := 4
|
||||||
|
start := time.Now()
|
||||||
|
err := Do(func() error {
|
||||||
|
switch count {
|
||||||
|
case 0:
|
||||||
|
require.WithinDuration(t, start, time.Now(), 1*time.Millisecond)
|
||||||
|
case 1:
|
||||||
|
require.WithinDuration(t, start.Add(50*time.Millisecond), time.Now(), 10*time.Millisecond)
|
||||||
|
case 2:
|
||||||
|
require.WithinDuration(t, start.Add((50+100)*time.Millisecond), time.Now(), 10*time.Millisecond)
|
||||||
|
case 3:
|
||||||
|
require.WithinDuration(t, start.Add((50+100+200)*time.Millisecond), time.Now(), 10*time.Millisecond)
|
||||||
|
}
|
||||||
|
count++
|
||||||
|
if count != max {
|
||||||
|
return errTest
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
WithInterval(50*time.Millisecond),
|
||||||
|
WithBackoffMultiplier(2),
|
||||||
|
WithMaxAttempts(4),
|
||||||
|
)
|
||||||
|
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, max, count)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
@ -302,8 +302,12 @@ func (oc *OrbitClient) getNodeKeyOrEnroll() (string, error) {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
retry.WithInterval(OrbitRetryInterval()),
|
// The below configuration means the following retry intervals (exponential backoff):
|
||||||
|
// 10s, 20s, 40s, 80s, 160s and then return the failure (max attempts = 6)
|
||||||
|
// thus executing no more than ~6 enroll request failures every ~5 minutes.
|
||||||
|
retry.WithInterval(orbitEnrollRetryInterval()),
|
||||||
retry.WithMaxAttempts(constant.OrbitEnrollMaxRetries),
|
retry.WithMaxAttempts(constant.OrbitEnrollMaxRetries),
|
||||||
|
retry.WithBackoffMultiplier(constant.OrbitEnrollBackoffMultiplier),
|
||||||
); err != nil {
|
); err != nil {
|
||||||
return "", fmt.Errorf("orbit node key enroll failed, attempts=%d", constant.OrbitEnrollMaxRetries)
|
return "", fmt.Errorf("orbit node key enroll failed, attempts=%d", constant.OrbitEnrollMaxRetries)
|
||||||
}
|
}
|
||||||
@ -402,7 +406,7 @@ func (oc *OrbitClient) setLastRecordedError(err error) {
|
|||||||
oc.lastRecordedErr = fmt.Errorf("%s: %w", time.Now().UTC().Format("2006-01-02T15:04:05Z"), err)
|
oc.lastRecordedErr = fmt.Errorf("%s: %w", time.Now().UTC().Format("2006-01-02T15:04:05Z"), err)
|
||||||
}
|
}
|
||||||
|
|
||||||
func OrbitRetryInterval() time.Duration {
|
func orbitEnrollRetryInterval() time.Duration {
|
||||||
interval := os.Getenv("FLEETD_ENROLL_RETRY_INTERVAL")
|
interval := os.Getenv("FLEETD_ENROLL_RETRY_INTERVAL")
|
||||||
if interval != "" {
|
if interval != "" {
|
||||||
d, err := time.ParseDuration(interval)
|
d, err := time.ParseDuration(interval)
|
||||||
|
@ -73,7 +73,9 @@ for system in $SYSTEMS; do
|
|||||||
# compiling a macOS-arm64 binary requires CGO and a macOS computer (for
|
# compiling a macOS-arm64 binary requires CGO and a macOS computer (for
|
||||||
# Apple keychain, some tables, etc), if this is the case, compile an
|
# Apple keychain, some tables, etc), if this is the case, compile an
|
||||||
# universal binary.
|
# universal binary.
|
||||||
if [ $system == "macos" ] && [ "$(uname -s)" = "Darwin" ]; then
|
#
|
||||||
|
# NOTE(lucas): Cross-compiling orbit for arm64 from Intel macOS currently fails (CGO error).
|
||||||
|
if [ $system == "macos" ] && [ "$(uname -s)" = "Darwin" ] && [ "$(uname -m)" = "arm64" ]; then
|
||||||
CGO_ENABLED=1 \
|
CGO_ENABLED=1 \
|
||||||
CODESIGN_IDENTITY=$CODESIGN_IDENTITY \
|
CODESIGN_IDENTITY=$CODESIGN_IDENTITY \
|
||||||
ORBIT_VERSION=42 \
|
ORBIT_VERSION=42 \
|
||||||
|
Loading…
Reference in New Issue
Block a user