From c5b988d6009dc00cc548bd6c1cbd9575ea319b0e Mon Sep 17 00:00:00 2001 From: Martin Angers Date: Thu, 22 Feb 2024 13:10:28 -0500 Subject: [PATCH] Fix stack trace of captured errors in Sentry, capture errors in more code paths (#16966) #16480 # Checklist for submitter - [x] Changes file added for user-visible changes in `changes/` or `orbit/changes/`. See [Changes files](https://fleetdm.com/docs/contributing/committing-changes#changes-files) for more information. - [x] Added/updated tests - [x] Manual QA for all new/changed functionality --- changes/16480-fix-capturing-errors-in-sentry | 4 + cmd/fleet/cron.go | 2 - server/contexts/ctxerr/ctxerr.go | 40 ++++ server/contexts/ctxerr/stack_test.go | 234 +++++++++++++++++-- server/mdm/apple/apple_mdm.go | 11 +- server/service/async/async.go | 2 - server/service/handler.go | 2 +- server/service/schedule/schedule.go | 13 +- server/service/transport_error.go | 32 --- tools/sentry-self-hosted/README.md | 17 ++ 10 files changed, 293 insertions(+), 64 deletions(-) create mode 100644 changes/16480-fix-capturing-errors-in-sentry create mode 100644 tools/sentry-self-hosted/README.md diff --git a/changes/16480-fix-capturing-errors-in-sentry b/changes/16480-fix-capturing-errors-in-sentry new file mode 100644 index 000000000..0638ba660 --- /dev/null +++ b/changes/16480-fix-capturing-errors-in-sentry @@ -0,0 +1,4 @@ +* Fixed issues with how errors were captured in Sentry: + - The stack trace is now more precise. + - More error paths will now get captured in Sentry. + - **NOTE: Many more entries could be generated in Sentry compared to earlier Fleet versions.** Sentry capacity should be planned accordingly. diff --git a/cmd/fleet/cron.go b/cmd/fleet/cron.go index 2a17c87e9..3d1849726 100644 --- a/cmd/fleet/cron.go +++ b/cmd/fleet/cron.go @@ -32,7 +32,6 @@ import ( "github.com/fleetdm/fleet/v4/server/vulnerabilities/utils" "github.com/fleetdm/fleet/v4/server/webhooks" "github.com/fleetdm/fleet/v4/server/worker" - "github.com/getsentry/sentry-go" kitlog "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/hashicorp/go-multierror" @@ -41,7 +40,6 @@ import ( func errHandler(ctx context.Context, logger kitlog.Logger, msg string, err error) { level.Error(logger).Log("msg", msg, "err", err) - sentry.CaptureException(err) ctxerr.Handle(ctx, err) } diff --git a/server/contexts/ctxerr/ctxerr.go b/server/contexts/ctxerr/ctxerr.go index fcdec4feb..f644ce733 100644 --- a/server/contexts/ctxerr/ctxerr.go +++ b/server/contexts/ctxerr/ctxerr.go @@ -23,6 +23,7 @@ import ( "github.com/fleetdm/fleet/v4/server/contexts/host" "github.com/fleetdm/fleet/v4/server/contexts/viewer" "github.com/fleetdm/fleet/v4/server/fleet" + "github.com/getsentry/sentry-go" "go.elastic.co/apm/v2" ) @@ -73,6 +74,16 @@ func (e *FleetError) StackTrace() *runtime.Frames { return runtime.CallersFrames(st) } +// StackFrames implements the reflection-based method that Sentry's Go SDK +// uses to look for a stack trace. It abuses the internals a bit, as it uses +// the name that sentry looks for, but returns the []uintptr slice (which works +// because of how they handle the returned value via reflection). A cleaner +// approach would be if they used an interface detection like APM does. +// https://github.com/getsentry/sentry-go/blob/master/stacktrace.go#L44-L49 +func (e *FleetError) StackFrames() []uintptr { + return e.stack.(stack) // outside of tests, e.stack is always a stack type +} + // LogFields implements fleet.ErrWithLogFields, so attached error data can be // logged along with the error func (e *FleetError) LogFields() []any { @@ -295,8 +306,37 @@ func Handle(ctx context.Context, err error) { // (the one from the initial New/Wrap call). cause = ferr } + + // send to elastic APM apm.CaptureError(ctx, cause).Send() + // if Sentry is configured, capture the error there + if sentryClient := sentry.CurrentHub().Client(); sentryClient != nil { + // sentry is configured, add contextual information if available + v, _ := viewer.FromContext(ctx) + h, _ := host.FromContext(ctx) + + if v.User != nil || h != nil { + // we have a viewer (user) or a host in the context, use this to + // enrich the error with more context + ctxHub := sentry.CurrentHub().Clone() + if v.User != nil { + ctxHub.ConfigureScope(func(scope *sentry.Scope) { + scope.SetTag("email", v.User.Email) + scope.SetTag("user_id", fmt.Sprint(v.User.ID)) + }) + } else if h != nil { + ctxHub.ConfigureScope(func(scope *sentry.Scope) { + scope.SetTag("hostname", h.Hostname) + scope.SetTag("host_id", fmt.Sprint(h.ID)) + }) + } + ctxHub.CaptureException(cause) + } else { + sentry.CaptureException(cause) + } + } + if eh := fromContext(ctx); eh != nil { eh.Store(err) } diff --git a/server/contexts/ctxerr/stack_test.go b/server/contexts/ctxerr/stack_test.go index 4b155f66e..ed65f774f 100644 --- a/server/contexts/ctxerr/stack_test.go +++ b/server/contexts/ctxerr/stack_test.go @@ -2,13 +2,21 @@ package ctxerr import ( "context" + "encoding/json" "errors" "fmt" "io" + "net/http" + "net/http/httptest" + "net/url" + "path/filepath" "regexp" + "slices" "strings" "testing" + "time" + "github.com/getsentry/sentry-go" "github.com/stretchr/testify/require" "go.elastic.co/apm/v2" "go.elastic.co/apm/v2/apmtest" @@ -157,17 +165,6 @@ func TestElasticStack(t *testing.T) { } for _, c := range cases { t.Run(c.desc, func(t *testing.T) { - checkStack := func(stack, contains []string) { - stackStr := strings.Join(stack, "\n") - lastIx := -1 - for _, want := range contains { - ix := strings.Index(stackStr, want) - require.True(t, ix > -1, "expected stack %v to contain %q", stackStr, want) - require.True(t, ix > lastIx, "expected %q to be after last check in %v", want, stackStr) - lastIx = ix - } - } - err := c.chain() require.Error(t, err) var ferr *FleetError @@ -192,8 +189,8 @@ func TestElasticStack(t *testing.T) { } } - checkStack(causeStack, c.causeStackContains) - checkStack(leafStack, c.leafStackContains) + checkStack(t, causeStack, c.causeStackContains) + checkStack(t, leafStack, c.leafStackContains) // run in a test APM transaction, recording the sent events _, _, apmErrs := apmtest.NewRecordingTracer().WithTransaction(func(ctx context.Context) { @@ -219,7 +216,216 @@ func TestElasticStack(t *testing.T) { for _, st := range apmErr.Exception.Stacktrace { apmStack = append(apmStack, st.Module+"."+st.Function+" ("+st.File+":"+fmt.Sprint(st.Line)+")") } - checkStack(apmStack, c.causeStackContains) + checkStack(t, apmStack, c.causeStackContains) }) } } + +func TestSentryStack(t *testing.T) { + ctx := context.Background() + + var wrap = errors.New("wrap") + errFn := func(fn func() error) error { // func1 + if err := fn(); err != nil { + if err == wrap { + return Wrap(ctx, err, "wrapped") + } + return err + } + return nil + } + + type sentryPayload struct { + Exceptions []*sentry.Exception `json:"exception"` // json field name is singular + } + + cases := []struct { + desc string + chain func() error + causeStackContains []string + leafStackContains []string + }{ + { + desc: "depth 2, wrap in errFn", + chain: func() error { + // gets wrapped in errFn, so top of the stack is func1 + return errFn(func() error { return wrap }) + }, + causeStackContains: []string{ + "/ctxerr.TestSentryStack.func1 ", + }, + }, + { + desc: "depth 2, wrap immediately", + chain: func() error { + // gets wrapped immediately when returned, so top of the stack is funcX.1 + return errFn(func() error { return Wrap(ctx, wrap) }) + }, + causeStackContains: []string{ + "/ctxerr.TestSentryStack.func3.1 ", + "/ctxerr.TestSentryStack.func1 ", // errFn + }, + }, + { + desc: "depth 3, ctxerr.New", + chain: func() error { + // gets wrapped directly in the call to New, so top of the stack is X.1.1 + return errFn(func() error { return func() error { return New(ctx, "new") }() }) + }, + causeStackContains: []string{ + "/ctxerr.TestSentryStack.func4.1.1 ", + "/ctxerr.TestSentryStack.func4.1 ", + "/ctxerr.TestSentryStack.func1 ", // errFn + }, + }, + { + desc: "depth 4, ctxerr.New", + chain: func() error { + // stacked capture in New, so top of the stack is X.1.1.1 + return errFn(func() error { + return func() error { + return func() error { + return New(ctx, "new") + }() + }() + }) + }, + causeStackContains: []string{ + "/ctxerr.TestSentryStack.func5.1.1.1 ", + "/ctxerr.TestSentryStack.func5.1.1 ", + "/ctxerr.TestSentryStack.func5.1 ", + "/ctxerr.TestSentryStack.func1 ", // errFn + }, + }, + { + desc: "depth 4, ctxerr.New always wrapped", + chain: func() error { + // stacked capture in New, so top of the stack is X.1.1.1 + return errFn(func() error { + return Wrap(ctx, func() error { + return Wrap(ctx, func() error { + return New(ctx, "new") + }()) + }()) + }) + }, + causeStackContains: []string{ + "/ctxerr.TestSentryStack.func6.1.1.1 ", + "/ctxerr.TestSentryStack.func6.1.1 ", + "/ctxerr.TestSentryStack.func6.1 ", + "/ctxerr.TestSentryStack.func1 ", // errFn + }, + leafStackContains: []string{ + // only a single stack trace is collected when wrapping another + // FleetError. + "/ctxerr.TestSentryStack.func6.1 ", + }, + }, + { + desc: "depth 4, wrapped only at the end", + chain: func() error { + return errFn(func() error { + return Wrap(ctx, func() error { + return func() error { + return io.EOF + }() + }()) + }) + }, + causeStackContains: []string{ + // since it wraps a non-FleetError, the full stack is collected + "/ctxerr.TestSentryStack.func7.1 ", + "/ctxerr.TestSentryStack.func1 ", // errFn + }, + }, + } + for _, c := range cases { + t.Run(c.desc, func(t *testing.T) { + err := c.chain() + require.Error(t, err) + var ferr *FleetError + require.ErrorAs(t, err, &ferr) + + leafStack := ferr.Stack() + cause := FleetCause(err) + causeStack := cause.Stack() + + // if the fleet root error != fleet leaf error, then separate leaf + + // cause stacks must be provided. + if cause != ferr { + require.True(t, len(c.causeStackContains) > 0) + require.True(t, len(c.leafStackContains) > 0) + } else { + // otherwise use the same stack expectations for both + if len(c.causeStackContains) == 0 { + c.causeStackContains = c.leafStackContains + } + if len(c.leafStackContains) == 0 { + c.leafStackContains = c.causeStackContains + } + } + + checkStack(t, causeStack, c.causeStackContains) + checkStack(t, leafStack, c.leafStackContains) + + // start an HTTP server that Sentry will send the event to + var payload sentryPayload + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + b, err := io.ReadAll(r.Body) + require.NoError(t, err) + err = json.Unmarshal(b, &payload) + require.NoError(t, err) + w.WriteHeader(200) + })) + defer srv.Close() + + // a "project ID" is required, which is the path portion + parsedURL, err := url.Parse(srv.URL + "/testproject") + require.NoError(t, err) + parsedURL.User = url.User("test") + err = sentry.Init(sentry.ClientOptions{Dsn: parsedURL.String()}) + require.NoError(t, err) + + // best-effort un-configure of Sentry on exit + t.Cleanup(func() { + sentry.CurrentHub().BindClient(nil) + }) + + eventID := sentry.CaptureException(cause) + require.NotNil(t, eventID) + require.True(t, sentry.Flush(2*time.Second), "failed to flush Sentry events in time") + require.True(t, len(payload.Exceptions) >= 1) // the wrapped errors are exploded into separate exceptions in the slice + + // since we capture the FleetCause error, the last entry in the exceptions + // must be a FleetError and contain the stacktrace we're looking for. + rootCapturedErr := payload.Exceptions[len(payload.Exceptions)-1] + require.Equal(t, "*ctxerr.FleetError", rootCapturedErr.Type) + + // format the stack trace the same way we do in ctxerr + var stack []string + for _, st := range rootCapturedErr.Stacktrace.Frames { + filename := st.Filename + if filename == "" { + // get it from abspath + filename = filepath.Base(st.AbsPath) + } + stack = append(stack, st.Module+"."+st.Function+" ("+filename+":"+fmt.Sprint(st.Lineno)+")") + } + + // for some reason, Sentry reverses the stack trace + slices.Reverse(stack) + checkStack(t, stack, c.causeStackContains) + }) + } +} + +func checkStack(t *testing.T, stack, contains []string) { + stackStr := strings.Join(stack, "\n") + lastIx := -1 + for _, want := range contains { + ix := strings.Index(stackStr, want) + require.True(t, ix > -1, "expected stack %v to contain %q", stackStr, want) + require.True(t, ix > lastIx, "expected %q to be after last check in %v", want, stackStr) + lastIx = ix + } +} diff --git a/server/mdm/apple/apple_mdm.go b/server/mdm/apple/apple_mdm.go index 5747e2095..ba3f87259 100644 --- a/server/mdm/apple/apple_mdm.go +++ b/server/mdm/apple/apple_mdm.go @@ -16,7 +16,6 @@ import ( "github.com/fleetdm/fleet/v4/server/logging" "github.com/fleetdm/fleet/v4/server/mdm/internal/commonmdm" "github.com/fleetdm/fleet/v4/server/ptr" - "github.com/getsentry/sentry-go" "github.com/go-kit/log/level" "github.com/google/uuid" "github.com/micromdm/nanodep/godep" @@ -368,7 +367,13 @@ func NewDEPService( depStorage, depsync.WithLogger(logging.NewNanoDEPLogger(kitlog.With(logger, "component", "nanodep-syncer"))), depsync.WithCallback(func(ctx context.Context, isFetch bool, resp *godep.DeviceResponse) error { - return depSvc.processDeviceResponse(ctx, depClient, resp) + // the nanodep syncer just logs the error of the callback, so in order to + // capture it we need to do this here. + err := depSvc.processDeviceResponse(ctx, depClient, resp) + if err != nil { + ctxerr.Handle(ctx, err) + } + return err }), ) @@ -443,7 +448,7 @@ func (d *DEPService) processDeviceResponse(ctx context.Context, depClient *godep switch { case err != nil: level.Error(kitlog.With(d.logger)).Log("err", err) - sentry.CaptureException(err) + ctxerr.Handle(ctx, err) case n > 0: level.Info(kitlog.With(d.logger)).Log("msg", fmt.Sprintf("added %d new mdm device(s) to pending hosts", n)) case n == 0: diff --git a/server/service/async/async.go b/server/service/async/async.go index 8fe02692e..5f341b09d 100644 --- a/server/service/async/async.go +++ b/server/service/async/async.go @@ -10,7 +10,6 @@ import ( "github.com/fleetdm/fleet/v4/server/contexts/ctxerr" "github.com/fleetdm/fleet/v4/server/datastore/redis" "github.com/fleetdm/fleet/v4/server/fleet" - "github.com/getsentry/sentry-go" kitlog "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" redigo "github.com/gomodule/redigo/redis" @@ -47,7 +46,6 @@ func NewTask(ds fleet.Datastore, pool fleet.RedisPool, clck clock.Clock, conf co func (t *Task) StartCollectors(ctx context.Context, logger kitlog.Logger) { collectorErrHandler := func(name string, err error) { level.Error(logger).Log("err", fmt.Sprintf("%s collector", name), "details", err) - sentry.CaptureException(err) ctxerr.Handle(ctx, err) } diff --git a/server/service/handler.go b/server/service/handler.go index d6ea2100e..7a5d4c4e5 100644 --- a/server/service/handler.go +++ b/server/service/handler.go @@ -131,7 +131,7 @@ func MakeHandler( setRequestsContexts(svc), ), kithttp.ServerErrorHandler(&errorHandler{logger}), - kithttp.ServerErrorEncoder(encodeErrorAndTrySentry(config.Sentry.Dsn != "")), + kithttp.ServerErrorEncoder(encodeError), kithttp.ServerAfter( kithttp.SetContentType("application/json; charset=utf-8"), logRequestEnd(logger), diff --git a/server/service/schedule/schedule.go b/server/service/schedule/schedule.go index 5bc139ee0..eabe66c39 100644 --- a/server/service/schedule/schedule.go +++ b/server/service/schedule/schedule.go @@ -12,7 +12,6 @@ import ( "github.com/fleetdm/fleet/v4/server/contexts/ctxerr" "github.com/fleetdm/fleet/v4/server/fleet" - "github.com/getsentry/sentry-go" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" ) @@ -166,7 +165,6 @@ func (s *Schedule) Start() { prevScheduledRun, _, err := s.getLatestStats() if err != nil { level.Error(s.logger).Log("err", "start schedule", "details", err) - sentry.CaptureException(err) ctxerr.Handle(s.ctx, err) } s.setIntervalStartedAt(prevScheduledRun.CreatedAt) @@ -208,7 +206,6 @@ func (s *Schedule) Start() { prevScheduledRun, _, err := s.getLatestStats() if err != nil { level.Error(s.logger).Log("err", "trigger get cron stats", "details", err) - sentry.CaptureException(err) ctxerr.Handle(s.ctx, err) } @@ -241,7 +238,6 @@ func (s *Schedule) Start() { prevScheduledRun, prevTriggeredRun, err := s.getLatestStats() if err != nil { level.Error(s.logger).Log("err", "get cron stats", "details", err) - sentry.CaptureException(err) ctxerr.Handle(s.ctx, err) // skip ahead to the next interval schedTicker.Reset(schedInterval) @@ -331,7 +327,7 @@ func (s *Schedule) Start() { newInterval, err := s.configReloadIntervalFn(s.ctx) if err != nil { level.Error(s.logger).Log("err", "schedule interval config reload failed", "details", err) - sentry.CaptureException(err) + ctxerr.Handle(s.ctx, err) continue } @@ -411,7 +407,6 @@ func (s *Schedule) runWithStats(statsType fleet.CronStatsType) { statsID, err := s.insertStats(statsType, fleet.CronStatsStatusPending) if err != nil { level.Error(s.logger).Log("err", fmt.Sprintf("insert cron stats %s", s.name), "details", err) - sentry.CaptureException(err) ctxerr.Handle(s.ctx, err) } level.Info(s.logger).Log("status", "pending") @@ -420,7 +415,6 @@ func (s *Schedule) runWithStats(statsType fleet.CronStatsType) { if err := s.updateStats(statsID, fleet.CronStatsStatusCompleted); err != nil { level.Error(s.logger).Log("err", fmt.Sprintf("update cron stats %s", s.name), "details", err) - sentry.CaptureException(err) ctxerr.Handle(s.ctx, err) } level.Info(s.logger).Log("status", "completed") @@ -432,7 +426,6 @@ func (s *Schedule) runAllJobs() { level.Debug(s.logger).Log("msg", "starting", "jobID", job.ID) if err := runJob(s.ctx, job.Fn); err != nil { level.Error(s.logger).Log("err", "running job", "details", err, "jobID", job.ID) - sentry.CaptureException(err) ctxerr.Handle(s.ctx, err) } } @@ -507,7 +500,7 @@ func (s *Schedule) acquireLock() bool { ok, err := s.locker.Lock(s.ctx, s.getLockName(), s.instanceID, s.getSchedInterval()) if err != nil { level.Error(s.logger).Log("msg", "lock failed", "err", err) - sentry.CaptureException(err) + ctxerr.Handle(s.ctx, err) return false } if !ok { @@ -521,7 +514,7 @@ func (s *Schedule) releaseLock() { err := s.locker.Unlock(s.ctx, s.getLockName(), s.instanceID) if err != nil { level.Error(s.logger).Log("msg", "unlock failed", "err", err) - sentry.CaptureException(err) + ctxerr.Handle(s.ctx, err) } } diff --git a/server/service/transport_error.go b/server/service/transport_error.go index 4de130f6c..d7ebf7a86 100644 --- a/server/service/transport_error.go +++ b/server/service/transport_error.go @@ -4,16 +4,12 @@ import ( "context" "encoding/json" "errors" - "fmt" "net" "net/http" "strconv" "github.com/fleetdm/fleet/v4/server/contexts/ctxerr" - "github.com/fleetdm/fleet/v4/server/contexts/host" - "github.com/fleetdm/fleet/v4/server/contexts/viewer" "github.com/fleetdm/fleet/v4/server/fleet" - "github.com/getsentry/sentry-go" kithttp "github.com/go-kit/kit/transport/http" "github.com/go-sql-driver/mysql" ) @@ -72,16 +68,6 @@ type conflictErrorInterface interface { IsConflict() bool } -func encodeErrorAndTrySentry(sentryEnabled bool) func(ctx context.Context, err error, w http.ResponseWriter) { - if !sentryEnabled { - return encodeError - } - return func(ctx context.Context, err error, w http.ResponseWriter) { - encodeError(ctx, err, w) - sendToSentry(ctx, err) - } -} - // encode error and status header to the client func encodeError(ctx context.Context, err error, w http.ResponseWriter) { ctxerr.Handle(ctx, err) @@ -223,21 +209,3 @@ func encodeError(ctx context.Context, err error, w http.ResponseWriter) { enc.Encode(jsonErr) //nolint:errcheck } - -func sendToSentry(ctx context.Context, err error) { - v, haveUser := viewer.FromContext(ctx) - h, haveHost := host.FromContext(ctx) - localHub := sentry.CurrentHub().Clone() - if haveUser { - localHub.ConfigureScope(func(scope *sentry.Scope) { - scope.SetTag("email", v.User.Email) - scope.SetTag("user_id", fmt.Sprint(v.User.ID)) - }) - } else if haveHost { - localHub.ConfigureScope(func(scope *sentry.Scope) { - scope.SetTag("hostname", h.Hostname) - scope.SetTag("host_id", fmt.Sprint(h.ID)) - }) - } - localHub.CaptureException(err) -} diff --git a/tools/sentry-self-hosted/README.md b/tools/sentry-self-hosted/README.md new file mode 100644 index 000000000..d2b248ef8 --- /dev/null +++ b/tools/sentry-self-hosted/README.md @@ -0,0 +1,17 @@ +# Running self-hosted Sentry + +It may be useful to run a local, self-hosted version of Sentry for tests or to aid in monitoring a local development environment. + +It is possible to do so by following the [steps documented on Sentry's website](https://develop.sentry.dev/self-hosted/). + +While Sentry's documentation is canonical, the high-level steps are documented here and annotated with Fleet specific information: + +1. `git clone` the [Sentry self-hosted repository](https://github.com/getsentry/self-hosted) +2. `git checkout` a specific version (e.g. `git checkout 24.2.0`) +3. Run `sudo ./install.sh` script (you may want to review the install scripts first, this takes a while to complete - maybe 30 minutes or so, you'll be prompted to create a Sentry user and password towards the end) +4. Once done, you should be able to run `docker-compose up -d` to bring up the self-hosted Sentry stack (that's a lot of containers to start) +5. Once the stack is up, you should be able to login at `http://localhost:9000` (on Google Chrome, after login I was met with a CSRF protection failure page, but it worked on Firefox) +6. In the "Issues" page, you should see a button labelled "Installation Instructions"; clicking on it will bring a page with the DSN that you can copy to use with Fleet (e.g. `http://@localhost:9000/1`) +7. Start `fleet serve`, passing the `--sentry_dsn http://` flag to enable Sentry + +You may now login to Fleet and any errors should show up in this local self-hosted version of Sentry.