Fix stack trace of captured errors in Sentry, capture errors in more code paths (#16966)

#16480 

# Checklist for submitter

- [x] Changes file added for user-visible changes in `changes/` or
`orbit/changes/`.
See [Changes
files](https://fleetdm.com/docs/contributing/committing-changes#changes-files)
for more information.
- [x] Added/updated tests
- [x] Manual QA for all new/changed functionality
This commit is contained in:
Martin Angers 2024-02-22 13:10:28 -05:00 committed by GitHub
parent 0642387b32
commit c5b988d600
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 293 additions and 64 deletions

View File

@ -0,0 +1,4 @@
* Fixed issues with how errors were captured in Sentry:
- The stack trace is now more precise.
- More error paths will now get captured in Sentry.
- **NOTE: Many more entries could be generated in Sentry compared to earlier Fleet versions.** Sentry capacity should be planned accordingly.

View File

@ -32,7 +32,6 @@ import (
"github.com/fleetdm/fleet/v4/server/vulnerabilities/utils"
"github.com/fleetdm/fleet/v4/server/webhooks"
"github.com/fleetdm/fleet/v4/server/worker"
"github.com/getsentry/sentry-go"
kitlog "github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/hashicorp/go-multierror"
@ -41,7 +40,6 @@ import (
func errHandler(ctx context.Context, logger kitlog.Logger, msg string, err error) {
level.Error(logger).Log("msg", msg, "err", err)
sentry.CaptureException(err)
ctxerr.Handle(ctx, err)
}

View File

@ -23,6 +23,7 @@ import (
"github.com/fleetdm/fleet/v4/server/contexts/host"
"github.com/fleetdm/fleet/v4/server/contexts/viewer"
"github.com/fleetdm/fleet/v4/server/fleet"
"github.com/getsentry/sentry-go"
"go.elastic.co/apm/v2"
)
@ -73,6 +74,16 @@ func (e *FleetError) StackTrace() *runtime.Frames {
return runtime.CallersFrames(st)
}
// StackFrames implements the reflection-based method that Sentry's Go SDK
// uses to look for a stack trace. It abuses the internals a bit, as it uses
// the name that sentry looks for, but returns the []uintptr slice (which works
// because of how they handle the returned value via reflection). A cleaner
// approach would be if they used an interface detection like APM does.
// https://github.com/getsentry/sentry-go/blob/master/stacktrace.go#L44-L49
func (e *FleetError) StackFrames() []uintptr {
return e.stack.(stack) // outside of tests, e.stack is always a stack type
}
// LogFields implements fleet.ErrWithLogFields, so attached error data can be
// logged along with the error
func (e *FleetError) LogFields() []any {
@ -295,8 +306,37 @@ func Handle(ctx context.Context, err error) {
// (the one from the initial New/Wrap call).
cause = ferr
}
// send to elastic APM
apm.CaptureError(ctx, cause).Send()
// if Sentry is configured, capture the error there
if sentryClient := sentry.CurrentHub().Client(); sentryClient != nil {
// sentry is configured, add contextual information if available
v, _ := viewer.FromContext(ctx)
h, _ := host.FromContext(ctx)
if v.User != nil || h != nil {
// we have a viewer (user) or a host in the context, use this to
// enrich the error with more context
ctxHub := sentry.CurrentHub().Clone()
if v.User != nil {
ctxHub.ConfigureScope(func(scope *sentry.Scope) {
scope.SetTag("email", v.User.Email)
scope.SetTag("user_id", fmt.Sprint(v.User.ID))
})
} else if h != nil {
ctxHub.ConfigureScope(func(scope *sentry.Scope) {
scope.SetTag("hostname", h.Hostname)
scope.SetTag("host_id", fmt.Sprint(h.ID))
})
}
ctxHub.CaptureException(cause)
} else {
sentry.CaptureException(cause)
}
}
if eh := fromContext(ctx); eh != nil {
eh.Store(err)
}

View File

@ -2,13 +2,21 @@ package ctxerr
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/http/httptest"
"net/url"
"path/filepath"
"regexp"
"slices"
"strings"
"testing"
"time"
"github.com/getsentry/sentry-go"
"github.com/stretchr/testify/require"
"go.elastic.co/apm/v2"
"go.elastic.co/apm/v2/apmtest"
@ -157,17 +165,6 @@ func TestElasticStack(t *testing.T) {
}
for _, c := range cases {
t.Run(c.desc, func(t *testing.T) {
checkStack := func(stack, contains []string) {
stackStr := strings.Join(stack, "\n")
lastIx := -1
for _, want := range contains {
ix := strings.Index(stackStr, want)
require.True(t, ix > -1, "expected stack %v to contain %q", stackStr, want)
require.True(t, ix > lastIx, "expected %q to be after last check in %v", want, stackStr)
lastIx = ix
}
}
err := c.chain()
require.Error(t, err)
var ferr *FleetError
@ -192,8 +189,8 @@ func TestElasticStack(t *testing.T) {
}
}
checkStack(causeStack, c.causeStackContains)
checkStack(leafStack, c.leafStackContains)
checkStack(t, causeStack, c.causeStackContains)
checkStack(t, leafStack, c.leafStackContains)
// run in a test APM transaction, recording the sent events
_, _, apmErrs := apmtest.NewRecordingTracer().WithTransaction(func(ctx context.Context) {
@ -219,7 +216,216 @@ func TestElasticStack(t *testing.T) {
for _, st := range apmErr.Exception.Stacktrace {
apmStack = append(apmStack, st.Module+"."+st.Function+" ("+st.File+":"+fmt.Sprint(st.Line)+")")
}
checkStack(apmStack, c.causeStackContains)
checkStack(t, apmStack, c.causeStackContains)
})
}
}
func TestSentryStack(t *testing.T) {
ctx := context.Background()
var wrap = errors.New("wrap")
errFn := func(fn func() error) error { // func1
if err := fn(); err != nil {
if err == wrap {
return Wrap(ctx, err, "wrapped")
}
return err
}
return nil
}
type sentryPayload struct {
Exceptions []*sentry.Exception `json:"exception"` // json field name is singular
}
cases := []struct {
desc string
chain func() error
causeStackContains []string
leafStackContains []string
}{
{
desc: "depth 2, wrap in errFn",
chain: func() error {
// gets wrapped in errFn, so top of the stack is func1
return errFn(func() error { return wrap })
},
causeStackContains: []string{
"/ctxerr.TestSentryStack.func1 ",
},
},
{
desc: "depth 2, wrap immediately",
chain: func() error {
// gets wrapped immediately when returned, so top of the stack is funcX.1
return errFn(func() error { return Wrap(ctx, wrap) })
},
causeStackContains: []string{
"/ctxerr.TestSentryStack.func3.1 ",
"/ctxerr.TestSentryStack.func1 ", // errFn
},
},
{
desc: "depth 3, ctxerr.New",
chain: func() error {
// gets wrapped directly in the call to New, so top of the stack is X.1.1
return errFn(func() error { return func() error { return New(ctx, "new") }() })
},
causeStackContains: []string{
"/ctxerr.TestSentryStack.func4.1.1 ",
"/ctxerr.TestSentryStack.func4.1 ",
"/ctxerr.TestSentryStack.func1 ", // errFn
},
},
{
desc: "depth 4, ctxerr.New",
chain: func() error {
// stacked capture in New, so top of the stack is X.1.1.1
return errFn(func() error {
return func() error {
return func() error {
return New(ctx, "new")
}()
}()
})
},
causeStackContains: []string{
"/ctxerr.TestSentryStack.func5.1.1.1 ",
"/ctxerr.TestSentryStack.func5.1.1 ",
"/ctxerr.TestSentryStack.func5.1 ",
"/ctxerr.TestSentryStack.func1 ", // errFn
},
},
{
desc: "depth 4, ctxerr.New always wrapped",
chain: func() error {
// stacked capture in New, so top of the stack is X.1.1.1
return errFn(func() error {
return Wrap(ctx, func() error {
return Wrap(ctx, func() error {
return New(ctx, "new")
}())
}())
})
},
causeStackContains: []string{
"/ctxerr.TestSentryStack.func6.1.1.1 ",
"/ctxerr.TestSentryStack.func6.1.1 ",
"/ctxerr.TestSentryStack.func6.1 ",
"/ctxerr.TestSentryStack.func1 ", // errFn
},
leafStackContains: []string{
// only a single stack trace is collected when wrapping another
// FleetError.
"/ctxerr.TestSentryStack.func6.1 ",
},
},
{
desc: "depth 4, wrapped only at the end",
chain: func() error {
return errFn(func() error {
return Wrap(ctx, func() error {
return func() error {
return io.EOF
}()
}())
})
},
causeStackContains: []string{
// since it wraps a non-FleetError, the full stack is collected
"/ctxerr.TestSentryStack.func7.1 ",
"/ctxerr.TestSentryStack.func1 ", // errFn
},
},
}
for _, c := range cases {
t.Run(c.desc, func(t *testing.T) {
err := c.chain()
require.Error(t, err)
var ferr *FleetError
require.ErrorAs(t, err, &ferr)
leafStack := ferr.Stack()
cause := FleetCause(err)
causeStack := cause.Stack()
// if the fleet root error != fleet leaf error, then separate leaf +
// cause stacks must be provided.
if cause != ferr {
require.True(t, len(c.causeStackContains) > 0)
require.True(t, len(c.leafStackContains) > 0)
} else {
// otherwise use the same stack expectations for both
if len(c.causeStackContains) == 0 {
c.causeStackContains = c.leafStackContains
}
if len(c.leafStackContains) == 0 {
c.leafStackContains = c.causeStackContains
}
}
checkStack(t, causeStack, c.causeStackContains)
checkStack(t, leafStack, c.leafStackContains)
// start an HTTP server that Sentry will send the event to
var payload sentryPayload
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
b, err := io.ReadAll(r.Body)
require.NoError(t, err)
err = json.Unmarshal(b, &payload)
require.NoError(t, err)
w.WriteHeader(200)
}))
defer srv.Close()
// a "project ID" is required, which is the path portion
parsedURL, err := url.Parse(srv.URL + "/testproject")
require.NoError(t, err)
parsedURL.User = url.User("test")
err = sentry.Init(sentry.ClientOptions{Dsn: parsedURL.String()})
require.NoError(t, err)
// best-effort un-configure of Sentry on exit
t.Cleanup(func() {
sentry.CurrentHub().BindClient(nil)
})
eventID := sentry.CaptureException(cause)
require.NotNil(t, eventID)
require.True(t, sentry.Flush(2*time.Second), "failed to flush Sentry events in time")
require.True(t, len(payload.Exceptions) >= 1) // the wrapped errors are exploded into separate exceptions in the slice
// since we capture the FleetCause error, the last entry in the exceptions
// must be a FleetError and contain the stacktrace we're looking for.
rootCapturedErr := payload.Exceptions[len(payload.Exceptions)-1]
require.Equal(t, "*ctxerr.FleetError", rootCapturedErr.Type)
// format the stack trace the same way we do in ctxerr
var stack []string
for _, st := range rootCapturedErr.Stacktrace.Frames {
filename := st.Filename
if filename == "" {
// get it from abspath
filename = filepath.Base(st.AbsPath)
}
stack = append(stack, st.Module+"."+st.Function+" ("+filename+":"+fmt.Sprint(st.Lineno)+")")
}
// for some reason, Sentry reverses the stack trace
slices.Reverse(stack)
checkStack(t, stack, c.causeStackContains)
})
}
}
func checkStack(t *testing.T, stack, contains []string) {
stackStr := strings.Join(stack, "\n")
lastIx := -1
for _, want := range contains {
ix := strings.Index(stackStr, want)
require.True(t, ix > -1, "expected stack %v to contain %q", stackStr, want)
require.True(t, ix > lastIx, "expected %q to be after last check in %v", want, stackStr)
lastIx = ix
}
}

View File

@ -16,7 +16,6 @@ import (
"github.com/fleetdm/fleet/v4/server/logging"
"github.com/fleetdm/fleet/v4/server/mdm/internal/commonmdm"
"github.com/fleetdm/fleet/v4/server/ptr"
"github.com/getsentry/sentry-go"
"github.com/go-kit/log/level"
"github.com/google/uuid"
"github.com/micromdm/nanodep/godep"
@ -368,7 +367,13 @@ func NewDEPService(
depStorage,
depsync.WithLogger(logging.NewNanoDEPLogger(kitlog.With(logger, "component", "nanodep-syncer"))),
depsync.WithCallback(func(ctx context.Context, isFetch bool, resp *godep.DeviceResponse) error {
return depSvc.processDeviceResponse(ctx, depClient, resp)
// the nanodep syncer just logs the error of the callback, so in order to
// capture it we need to do this here.
err := depSvc.processDeviceResponse(ctx, depClient, resp)
if err != nil {
ctxerr.Handle(ctx, err)
}
return err
}),
)
@ -443,7 +448,7 @@ func (d *DEPService) processDeviceResponse(ctx context.Context, depClient *godep
switch {
case err != nil:
level.Error(kitlog.With(d.logger)).Log("err", err)
sentry.CaptureException(err)
ctxerr.Handle(ctx, err)
case n > 0:
level.Info(kitlog.With(d.logger)).Log("msg", fmt.Sprintf("added %d new mdm device(s) to pending hosts", n))
case n == 0:

View File

@ -10,7 +10,6 @@ import (
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
"github.com/fleetdm/fleet/v4/server/datastore/redis"
"github.com/fleetdm/fleet/v4/server/fleet"
"github.com/getsentry/sentry-go"
kitlog "github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
redigo "github.com/gomodule/redigo/redis"
@ -47,7 +46,6 @@ func NewTask(ds fleet.Datastore, pool fleet.RedisPool, clck clock.Clock, conf co
func (t *Task) StartCollectors(ctx context.Context, logger kitlog.Logger) {
collectorErrHandler := func(name string, err error) {
level.Error(logger).Log("err", fmt.Sprintf("%s collector", name), "details", err)
sentry.CaptureException(err)
ctxerr.Handle(ctx, err)
}

View File

@ -131,7 +131,7 @@ func MakeHandler(
setRequestsContexts(svc),
),
kithttp.ServerErrorHandler(&errorHandler{logger}),
kithttp.ServerErrorEncoder(encodeErrorAndTrySentry(config.Sentry.Dsn != "")),
kithttp.ServerErrorEncoder(encodeError),
kithttp.ServerAfter(
kithttp.SetContentType("application/json; charset=utf-8"),
logRequestEnd(logger),

View File

@ -12,7 +12,6 @@ import (
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
"github.com/fleetdm/fleet/v4/server/fleet"
"github.com/getsentry/sentry-go"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
)
@ -166,7 +165,6 @@ func (s *Schedule) Start() {
prevScheduledRun, _, err := s.getLatestStats()
if err != nil {
level.Error(s.logger).Log("err", "start schedule", "details", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
}
s.setIntervalStartedAt(prevScheduledRun.CreatedAt)
@ -208,7 +206,6 @@ func (s *Schedule) Start() {
prevScheduledRun, _, err := s.getLatestStats()
if err != nil {
level.Error(s.logger).Log("err", "trigger get cron stats", "details", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
}
@ -241,7 +238,6 @@ func (s *Schedule) Start() {
prevScheduledRun, prevTriggeredRun, err := s.getLatestStats()
if err != nil {
level.Error(s.logger).Log("err", "get cron stats", "details", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
// skip ahead to the next interval
schedTicker.Reset(schedInterval)
@ -331,7 +327,7 @@ func (s *Schedule) Start() {
newInterval, err := s.configReloadIntervalFn(s.ctx)
if err != nil {
level.Error(s.logger).Log("err", "schedule interval config reload failed", "details", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
continue
}
@ -411,7 +407,6 @@ func (s *Schedule) runWithStats(statsType fleet.CronStatsType) {
statsID, err := s.insertStats(statsType, fleet.CronStatsStatusPending)
if err != nil {
level.Error(s.logger).Log("err", fmt.Sprintf("insert cron stats %s", s.name), "details", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
}
level.Info(s.logger).Log("status", "pending")
@ -420,7 +415,6 @@ func (s *Schedule) runWithStats(statsType fleet.CronStatsType) {
if err := s.updateStats(statsID, fleet.CronStatsStatusCompleted); err != nil {
level.Error(s.logger).Log("err", fmt.Sprintf("update cron stats %s", s.name), "details", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
}
level.Info(s.logger).Log("status", "completed")
@ -432,7 +426,6 @@ func (s *Schedule) runAllJobs() {
level.Debug(s.logger).Log("msg", "starting", "jobID", job.ID)
if err := runJob(s.ctx, job.Fn); err != nil {
level.Error(s.logger).Log("err", "running job", "details", err, "jobID", job.ID)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
}
}
@ -507,7 +500,7 @@ func (s *Schedule) acquireLock() bool {
ok, err := s.locker.Lock(s.ctx, s.getLockName(), s.instanceID, s.getSchedInterval())
if err != nil {
level.Error(s.logger).Log("msg", "lock failed", "err", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
return false
}
if !ok {
@ -521,7 +514,7 @@ func (s *Schedule) releaseLock() {
err := s.locker.Unlock(s.ctx, s.getLockName(), s.instanceID)
if err != nil {
level.Error(s.logger).Log("msg", "unlock failed", "err", err)
sentry.CaptureException(err)
ctxerr.Handle(s.ctx, err)
}
}

View File

@ -4,16 +4,12 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"net"
"net/http"
"strconv"
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
"github.com/fleetdm/fleet/v4/server/contexts/host"
"github.com/fleetdm/fleet/v4/server/contexts/viewer"
"github.com/fleetdm/fleet/v4/server/fleet"
"github.com/getsentry/sentry-go"
kithttp "github.com/go-kit/kit/transport/http"
"github.com/go-sql-driver/mysql"
)
@ -72,16 +68,6 @@ type conflictErrorInterface interface {
IsConflict() bool
}
func encodeErrorAndTrySentry(sentryEnabled bool) func(ctx context.Context, err error, w http.ResponseWriter) {
if !sentryEnabled {
return encodeError
}
return func(ctx context.Context, err error, w http.ResponseWriter) {
encodeError(ctx, err, w)
sendToSentry(ctx, err)
}
}
// encode error and status header to the client
func encodeError(ctx context.Context, err error, w http.ResponseWriter) {
ctxerr.Handle(ctx, err)
@ -223,21 +209,3 @@ func encodeError(ctx context.Context, err error, w http.ResponseWriter) {
enc.Encode(jsonErr) //nolint:errcheck
}
func sendToSentry(ctx context.Context, err error) {
v, haveUser := viewer.FromContext(ctx)
h, haveHost := host.FromContext(ctx)
localHub := sentry.CurrentHub().Clone()
if haveUser {
localHub.ConfigureScope(func(scope *sentry.Scope) {
scope.SetTag("email", v.User.Email)
scope.SetTag("user_id", fmt.Sprint(v.User.ID))
})
} else if haveHost {
localHub.ConfigureScope(func(scope *sentry.Scope) {
scope.SetTag("hostname", h.Hostname)
scope.SetTag("host_id", fmt.Sprint(h.ID))
})
}
localHub.CaptureException(err)
}

View File

@ -0,0 +1,17 @@
# Running self-hosted Sentry
It may be useful to run a local, self-hosted version of Sentry for tests or to aid in monitoring a local development environment.
It is possible to do so by following the [steps documented on Sentry's website](https://develop.sentry.dev/self-hosted/).
While Sentry's documentation is canonical, the high-level steps are documented here and annotated with Fleet specific information:
1. `git clone` the [Sentry self-hosted repository](https://github.com/getsentry/self-hosted)
2. `git checkout` a specific version (e.g. `git checkout 24.2.0`)
3. Run `sudo ./install.sh` script (you may want to review the install scripts first, this takes a while to complete - maybe 30 minutes or so, you'll be prompted to create a Sentry user and password towards the end)
4. Once done, you should be able to run `docker-compose up -d` to bring up the self-hosted Sentry stack (that's a lot of containers to start)
5. Once the stack is up, you should be able to login at `http://localhost:9000` (on Google Chrome, after login I was met with a CSRF protection failure page, but it worked on Firefox)
6. In the "Issues" page, you should see a button labelled "Installation Instructions"; clicking on it will bring a page with the DSN that you can copy to use with Fleet (e.g. `http://<base64-data>@localhost:9000/1`)
7. Start `fleet serve`, passing the `--sentry_dsn http://<sentry-dsn>` flag to enable Sentry
You may now login to Fleet and any errors should show up in this local self-hosted version of Sentry.