diff --git a/changes/14713-fix-apm-stacktrace-and-duplicates b/changes/14713-fix-apm-stacktrace-and-duplicates new file mode 100644 index 000000000..464589267 --- /dev/null +++ b/changes/14713-fix-apm-stacktrace-and-duplicates @@ -0,0 +1 @@ +* Fixed how errors are sent to APM (Elastic) to avoid duplicates, cover more errors in background tasks (cron and worker jobs) and fix the reported stack trace. diff --git a/server/contexts/ctxerr/ctxerr.go b/server/contexts/ctxerr/ctxerr.go index aaae72e15..fcdec4feb 100644 --- a/server/contexts/ctxerr/ctxerr.go +++ b/server/contexts/ctxerr/ctxerr.go @@ -16,6 +16,7 @@ import ( "encoding/json" "errors" "fmt" + "runtime" "strings" "time" @@ -64,6 +65,14 @@ func (e *FleetError) Stack() []string { return e.stack.List() } +// StackTrace implements the runtimeStackTracer interface understood by the +// elastic APM package to reuse already-captured stack traces. +// https://github.com/elastic/apm-agent-go/blob/main/stacktrace/errors.go#L45-L47 +func (e *FleetError) StackTrace() *runtime.Frames { + st := e.stack.(stack) // outside of tests, e.stack is always a stack type + return runtime.CallersFrames(st) +} + // LogFields implements fleet.ErrWithLogFields, so attached error data can be // logged along with the error func (e *FleetError) LogFields() []any { @@ -153,10 +162,6 @@ func wrapError(ctx context.Context, msg string, cause error, data map[string]int } edata := encodeData(ctx, data, !isFleetError) - - // As a final step, report error to APM. - // This is safe to to even if APM is not enabled. - apm.CaptureError(ctx, cause).Send() return &FleetError{msg, stack, cause, edata} } @@ -274,7 +279,8 @@ func fromContext(ctx context.Context) handler { } // Handle handles err by passing it to the registered error handler, -// deduplicating it and storing it for a configured duration. +// deduplicating it and storing it for a configured duration. It also takes +// care of sending it to the configured APM, if any. func Handle(ctx context.Context, err error) { // as a last resource, wrap the error if there isn't // a FleetError in the chain @@ -283,6 +289,14 @@ func Handle(ctx context.Context, err error) { err = Wrap(ctx, err, "missing FleetError in chain") } + cause := err + if ferr := FleetCause(err); ferr != nil { + // use the FleetCause error so we send the most relevant stacktrace to APM + // (the one from the initial New/Wrap call). + cause = ferr + } + apm.CaptureError(ctx, cause).Send() + if eh := fromContext(ctx); eh != nil { eh.Store(err) } diff --git a/server/contexts/ctxerr/stack_test.go b/server/contexts/ctxerr/stack_test.go index 9447f7abb..4b155f66e 100644 --- a/server/contexts/ctxerr/stack_test.go +++ b/server/contexts/ctxerr/stack_test.go @@ -1,10 +1,17 @@ package ctxerr import ( + "context" + "errors" + "fmt" + "io" "regexp" + "strings" "testing" "github.com/stretchr/testify/require" + "go.elastic.co/apm/v2" + "go.elastic.co/apm/v2/apmtest" ) type mockStack struct { @@ -33,3 +40,186 @@ func TestStack(t *testing.T) { require.Regexpf(t, re, line, "expected line %d to match %q, got %q", i, re, line) } } + +func TestElasticStack(t *testing.T) { + ctx := context.Background() + + var wrap = errors.New("wrap") + errFn := func(fn func() error) error { // func1 + if err := fn(); err != nil { + if err == wrap { + return Wrap(ctx, err, "wrapped") + } + return err + } + return nil + } + + cases := []struct { + desc string + chain func() error + causeStackContains []string + leafStackContains []string + }{ + { + desc: "depth 2, wrap in errFn", + chain: func() error { + // gets wrapped in errFn, so top of the stack is func1 + return errFn(func() error { return wrap }) + }, + causeStackContains: []string{ + "/ctxerr.TestElasticStack.func1 ", + }, + }, + { + desc: "depth 2, wrap immediately", + chain: func() error { + // gets wrapped immediately when returned, so top of the stack is funcX.1 + return errFn(func() error { return Wrap(ctx, wrap) }) + }, + causeStackContains: []string{ + "/ctxerr.TestElasticStack.func3.1 ", + "/ctxerr.TestElasticStack.func1 ", // errFn + }, + }, + { + desc: "depth 3, ctxerr.New", + chain: func() error { + // gets wrapped directly in the call to New, so top of the stack is X.1.1 + return errFn(func() error { return func() error { return New(ctx, "new") }() }) + }, + causeStackContains: []string{ + "/ctxerr.TestElasticStack.func4.1.1 ", + "/ctxerr.TestElasticStack.func4.1 ", + "/ctxerr.TestElasticStack.func1 ", // errFn + }, + }, + { + desc: "depth 4, ctxerr.New", + chain: func() error { + // stacked capture in New, so top of the stack is X.1.1.1 + return errFn(func() error { + return func() error { + return func() error { + return New(ctx, "new") + }() + }() + }) + }, + causeStackContains: []string{ + "/ctxerr.TestElasticStack.func5.1.1.1 ", + "/ctxerr.TestElasticStack.func5.1.1 ", + "/ctxerr.TestElasticStack.func5.1 ", + "/ctxerr.TestElasticStack.func1 ", // errFn + }, + }, + { + desc: "depth 4, ctxerr.New always wrapped", + chain: func() error { + // stacked capture in New, so top of the stack is X.1.1.1 + return errFn(func() error { + return Wrap(ctx, func() error { + return Wrap(ctx, func() error { + return New(ctx, "new") + }()) + }()) + }) + }, + causeStackContains: []string{ + "/ctxerr.TestElasticStack.func6.1.1.1 ", + "/ctxerr.TestElasticStack.func6.1.1 ", + "/ctxerr.TestElasticStack.func6.1 ", + "/ctxerr.TestElasticStack.func1 ", // errFn + }, + leafStackContains: []string{ + // only a single stack trace is collected when wrapping another + // FleetError. + "/ctxerr.TestElasticStack.func6.1 ", + }, + }, + { + desc: "depth 4, wrapped only at the end", + chain: func() error { + return errFn(func() error { + return Wrap(ctx, func() error { + return func() error { + return io.EOF + }() + }()) + }) + }, + causeStackContains: []string{ + // since it wraps a non-FleetError, the full stack is collected + "/ctxerr.TestElasticStack.func7.1 ", + "/ctxerr.TestElasticStack.func1 ", // errFn + }, + }, + } + for _, c := range cases { + t.Run(c.desc, func(t *testing.T) { + checkStack := func(stack, contains []string) { + stackStr := strings.Join(stack, "\n") + lastIx := -1 + for _, want := range contains { + ix := strings.Index(stackStr, want) + require.True(t, ix > -1, "expected stack %v to contain %q", stackStr, want) + require.True(t, ix > lastIx, "expected %q to be after last check in %v", want, stackStr) + lastIx = ix + } + } + + err := c.chain() + require.Error(t, err) + var ferr *FleetError + require.ErrorAs(t, err, &ferr) + + leafStack := ferr.Stack() + cause := FleetCause(err) + causeStack := cause.Stack() + + // if the fleet root error != fleet leaf error, then separate leaf + + // cause stacks must be provided. + if cause != ferr { + require.True(t, len(c.causeStackContains) > 0) + require.True(t, len(c.leafStackContains) > 0) + } else { + // otherwise use the same stack expectations for both + if len(c.causeStackContains) == 0 { + c.causeStackContains = c.leafStackContains + } + if len(c.leafStackContains) == 0 { + c.leafStackContains = c.causeStackContains + } + } + + checkStack(causeStack, c.causeStackContains) + checkStack(leafStack, c.leafStackContains) + + // run in a test APM transaction, recording the sent events + _, _, apmErrs := apmtest.NewRecordingTracer().WithTransaction(func(ctx context.Context) { + // APM should be able to capture that error (we use the FleetCause error, + // so that it gets the most relevant stack trace and not that of the + // wrapped ones) and its cause is the cause. + apm.CaptureError(ctx, cause).Send() + }) + require.Len(t, apmErrs, 1) + apmErr := apmErrs[0] + require.NotNil(t, apmErr) + + // the culprit should be the function name of the top of the stack of the + // cause error. + fnName := strings.TrimSpace(c.causeStackContains[0][strings.Index(c.causeStackContains[0], "TestElasticStack"):]) + require.Equal(t, fnName, apmErr.Culprit) + + // the APM stack should match the cause stack (i.e. APM should have + // grabbed the stacktrace that we provided). If it didn't, it would have + // a stacktrace with a function name indicating the WithTransaction + // function literal where CaptureError is called. + var apmStack []string + for _, st := range apmErr.Exception.Stacktrace { + apmStack = append(apmStack, st.Module+"."+st.Function+" ("+st.File+":"+fmt.Sprint(st.Line)+")") + } + checkStack(apmStack, c.causeStackContains) + }) + } +} diff --git a/tools/apm-elastic/.apm-server.yml b/tools/apm-elastic/.apm-server.yml new file mode 100644 index 000000000..287e5b4b7 --- /dev/null +++ b/tools/apm-elastic/.apm-server.yml @@ -0,0 +1,11 @@ +apm-server: + host: apm-server:8200 + frontend.enabled: true + frontend.allow_origins: "*" + +output.elasticsearch: + enabled: true + hosts: ["elasticsearch"] + +setup.kibana: + host: "kibana" diff --git a/tools/apm-elastic/README.md b/tools/apm-elastic/README.md new file mode 100644 index 000000000..7d29e7aa8 --- /dev/null +++ b/tools/apm-elastic/README.md @@ -0,0 +1,30 @@ +# Running a local Elastic/Kibana APM + +## Setup + +To setup a full Elastic APM stack, from this directory, run: + +``` +$ docker-compose up -d +$ docker-compose exec apm-server ./apm-server setup +``` + +Give it a few seconds to complete setup, and then you should be able to view the APM website at `http://localhost:5601`. + +## Configuring local Fleet + +Make sure Fleet is ready to run locally (docker services are started, binary is built, etc., see [Testing and local development](../../docs/Contributing/Testing-and-local-development.md)). + +Start the locally-built Fleet (`fleet serve`) and provide the `--logging_tracing_enabled --logging_tracing_type=elasticapm` flags (note that sending those options using environment variables does not seem to work). + +Navigate the Fleet website and you should start seeing API requests and (eventually) errors show up in the APM dashboard (http://localhost:5601/app/apm). + +You may set the following environment variables to further configure the APM data collection: + +``` +ELASTIC_APM_SERVICE_NAME=fleet +ELASTIC_APM_ENVIRONMENT=development +ELASTIC_APM_TRANSACTION_SAMPLE_RATE=1 +``` + +The default values should be fine for a local environment (in particular, the sample rate defaults to 1.0 so all events are recorded). diff --git a/tools/apm-elastic/docker-compose.yml b/tools/apm-elastic/docker-compose.yml new file mode 100644 index 000000000..9caf3a042 --- /dev/null +++ b/tools/apm-elastic/docker-compose.yml @@ -0,0 +1,45 @@ +# From https://gist.github.com/sarriaroman/98289f3a1ae1cf46a801c89f2990c8fa + +# This docker-compose file will set up Elastic APM, Kibana and Elasticsearch containers +# and the apm-server.yml configuration file ensures that APM can talk to Kibana. Once +# the servers are up and running you should be able to browse http://localhost:5601 +# and see Kibana running. +# In order to set up the APM dashboards, you will need to run the `./apm-server -setup` +# command on the running apm-server container: +# $ docker-compose up -d +# Starting elasticsearch_1 ... done +# Starting kibana_1 ... done +# Starting apm-server_1 ... done +# $ docker-compose exec apm-server ./apm-server setup +# Loaded index template +# Loaded dashboards +# +# You should now be able to browse http://localhost:5601 and see the APM dashboards in Kibana. +version: "3" +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:7.8.0 + ports: + - "9200:9200" + - "9300:9300" + environment: + - discovery.type=single-node + kibana: + image: docker.elastic.co/kibana/kibana:7.8.0 + ports: + - "5601:5601" + links: + - elasticsearch + depends_on: + - elasticsearch + apm-server: + image: docker.elastic.co/apm/apm-server:7.8.0 + ports: + - "8200:8200" + volumes: + - ./.apm-server.yml:/usr/share/apm-server/apm-server.yml + depends_on: + - elasticsearch + links: + - elasticsearch + - kibana