Fix stack trace, duplicates and better coverage of captured errors in APM (#16516)

This commit is contained in:
Martin Angers 2024-02-05 11:53:39 -05:00 committed by GitHub
parent a1b23acfea
commit 792d76e2cd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 296 additions and 5 deletions

View File

@ -0,0 +1 @@
* Fixed how errors are sent to APM (Elastic) to avoid duplicates, cover more errors in background tasks (cron and worker jobs) and fix the reported stack trace.

View File

@ -16,6 +16,7 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"runtime"
"strings" "strings"
"time" "time"
@ -64,6 +65,14 @@ func (e *FleetError) Stack() []string {
return e.stack.List() return e.stack.List()
} }
// StackTrace implements the runtimeStackTracer interface understood by the
// elastic APM package to reuse already-captured stack traces.
// https://github.com/elastic/apm-agent-go/blob/main/stacktrace/errors.go#L45-L47
func (e *FleetError) StackTrace() *runtime.Frames {
st := e.stack.(stack) // outside of tests, e.stack is always a stack type
return runtime.CallersFrames(st)
}
// LogFields implements fleet.ErrWithLogFields, so attached error data can be // LogFields implements fleet.ErrWithLogFields, so attached error data can be
// logged along with the error // logged along with the error
func (e *FleetError) LogFields() []any { func (e *FleetError) LogFields() []any {
@ -153,10 +162,6 @@ func wrapError(ctx context.Context, msg string, cause error, data map[string]int
} }
edata := encodeData(ctx, data, !isFleetError) edata := encodeData(ctx, data, !isFleetError)
// As a final step, report error to APM.
// This is safe to to even if APM is not enabled.
apm.CaptureError(ctx, cause).Send()
return &FleetError{msg, stack, cause, edata} return &FleetError{msg, stack, cause, edata}
} }
@ -274,7 +279,8 @@ func fromContext(ctx context.Context) handler {
} }
// Handle handles err by passing it to the registered error handler, // Handle handles err by passing it to the registered error handler,
// deduplicating it and storing it for a configured duration. // deduplicating it and storing it for a configured duration. It also takes
// care of sending it to the configured APM, if any.
func Handle(ctx context.Context, err error) { func Handle(ctx context.Context, err error) {
// as a last resource, wrap the error if there isn't // as a last resource, wrap the error if there isn't
// a FleetError in the chain // a FleetError in the chain
@ -283,6 +289,14 @@ func Handle(ctx context.Context, err error) {
err = Wrap(ctx, err, "missing FleetError in chain") err = Wrap(ctx, err, "missing FleetError in chain")
} }
cause := err
if ferr := FleetCause(err); ferr != nil {
// use the FleetCause error so we send the most relevant stacktrace to APM
// (the one from the initial New/Wrap call).
cause = ferr
}
apm.CaptureError(ctx, cause).Send()
if eh := fromContext(ctx); eh != nil { if eh := fromContext(ctx); eh != nil {
eh.Store(err) eh.Store(err)
} }

View File

@ -1,10 +1,17 @@
package ctxerr package ctxerr
import ( import (
"context"
"errors"
"fmt"
"io"
"regexp" "regexp"
"strings"
"testing" "testing"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"go.elastic.co/apm/v2"
"go.elastic.co/apm/v2/apmtest"
) )
type mockStack struct { type mockStack struct {
@ -33,3 +40,186 @@ func TestStack(t *testing.T) {
require.Regexpf(t, re, line, "expected line %d to match %q, got %q", i, re, line) require.Regexpf(t, re, line, "expected line %d to match %q, got %q", i, re, line)
} }
} }
func TestElasticStack(t *testing.T) {
ctx := context.Background()
var wrap = errors.New("wrap")
errFn := func(fn func() error) error { // func1
if err := fn(); err != nil {
if err == wrap {
return Wrap(ctx, err, "wrapped")
}
return err
}
return nil
}
cases := []struct {
desc string
chain func() error
causeStackContains []string
leafStackContains []string
}{
{
desc: "depth 2, wrap in errFn",
chain: func() error {
// gets wrapped in errFn, so top of the stack is func1
return errFn(func() error { return wrap })
},
causeStackContains: []string{
"/ctxerr.TestElasticStack.func1 ",
},
},
{
desc: "depth 2, wrap immediately",
chain: func() error {
// gets wrapped immediately when returned, so top of the stack is funcX.1
return errFn(func() error { return Wrap(ctx, wrap) })
},
causeStackContains: []string{
"/ctxerr.TestElasticStack.func3.1 ",
"/ctxerr.TestElasticStack.func1 ", // errFn
},
},
{
desc: "depth 3, ctxerr.New",
chain: func() error {
// gets wrapped directly in the call to New, so top of the stack is X.1.1
return errFn(func() error { return func() error { return New(ctx, "new") }() })
},
causeStackContains: []string{
"/ctxerr.TestElasticStack.func4.1.1 ",
"/ctxerr.TestElasticStack.func4.1 ",
"/ctxerr.TestElasticStack.func1 ", // errFn
},
},
{
desc: "depth 4, ctxerr.New",
chain: func() error {
// stacked capture in New, so top of the stack is X.1.1.1
return errFn(func() error {
return func() error {
return func() error {
return New(ctx, "new")
}()
}()
})
},
causeStackContains: []string{
"/ctxerr.TestElasticStack.func5.1.1.1 ",
"/ctxerr.TestElasticStack.func5.1.1 ",
"/ctxerr.TestElasticStack.func5.1 ",
"/ctxerr.TestElasticStack.func1 ", // errFn
},
},
{
desc: "depth 4, ctxerr.New always wrapped",
chain: func() error {
// stacked capture in New, so top of the stack is X.1.1.1
return errFn(func() error {
return Wrap(ctx, func() error {
return Wrap(ctx, func() error {
return New(ctx, "new")
}())
}())
})
},
causeStackContains: []string{
"/ctxerr.TestElasticStack.func6.1.1.1 ",
"/ctxerr.TestElasticStack.func6.1.1 ",
"/ctxerr.TestElasticStack.func6.1 ",
"/ctxerr.TestElasticStack.func1 ", // errFn
},
leafStackContains: []string{
// only a single stack trace is collected when wrapping another
// FleetError.
"/ctxerr.TestElasticStack.func6.1 ",
},
},
{
desc: "depth 4, wrapped only at the end",
chain: func() error {
return errFn(func() error {
return Wrap(ctx, func() error {
return func() error {
return io.EOF
}()
}())
})
},
causeStackContains: []string{
// since it wraps a non-FleetError, the full stack is collected
"/ctxerr.TestElasticStack.func7.1 ",
"/ctxerr.TestElasticStack.func1 ", // errFn
},
},
}
for _, c := range cases {
t.Run(c.desc, func(t *testing.T) {
checkStack := func(stack, contains []string) {
stackStr := strings.Join(stack, "\n")
lastIx := -1
for _, want := range contains {
ix := strings.Index(stackStr, want)
require.True(t, ix > -1, "expected stack %v to contain %q", stackStr, want)
require.True(t, ix > lastIx, "expected %q to be after last check in %v", want, stackStr)
lastIx = ix
}
}
err := c.chain()
require.Error(t, err)
var ferr *FleetError
require.ErrorAs(t, err, &ferr)
leafStack := ferr.Stack()
cause := FleetCause(err)
causeStack := cause.Stack()
// if the fleet root error != fleet leaf error, then separate leaf +
// cause stacks must be provided.
if cause != ferr {
require.True(t, len(c.causeStackContains) > 0)
require.True(t, len(c.leafStackContains) > 0)
} else {
// otherwise use the same stack expectations for both
if len(c.causeStackContains) == 0 {
c.causeStackContains = c.leafStackContains
}
if len(c.leafStackContains) == 0 {
c.leafStackContains = c.causeStackContains
}
}
checkStack(causeStack, c.causeStackContains)
checkStack(leafStack, c.leafStackContains)
// run in a test APM transaction, recording the sent events
_, _, apmErrs := apmtest.NewRecordingTracer().WithTransaction(func(ctx context.Context) {
// APM should be able to capture that error (we use the FleetCause error,
// so that it gets the most relevant stack trace and not that of the
// wrapped ones) and its cause is the cause.
apm.CaptureError(ctx, cause).Send()
})
require.Len(t, apmErrs, 1)
apmErr := apmErrs[0]
require.NotNil(t, apmErr)
// the culprit should be the function name of the top of the stack of the
// cause error.
fnName := strings.TrimSpace(c.causeStackContains[0][strings.Index(c.causeStackContains[0], "TestElasticStack"):])
require.Equal(t, fnName, apmErr.Culprit)
// the APM stack should match the cause stack (i.e. APM should have
// grabbed the stacktrace that we provided). If it didn't, it would have
// a stacktrace with a function name indicating the WithTransaction
// function literal where CaptureError is called.
var apmStack []string
for _, st := range apmErr.Exception.Stacktrace {
apmStack = append(apmStack, st.Module+"."+st.Function+" ("+st.File+":"+fmt.Sprint(st.Line)+")")
}
checkStack(apmStack, c.causeStackContains)
})
}
}

View File

@ -0,0 +1,11 @@
apm-server:
host: apm-server:8200
frontend.enabled: true
frontend.allow_origins: "*"
output.elasticsearch:
enabled: true
hosts: ["elasticsearch"]
setup.kibana:
host: "kibana"

View File

@ -0,0 +1,30 @@
# Running a local Elastic/Kibana APM
## Setup
To setup a full Elastic APM stack, from this directory, run:
```
$ docker-compose up -d
$ docker-compose exec apm-server ./apm-server setup
```
Give it a few seconds to complete setup, and then you should be able to view the APM website at `http://localhost:5601`.
## Configuring local Fleet
Make sure Fleet is ready to run locally (docker services are started, binary is built, etc., see [Testing and local development](../../docs/Contributing/Testing-and-local-development.md)).
Start the locally-built Fleet (`fleet serve`) and provide the `--logging_tracing_enabled --logging_tracing_type=elasticapm` flags (note that sending those options using environment variables does not seem to work).
Navigate the Fleet website and you should start seeing API requests and (eventually) errors show up in the APM dashboard (http://localhost:5601/app/apm).
You may set the following environment variables to further configure the APM data collection:
```
ELASTIC_APM_SERVICE_NAME=fleet
ELASTIC_APM_ENVIRONMENT=development
ELASTIC_APM_TRANSACTION_SAMPLE_RATE=1
```
The default values should be fine for a local environment (in particular, the sample rate defaults to 1.0 so all events are recorded).

View File

@ -0,0 +1,45 @@
# From https://gist.github.com/sarriaroman/98289f3a1ae1cf46a801c89f2990c8fa
# This docker-compose file will set up Elastic APM, Kibana and Elasticsearch containers
# and the apm-server.yml configuration file ensures that APM can talk to Kibana. Once
# the servers are up and running you should be able to browse http://localhost:5601
# and see Kibana running.
# In order to set up the APM dashboards, you will need to run the `./apm-server -setup`
# command on the running apm-server container:
# $ docker-compose up -d
# Starting elasticsearch_1 ... done
# Starting kibana_1 ... done
# Starting apm-server_1 ... done
# $ docker-compose exec apm-server ./apm-server setup
# Loaded index template
# Loaded dashboards
#
# You should now be able to browse http://localhost:5601 and see the APM dashboards in Kibana.
version: "3"
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.8.0
ports:
- "9200:9200"
- "9300:9300"
environment:
- discovery.type=single-node
kibana:
image: docker.elastic.co/kibana/kibana:7.8.0
ports:
- "5601:5601"
links:
- elasticsearch
depends_on:
- elasticsearch
apm-server:
image: docker.elastic.co/apm/apm-server:7.8.0
ports:
- "8200:8200"
volumes:
- ./.apm-server.yml:/usr/share/apm-server/apm-server.yml
depends_on:
- elasticsearch
links:
- elasticsearch
- kibana