mirror of
https://github.com/empayre/fleet.git
synced 2024-11-06 00:45:19 +00:00
Fix stack trace, duplicates and better coverage of captured errors in APM (#16516)
This commit is contained in:
parent
a1b23acfea
commit
792d76e2cd
1
changes/14713-fix-apm-stacktrace-and-duplicates
Normal file
1
changes/14713-fix-apm-stacktrace-and-duplicates
Normal file
@ -0,0 +1 @@
|
||||
* Fixed how errors are sent to APM (Elastic) to avoid duplicates, cover more errors in background tasks (cron and worker jobs) and fix the reported stack trace.
|
@ -16,6 +16,7 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@ -64,6 +65,14 @@ func (e *FleetError) Stack() []string {
|
||||
return e.stack.List()
|
||||
}
|
||||
|
||||
// StackTrace implements the runtimeStackTracer interface understood by the
|
||||
// elastic APM package to reuse already-captured stack traces.
|
||||
// https://github.com/elastic/apm-agent-go/blob/main/stacktrace/errors.go#L45-L47
|
||||
func (e *FleetError) StackTrace() *runtime.Frames {
|
||||
st := e.stack.(stack) // outside of tests, e.stack is always a stack type
|
||||
return runtime.CallersFrames(st)
|
||||
}
|
||||
|
||||
// LogFields implements fleet.ErrWithLogFields, so attached error data can be
|
||||
// logged along with the error
|
||||
func (e *FleetError) LogFields() []any {
|
||||
@ -153,10 +162,6 @@ func wrapError(ctx context.Context, msg string, cause error, data map[string]int
|
||||
}
|
||||
|
||||
edata := encodeData(ctx, data, !isFleetError)
|
||||
|
||||
// As a final step, report error to APM.
|
||||
// This is safe to to even if APM is not enabled.
|
||||
apm.CaptureError(ctx, cause).Send()
|
||||
return &FleetError{msg, stack, cause, edata}
|
||||
}
|
||||
|
||||
@ -274,7 +279,8 @@ func fromContext(ctx context.Context) handler {
|
||||
}
|
||||
|
||||
// Handle handles err by passing it to the registered error handler,
|
||||
// deduplicating it and storing it for a configured duration.
|
||||
// deduplicating it and storing it for a configured duration. It also takes
|
||||
// care of sending it to the configured APM, if any.
|
||||
func Handle(ctx context.Context, err error) {
|
||||
// as a last resource, wrap the error if there isn't
|
||||
// a FleetError in the chain
|
||||
@ -283,6 +289,14 @@ func Handle(ctx context.Context, err error) {
|
||||
err = Wrap(ctx, err, "missing FleetError in chain")
|
||||
}
|
||||
|
||||
cause := err
|
||||
if ferr := FleetCause(err); ferr != nil {
|
||||
// use the FleetCause error so we send the most relevant stacktrace to APM
|
||||
// (the one from the initial New/Wrap call).
|
||||
cause = ferr
|
||||
}
|
||||
apm.CaptureError(ctx, cause).Send()
|
||||
|
||||
if eh := fromContext(ctx); eh != nil {
|
||||
eh.Store(err)
|
||||
}
|
||||
|
@ -1,10 +1,17 @@
|
||||
package ctxerr
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"regexp"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.elastic.co/apm/v2"
|
||||
"go.elastic.co/apm/v2/apmtest"
|
||||
)
|
||||
|
||||
type mockStack struct {
|
||||
@ -33,3 +40,186 @@ func TestStack(t *testing.T) {
|
||||
require.Regexpf(t, re, line, "expected line %d to match %q, got %q", i, re, line)
|
||||
}
|
||||
}
|
||||
|
||||
func TestElasticStack(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
|
||||
var wrap = errors.New("wrap")
|
||||
errFn := func(fn func() error) error { // func1
|
||||
if err := fn(); err != nil {
|
||||
if err == wrap {
|
||||
return Wrap(ctx, err, "wrapped")
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
desc string
|
||||
chain func() error
|
||||
causeStackContains []string
|
||||
leafStackContains []string
|
||||
}{
|
||||
{
|
||||
desc: "depth 2, wrap in errFn",
|
||||
chain: func() error {
|
||||
// gets wrapped in errFn, so top of the stack is func1
|
||||
return errFn(func() error { return wrap })
|
||||
},
|
||||
causeStackContains: []string{
|
||||
"/ctxerr.TestElasticStack.func1 ",
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "depth 2, wrap immediately",
|
||||
chain: func() error {
|
||||
// gets wrapped immediately when returned, so top of the stack is funcX.1
|
||||
return errFn(func() error { return Wrap(ctx, wrap) })
|
||||
},
|
||||
causeStackContains: []string{
|
||||
"/ctxerr.TestElasticStack.func3.1 ",
|
||||
"/ctxerr.TestElasticStack.func1 ", // errFn
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "depth 3, ctxerr.New",
|
||||
chain: func() error {
|
||||
// gets wrapped directly in the call to New, so top of the stack is X.1.1
|
||||
return errFn(func() error { return func() error { return New(ctx, "new") }() })
|
||||
},
|
||||
causeStackContains: []string{
|
||||
"/ctxerr.TestElasticStack.func4.1.1 ",
|
||||
"/ctxerr.TestElasticStack.func4.1 ",
|
||||
"/ctxerr.TestElasticStack.func1 ", // errFn
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "depth 4, ctxerr.New",
|
||||
chain: func() error {
|
||||
// stacked capture in New, so top of the stack is X.1.1.1
|
||||
return errFn(func() error {
|
||||
return func() error {
|
||||
return func() error {
|
||||
return New(ctx, "new")
|
||||
}()
|
||||
}()
|
||||
})
|
||||
},
|
||||
causeStackContains: []string{
|
||||
"/ctxerr.TestElasticStack.func5.1.1.1 ",
|
||||
"/ctxerr.TestElasticStack.func5.1.1 ",
|
||||
"/ctxerr.TestElasticStack.func5.1 ",
|
||||
"/ctxerr.TestElasticStack.func1 ", // errFn
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "depth 4, ctxerr.New always wrapped",
|
||||
chain: func() error {
|
||||
// stacked capture in New, so top of the stack is X.1.1.1
|
||||
return errFn(func() error {
|
||||
return Wrap(ctx, func() error {
|
||||
return Wrap(ctx, func() error {
|
||||
return New(ctx, "new")
|
||||
}())
|
||||
}())
|
||||
})
|
||||
},
|
||||
causeStackContains: []string{
|
||||
"/ctxerr.TestElasticStack.func6.1.1.1 ",
|
||||
"/ctxerr.TestElasticStack.func6.1.1 ",
|
||||
"/ctxerr.TestElasticStack.func6.1 ",
|
||||
"/ctxerr.TestElasticStack.func1 ", // errFn
|
||||
},
|
||||
leafStackContains: []string{
|
||||
// only a single stack trace is collected when wrapping another
|
||||
// FleetError.
|
||||
"/ctxerr.TestElasticStack.func6.1 ",
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "depth 4, wrapped only at the end",
|
||||
chain: func() error {
|
||||
return errFn(func() error {
|
||||
return Wrap(ctx, func() error {
|
||||
return func() error {
|
||||
return io.EOF
|
||||
}()
|
||||
}())
|
||||
})
|
||||
},
|
||||
causeStackContains: []string{
|
||||
// since it wraps a non-FleetError, the full stack is collected
|
||||
"/ctxerr.TestElasticStack.func7.1 ",
|
||||
"/ctxerr.TestElasticStack.func1 ", // errFn
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.desc, func(t *testing.T) {
|
||||
checkStack := func(stack, contains []string) {
|
||||
stackStr := strings.Join(stack, "\n")
|
||||
lastIx := -1
|
||||
for _, want := range contains {
|
||||
ix := strings.Index(stackStr, want)
|
||||
require.True(t, ix > -1, "expected stack %v to contain %q", stackStr, want)
|
||||
require.True(t, ix > lastIx, "expected %q to be after last check in %v", want, stackStr)
|
||||
lastIx = ix
|
||||
}
|
||||
}
|
||||
|
||||
err := c.chain()
|
||||
require.Error(t, err)
|
||||
var ferr *FleetError
|
||||
require.ErrorAs(t, err, &ferr)
|
||||
|
||||
leafStack := ferr.Stack()
|
||||
cause := FleetCause(err)
|
||||
causeStack := cause.Stack()
|
||||
|
||||
// if the fleet root error != fleet leaf error, then separate leaf +
|
||||
// cause stacks must be provided.
|
||||
if cause != ferr {
|
||||
require.True(t, len(c.causeStackContains) > 0)
|
||||
require.True(t, len(c.leafStackContains) > 0)
|
||||
} else {
|
||||
// otherwise use the same stack expectations for both
|
||||
if len(c.causeStackContains) == 0 {
|
||||
c.causeStackContains = c.leafStackContains
|
||||
}
|
||||
if len(c.leafStackContains) == 0 {
|
||||
c.leafStackContains = c.causeStackContains
|
||||
}
|
||||
}
|
||||
|
||||
checkStack(causeStack, c.causeStackContains)
|
||||
checkStack(leafStack, c.leafStackContains)
|
||||
|
||||
// run in a test APM transaction, recording the sent events
|
||||
_, _, apmErrs := apmtest.NewRecordingTracer().WithTransaction(func(ctx context.Context) {
|
||||
// APM should be able to capture that error (we use the FleetCause error,
|
||||
// so that it gets the most relevant stack trace and not that of the
|
||||
// wrapped ones) and its cause is the cause.
|
||||
apm.CaptureError(ctx, cause).Send()
|
||||
})
|
||||
require.Len(t, apmErrs, 1)
|
||||
apmErr := apmErrs[0]
|
||||
require.NotNil(t, apmErr)
|
||||
|
||||
// the culprit should be the function name of the top of the stack of the
|
||||
// cause error.
|
||||
fnName := strings.TrimSpace(c.causeStackContains[0][strings.Index(c.causeStackContains[0], "TestElasticStack"):])
|
||||
require.Equal(t, fnName, apmErr.Culprit)
|
||||
|
||||
// the APM stack should match the cause stack (i.e. APM should have
|
||||
// grabbed the stacktrace that we provided). If it didn't, it would have
|
||||
// a stacktrace with a function name indicating the WithTransaction
|
||||
// function literal where CaptureError is called.
|
||||
var apmStack []string
|
||||
for _, st := range apmErr.Exception.Stacktrace {
|
||||
apmStack = append(apmStack, st.Module+"."+st.Function+" ("+st.File+":"+fmt.Sprint(st.Line)+")")
|
||||
}
|
||||
checkStack(apmStack, c.causeStackContains)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
11
tools/apm-elastic/.apm-server.yml
Normal file
11
tools/apm-elastic/.apm-server.yml
Normal file
@ -0,0 +1,11 @@
|
||||
apm-server:
|
||||
host: apm-server:8200
|
||||
frontend.enabled: true
|
||||
frontend.allow_origins: "*"
|
||||
|
||||
output.elasticsearch:
|
||||
enabled: true
|
||||
hosts: ["elasticsearch"]
|
||||
|
||||
setup.kibana:
|
||||
host: "kibana"
|
30
tools/apm-elastic/README.md
Normal file
30
tools/apm-elastic/README.md
Normal file
@ -0,0 +1,30 @@
|
||||
# Running a local Elastic/Kibana APM
|
||||
|
||||
## Setup
|
||||
|
||||
To setup a full Elastic APM stack, from this directory, run:
|
||||
|
||||
```
|
||||
$ docker-compose up -d
|
||||
$ docker-compose exec apm-server ./apm-server setup
|
||||
```
|
||||
|
||||
Give it a few seconds to complete setup, and then you should be able to view the APM website at `http://localhost:5601`.
|
||||
|
||||
## Configuring local Fleet
|
||||
|
||||
Make sure Fleet is ready to run locally (docker services are started, binary is built, etc., see [Testing and local development](../../docs/Contributing/Testing-and-local-development.md)).
|
||||
|
||||
Start the locally-built Fleet (`fleet serve`) and provide the `--logging_tracing_enabled --logging_tracing_type=elasticapm` flags (note that sending those options using environment variables does not seem to work).
|
||||
|
||||
Navigate the Fleet website and you should start seeing API requests and (eventually) errors show up in the APM dashboard (http://localhost:5601/app/apm).
|
||||
|
||||
You may set the following environment variables to further configure the APM data collection:
|
||||
|
||||
```
|
||||
ELASTIC_APM_SERVICE_NAME=fleet
|
||||
ELASTIC_APM_ENVIRONMENT=development
|
||||
ELASTIC_APM_TRANSACTION_SAMPLE_RATE=1
|
||||
```
|
||||
|
||||
The default values should be fine for a local environment (in particular, the sample rate defaults to 1.0 so all events are recorded).
|
45
tools/apm-elastic/docker-compose.yml
Normal file
45
tools/apm-elastic/docker-compose.yml
Normal file
@ -0,0 +1,45 @@
|
||||
# From https://gist.github.com/sarriaroman/98289f3a1ae1cf46a801c89f2990c8fa
|
||||
|
||||
# This docker-compose file will set up Elastic APM, Kibana and Elasticsearch containers
|
||||
# and the apm-server.yml configuration file ensures that APM can talk to Kibana. Once
|
||||
# the servers are up and running you should be able to browse http://localhost:5601
|
||||
# and see Kibana running.
|
||||
# In order to set up the APM dashboards, you will need to run the `./apm-server -setup`
|
||||
# command on the running apm-server container:
|
||||
# $ docker-compose up -d
|
||||
# Starting elasticsearch_1 ... done
|
||||
# Starting kibana_1 ... done
|
||||
# Starting apm-server_1 ... done
|
||||
# $ docker-compose exec apm-server ./apm-server setup
|
||||
# Loaded index template
|
||||
# Loaded dashboards
|
||||
#
|
||||
# You should now be able to browse http://localhost:5601 and see the APM dashboards in Kibana.
|
||||
version: "3"
|
||||
services:
|
||||
elasticsearch:
|
||||
image: docker.elastic.co/elasticsearch/elasticsearch:7.8.0
|
||||
ports:
|
||||
- "9200:9200"
|
||||
- "9300:9300"
|
||||
environment:
|
||||
- discovery.type=single-node
|
||||
kibana:
|
||||
image: docker.elastic.co/kibana/kibana:7.8.0
|
||||
ports:
|
||||
- "5601:5601"
|
||||
links:
|
||||
- elasticsearch
|
||||
depends_on:
|
||||
- elasticsearch
|
||||
apm-server:
|
||||
image: docker.elastic.co/apm/apm-server:7.8.0
|
||||
ports:
|
||||
- "8200:8200"
|
||||
volumes:
|
||||
- ./.apm-server.yml:/usr/share/apm-server/apm-server.yml
|
||||
depends_on:
|
||||
- elasticsearch
|
||||
links:
|
||||
- elasticsearch
|
||||
- kibana
|
Loading…
Reference in New Issue
Block a user