Update backend MDM migration flow and add logging to aid in debugging migration errors (#16627)

This commit is contained in:
Sarah Gillespie 2024-02-06 16:53:43 -06:00 committed by GitHub
parent 38d6e94f9f
commit 78aa1b0080
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 94 additions and 17 deletions

View File

@ -0,0 +1 @@
- Updated backend MDM migration flow and added logging to aid in debugging migration errors.

View File

@ -8,6 +8,7 @@ import (
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
hostctx "github.com/fleetdm/fleet/v4/server/contexts/host"
"github.com/fleetdm/fleet/v4/server/fleet"
"github.com/go-kit/kit/log/level"
)
func (svc *Service) ListDevicePolicies(ctx context.Context, host *fleet.Host) ([]*fleet.HostPolicy, error) {
@ -25,6 +26,9 @@ const refetchMDMUnenrollCriticalQueryDuration = 3 * time.Minute
// the server/webhooks one because it is a Fleet Premium only feature and for
// licensing reasons this needs to live under this package.
func (svc *Service) TriggerMigrateMDMDevice(ctx context.Context, host *fleet.Host) error {
level.Debug(svc.logger).Log("msg", "trigger migration webhook", "host_id", host.ID,
"refetch_critical_queries_until", host.RefetchCriticalQueriesUntil)
ac, err := svc.ds.AppConfig(ctx)
if err != nil {
return err
@ -33,6 +37,15 @@ func (svc *Service) TriggerMigrateMDMDevice(ctx context.Context, host *fleet.Hos
return fleet.ErrMDMNotConfigured
}
if host.RefetchCriticalQueriesUntil != nil && host.RefetchCriticalQueriesUntil.After(svc.clock.Now()) {
// the webhook has already been triggered successfully recently (within the
// refetch critical queries delay), so return as if it did send it successfully
// but do not re-send.
level.Debug(svc.logger).Log("msg", "waiting for critical queries refetch, skip sending webhook",
"host_id", host.ID)
return nil
}
var bre fleet.BadRequestError
switch {
case !ac.MDM.MacOSMigration.Enable:
@ -41,11 +54,6 @@ func (svc *Service) TriggerMigrateMDMDevice(ctx context.Context, host *fleet.Hos
bre.InternalErr = ctxerr.New(ctx, "macOS migration webhook URL not configured")
case !host.IsEligibleForDEPMigration():
bre.InternalErr = ctxerr.New(ctx, "host not eligible for macOS migration")
case host.RefetchCriticalQueriesUntil != nil && host.RefetchCriticalQueriesUntil.After(svc.clock.Now()):
// the webhook has already been triggered successfully recently (within the
// refetch critical queries delay), so do as if it did send it successfully
// but do not re-send.
return nil
}
if bre.InternalErr != nil {
return &bre
@ -66,7 +74,7 @@ func (svc *Service) TriggerMigrateMDMDevice(ctx context.Context, host *fleet.Hos
// existing third-party MDM.
refetchUntil := svc.clock.Now().Add(refetchMDMUnenrollCriticalQueryDuration)
host.RefetchCriticalQueriesUntil = &refetchUntil
if err := svc.ds.UpdateHost(ctx, host); err != nil {
if err := svc.ds.UpdateHostRefetchCriticalQueriesUntil(ctx, host.ID, &refetchUntil); err != nil {
return ctxerr.Wrap(ctx, err, "save host with refetch critical queries timestamp")
}

View File

@ -408,12 +408,15 @@ type mdmMigrationHandler struct {
}
func (m *mdmMigrationHandler) NotifyRemote() error {
log.Debug().Msg("sending request to trigger mdm migration webhook")
log.Info().Msg("sending request to trigger mdm migration webhook")
// TODO: Revisit if/when we should hide the migration menu item depending on the
// result of the client request.
if err := m.client.MigrateMDM(m.tokenReader.GetCached()); err != nil {
log.Error().Err(err).Msg("triggering migration webhook")
return fmt.Errorf("on migration start: %w", err)
}
log.Debug().Msg("successfully sent request to trigger mdm migration webhook")
log.Info().Msg("successfully sent request to trigger mdm migration webhook")
return nil
}

View File

@ -4271,9 +4271,30 @@ func (ds *Datastore) UpdateHostRefetchRequested(ctx context.Context, id uint, va
return nil
}
// UpdateHostRefetchCriticalQueriesUntil updates a host's refetch critical queries until field.
func (ds *Datastore) UpdateHostRefetchCriticalQueriesUntil(ctx context.Context, id uint, until *time.Time) error {
debugLogs := []interface{}{"msg", "update refetch_critical_queries_until", "host_id", id}
if until != nil {
debugLogs = append(debugLogs, "until", until.Format(time.RFC3339))
}
level.Debug(ds.logger).Log(debugLogs...)
sqlStatement := `UPDATE hosts SET refetch_critical_queries_until = ? WHERE id = ?`
_, err := ds.writer(ctx).ExecContext(ctx, sqlStatement, until, id)
if err != nil {
return ctxerr.Wrapf(ctx, err, "update host %d refetch_critical_queries_until", id)
}
return nil
}
// UpdateHost updates all columns of the `hosts` table.
// It only updates `hosts` table, other additional host information is ignored.
func (ds *Datastore) UpdateHost(ctx context.Context, host *fleet.Host) error {
if host.OrbitNodeKey == nil || *host.OrbitNodeKey == "" {
level.Debug(ds.logger).Log("msg", "missing orbit_node_key to update host",
"host_id", host.ID, "node_key", host.NodeKey, "orbit_node_key", host.OrbitNodeKey)
}
sqlStatement := `
UPDATE hosts SET
detail_updated_at = ?,

View File

@ -718,6 +718,9 @@ type Datastore interface {
// UpdateHostRefetchRequested updates a host's refetch requested field.
UpdateHostRefetchRequested(ctx context.Context, hostID uint, value bool) error
// UpdateHostRefetchCriticalQueriesUntil updates a host's refetch critical queries until field.
UpdateHostRefetchCriticalQueriesUntil(ctx context.Context, hostID uint, until *time.Time) error
// FlippingPoliciesForHost fetches the policies with incoming results and returns:
// - a list of "new" failing policies; "new" here means those that fail on their first
// run, and those that were passing on the previous run and are failing on the incoming execution.

View File

@ -525,6 +525,12 @@ func (d *DEPService) processDeviceResponse(ctx context.Context, depClient *godep
}
logs = append(logs, logCountsForResults(apiResp.Devices)...)
level.Info(logger).Log(logs...)
debugLogs := []interface{}{"msg", "assign profile responses by device"}
for k, v := range apiResp.Devices {
debugLogs = append(debugLogs, k, v)
}
level.Debug(logger).Log(debugLogs...)
}
return nil

View File

@ -508,6 +508,8 @@ type ListScheduledQueriesInPackFunc func(ctx context.Context, packID uint) (flee
type UpdateHostRefetchRequestedFunc func(ctx context.Context, hostID uint, value bool) error
type UpdateHostRefetchCriticalQueriesUntilFunc func(ctx context.Context, hostID uint, until *time.Time) error
type FlippingPoliciesForHostFunc func(ctx context.Context, hostID uint, incomingResults map[uint]*bool) (newFailing []uint, newPassing []uint, err error)
type RecordPolicyQueryExecutionsFunc func(ctx context.Context, host *fleet.Host, results map[uint]*bool, updated time.Time, deferredSaveHost bool) error
@ -1540,6 +1542,9 @@ type DataStore struct {
UpdateHostRefetchRequestedFunc UpdateHostRefetchRequestedFunc
UpdateHostRefetchRequestedFuncInvoked bool
UpdateHostRefetchCriticalQueriesUntilFunc UpdateHostRefetchCriticalQueriesUntilFunc
UpdateHostRefetchCriticalQueriesUntilFuncInvoked bool
FlippingPoliciesForHostFunc FlippingPoliciesForHostFunc
FlippingPoliciesForHostFuncInvoked bool
@ -3702,6 +3707,13 @@ func (s *DataStore) UpdateHostRefetchRequested(ctx context.Context, hostID uint,
return s.UpdateHostRefetchRequestedFunc(ctx, hostID, value)
}
func (s *DataStore) UpdateHostRefetchCriticalQueriesUntil(ctx context.Context, hostID uint, until *time.Time) error {
s.mu.Lock()
s.UpdateHostRefetchCriticalQueriesUntilFuncInvoked = true
s.mu.Unlock()
return s.UpdateHostRefetchCriticalQueriesUntilFunc(ctx, hostID, until)
}
func (s *DataStore) FlippingPoliciesForHost(ctx context.Context, hostID uint, incomingResults map[uint]*bool) (newFailing []uint, newPassing []uint, err error) {
s.mu.Lock()
s.FlippingPoliciesForHostFuncInvoked = true

View File

@ -5761,6 +5761,8 @@ func (s *integrationMDMTestSuite) TestMigrateMDMDeviceWebhook() {
require.NotNil(t, h.RefetchCriticalQueriesUntil)
require.True(t, h.RefetchCriticalQueriesUntil.After(time.Now()))
require.NoError(t, s.ds.UpdateHostRefetchCriticalQueriesUntil(context.Background(), h.ID, nil))
// bad token
s.Do("POST", fmt.Sprintf("/api/v1/fleet/device/%s/migrate_mdm", "bad-token"), nil, http.StatusUnauthorized)
require.False(t, webhookCalled)

View File

@ -1069,6 +1069,9 @@ func (svc *Service) SubmitDistributedQueryResults(
host.RefetchRequested = false
}
refetchCriticalCleared := refetchCriticalSet && host.RefetchCriticalQueriesUntil == nil
if refetchCriticalSet {
level.Debug(svc.logger).Log("msg", "refetch critical status on submit distributed query results", "host_id", host.ID, "refetch_requested", refetchRequested, "refetch_critical_queries_until", host.RefetchCriticalQueriesUntil, "refetch_critical_cleared", refetchCriticalCleared)
}
if refetchRequested || detailUpdated || refetchCriticalCleared {
appConfig, err := svc.ds.AppConfig(ctx)

View File

@ -1453,6 +1453,12 @@ func directIngestMDMMac(ctx context.Context, logger log.Logger, host *fleet.Host
logger.Log("component", "service", "method", "ingestMDM", "warn",
fmt.Sprintf("mdm expected single result got %d", len(rows)))
}
if host.RefetchCriticalQueriesUntil != nil {
level.Debug(logger).Log("msg", "ingesting macos mdm data during refetch critical queries window", "host_id", host.ID,
"data", fmt.Sprintf("%+v", rows))
}
enrolledVal := rows[0]["enrolled"]
if enrolledVal == "" {
return ctxerr.Wrap(ctx, fmt.Errorf("missing mdm.enrolled value: %d", host.ID))

View File

@ -6,9 +6,13 @@ import (
"log"
"net/http"
"os"
"os/signal"
"syscall"
"time"
)
const DELAY = 10 * time.Second // adjust this to simulate slow webhook response
func main() {
http.HandleFunc("/", func(writer http.ResponseWriter, request *http.Request) {
var detail string
@ -19,8 +23,10 @@ func main() {
detail = fmt.Sprintf("| BODY: %s", string(body))
}
log.Printf("%s %s %s\n", request.Method, request.URL.Path, detail)
if err := request.Write(writer); err != nil {
log.Fatalf(err.Error())
time.Sleep(DELAY)
if _, err := writer.Write(nil); err != nil {
log.Printf("error writing response %s", err.Error())
}
})
@ -32,9 +38,14 @@ func main() {
fmt.Printf("Server running at http://localhost%s\n", port)
server := &http.Server{
Addr: port,
ReadHeaderTimeout: 3 * time.Second,
}
if err := server.ListenAndServe(); err != nil {
log.Fatalf(err.Error())
ReadHeaderTimeout: 30 * time.Second,
}
go func() {
if err := server.ListenAndServe(); err != nil {
log.Fatalf("server error %s", err.Error())
}
}()
sig := make(chan os.Signal, 1)
signal.Notify(sig, syscall.SIGINT, syscall.SIGTERM)
<-sig // block on signal
}

View File

@ -13,6 +13,8 @@ Scripts in this directory aim to ease the testing of Orbit and the [TUF](https:/
> PS: We use `host.docker.internal` because the testing certificate `./tools/osquery/fleet.crt`
> has such hostname (and `localhost`) defined as SANs.
> PPS: Make sure you set the macOSX deployment target to the lowest macOS version you intend to support. See [Troubleshooting](#troubleshooting) for more details.
# Run
The `main.sh` creates and runs the TUF repository and optionally generate the installers (GENERATE_PKGS):
@ -94,7 +96,7 @@ When running Fleet Desktop on an older macOS version than it was compiled on, Or
_LSOpenURLsWithCompletionHandler() failed with error -10825
```
Solution: `Set the MACOSX_DEPLOYMENT_TARGET` environment variable to the lowest macOS version you intend to support:
Solution: Set the `MACOSX_DEPLOYMENT_TARGET` environment variable to the lowest macOS version you intend to support:
```
export MACOSX_DEPLOYMENT_TARGET=13 # replace '13' with your target macOS version

View File

@ -115,8 +115,7 @@ for system in $SYSTEMS; do
# Add swiftDialog on macos (if enabled).
if [[ $system == "macos" && -n "$SWIFT_DIALOG" ]]; then
# For now we always make swiftDialog (until it's uploaded to our TUF repo)
make swift-dialog-app-tar-gz version=$SWIFT_DIALOG_MACOS_APP_VERSION build=$SWIFT_DIALOG_MACOS_APP_BUILD_VERSION out-path=.
curl https://tuf.fleetctl.com/targets/swiftDialog/macos/stable/swiftDialog.app.tar.gz --output swiftDialog.app.tar.gz
./build/fleetctl updates add \
--path $TUF_PATH \