Reducing the number of deadlock errors. (#17290)

#16562

Based on local testing, these changes reduce the number of 'Deadlock
found' errors when multiple hosts share the same UUID.

The performance issues with vulnerability processing will be fixed as
part of issue https://github.com/fleetdm/fleet/issues/16858

# Checklist for submitter

<!-- Note that API documentation changes are now addressed by the
product design team. -->

- [x] Changes file added for user-visible changes in `changes/` or
`orbit/changes/`.
See [Changes
files](https://fleetdm.com/docs/contributing/committing-changes#changes-files)
for more information.
- [ ] Added/updated tests
- [x] Manual QA for all new/changed functionality
This commit is contained in:
Victor Lyuboslavsky 2024-03-13 06:58:29 -05:00 committed by GitHub
parent e9a464e0cf
commit 1052b6b350
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 68 additions and 51 deletions

View File

@ -0,0 +1 @@
Reduced the number of 'Deadlock found' errors seen by the server when multiple hosts share the same UUID

View File

@ -4405,7 +4405,10 @@ func (ds *Datastore) UpdateHost(ctx context.Context, host *fleet.Host) error {
refetch_critical_queries_until = ?
WHERE id = ?
`
_, err := ds.writer(ctx).ExecContext(ctx, sqlStatement,
return ds.withRetryTxx(
ctx, func(tx sqlx.ExtContext) error {
_, err := tx.ExecContext(
ctx, sqlStatement,
host.DetailUpdatedAt,
host.LabelUpdatedAt,
host.PolicyUpdatedAt,
@ -4445,7 +4448,8 @@ func (ds *Datastore) UpdateHost(ctx context.Context, host *fleet.Host) error {
if err != nil {
return ctxerr.Wrapf(ctx, err, "save host with id %d", host.ID)
}
_, err = ds.writer(ctx).ExecContext(ctx, `
_, err = tx.ExecContext(
ctx, `
UPDATE host_display_names
SET display_name=?
WHERE host_id=?`,
@ -4456,6 +4460,8 @@ func (ds *Datastore) UpdateHost(ctx context.Context, host *fleet.Host) error {
return ctxerr.Wrapf(ctx, err, "update host_display_names for host id %d", host.ID)
}
return nil
},
)
}
func (ds *Datastore) OSVersion(ctx context.Context, osVersionID uint, teamID *uint) (*fleet.OSVersion, *time.Time, error) {

View File

@ -225,8 +225,13 @@ func withRetryTxx(ctx context.Context, db *sqlx.DB, fn txFn, logger log.Logger)
return nil
}
bo := backoff.NewExponentialBackOff()
bo.MaxElapsedTime = 5 * time.Second
expBo := backoff.NewExponentialBackOff()
// MySQL innodb_lock_wait_timeout default is 50 seconds, so transaction can be waiting for a lock for several seconds.
// Setting a higher MaxElapsedTime to increase probability that transaction will be retried.
// This will reduce the number of retryable 'Deadlock found' errors. However, with a loaded DB, we will still see
// 'Context cancelled' errors when the server drops long-lasting connections.
expBo.MaxElapsedTime = 1 * time.Minute
bo := backoff.WithMaxRetries(expBo, 5)
return backoff.Retry(operation, bo)
}
@ -333,8 +338,13 @@ func (ds *Datastore) writeChanLoop() {
case *fleet.Host:
item.errCh <- ds.UpdateHost(item.ctx, actualItem)
case hostXUpdatedAt:
err := ds.withRetryTxx(
item.ctx, func(tx sqlx.ExtContext) error {
query := fmt.Sprintf(`UPDATE hosts SET %s = ? WHERE id=?`, actualItem.what)
_, err := ds.writer(item.ctx).ExecContext(item.ctx, query, actualItem.updatedAt, actualItem.hostID)
_, err := tx.ExecContext(item.ctx, query, actualItem.updatedAt, actualItem.hostID)
return err
},
)
item.errCh <- ctxerr.Wrap(item.ctx, err, "updating hosts label updated at")
}
}