mirror of
https://github.com/empayre/fleet.git
synced 2024-11-06 08:55:24 +00:00
Scripts char limit: improve database migration performance and memory usage (#17338)
This commit is contained in:
parent
33a0324ebb
commit
0858f5a6f4
@ -1 +1,2 @@
|
||||
* Updated the database schema to support the increase in size of scripts.
|
||||
* **NOTE**: database migration might take a few minutes to complete, depending on scripts usage. It is mostly affected by the number of script executions (anonymous scripts via `fleetctl run-script` or saved scripts), the number of hosts targeted to execute the scripts and the number of saved scripts.
|
||||
|
@ -16,7 +16,7 @@ func init() {
|
||||
}
|
||||
|
||||
func Up_20240228111134(tx *sql.Tx) error {
|
||||
txx := sqlx.Tx{Tx: tx, Mapper: reflectx.NewMapperFunc("db", sqlx.NameMapper)}
|
||||
txx := &sqlx.Tx{Tx: tx, Mapper: reflectx.NewMapperFunc("db", sqlx.NameMapper)}
|
||||
|
||||
// at the time of this migration, we have two tables that deal with scripts:
|
||||
// - host_script_results: stores both the contents of the script and
|
||||
@ -57,21 +57,14 @@ CREATE TABLE script_contents (
|
||||
return fmt.Errorf("create table script_contents: %w", err)
|
||||
}
|
||||
|
||||
insertScriptContentsStmt := `
|
||||
INSERT INTO
|
||||
script_contents (md5_checksum, contents)
|
||||
VALUES
|
||||
(UNHEX(?), ?)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
created_at = created_at
|
||||
`
|
||||
// map tracking the script_contents id for a given md5 checksum
|
||||
scriptContentsIDLookup := make(map[string]uint)
|
||||
// map tracking the host_script_results and scripts ids for a given md5
|
||||
// checksum
|
||||
hostScriptsLookup := make(map[string][]uint)
|
||||
savedScriptsLookup := make(map[string][]uint)
|
||||
|
||||
type scriptContent struct {
|
||||
ID uint `db:"id"`
|
||||
ScriptContents string `db:"script_contents"`
|
||||
}
|
||||
|
||||
// load all contents found in host_script_results to next insert in
|
||||
// load all contents found in host_script_results to insert in
|
||||
// script_contents
|
||||
readHostScriptsStmt := `
|
||||
SELECT
|
||||
@ -80,12 +73,11 @@ CREATE TABLE script_contents (
|
||||
FROM
|
||||
host_script_results
|
||||
`
|
||||
var hostScripts []scriptContent
|
||||
if err := txx.Select(&hostScripts, readHostScriptsStmt); err != nil {
|
||||
return fmt.Errorf("load host_script_results contents: %w", err)
|
||||
if err := createScriptContentsEntries(txx, "host_script_results", readHostScriptsStmt, scriptContentsIDLookup, hostScriptsLookup); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// load all contents found in scripts to next insert in script_contents
|
||||
// load all contents found in scripts to insert in script_contents
|
||||
readScriptsStmt := `
|
||||
SELECT
|
||||
id,
|
||||
@ -93,29 +85,8 @@ CREATE TABLE script_contents (
|
||||
FROM
|
||||
scripts
|
||||
`
|
||||
var savedScripts []scriptContent
|
||||
if err := txx.Select(&savedScripts, readScriptsStmt); err != nil {
|
||||
return fmt.Errorf("load saved scripts contents: %w", err)
|
||||
}
|
||||
|
||||
// for every script content, md5-hash it and keep track of the hash
|
||||
// associated with the ids to update the host_script_results and scripts
|
||||
// tables with the new script_content_id later on.
|
||||
hostScriptsLookup := make(map[string][]uint, len(hostScripts))
|
||||
savedScriptsLookup := make(map[string][]uint, len(savedScripts))
|
||||
for _, s := range hostScripts {
|
||||
hexChecksum := md5ChecksumScriptContent(s.ScriptContents)
|
||||
hostScriptsLookup[hexChecksum] = append(hostScriptsLookup[hexChecksum], s.ID)
|
||||
if _, err := txx.Exec(insertScriptContentsStmt, hexChecksum, s.ScriptContents); err != nil {
|
||||
return fmt.Errorf("create script_contents from host_script_results: %w", err)
|
||||
}
|
||||
}
|
||||
for _, s := range savedScripts {
|
||||
hexChecksum := md5ChecksumScriptContent(s.ScriptContents)
|
||||
savedScriptsLookup[hexChecksum] = append(savedScriptsLookup[hexChecksum], s.ID)
|
||||
if _, err := txx.Exec(insertScriptContentsStmt, hexChecksum, s.ScriptContents); err != nil {
|
||||
return fmt.Errorf("create script_contents from saved scripts: %w", err)
|
||||
}
|
||||
if err := createScriptContentsEntries(txx, "saved scripts", readScriptsStmt, scriptContentsIDLookup, savedScriptsLookup); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
alterAddHostScriptsStmt := `
|
||||
@ -138,10 +109,10 @@ ALTER TABLE scripts
|
||||
UPDATE
|
||||
host_script_results
|
||||
SET
|
||||
script_content_id = (SELECT id FROM script_contents WHERE md5_checksum = UNHEX(?)),
|
||||
script_content_id = ?,
|
||||
updated_at = updated_at
|
||||
WHERE
|
||||
id = ?`
|
||||
id IN (?)`
|
||||
|
||||
// for saved scripts, the `updated_at` timestamp is used as the "uploaded_at"
|
||||
// information in the UI, so we ensure that it doesn't change with this
|
||||
@ -150,30 +121,28 @@ WHERE
|
||||
UPDATE
|
||||
scripts
|
||||
SET
|
||||
script_content_id = (SELECT id FROM script_contents WHERE md5_checksum = UNHEX(?)),
|
||||
script_content_id = ?,
|
||||
updated_at = updated_at
|
||||
WHERE
|
||||
id = ?`
|
||||
id IN (?)`
|
||||
|
||||
// insert the associated script_content_id into host_script_results and
|
||||
// scripts
|
||||
for hexChecksum, ids := range hostScriptsLookup {
|
||||
for _, id := range ids {
|
||||
if _, err := txx.Exec(updateHostScriptsStmt, hexChecksum, id); err != nil {
|
||||
return fmt.Errorf("update host_script_results with script_content_id: %w", err)
|
||||
}
|
||||
scID := scriptContentsIDLookup[hexChecksum]
|
||||
if err := updateScriptContentIDInBatches(txx, "host_script_results", updateHostScriptsStmt, scID, ids); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for hexChecksum, ids := range savedScriptsLookup {
|
||||
for _, id := range ids {
|
||||
if _, err := txx.Exec(updateSavedScriptsStmt, hexChecksum, id); err != nil {
|
||||
return fmt.Errorf("update saved scripts with script_content_id: %w", err)
|
||||
}
|
||||
scID := scriptContentsIDLookup[hexChecksum]
|
||||
if err := updateScriptContentIDInBatches(txx, "saved scripts", updateSavedScriptsStmt, scID, ids); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(mna): we cannot drop the "script_contents" column immediately from
|
||||
// the host_script_results and scripts tables because that would break the
|
||||
// NOTE: we cannot drop the "script_contents" column immediately from the
|
||||
// host_script_results and scripts tables because that would break the
|
||||
// current code, we need to wait for the feature to be fully implemented.
|
||||
// There's no harm in leaving it in there unused for now, as stored scripts
|
||||
// were previously smallish.
|
||||
@ -181,6 +150,89 @@ WHERE
|
||||
return nil
|
||||
}
|
||||
|
||||
var testBatchSize int
|
||||
|
||||
func updateScriptContentIDInBatches(txx *sqlx.Tx, stmtTable, stmt string, scriptContentID uint, allIDs []uint) error {
|
||||
const maxBatchSize = 10000
|
||||
|
||||
batchSize := maxBatchSize
|
||||
if testBatchSize > 0 {
|
||||
// to allow override for tests
|
||||
batchSize = testBatchSize
|
||||
}
|
||||
|
||||
var startIx int
|
||||
for startIx < len(allIDs) {
|
||||
batchIDs := allIDs[startIx:]
|
||||
if len(batchIDs) > batchSize {
|
||||
batchIDs = batchIDs[startIx : startIx+batchSize]
|
||||
}
|
||||
startIx += len(batchIDs)
|
||||
|
||||
stmt, args, err := sqlx.In(stmt, scriptContentID, batchIDs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("prepare statement to update %s with script_content_id: %w", stmtTable, err)
|
||||
}
|
||||
if _, err := txx.Exec(stmt, args...); err != nil {
|
||||
return fmt.Errorf("update %s with script_content_id: %w", stmtTable, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func createScriptContentsEntries(txx *sqlx.Tx, stmtTable, stmt string, scriptContentsIDLookup map[string]uint, contentHashToStmtTableIDs map[string][]uint) error {
|
||||
type scriptContent struct {
|
||||
ID uint `db:"id"`
|
||||
ScriptContents string `db:"script_contents"`
|
||||
}
|
||||
|
||||
// using id = LAST_INSERT_ID(id) to get the id of the row that was updated
|
||||
// in case of a duplicate key.
|
||||
insertScriptContentsStmt := `
|
||||
INSERT INTO
|
||||
script_contents (md5_checksum, contents)
|
||||
VALUES
|
||||
(UNHEX(?), ?)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
id = LAST_INSERT_ID(id)
|
||||
`
|
||||
|
||||
// we cannot use a txx.Query and iterate over the rows, because we would need
|
||||
// another connection to be able to insert into script_contents while
|
||||
// iterating. So instead, we load the scripts in reasonably-sized batches.
|
||||
const batchSize = 1000 // at most ~10MB of script contents
|
||||
var lastID uint
|
||||
stmt += ` WHERE id > ? ORDER BY id LIMIT ?`
|
||||
|
||||
for {
|
||||
var scriptContents []scriptContent
|
||||
if err := txx.Select(&scriptContents, stmt, lastID, batchSize); err != nil {
|
||||
return fmt.Errorf("load %s contents: %w", stmtTable, err)
|
||||
}
|
||||
|
||||
if len(scriptContents) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, s := range scriptContents {
|
||||
lastID = s.ID
|
||||
|
||||
hexChecksum := md5ChecksumScriptContent(s.ScriptContents)
|
||||
contentHashToStmtTableIDs[hexChecksum] = append(contentHashToStmtTableIDs[hexChecksum], s.ID)
|
||||
if id := scriptContentsIDLookup[hexChecksum]; id == 0 {
|
||||
// insert the script content into the script_contents table, we don't
|
||||
// have its id yet
|
||||
res, err := txx.Exec(insertScriptContentsStmt, hexChecksum, s.ScriptContents)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create script_contents from %s: %w", stmtTable, err)
|
||||
}
|
||||
id, _ := res.LastInsertId()
|
||||
scriptContentsIDLookup[hexChecksum] = uint(id)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func md5ChecksumScriptContent(s string) string {
|
||||
rawChecksum := md5.Sum([]byte(s)) //nolint:gosec
|
||||
return strings.ToUpper(hex.EncodeToString(rawChecksum[:]))
|
||||
|
@ -7,6 +7,9 @@ import (
|
||||
)
|
||||
|
||||
func TestUp_20240228111134(t *testing.T) {
|
||||
testBatchSize = 2
|
||||
defer func() { testBatchSize = 0 }()
|
||||
|
||||
db := applyUpToPrev(t)
|
||||
|
||||
scriptA, scriptB, scriptC, scriptD := "scriptA", "scriptB", "scriptC", "scriptD"
|
||||
|
Loading…
Reference in New Issue
Block a user