mirror of
https://github.com/valitydev/osquery-1.git
synced 2024-11-07 01:55:20 +00:00
Add memory and utilization limit override flags (#2858)
This commit is contained in:
parent
eaf362fcb1
commit
0017de5bf1
@ -76,6 +76,16 @@ Disable userland watchdog process. **osqueryd** uses a watchdog process to monit
|
||||
Performance limit level (0=loose, 1=normal, 2=restrictive, 3=debug). The default watchdog process uses a "level" to configure performance limits.
|
||||
The higher the level the more strict the limits become. The "debug" level disables the performance limits completely.
|
||||
|
||||
The watchdog "profiles" can be overridden for Memory and CPU Utilization.
|
||||
|
||||
`--watchdog_memory_limit=0`
|
||||
|
||||
If this value is non-0 the watchdog level (`--watchdog_level`) for maximum memory is overridden. Use this if you would like to allow the `osqueryd` process to allocate more than 100M, but somewhere less than 1G.
|
||||
|
||||
`--watchdog_utilization_limit=0`
|
||||
|
||||
If this value is non-0 the watchdog level (`--watchdog_level`) for maximum sustained CPU utilization is overridden. Use this if you would like to allow the `osqueryd` process to use more than 90% of a thread for more than 6 seconds of wall time.
|
||||
|
||||
`--utc=true`
|
||||
|
||||
Attempt to convert all UNIX calendar times to UTC.
|
||||
|
@ -196,10 +196,14 @@ class FakeWatcherRunner : public WatcherRunner {
|
||||
*
|
||||
* Internal calls to getProcessRow will return this structure.
|
||||
*/
|
||||
void setProcessRow(QueryData qd) { qd_ = std::move(qd); }
|
||||
void setProcessRow(QueryData qd) {
|
||||
qd_ = std::move(qd);
|
||||
}
|
||||
|
||||
/// The tests do not have access to the processes table.
|
||||
QueryData getProcessRow(pid_t pid) const override { return qd_; }
|
||||
QueryData getProcessRow(pid_t pid) const override {
|
||||
return qd_;
|
||||
}
|
||||
|
||||
private:
|
||||
QueryData qd_;
|
||||
@ -232,7 +236,7 @@ TEST_F(WatcherTests, test_watcherrunner_watcherhealth) {
|
||||
EXPECT_EQ(100U, state.initial_footprint);
|
||||
|
||||
// The measurement of latency applies an interval value normalization.
|
||||
auto iv = std::max(getWorkerLimit(INTERVAL), (size_t)1);
|
||||
auto iv = std::max(getWorkerLimit(WatchdogLimitType::INTERVAL), (size_t)1);
|
||||
EXPECT_EQ(100U / iv, state.user_time);
|
||||
EXPECT_EQ(0U, state.sustained_latency);
|
||||
|
||||
|
@ -50,17 +50,17 @@ using WatchdogLimitMap = std::map<WatchdogLimitType, LimitDefinition>;
|
||||
|
||||
const WatchdogLimitMap kWatchdogLimits = {
|
||||
// Maximum MB worker can privately allocate.
|
||||
{MEMORY_LIMIT, {100, 50, 1000}},
|
||||
{WatchdogLimitType::MEMORY_LIMIT, {100, 50, 1000}},
|
||||
// User or system CPU worker can utilize for LATENCY_LIMIT seconds.
|
||||
{UTILIZATION_LIMIT, {90, 80, 1000}},
|
||||
{WatchdogLimitType::UTILIZATION_LIMIT, {90, 80, 1000}},
|
||||
// Number of seconds the worker should run, else consider the exit fatal.
|
||||
{RESPAWN_LIMIT, {20, 20, 1000}},
|
||||
{WatchdogLimitType::RESPAWN_LIMIT, {20, 20, 1000}},
|
||||
// If the worker respawns too quickly, backoff on creating additional.
|
||||
{RESPAWN_DELAY, {5, 5, 1}},
|
||||
{WatchdogLimitType::RESPAWN_DELAY, {5, 5, 1}},
|
||||
// Seconds of tolerable UTILIZATION_LIMIT sustained latency.
|
||||
{LATENCY_LIMIT, {12, 6, 1000}},
|
||||
{WatchdogLimitType::LATENCY_LIMIT, {12, 6, 1000}},
|
||||
// How often to poll for performance limit violations.
|
||||
{INTERVAL, {3, 3, 3}},
|
||||
{WatchdogLimitType::INTERVAL, {3, 3, 3}},
|
||||
};
|
||||
|
||||
CLI_FLAG(int32,
|
||||
@ -68,6 +68,16 @@ CLI_FLAG(int32,
|
||||
0,
|
||||
"Performance limit level (0=normal, 1=restrictive, -1=off)");
|
||||
|
||||
CLI_FLAG(uint64,
|
||||
watchdog_memory_limit,
|
||||
0,
|
||||
"Override watchdog profile memory limit");
|
||||
|
||||
CLI_FLAG(uint64,
|
||||
watchdog_utilization_limit,
|
||||
0,
|
||||
"Override watchdog profile CPU utilization limit");
|
||||
|
||||
CLI_FLAG(bool, disable_watchdog, false, "Disable userland watchdog process");
|
||||
|
||||
void Watcher::resetWorkerCounters(size_t respawn_time) {
|
||||
@ -219,7 +229,7 @@ void WatcherRunner::start() {
|
||||
// A test harness can end the thread immediately.
|
||||
break;
|
||||
}
|
||||
pauseMilli(getWorkerLimit(INTERVAL) * 1000);
|
||||
pauseMilli(getWorkerLimit(WatchdogLimitType::INTERVAL) * 1000);
|
||||
} while (!interrupted() && ok());
|
||||
}
|
||||
|
||||
@ -265,7 +275,7 @@ PerformanceChange getChange(const Row& r, PerformanceState& state) {
|
||||
PerformanceChange change;
|
||||
|
||||
// IV is the check interval in seconds, and utilization is set per-second.
|
||||
change.iv = std::max(getWorkerLimit(INTERVAL), (size_t)1);
|
||||
change.iv = std::max(getWorkerLimit(WatchdogLimitType::INTERVAL), (size_t)1);
|
||||
UNSIGNED_BIGINT_LITERAL user_time = 0, system_time = 0;
|
||||
try {
|
||||
change.parent =
|
||||
@ -278,8 +288,10 @@ PerformanceChange getChange(const Row& r, PerformanceState& state) {
|
||||
}
|
||||
|
||||
// Check the difference of CPU time used since last check.
|
||||
if (user_time - state.user_time > getWorkerLimit(UTILIZATION_LIMIT) ||
|
||||
system_time - state.system_time > getWorkerLimit(UTILIZATION_LIMIT)) {
|
||||
if (user_time - state.user_time >
|
||||
getWorkerLimit(WatchdogLimitType::UTILIZATION_LIMIT) ||
|
||||
system_time - state.system_time >
|
||||
getWorkerLimit(WatchdogLimitType::UTILIZATION_LIMIT)) {
|
||||
state.sustained_latency++;
|
||||
} else {
|
||||
state.sustained_latency = 0;
|
||||
@ -313,7 +325,8 @@ static bool exceededMemoryLimit(const PerformanceChange& change) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (change.footprint > getWorkerLimit(MEMORY_LIMIT) * 1024 * 1024);
|
||||
return (change.footprint >
|
||||
getWorkerLimit(WatchdogLimitType::MEMORY_LIMIT) * 1024 * 1024);
|
||||
}
|
||||
|
||||
static bool exceededCyclesLimit(const PerformanceChange& change) {
|
||||
@ -322,7 +335,7 @@ static bool exceededCyclesLimit(const PerformanceChange& change) {
|
||||
}
|
||||
|
||||
auto latency = change.sustained_latency * change.iv;
|
||||
return (latency >= getWorkerLimit(LATENCY_LIMIT));
|
||||
return (latency >= getWorkerLimit(WatchdogLimitType::LATENCY_LIMIT));
|
||||
}
|
||||
|
||||
Status WatcherRunner::isWatcherHealthy(const PlatformProcess& watcher,
|
||||
@ -390,12 +403,12 @@ void WatcherRunner::createWorker() {
|
||||
{
|
||||
WatcherLocker locker;
|
||||
if (Watcher::getState(Watcher::getWorker()).last_respawn_time >
|
||||
getUnixTime() - getWorkerLimit(RESPAWN_LIMIT)) {
|
||||
getUnixTime() - getWorkerLimit(WatchdogLimitType::RESPAWN_LIMIT)) {
|
||||
LOG(WARNING) << "osqueryd worker respawning too quickly: "
|
||||
<< Watcher::workerRestartCount() << " times";
|
||||
Watcher::workerRestarted();
|
||||
// The configured automatic delay.
|
||||
size_t delay = getWorkerLimit(RESPAWN_DELAY) * 1000;
|
||||
size_t delay = getWorkerLimit(WatchdogLimitType::RESPAWN_DELAY) * 1000;
|
||||
// Exponential back off for quickly-respawning clients.
|
||||
delay +=
|
||||
static_cast<size_t>(pow(2, Watcher::workerRestartCount())) * 1000;
|
||||
@ -451,7 +464,7 @@ bool WatcherRunner::createExtension(const std::string& extension) {
|
||||
{
|
||||
WatcherLocker locker;
|
||||
if (Watcher::getState(extension).last_respawn_time >
|
||||
getUnixTime() - getWorkerLimit(RESPAWN_LIMIT)) {
|
||||
getUnixTime() - getWorkerLimit(WatchdogLimitType::RESPAWN_LIMIT)) {
|
||||
LOG(WARNING) << "Extension respawning too quickly: " << extension;
|
||||
// Unlike a worker, if an extension respawns to quickly we give up.
|
||||
return false;
|
||||
@ -501,7 +514,7 @@ void WatcherWatcherRunner::start() {
|
||||
Initializer::requestShutdown();
|
||||
break;
|
||||
}
|
||||
pauseMilli(getWorkerLimit(INTERVAL) * 1000);
|
||||
pauseMilli(getWorkerLimit(WatchdogLimitType::INTERVAL) * 1000);
|
||||
}
|
||||
}
|
||||
|
||||
@ -510,6 +523,16 @@ size_t getWorkerLimit(WatchdogLimitType name) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (name == WatchdogLimitType::MEMORY_LIMIT &&
|
||||
FLAGS_watchdog_memory_limit > 0) {
|
||||
return FLAGS_watchdog_memory_limit;
|
||||
}
|
||||
|
||||
if (name == WatchdogLimitType::UTILIZATION_LIMIT &&
|
||||
FLAGS_watchdog_utilization_limit > 0) {
|
||||
return FLAGS_watchdog_utilization_limit;
|
||||
}
|
||||
|
||||
auto level = FLAGS_watchdog_level;
|
||||
// If no level was provided then use the default (config/switch).
|
||||
if (level == -1) {
|
||||
|
@ -42,7 +42,7 @@ class WatcherRunner;
|
||||
* here, and organized into levels. Such that a caller may enforce rigor or
|
||||
* relax the performance expectations of a osquery daemon.
|
||||
*/
|
||||
enum WatchdogLimitType {
|
||||
enum class WatchdogLimitType {
|
||||
MEMORY_LIMIT,
|
||||
UTILIZATION_LIMIT,
|
||||
RESPAWN_LIMIT,
|
||||
@ -109,13 +109,19 @@ class Watcher : private boost::noncopyable {
|
||||
size_t respawn_time);
|
||||
|
||||
/// Lock access to extensions.
|
||||
static void lock() { instance().lock_.lock(); }
|
||||
static void lock() {
|
||||
instance().lock_.lock();
|
||||
}
|
||||
|
||||
/// Unlock access to extensions.
|
||||
static void unlock() { instance().lock_.unlock(); }
|
||||
static void unlock() {
|
||||
instance().lock_.unlock();
|
||||
}
|
||||
|
||||
/// Accessor for autoloadable extension paths.
|
||||
static const ExtensionMap& extensions() { return instance().extensions_; }
|
||||
static const ExtensionMap& extensions() {
|
||||
return instance().extensions_;
|
||||
}
|
||||
|
||||
/// Lookup extension path from pid.
|
||||
static std::string getExtensionPath(const PlatformProcess& child);
|
||||
@ -131,7 +137,9 @@ class Watcher : private boost::noncopyable {
|
||||
static PerformanceState& getState(const std::string& extension);
|
||||
|
||||
/// Accessor for the worker process.
|
||||
static PlatformProcess& getWorker() { return *instance().worker_; }
|
||||
static PlatformProcess& getWorker() {
|
||||
return *instance().worker_;
|
||||
}
|
||||
|
||||
/// Setter for worker process.
|
||||
static void setWorker(const std::shared_ptr<PlatformProcess>& child) {
|
||||
@ -146,13 +154,19 @@ class Watcher : private boost::noncopyable {
|
||||
static void reset(const PlatformProcess& child);
|
||||
|
||||
/// Count the number of worker restarts.
|
||||
static size_t workerRestartCount() { return instance().worker_restarts_; }
|
||||
static size_t workerRestartCount() {
|
||||
return instance().worker_restarts_;
|
||||
}
|
||||
|
||||
/// Become responsible for the worker's fate, but do not guarantee its safety.
|
||||
static void bindFates() { instance().restart_worker_ = false; }
|
||||
static void bindFates() {
|
||||
instance().restart_worker_ = false;
|
||||
}
|
||||
|
||||
/// Check if the worker and watcher's fates are bound.
|
||||
static bool fatesBound() { return !instance().restart_worker_; }
|
||||
static bool fatesBound() {
|
||||
return !instance().restart_worker_;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the state of autoloadable extensions.
|
||||
@ -164,7 +178,9 @@ class Watcher : private boost::noncopyable {
|
||||
static bool hasManagedExtensions();
|
||||
|
||||
/// Check the status of the last worker.
|
||||
static int getWorkerStatus() { return instance().worker_status_; }
|
||||
static int getWorkerStatus() {
|
||||
return instance().worker_status_;
|
||||
}
|
||||
|
||||
private:
|
||||
/// Do not request the lock until extensions are used.
|
||||
@ -179,7 +195,9 @@ class Watcher : private boost::noncopyable {
|
||||
|
||||
private:
|
||||
/// Inform the watcher that the worker restarted without cause.
|
||||
static void workerRestarted() { instance().worker_restarts_++; }
|
||||
static void workerRestarted() {
|
||||
instance().worker_restarts_++;
|
||||
}
|
||||
|
||||
private:
|
||||
/// Performance state for the worker process.
|
||||
@ -228,10 +246,14 @@ class Watcher : private boost::noncopyable {
|
||||
class WatcherLocker {
|
||||
public:
|
||||
/// Construct and gain watcher lock.
|
||||
WatcherLocker() { Watcher::lock(); }
|
||||
WatcherLocker() {
|
||||
Watcher::lock();
|
||||
}
|
||||
|
||||
/// Destruct and release watcher lock.
|
||||
~WatcherLocker() { Watcher::unlock(); }
|
||||
~WatcherLocker() {
|
||||
Watcher::unlock();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
@ -287,7 +309,9 @@ class WatcherRunner : public InternalRunnable {
|
||||
|
||||
private:
|
||||
/// For testing only, ask the WatcherRunner to run a start loop once.
|
||||
void runOnce() { run_once_ = true; }
|
||||
void runOnce() {
|
||||
run_once_ = true;
|
||||
}
|
||||
|
||||
private:
|
||||
/// Keep the invocation daemon's argc to iterate through argv.
|
||||
|
Loading…
Reference in New Issue
Block a user