Add memory and utilization limit override flags (#2858)

This commit is contained in:
Teddy Reed 2016-12-11 21:59:32 -08:00 committed by GitHub
parent eaf362fcb1
commit 0017de5bf1
4 changed files with 93 additions and 32 deletions

View File

@ -76,6 +76,16 @@ Disable userland watchdog process. **osqueryd** uses a watchdog process to monit
Performance limit level (0=loose, 1=normal, 2=restrictive, 3=debug). The default watchdog process uses a "level" to configure performance limits.
The higher the level the more strict the limits become. The "debug" level disables the performance limits completely.
The watchdog "profiles" can be overridden for Memory and CPU Utilization.
`--watchdog_memory_limit=0`
If this value is non-0 the watchdog level (`--watchdog_level`) for maximum memory is overridden. Use this if you would like to allow the `osqueryd` process to allocate more than 100M, but somewhere less than 1G.
`--watchdog_utilization_limit=0`
If this value is non-0 the watchdog level (`--watchdog_level`) for maximum sustained CPU utilization is overridden. Use this if you would like to allow the `osqueryd` process to use more than 90% of a thread for more than 6 seconds of wall time.
`--utc=true`
Attempt to convert all UNIX calendar times to UTC.

View File

@ -196,10 +196,14 @@ class FakeWatcherRunner : public WatcherRunner {
*
* Internal calls to getProcessRow will return this structure.
*/
void setProcessRow(QueryData qd) { qd_ = std::move(qd); }
void setProcessRow(QueryData qd) {
qd_ = std::move(qd);
}
/// The tests do not have access to the processes table.
QueryData getProcessRow(pid_t pid) const override { return qd_; }
QueryData getProcessRow(pid_t pid) const override {
return qd_;
}
private:
QueryData qd_;
@ -232,7 +236,7 @@ TEST_F(WatcherTests, test_watcherrunner_watcherhealth) {
EXPECT_EQ(100U, state.initial_footprint);
// The measurement of latency applies an interval value normalization.
auto iv = std::max(getWorkerLimit(INTERVAL), (size_t)1);
auto iv = std::max(getWorkerLimit(WatchdogLimitType::INTERVAL), (size_t)1);
EXPECT_EQ(100U / iv, state.user_time);
EXPECT_EQ(0U, state.sustained_latency);

View File

@ -50,17 +50,17 @@ using WatchdogLimitMap = std::map<WatchdogLimitType, LimitDefinition>;
const WatchdogLimitMap kWatchdogLimits = {
// Maximum MB worker can privately allocate.
{MEMORY_LIMIT, {100, 50, 1000}},
{WatchdogLimitType::MEMORY_LIMIT, {100, 50, 1000}},
// User or system CPU worker can utilize for LATENCY_LIMIT seconds.
{UTILIZATION_LIMIT, {90, 80, 1000}},
{WatchdogLimitType::UTILIZATION_LIMIT, {90, 80, 1000}},
// Number of seconds the worker should run, else consider the exit fatal.
{RESPAWN_LIMIT, {20, 20, 1000}},
{WatchdogLimitType::RESPAWN_LIMIT, {20, 20, 1000}},
// If the worker respawns too quickly, backoff on creating additional.
{RESPAWN_DELAY, {5, 5, 1}},
{WatchdogLimitType::RESPAWN_DELAY, {5, 5, 1}},
// Seconds of tolerable UTILIZATION_LIMIT sustained latency.
{LATENCY_LIMIT, {12, 6, 1000}},
{WatchdogLimitType::LATENCY_LIMIT, {12, 6, 1000}},
// How often to poll for performance limit violations.
{INTERVAL, {3, 3, 3}},
{WatchdogLimitType::INTERVAL, {3, 3, 3}},
};
CLI_FLAG(int32,
@ -68,6 +68,16 @@ CLI_FLAG(int32,
0,
"Performance limit level (0=normal, 1=restrictive, -1=off)");
CLI_FLAG(uint64,
watchdog_memory_limit,
0,
"Override watchdog profile memory limit");
CLI_FLAG(uint64,
watchdog_utilization_limit,
0,
"Override watchdog profile CPU utilization limit");
CLI_FLAG(bool, disable_watchdog, false, "Disable userland watchdog process");
void Watcher::resetWorkerCounters(size_t respawn_time) {
@ -219,7 +229,7 @@ void WatcherRunner::start() {
// A test harness can end the thread immediately.
break;
}
pauseMilli(getWorkerLimit(INTERVAL) * 1000);
pauseMilli(getWorkerLimit(WatchdogLimitType::INTERVAL) * 1000);
} while (!interrupted() && ok());
}
@ -265,7 +275,7 @@ PerformanceChange getChange(const Row& r, PerformanceState& state) {
PerformanceChange change;
// IV is the check interval in seconds, and utilization is set per-second.
change.iv = std::max(getWorkerLimit(INTERVAL), (size_t)1);
change.iv = std::max(getWorkerLimit(WatchdogLimitType::INTERVAL), (size_t)1);
UNSIGNED_BIGINT_LITERAL user_time = 0, system_time = 0;
try {
change.parent =
@ -278,8 +288,10 @@ PerformanceChange getChange(const Row& r, PerformanceState& state) {
}
// Check the difference of CPU time used since last check.
if (user_time - state.user_time > getWorkerLimit(UTILIZATION_LIMIT) ||
system_time - state.system_time > getWorkerLimit(UTILIZATION_LIMIT)) {
if (user_time - state.user_time >
getWorkerLimit(WatchdogLimitType::UTILIZATION_LIMIT) ||
system_time - state.system_time >
getWorkerLimit(WatchdogLimitType::UTILIZATION_LIMIT)) {
state.sustained_latency++;
} else {
state.sustained_latency = 0;
@ -313,7 +325,8 @@ static bool exceededMemoryLimit(const PerformanceChange& change) {
return false;
}
return (change.footprint > getWorkerLimit(MEMORY_LIMIT) * 1024 * 1024);
return (change.footprint >
getWorkerLimit(WatchdogLimitType::MEMORY_LIMIT) * 1024 * 1024);
}
static bool exceededCyclesLimit(const PerformanceChange& change) {
@ -322,7 +335,7 @@ static bool exceededCyclesLimit(const PerformanceChange& change) {
}
auto latency = change.sustained_latency * change.iv;
return (latency >= getWorkerLimit(LATENCY_LIMIT));
return (latency >= getWorkerLimit(WatchdogLimitType::LATENCY_LIMIT));
}
Status WatcherRunner::isWatcherHealthy(const PlatformProcess& watcher,
@ -390,12 +403,12 @@ void WatcherRunner::createWorker() {
{
WatcherLocker locker;
if (Watcher::getState(Watcher::getWorker()).last_respawn_time >
getUnixTime() - getWorkerLimit(RESPAWN_LIMIT)) {
getUnixTime() - getWorkerLimit(WatchdogLimitType::RESPAWN_LIMIT)) {
LOG(WARNING) << "osqueryd worker respawning too quickly: "
<< Watcher::workerRestartCount() << " times";
Watcher::workerRestarted();
// The configured automatic delay.
size_t delay = getWorkerLimit(RESPAWN_DELAY) * 1000;
size_t delay = getWorkerLimit(WatchdogLimitType::RESPAWN_DELAY) * 1000;
// Exponential back off for quickly-respawning clients.
delay +=
static_cast<size_t>(pow(2, Watcher::workerRestartCount())) * 1000;
@ -451,7 +464,7 @@ bool WatcherRunner::createExtension(const std::string& extension) {
{
WatcherLocker locker;
if (Watcher::getState(extension).last_respawn_time >
getUnixTime() - getWorkerLimit(RESPAWN_LIMIT)) {
getUnixTime() - getWorkerLimit(WatchdogLimitType::RESPAWN_LIMIT)) {
LOG(WARNING) << "Extension respawning too quickly: " << extension;
// Unlike a worker, if an extension respawns to quickly we give up.
return false;
@ -501,7 +514,7 @@ void WatcherWatcherRunner::start() {
Initializer::requestShutdown();
break;
}
pauseMilli(getWorkerLimit(INTERVAL) * 1000);
pauseMilli(getWorkerLimit(WatchdogLimitType::INTERVAL) * 1000);
}
}
@ -510,6 +523,16 @@ size_t getWorkerLimit(WatchdogLimitType name) {
return 0;
}
if (name == WatchdogLimitType::MEMORY_LIMIT &&
FLAGS_watchdog_memory_limit > 0) {
return FLAGS_watchdog_memory_limit;
}
if (name == WatchdogLimitType::UTILIZATION_LIMIT &&
FLAGS_watchdog_utilization_limit > 0) {
return FLAGS_watchdog_utilization_limit;
}
auto level = FLAGS_watchdog_level;
// If no level was provided then use the default (config/switch).
if (level == -1) {

View File

@ -42,7 +42,7 @@ class WatcherRunner;
* here, and organized into levels. Such that a caller may enforce rigor or
* relax the performance expectations of a osquery daemon.
*/
enum WatchdogLimitType {
enum class WatchdogLimitType {
MEMORY_LIMIT,
UTILIZATION_LIMIT,
RESPAWN_LIMIT,
@ -109,13 +109,19 @@ class Watcher : private boost::noncopyable {
size_t respawn_time);
/// Lock access to extensions.
static void lock() { instance().lock_.lock(); }
static void lock() {
instance().lock_.lock();
}
/// Unlock access to extensions.
static void unlock() { instance().lock_.unlock(); }
static void unlock() {
instance().lock_.unlock();
}
/// Accessor for autoloadable extension paths.
static const ExtensionMap& extensions() { return instance().extensions_; }
static const ExtensionMap& extensions() {
return instance().extensions_;
}
/// Lookup extension path from pid.
static std::string getExtensionPath(const PlatformProcess& child);
@ -131,7 +137,9 @@ class Watcher : private boost::noncopyable {
static PerformanceState& getState(const std::string& extension);
/// Accessor for the worker process.
static PlatformProcess& getWorker() { return *instance().worker_; }
static PlatformProcess& getWorker() {
return *instance().worker_;
}
/// Setter for worker process.
static void setWorker(const std::shared_ptr<PlatformProcess>& child) {
@ -146,13 +154,19 @@ class Watcher : private boost::noncopyable {
static void reset(const PlatformProcess& child);
/// Count the number of worker restarts.
static size_t workerRestartCount() { return instance().worker_restarts_; }
static size_t workerRestartCount() {
return instance().worker_restarts_;
}
/// Become responsible for the worker's fate, but do not guarantee its safety.
static void bindFates() { instance().restart_worker_ = false; }
static void bindFates() {
instance().restart_worker_ = false;
}
/// Check if the worker and watcher's fates are bound.
static bool fatesBound() { return !instance().restart_worker_; }
static bool fatesBound() {
return !instance().restart_worker_;
}
/**
* @brief Return the state of autoloadable extensions.
@ -164,7 +178,9 @@ class Watcher : private boost::noncopyable {
static bool hasManagedExtensions();
/// Check the status of the last worker.
static int getWorkerStatus() { return instance().worker_status_; }
static int getWorkerStatus() {
return instance().worker_status_;
}
private:
/// Do not request the lock until extensions are used.
@ -179,7 +195,9 @@ class Watcher : private boost::noncopyable {
private:
/// Inform the watcher that the worker restarted without cause.
static void workerRestarted() { instance().worker_restarts_++; }
static void workerRestarted() {
instance().worker_restarts_++;
}
private:
/// Performance state for the worker process.
@ -228,10 +246,14 @@ class Watcher : private boost::noncopyable {
class WatcherLocker {
public:
/// Construct and gain watcher lock.
WatcherLocker() { Watcher::lock(); }
WatcherLocker() {
Watcher::lock();
}
/// Destruct and release watcher lock.
~WatcherLocker() { Watcher::unlock(); }
~WatcherLocker() {
Watcher::unlock();
}
};
/**
@ -287,7 +309,9 @@ class WatcherRunner : public InternalRunnable {
private:
/// For testing only, ask the WatcherRunner to run a start loop once.
void runOnce() { run_once_ = true; }
void runOnce() {
run_once_ = true;
}
private:
/// Keep the invocation daemon's argc to iterate through argv.