osquery-1/osquery/core/watcher.cpp

211 lines
6.0 KiB
C++
Raw Normal View History

/*
* Copyright (c) 2014, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree. An additional grant
* of patent rights can be found in the PATENTS file in the same directory.
*
*/
#include <cstring>
#include <sstream>
#include <sys/wait.h>
#include <signal.h>
2015-02-09 00:00:43 +00:00
#include <boost/filesystem.hpp>
2015-02-19 23:19:00 +00:00
#include <osquery/events.h>
#include <osquery/filesystem.h>
#include <osquery/logger.h>
#include <osquery/sql.h>
#include "osquery/core/watcher.h"
extern char** environ;
namespace fs = boost::filesystem;
namespace osquery {
2015-02-09 00:00:43 +00:00
const std::map<WatchdogLimitType, std::vector<size_t> > kWatchdogLimits = {
2015-02-19 23:19:00 +00:00
// Maximum MB worker can privately allocate.
{MEMORY_LIMIT, {50, 20, 10, 10}},
// Percent of user or system CPU worker can utilize for LATENCY_LIMIT
// seconds.
{UTILIZATION_LIMIT, {90, 70, 60, 50}},
2015-02-09 00:00:43 +00:00
// Number of seconds the worker should run, else consider the exit fatal.
2015-02-19 23:19:00 +00:00
{RESPAWN_LIMIT, {20, 20, 20, 5}},
2015-02-09 00:00:43 +00:00
// If the worker respawns too quickly, backoff on creating additional.
2015-02-19 23:19:00 +00:00
{RESPAWN_DELAY, {5, 5, 5, 1}},
// Seconds of tolerable UTILIZATION_LIMIT sustained latency.
{LATENCY_LIMIT, {5, 5, 3, 1}},
2015-02-09 00:00:43 +00:00
// How often to poll for performance limit violations.
2015-02-19 23:19:00 +00:00
{INTERVAL, {3, 3, 3, 1}}, };
2015-02-09 00:00:43 +00:00
2015-02-17 08:36:20 +00:00
FLAG(int32,
watchdog_level,
1,
2015-02-19 23:19:00 +00:00
"Performance limit level (0=loose, 1=normal, 2=restrictive, 3=debug)");
2015-02-17 08:36:20 +00:00
2015-03-03 23:03:14 +00:00
CLI_FLAG(bool, disable_watchdog, false, "Disable userland watchdog process");
bool Watcher::ok() {
2015-02-09 00:00:43 +00:00
::sleep(getWorkerLimit(INTERVAL));
return (worker_ >= 0);
}
bool Watcher::watch() {
int status;
pid_t result = waitpid(worker_, &status, WNOHANG);
if (worker_ == 0 || result == worker_) {
// Worker does not exist or never existed.
return false;
} else if (result == 0) {
// If the inspect finds problems it will stop/restart the worker.
if (!isWorkerSane()) {
stopWorker();
return false;
}
}
return true;
}
void Watcher::stopWorker() {
kill(worker_, SIGKILL);
worker_ = 0;
// Clean up the defunct (zombie) process.
waitpid(-1, 0, 0);
}
bool Watcher::isWorkerSane() {
auto rows =
SQL::selectAllFrom("processes", "pid", tables::EQUALS, INTEGER(worker_));
if (rows.size() == 0) {
// Could not find worker process?
return false;
}
// Compare CPU utilization since last check.
BIGINT_LITERAL footprint, user_time, system_time;
2015-02-09 00:00:43 +00:00
// IV is the check interval in seconds, and utilization is set per-second.
auto iv = getWorkerLimit(INTERVAL);
try {
2015-02-09 00:00:43 +00:00
user_time = AS_LITERAL(BIGINT_LITERAL, rows[0].at("user_time")) / iv;
system_time = AS_LITERAL(BIGINT_LITERAL, rows[0].at("system_time")) / iv;
footprint = AS_LITERAL(BIGINT_LITERAL, rows[0].at("phys_footprint"));
} catch (const std::exception& e) {
sustained_latency_ = 0;
}
2015-02-09 00:00:43 +00:00
if (current_user_time_ + getWorkerLimit(UTILIZATION_LIMIT) < user_time ||
current_system_time_ + getWorkerLimit(UTILIZATION_LIMIT) < system_time) {
sustained_latency_++;
} else {
sustained_latency_ = 0;
}
current_user_time_ = user_time;
current_system_time_ = system_time;
2015-02-09 00:00:43 +00:00
if (sustained_latency_ * iv >= getWorkerLimit(LATENCY_LIMIT)) {
LOG(WARNING) << "osqueryd worker system performance limits exceeded";
return false;
}
2015-02-09 00:00:43 +00:00
if (footprint > getWorkerLimit(MEMORY_LIMIT) * 1024 * 1024) {
LOG(WARNING) << "osqueryd worker memory limits exceeded";
return false;
}
// The worker is sane, no action needed.
return true;
}
void Watcher::createWorker() {
2015-02-09 00:00:43 +00:00
if (last_respawn_time_ > getUnixTime() - getWorkerLimit(RESPAWN_LIMIT)) {
LOG(WARNING) << "osqueryd worker respawning too quickly";
2015-02-09 00:00:43 +00:00
::sleep(getWorkerLimit(RESPAWN_DELAY));
}
2015-02-20 00:00:12 +00:00
// Get the path of the current process.
auto qd = SQL::selectAllFrom("processes", "pid", tables::EQUALS,
INTEGER(getpid()));
if (qd.size() != 1 || qd[0].count("path") == 0 || qd[0]["path"].size() == 0) {
LOG(ERROR) << "osquery watcher cannot determine process path";
::exit(EXIT_FAILURE);
}
worker_ = fork();
if (worker_ < 0) {
// Unrecoverable error, cannot create a worker process.
LOG(ERROR) << "osqueryd could not create a worker process";
::exit(EXIT_FAILURE);
} else if (worker_ == 0) {
// This is the new worker process, no watching needed.
setenv("OSQUERYD_WORKER", std::to_string(getpid()).c_str(), 1);
2015-02-20 00:00:12 +00:00
// Get the complete path of the osquery process binary.
auto exec_path = fs::system_complete(fs::path(qd[0]["path"]));
execve(exec_path.string().c_str(), argv_, environ);
// Code will never reach this point.
::exit(EXIT_FAILURE);
}
VLOG(1) << "osqueryd watcher (" << getpid() << ") executing worker ("
<< worker_ << ")";
}
void Watcher::resetCounters() {
// Reset the monitoring counters for the watcher.
sustained_latency_ = 0;
current_user_time_ = 0;
current_system_time_ = 0;
last_respawn_time_ = getUnixTime();
}
void Watcher::initWorker() {
// Set the worker's process name.
size_t name_size = strlen(argv_[0]);
for (int i = 0; i < argc_; i++) {
if (argv_[i] != nullptr) {
memset(argv_[i], 0, strlen(argv_[i]));
}
}
strncpy(argv_[0], name_.c_str(), name_size);
// Start a watcher watcher thread to exit the process if the watcher exits.
Dispatcher::getInstance().addService(
std::make_shared<WatcherWatcherRunner>(getppid()));
}
void WatcherWatcherRunner::enter() {
while (true) {
if (getppid() != watcher_) {
// Watcher died, the worker must follow.
VLOG(1) << "osqueryd worker (" << getpid()
<< ") detected killed watcher (" << watcher_ << ")";
::exit(EXIT_SUCCESS);
}
2015-02-09 00:00:43 +00:00
interruptableSleep(getWorkerLimit(INTERVAL) * 1000);
}
}
size_t getWorkerLimit(WatchdogLimitType name, int level) {
if (kWatchdogLimits.count(name) == 0) {
return 0;
}
// If no level was provided then use the default (config/switch).
if (level == -1) {
level = FLAGS_watchdog_level;
}
if (level > 3) {
return kWatchdogLimits.at(name).back();
}
2015-02-09 00:00:43 +00:00
return kWatchdogLimits.at(name).at(level);
}
}