2015-01-26 08:02:02 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2014, Facebook, Inc.
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* This source code is licensed under the BSD-style license found in the
|
|
|
|
* LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
* of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <cstring>
|
|
|
|
#include <sstream>
|
|
|
|
|
|
|
|
#include <sys/wait.h>
|
|
|
|
#include <signal.h>
|
|
|
|
|
2015-02-09 00:00:43 +00:00
|
|
|
#include <boost/filesystem.hpp>
|
|
|
|
|
2015-01-26 08:02:02 +00:00
|
|
|
#include <osquery/core.h>
|
2015-02-06 17:42:03 +00:00
|
|
|
#include <osquery/filesystem.h>
|
2015-01-26 08:02:02 +00:00
|
|
|
#include <osquery/logger.h>
|
|
|
|
#include <osquery/sql.h>
|
|
|
|
|
|
|
|
#include "osquery/core/watcher.h"
|
|
|
|
|
2015-02-06 17:42:03 +00:00
|
|
|
extern char** environ;
|
|
|
|
|
|
|
|
namespace fs = boost::filesystem;
|
|
|
|
|
2015-01-26 08:02:02 +00:00
|
|
|
namespace osquery {
|
|
|
|
|
2015-02-09 00:00:43 +00:00
|
|
|
const std::map<WatchdogLimitType, std::vector<size_t> > kWatchdogLimits = {
|
|
|
|
{MEMORY_LIMIT, {50, 20, 10}},
|
|
|
|
{UTILIZATION_LIMIT, {90, 70, 60}},
|
|
|
|
// Number of seconds the worker should run, else consider the exit fatal.
|
|
|
|
{RESPAWN_LIMIT, {20, 20, 20}},
|
|
|
|
// If the worker respawns too quickly, backoff on creating additional.
|
|
|
|
{RESPAWN_DELAY, {5, 5, 5}},
|
|
|
|
// Seconds of tolerable sustained latency.
|
|
|
|
{LATENCY_LIMIT, {5, 5, 3}},
|
|
|
|
// How often to poll for performance limit violations.
|
|
|
|
{INTERVAL, {3, 3, 3}}, };
|
|
|
|
|
|
|
|
DEFINE_osquery_flag(
|
|
|
|
int32,
|
|
|
|
watchdog_level,
|
|
|
|
1,
|
|
|
|
"Performance limit level (0=loose, 1=normal, 2=restrictive)");
|
2015-01-26 08:02:02 +00:00
|
|
|
|
2015-02-06 17:42:03 +00:00
|
|
|
DEFINE_osquery_flag(bool,
|
|
|
|
disable_watchdog,
|
|
|
|
false,
|
|
|
|
"Disable userland watchdog process");
|
|
|
|
|
2015-01-26 08:02:02 +00:00
|
|
|
bool Watcher::ok() {
|
2015-02-09 00:00:43 +00:00
|
|
|
::sleep(getWorkerLimit(INTERVAL));
|
2015-01-26 08:02:02 +00:00
|
|
|
return (worker_ >= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Watcher::watch() {
|
|
|
|
int status;
|
|
|
|
pid_t result = waitpid(worker_, &status, WNOHANG);
|
|
|
|
if (worker_ == 0 || result == worker_) {
|
|
|
|
// Worker does not exist or never existed.
|
|
|
|
return false;
|
|
|
|
} else if (result == 0) {
|
|
|
|
// If the inspect finds problems it will stop/restart the worker.
|
|
|
|
if (!isWorkerSane()) {
|
|
|
|
stopWorker();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Watcher::stopWorker() {
|
|
|
|
kill(worker_, SIGKILL);
|
|
|
|
worker_ = 0;
|
|
|
|
// Clean up the defunct (zombie) process.
|
|
|
|
waitpid(-1, 0, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Watcher::isWorkerSane() {
|
|
|
|
auto rows =
|
|
|
|
SQL::selectAllFrom("processes", "pid", tables::EQUALS, INTEGER(worker_));
|
|
|
|
if (rows.size() == 0) {
|
|
|
|
// Could not find worker process?
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compare CPU utilization since last check.
|
2015-02-06 17:42:03 +00:00
|
|
|
BIGINT_LITERAL footprint, user_time, system_time;
|
2015-02-09 00:00:43 +00:00
|
|
|
// IV is the check interval in seconds, and utilization is set per-second.
|
|
|
|
auto iv = getWorkerLimit(INTERVAL);
|
2015-02-06 17:42:03 +00:00
|
|
|
|
|
|
|
try {
|
2015-02-09 00:00:43 +00:00
|
|
|
user_time = AS_LITERAL(BIGINT_LITERAL, rows[0].at("user_time")) / iv;
|
|
|
|
system_time = AS_LITERAL(BIGINT_LITERAL, rows[0].at("system_time")) / iv;
|
2015-02-06 17:42:03 +00:00
|
|
|
footprint = AS_LITERAL(BIGINT_LITERAL, rows[0].at("phys_footprint"));
|
|
|
|
} catch (const std::exception& e) {
|
|
|
|
sustained_latency_ = 0;
|
|
|
|
}
|
|
|
|
|
2015-02-09 00:00:43 +00:00
|
|
|
if (current_user_time_ + getWorkerLimit(UTILIZATION_LIMIT) < user_time ||
|
|
|
|
current_system_time_ + getWorkerLimit(UTILIZATION_LIMIT) < system_time) {
|
2015-01-26 08:02:02 +00:00
|
|
|
sustained_latency_++;
|
|
|
|
} else {
|
|
|
|
sustained_latency_ = 0;
|
|
|
|
}
|
2015-02-06 17:42:03 +00:00
|
|
|
|
2015-01-26 08:02:02 +00:00
|
|
|
current_user_time_ = user_time;
|
|
|
|
current_system_time_ = system_time;
|
|
|
|
|
2015-02-09 00:00:43 +00:00
|
|
|
if (sustained_latency_ * iv >= getWorkerLimit(LATENCY_LIMIT)) {
|
2015-01-26 08:02:02 +00:00
|
|
|
LOG(WARNING) << "osqueryd worker system performance limits exceeded";
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-02-09 00:00:43 +00:00
|
|
|
if (footprint > getWorkerLimit(MEMORY_LIMIT) * 1024 * 1024) {
|
2015-01-26 08:02:02 +00:00
|
|
|
LOG(WARNING) << "osqueryd worker memory limits exceeded";
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The worker is sane, no action needed.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-02-06 17:42:03 +00:00
|
|
|
void Watcher::createWorker() {
|
2015-02-09 00:00:43 +00:00
|
|
|
if (last_respawn_time_ > getUnixTime() - getWorkerLimit(RESPAWN_LIMIT)) {
|
2015-01-26 08:02:02 +00:00
|
|
|
LOG(WARNING) << "osqueryd worker respawning too quickly";
|
2015-02-09 00:00:43 +00:00
|
|
|
::sleep(getWorkerLimit(RESPAWN_DELAY));
|
2015-01-26 08:02:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
worker_ = fork();
|
|
|
|
if (worker_ < 0) {
|
|
|
|
// Unrecoverable error, cannot create a worker process.
|
|
|
|
LOG(ERROR) << "osqueryd could not create a worker process";
|
|
|
|
::exit(EXIT_FAILURE);
|
|
|
|
} else if (worker_ == 0) {
|
|
|
|
// This is the new worker process, no watching needed.
|
2015-02-06 17:42:03 +00:00
|
|
|
setenv("OSQUERYD_WORKER", std::to_string(getpid()).c_str(), 1);
|
|
|
|
fs::path exec_path(fs::initial_path<fs::path>());
|
|
|
|
exec_path = fs::system_complete(fs::path(argv_[0]));
|
|
|
|
execve(exec_path.string().c_str(), argv_, environ);
|
|
|
|
// Code will never reach this point.
|
|
|
|
::exit(EXIT_FAILURE);
|
2015-01-26 08:02:02 +00:00
|
|
|
}
|
|
|
|
|
2015-02-06 17:42:03 +00:00
|
|
|
VLOG(1) << "osqueryd watcher (" << getpid() << ") executing worker ("
|
|
|
|
<< worker_ << ")";
|
2015-01-26 08:02:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void Watcher::resetCounters() {
|
2015-02-06 17:42:03 +00:00
|
|
|
// Reset the monitoring counters for the watcher.
|
2015-01-26 08:02:02 +00:00
|
|
|
sustained_latency_ = 0;
|
|
|
|
current_user_time_ = 0;
|
|
|
|
current_system_time_ = 0;
|
|
|
|
last_respawn_time_ = getUnixTime();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Watcher::initWorker() {
|
|
|
|
// Set the worker's process name.
|
|
|
|
size_t name_size = strlen(argv_[0]);
|
|
|
|
for (int i = 0; i < argc_; i++) {
|
|
|
|
if (argv_[i] != nullptr) {
|
|
|
|
memset(argv_[i], 0, strlen(argv_[i]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
strncpy(argv_[0], name_.c_str(), name_size);
|
2015-02-06 17:42:03 +00:00
|
|
|
|
|
|
|
// Start a watcher watcher thread to exit the process if the watcher exits.
|
|
|
|
Dispatcher::getInstance().addService(
|
|
|
|
std::make_shared<WatcherWatcherRunner>(getppid()));
|
|
|
|
}
|
|
|
|
|
|
|
|
bool isOsqueryWorker() {
|
|
|
|
return (getenv("OSQUERYD_WORKER") != nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
void WatcherWatcherRunner::enter() {
|
|
|
|
while (true) {
|
|
|
|
if (getppid() != watcher_) {
|
|
|
|
// Watcher died, the worker must follow.
|
|
|
|
VLOG(1) << "osqueryd worker (" << getpid()
|
|
|
|
<< ") detected killed watcher (" << watcher_ << ")";
|
|
|
|
::exit(EXIT_SUCCESS);
|
|
|
|
}
|
2015-02-09 00:00:43 +00:00
|
|
|
interruptableSleep(getWorkerLimit(INTERVAL) * 1000);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t getWorkerLimit(WatchdogLimitType name, int level) {
|
|
|
|
if (kWatchdogLimits.count(name) == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If no level was provided then use the default (config/switch).
|
|
|
|
if (level == -1) {
|
|
|
|
level = FLAGS_watchdog_level;
|
|
|
|
}
|
|
|
|
if (level > 3) {
|
|
|
|
return kWatchdogLimits.at(name).back();
|
2015-02-06 17:42:03 +00:00
|
|
|
}
|
2015-02-09 00:00:43 +00:00
|
|
|
return kWatchdogLimits.at(name).at(level);
|
2015-01-26 08:02:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void initWorkerWatcher(const std::string& name, int argc, char* argv[]) {
|
|
|
|
// The watcher will forever monitor and spawn additional workers.
|
|
|
|
Watcher watcher(argc, argv);
|
|
|
|
watcher.setWorkerName(name);
|
|
|
|
|
2015-02-06 17:42:03 +00:00
|
|
|
if (isOsqueryWorker()) {
|
|
|
|
// Do not start watching/spawning if this process is a worker.
|
|
|
|
watcher.initWorker();
|
|
|
|
} else {
|
|
|
|
do {
|
|
|
|
if (!watcher.watch()) {
|
|
|
|
// The watcher failed, create a worker.
|
|
|
|
watcher.createWorker();
|
|
|
|
watcher.resetCounters();
|
2015-01-26 08:02:02 +00:00
|
|
|
}
|
2015-02-06 17:42:03 +00:00
|
|
|
} while (watcher.ok());
|
|
|
|
|
|
|
|
// Executation should never reach this point.
|
|
|
|
::exit(EXIT_FAILURE);
|
|
|
|
}
|
2015-01-26 08:02:02 +00:00
|
|
|
}
|
|
|
|
}
|