2015-01-26 08:02:02 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2014, Facebook, Inc.
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* This source code is licensed under the BSD-style license found in the
|
|
|
|
* LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
* of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <cstring>
|
|
|
|
#include <sstream>
|
|
|
|
|
|
|
|
#include <sys/wait.h>
|
|
|
|
#include <signal.h>
|
|
|
|
|
2015-02-09 00:00:43 +00:00
|
|
|
#include <boost/filesystem.hpp>
|
|
|
|
|
2015-02-19 23:19:00 +00:00
|
|
|
#include <osquery/events.h>
|
2015-02-06 17:42:03 +00:00
|
|
|
#include <osquery/filesystem.h>
|
2015-01-26 08:02:02 +00:00
|
|
|
#include <osquery/logger.h>
|
|
|
|
#include <osquery/sql.h>
|
|
|
|
|
|
|
|
#include "osquery/core/watcher.h"
|
|
|
|
|
2015-02-06 17:42:03 +00:00
|
|
|
extern char** environ;
|
|
|
|
|
|
|
|
namespace fs = boost::filesystem;
|
|
|
|
|
2015-01-26 08:02:02 +00:00
|
|
|
namespace osquery {
|
|
|
|
|
2015-02-09 00:00:43 +00:00
|
|
|
const std::map<WatchdogLimitType, std::vector<size_t> > kWatchdogLimits = {
|
2015-02-19 23:19:00 +00:00
|
|
|
// Maximum MB worker can privately allocate.
|
|
|
|
{MEMORY_LIMIT, {50, 20, 10, 10}},
|
|
|
|
// Percent of user or system CPU worker can utilize for LATENCY_LIMIT
|
|
|
|
// seconds.
|
|
|
|
{UTILIZATION_LIMIT, {90, 70, 60, 50}},
|
2015-02-09 00:00:43 +00:00
|
|
|
// Number of seconds the worker should run, else consider the exit fatal.
|
2015-02-19 23:19:00 +00:00
|
|
|
{RESPAWN_LIMIT, {20, 20, 20, 5}},
|
2015-02-09 00:00:43 +00:00
|
|
|
// If the worker respawns too quickly, backoff on creating additional.
|
2015-02-19 23:19:00 +00:00
|
|
|
{RESPAWN_DELAY, {5, 5, 5, 1}},
|
|
|
|
// Seconds of tolerable UTILIZATION_LIMIT sustained latency.
|
|
|
|
{LATENCY_LIMIT, {5, 5, 3, 1}},
|
2015-02-09 00:00:43 +00:00
|
|
|
// How often to poll for performance limit violations.
|
2015-02-19 23:19:00 +00:00
|
|
|
{INTERVAL, {3, 3, 3, 1}}, };
|
2015-02-09 00:00:43 +00:00
|
|
|
|
2015-02-17 08:36:20 +00:00
|
|
|
FLAG(int32,
|
|
|
|
watchdog_level,
|
|
|
|
1,
|
2015-02-19 23:19:00 +00:00
|
|
|
"Performance limit level (0=loose, 1=normal, 2=restrictive, 3=debug)");
|
2015-02-17 08:36:20 +00:00
|
|
|
|
2015-03-03 23:03:14 +00:00
|
|
|
CLI_FLAG(bool, disable_watchdog, false, "Disable userland watchdog process");
|
2015-02-06 17:42:03 +00:00
|
|
|
|
2015-01-26 08:02:02 +00:00
|
|
|
bool Watcher::ok() {
|
2015-02-09 00:00:43 +00:00
|
|
|
::sleep(getWorkerLimit(INTERVAL));
|
2015-01-26 08:02:02 +00:00
|
|
|
return (worker_ >= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Watcher::watch() {
|
|
|
|
int status;
|
|
|
|
pid_t result = waitpid(worker_, &status, WNOHANG);
|
|
|
|
if (worker_ == 0 || result == worker_) {
|
|
|
|
// Worker does not exist or never existed.
|
|
|
|
return false;
|
|
|
|
} else if (result == 0) {
|
|
|
|
// If the inspect finds problems it will stop/restart the worker.
|
|
|
|
if (!isWorkerSane()) {
|
|
|
|
stopWorker();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Watcher::stopWorker() {
|
|
|
|
kill(worker_, SIGKILL);
|
|
|
|
worker_ = 0;
|
|
|
|
// Clean up the defunct (zombie) process.
|
|
|
|
waitpid(-1, 0, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Watcher::isWorkerSane() {
|
|
|
|
auto rows =
|
|
|
|
SQL::selectAllFrom("processes", "pid", tables::EQUALS, INTEGER(worker_));
|
|
|
|
if (rows.size() == 0) {
|
|
|
|
// Could not find worker process?
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compare CPU utilization since last check.
|
2015-02-06 17:42:03 +00:00
|
|
|
BIGINT_LITERAL footprint, user_time, system_time;
|
2015-02-09 00:00:43 +00:00
|
|
|
// IV is the check interval in seconds, and utilization is set per-second.
|
|
|
|
auto iv = getWorkerLimit(INTERVAL);
|
2015-02-06 17:42:03 +00:00
|
|
|
|
|
|
|
try {
|
2015-02-09 00:00:43 +00:00
|
|
|
user_time = AS_LITERAL(BIGINT_LITERAL, rows[0].at("user_time")) / iv;
|
|
|
|
system_time = AS_LITERAL(BIGINT_LITERAL, rows[0].at("system_time")) / iv;
|
2015-02-06 17:42:03 +00:00
|
|
|
footprint = AS_LITERAL(BIGINT_LITERAL, rows[0].at("phys_footprint"));
|
|
|
|
} catch (const std::exception& e) {
|
|
|
|
sustained_latency_ = 0;
|
|
|
|
}
|
|
|
|
|
2015-02-09 00:00:43 +00:00
|
|
|
if (current_user_time_ + getWorkerLimit(UTILIZATION_LIMIT) < user_time ||
|
|
|
|
current_system_time_ + getWorkerLimit(UTILIZATION_LIMIT) < system_time) {
|
2015-01-26 08:02:02 +00:00
|
|
|
sustained_latency_++;
|
|
|
|
} else {
|
|
|
|
sustained_latency_ = 0;
|
|
|
|
}
|
2015-02-06 17:42:03 +00:00
|
|
|
|
2015-01-26 08:02:02 +00:00
|
|
|
current_user_time_ = user_time;
|
|
|
|
current_system_time_ = system_time;
|
|
|
|
|
2015-02-09 00:00:43 +00:00
|
|
|
if (sustained_latency_ * iv >= getWorkerLimit(LATENCY_LIMIT)) {
|
2015-01-26 08:02:02 +00:00
|
|
|
LOG(WARNING) << "osqueryd worker system performance limits exceeded";
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-02-09 00:00:43 +00:00
|
|
|
if (footprint > getWorkerLimit(MEMORY_LIMIT) * 1024 * 1024) {
|
2015-01-26 08:02:02 +00:00
|
|
|
LOG(WARNING) << "osqueryd worker memory limits exceeded";
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The worker is sane, no action needed.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-02-06 17:42:03 +00:00
|
|
|
void Watcher::createWorker() {
|
2015-02-09 00:00:43 +00:00
|
|
|
if (last_respawn_time_ > getUnixTime() - getWorkerLimit(RESPAWN_LIMIT)) {
|
2015-01-26 08:02:02 +00:00
|
|
|
LOG(WARNING) << "osqueryd worker respawning too quickly";
|
2015-02-09 00:00:43 +00:00
|
|
|
::sleep(getWorkerLimit(RESPAWN_DELAY));
|
2015-01-26 08:02:02 +00:00
|
|
|
}
|
|
|
|
|
2015-02-20 00:00:12 +00:00
|
|
|
// Get the path of the current process.
|
|
|
|
auto qd = SQL::selectAllFrom("processes", "pid", tables::EQUALS,
|
|
|
|
INTEGER(getpid()));
|
|
|
|
if (qd.size() != 1 || qd[0].count("path") == 0 || qd[0]["path"].size() == 0) {
|
|
|
|
LOG(ERROR) << "osquery watcher cannot determine process path";
|
|
|
|
::exit(EXIT_FAILURE);
|
|
|
|
}
|
|
|
|
|
2015-01-26 08:02:02 +00:00
|
|
|
worker_ = fork();
|
|
|
|
if (worker_ < 0) {
|
|
|
|
// Unrecoverable error, cannot create a worker process.
|
|
|
|
LOG(ERROR) << "osqueryd could not create a worker process";
|
|
|
|
::exit(EXIT_FAILURE);
|
|
|
|
} else if (worker_ == 0) {
|
|
|
|
// This is the new worker process, no watching needed.
|
2015-02-06 17:42:03 +00:00
|
|
|
setenv("OSQUERYD_WORKER", std::to_string(getpid()).c_str(), 1);
|
2015-02-20 00:00:12 +00:00
|
|
|
// Get the complete path of the osquery process binary.
|
|
|
|
auto exec_path = fs::system_complete(fs::path(qd[0]["path"]));
|
2015-02-06 17:42:03 +00:00
|
|
|
execve(exec_path.string().c_str(), argv_, environ);
|
|
|
|
// Code will never reach this point.
|
|
|
|
::exit(EXIT_FAILURE);
|
2015-01-26 08:02:02 +00:00
|
|
|
}
|
|
|
|
|
2015-02-06 17:42:03 +00:00
|
|
|
VLOG(1) << "osqueryd watcher (" << getpid() << ") executing worker ("
|
|
|
|
<< worker_ << ")";
|
2015-01-26 08:02:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void Watcher::resetCounters() {
|
2015-02-06 17:42:03 +00:00
|
|
|
// Reset the monitoring counters for the watcher.
|
2015-01-26 08:02:02 +00:00
|
|
|
sustained_latency_ = 0;
|
|
|
|
current_user_time_ = 0;
|
|
|
|
current_system_time_ = 0;
|
|
|
|
last_respawn_time_ = getUnixTime();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Watcher::initWorker() {
|
|
|
|
// Set the worker's process name.
|
|
|
|
size_t name_size = strlen(argv_[0]);
|
|
|
|
for (int i = 0; i < argc_; i++) {
|
|
|
|
if (argv_[i] != nullptr) {
|
|
|
|
memset(argv_[i], 0, strlen(argv_[i]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
strncpy(argv_[0], name_.c_str(), name_size);
|
2015-02-06 17:42:03 +00:00
|
|
|
|
|
|
|
// Start a watcher watcher thread to exit the process if the watcher exits.
|
|
|
|
Dispatcher::getInstance().addService(
|
|
|
|
std::make_shared<WatcherWatcherRunner>(getppid()));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void WatcherWatcherRunner::enter() {
|
|
|
|
while (true) {
|
|
|
|
if (getppid() != watcher_) {
|
|
|
|
// Watcher died, the worker must follow.
|
|
|
|
VLOG(1) << "osqueryd worker (" << getpid()
|
|
|
|
<< ") detected killed watcher (" << watcher_ << ")";
|
|
|
|
::exit(EXIT_SUCCESS);
|
|
|
|
}
|
2015-02-09 00:00:43 +00:00
|
|
|
interruptableSleep(getWorkerLimit(INTERVAL) * 1000);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t getWorkerLimit(WatchdogLimitType name, int level) {
|
|
|
|
if (kWatchdogLimits.count(name) == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If no level was provided then use the default (config/switch).
|
|
|
|
if (level == -1) {
|
|
|
|
level = FLAGS_watchdog_level;
|
|
|
|
}
|
|
|
|
if (level > 3) {
|
|
|
|
return kWatchdogLimits.at(name).back();
|
2015-02-06 17:42:03 +00:00
|
|
|
}
|
2015-02-09 00:00:43 +00:00
|
|
|
return kWatchdogLimits.at(name).at(level);
|
2015-01-26 08:02:02 +00:00
|
|
|
}
|
|
|
|
}
|