worker: Do not ignore SIGCHLD to exit faster (#3487)

2024-11-07 18:08:53 +00:00 · 2017-07-24 08:48:06 -07:00 · 2017-07-24 08:48:06 -07:00 · 295acfcf3d
commit 295acfcf3d
parent 30a9f23cb0
3 changed files with 39 additions and 13 deletions
--- a/osquery/core/init.cpp
+++ b/osquery/core/init.cpp
@ -147,6 +147,9 @@ void signalHandler(int num) {
      // Time to stop, set an upper bound time constraint on how long threads
      // have to terminate (join). Publishers may be in 20ms or similar sleeps.
      alarm(osquery::FLAGS_alarm_timeout);
+
+      // Allow the OS to auto-reap our child processes.
+      std::signal(SIGCHLD, SIG_IGN);
 #endif

      // Restore the default signal handler.
@ -369,7 +372,6 @@ Initializer::Initializer(int& argc, char**& argv, ToolType tool)
  std::signal(SIGINT, signalHandler);
  std::signal(SIGHUP, signalHandler);
  std::signal(SIGALRM, signalHandler);
-  std::signal(SIGCHLD, SIG_IGN);
 #endif

  std::signal(SIGABRT, signalHandler);
@ -679,8 +681,11 @@ void Initializer::waitForShutdown() {
 }

 void Initializer::requestShutdown(int retcode) {
+  if (kExitCode == 0) {
+    kExitCode = retcode;
+  }
+
  // Stop thrift services/clients/and their thread pools.
-  kExitCode = retcode;
  if (std::this_thread::get_id() != kMainThreadId) {
    raise(SIGUSR1);
  } else {
--- a/osquery/core/posix/process.cpp
+++ b/osquery/core/posix/process.cpp
@ -65,13 +65,16 @@ bool PlatformProcess::killGracefully() const {

 ProcessState PlatformProcess::checkStatus(int& status) const {
  int process_status = 0;
+  if (!isValid()) {
+    return PROCESS_ERROR;
+  }

  pid_t result = ::waitpid(nativeHandle(), &process_status, WNOHANG);
  if (result < 0) {
-    process_status = -1;
    if (errno == ECHILD) {
      return PROCESS_EXITED;
    }
+    process_status = -1;
    return PROCESS_ERROR;
  }

--- a/osquery/core/watcher.cpp
+++ b/osquery/core/watcher.cpp
@ -53,7 +53,7 @@ const WatchdogLimitMap kWatchdogLimits = {
    // User or system CPU worker can utilize for LATENCY_LIMIT seconds.
    {WatchdogLimitType::UTILIZATION_LIMIT, {90, 80, 1000}},
    // Number of seconds the worker should run, else consider the exit fatal.
-    {WatchdogLimitType::RESPAWN_LIMIT, {10, 4, 1000}},
+    {WatchdogLimitType::RESPAWN_LIMIT, {4, 4, 1000}},
    // If the worker respawns too quickly, backoff on creating additional.
    {WatchdogLimitType::RESPAWN_DELAY, {5, 5, 1}},
    // Seconds of tolerable UTILIZATION_LIMIT sustained latency.
@ -180,25 +180,41 @@ bool WatcherRunner::ok() const {
 }

 void WatcherRunner::start() {
-  // Set worker performance counters to an initial state.
-  Watcher::get().resetWorkerCounters(0);
  // Hold the current process (watcher) for inspection too.
-  auto watcher = PlatformProcess::getCurrentProcess();
+  auto& watcher = Watcher::get();
+  auto self = PlatformProcess::getCurrentProcess();
+
+  // Set worker performance counters to an initial state.
+  watcher.resetWorkerCounters(0);
  PerformanceState watcher_state;

  // Enter the watch loop.
  do {
-    if (use_worker_ && !watch(Watcher::get().getWorker())) {
-      if (Watcher::get().fatesBound()) {
+    if (use_worker_ && !watch(watcher.getWorker())) {
+      if (watcher.fatesBound()) {
        // A signal has interrupted the watcher.
        break;
      }
+
+      auto status = watcher.getWorkerStatus();
+      if (status == EXIT_CATASTROPHIC) {
+        Initializer::requestShutdown(EXIT_CATASTROPHIC);
+        break;
+      }
+
+      if (watcher.workerRestartCount() ==
+          getWorkerLimit(WatchdogLimitType::RESPAWN_LIMIT)) {
+        // Too many worker restarts.
+        Initializer::requestShutdown(EXIT_FAILURE, "Too many worker restarts");
+        break;
+      }
+
      // The watcher failed, create a worker.
      createWorker();
    }

    // Loop over every managed extension and check sanity.
-    for (const auto& extension : Watcher::get().extensions()) {
+    for (const auto& extension : watcher.extensions()) {
      auto s = isChildSane(*extension.second);
      if (!s.ok()) {
        // The extension manager also watches for extension-related failures.
@ -212,13 +228,13 @@ void WatcherRunner::start() {
    // If any extension creations failed, stop managing them.
    for (auto& extension : extension_restarts_) {
      if (extension.second > 3) {
-        Watcher::get().removeExtensionPath(extension.first);
+        watcher.removeExtensionPath(extension.first);
        extension.second = 0;
      }
    }

    if (use_worker_) {
-      auto status = isWatcherHealthy(*watcher, watcher_state);
+      auto status = isWatcherHealthy(*self, watcher_state);
      if (!status.ok()) {
        Initializer::requestShutdown(
            EXIT_CATASTROPHIC,
@ -430,9 +446,10 @@ void WatcherRunner::createWorker() {
    WatcherExtensionsLocker locker;
    if (watcher.getState(watcher.getWorker()).last_respawn_time >
        getUnixTime() - getWorkerLimit(WatchdogLimitType::RESPAWN_LIMIT)) {
+      watcher.workerRestarted();
      LOG(WARNING) << "osqueryd worker respawning too quickly: "
                   << watcher.workerRestartCount() << " times";
-      watcher.workerRestarted();
+
      // The configured automatic delay.
      size_t delay = getWorkerLimit(WatchdogLimitType::RESPAWN_DELAY) * 1000;
      // Exponential back off for quickly-respawning clients.
@ -483,6 +500,7 @@ void WatcherRunner::createWorker() {
  watcher.resetWorkerCounters(getUnixTime());
  VLOG(1) << "osqueryd watcher (" << PlatformProcess::getCurrentProcess()->pid()
          << ") executing worker (" << worker->pid() << ")";
+  watcher.worker_status_ = -1;
 }

 void WatcherRunner::createExtension(const std::string& extension) {