worker: Do not ignore SIGCHLD to exit faster (#3487)

This commit is contained in:
Teddy Reed 2017-07-24 08:48:06 -07:00 committed by GitHub
parent 30a9f23cb0
commit 295acfcf3d
3 changed files with 39 additions and 13 deletions

View File

@ -147,6 +147,9 @@ void signalHandler(int num) {
// Time to stop, set an upper bound time constraint on how long threads
// have to terminate (join). Publishers may be in 20ms or similar sleeps.
alarm(osquery::FLAGS_alarm_timeout);
// Allow the OS to auto-reap our child processes.
std::signal(SIGCHLD, SIG_IGN);
#endif
// Restore the default signal handler.
@ -369,7 +372,6 @@ Initializer::Initializer(int& argc, char**& argv, ToolType tool)
std::signal(SIGINT, signalHandler);
std::signal(SIGHUP, signalHandler);
std::signal(SIGALRM, signalHandler);
std::signal(SIGCHLD, SIG_IGN);
#endif
std::signal(SIGABRT, signalHandler);
@ -679,8 +681,11 @@ void Initializer::waitForShutdown() {
}
void Initializer::requestShutdown(int retcode) {
if (kExitCode == 0) {
kExitCode = retcode;
}
// Stop thrift services/clients/and their thread pools.
kExitCode = retcode;
if (std::this_thread::get_id() != kMainThreadId) {
raise(SIGUSR1);
} else {

View File

@ -65,13 +65,16 @@ bool PlatformProcess::killGracefully() const {
ProcessState PlatformProcess::checkStatus(int& status) const {
int process_status = 0;
if (!isValid()) {
return PROCESS_ERROR;
}
pid_t result = ::waitpid(nativeHandle(), &process_status, WNOHANG);
if (result < 0) {
process_status = -1;
if (errno == ECHILD) {
return PROCESS_EXITED;
}
process_status = -1;
return PROCESS_ERROR;
}

View File

@ -53,7 +53,7 @@ const WatchdogLimitMap kWatchdogLimits = {
// User or system CPU worker can utilize for LATENCY_LIMIT seconds.
{WatchdogLimitType::UTILIZATION_LIMIT, {90, 80, 1000}},
// Number of seconds the worker should run, else consider the exit fatal.
{WatchdogLimitType::RESPAWN_LIMIT, {10, 4, 1000}},
{WatchdogLimitType::RESPAWN_LIMIT, {4, 4, 1000}},
// If the worker respawns too quickly, backoff on creating additional.
{WatchdogLimitType::RESPAWN_DELAY, {5, 5, 1}},
// Seconds of tolerable UTILIZATION_LIMIT sustained latency.
@ -180,25 +180,41 @@ bool WatcherRunner::ok() const {
}
void WatcherRunner::start() {
// Set worker performance counters to an initial state.
Watcher::get().resetWorkerCounters(0);
// Hold the current process (watcher) for inspection too.
auto watcher = PlatformProcess::getCurrentProcess();
auto& watcher = Watcher::get();
auto self = PlatformProcess::getCurrentProcess();
// Set worker performance counters to an initial state.
watcher.resetWorkerCounters(0);
PerformanceState watcher_state;
// Enter the watch loop.
do {
if (use_worker_ && !watch(Watcher::get().getWorker())) {
if (Watcher::get().fatesBound()) {
if (use_worker_ && !watch(watcher.getWorker())) {
if (watcher.fatesBound()) {
// A signal has interrupted the watcher.
break;
}
auto status = watcher.getWorkerStatus();
if (status == EXIT_CATASTROPHIC) {
Initializer::requestShutdown(EXIT_CATASTROPHIC);
break;
}
if (watcher.workerRestartCount() ==
getWorkerLimit(WatchdogLimitType::RESPAWN_LIMIT)) {
// Too many worker restarts.
Initializer::requestShutdown(EXIT_FAILURE, "Too many worker restarts");
break;
}
// The watcher failed, create a worker.
createWorker();
}
// Loop over every managed extension and check sanity.
for (const auto& extension : Watcher::get().extensions()) {
for (const auto& extension : watcher.extensions()) {
auto s = isChildSane(*extension.second);
if (!s.ok()) {
// The extension manager also watches for extension-related failures.
@ -212,13 +228,13 @@ void WatcherRunner::start() {
// If any extension creations failed, stop managing them.
for (auto& extension : extension_restarts_) {
if (extension.second > 3) {
Watcher::get().removeExtensionPath(extension.first);
watcher.removeExtensionPath(extension.first);
extension.second = 0;
}
}
if (use_worker_) {
auto status = isWatcherHealthy(*watcher, watcher_state);
auto status = isWatcherHealthy(*self, watcher_state);
if (!status.ok()) {
Initializer::requestShutdown(
EXIT_CATASTROPHIC,
@ -430,9 +446,10 @@ void WatcherRunner::createWorker() {
WatcherExtensionsLocker locker;
if (watcher.getState(watcher.getWorker()).last_respawn_time >
getUnixTime() - getWorkerLimit(WatchdogLimitType::RESPAWN_LIMIT)) {
watcher.workerRestarted();
LOG(WARNING) << "osqueryd worker respawning too quickly: "
<< watcher.workerRestartCount() << " times";
watcher.workerRestarted();
// The configured automatic delay.
size_t delay = getWorkerLimit(WatchdogLimitType::RESPAWN_DELAY) * 1000;
// Exponential back off for quickly-respawning clients.
@ -483,6 +500,7 @@ void WatcherRunner::createWorker() {
watcher.resetWorkerCounters(getUnixTime());
VLOG(1) << "osqueryd watcher (" << PlatformProcess::getCurrentProcess()->pid()
<< ") executing worker (" << worker->pid() << ")";
watcher.worker_status_ = -1;
}
void WatcherRunner::createExtension(const std::string& extension) {