Multiprocess RQ workers (using supervisor) (#4371)

* launch and monitor multiple workers using supervisor * run supervisord in non-daemon mode * redirect all output to stdout/stderr * no need to log supervisord's output because it is redirected to stdout anyway * updated and less brittle healthcheck * add supervisor healthchecks * remove redundant supervisor installation as it is installed by pip * add a 5 minute check gate
2024-11-06 09:05:17 +00:00 · 2020-01-01 15:32:29 +02:00 · 2020-01-01 15:32:29 +02:00 · 260bfca767
commit 260bfca767
parent f85490cf50
5 changed files with 89 additions and 12 deletions
--- a/bin/docker-entrypoint
+++ b/bin/docker-entrypoint
@ -25,7 +25,10 @@ dev_scheduler() {
 worker() {
  echo "Starting RQ worker..."

-  exec /app/manage.py rq worker $QUEUES
+  export WORKERS_COUNT=${WORKERS_COUNT:-2}
+  export QUEUES=${QUEUES:-}
+  
+  supervisord -c worker.conf
 }

 dev_worker() {
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -41,7 +41,6 @@ services:
        target: /app
    depends_on:
      - server
-    tty: true
    environment:
      PYTHONUNBUFFERED: 0
      REDASH_LOG_LEVEL: "INFO"
--- a/redash/cli/rq.py
+++ b/redash/cli/rq.py
@ -6,7 +6,10 @@ import datetime
 from click import argument
 from flask.cli import AppGroup
 from rq import Connection
+from rq.worker import WorkerStatus
 from sqlalchemy.orm import configure_mappers
+from supervisor_checks import check_runner
+from supervisor_checks.check_modules import base

 from redash import rq_redis_connection
 from redash.tasks import Worker
@ -42,15 +45,53 @@ def worker(queues):
        w.work()


+class WorkerHealthcheck(base.BaseCheck):
+    NAME = 'RQ Worker Healthcheck'
+    INTERVAL = datetime.timedelta(minutes=5)
+    _last_check_time = {}
+
+    def time_to_check(self, pid):
+        now = datetime.datetime.utcnow()
+
+        if pid not in self._last_check_time:
+            self._last_check_time[pid] = now
+
+        if now - self._last_check_time[pid] >= self.INTERVAL:
+            self._last_check_time[pid] = now
+            return True
+
+        return False
+
+    def __call__(self, process_spec):
+        pid = process_spec['pid']
+        if not self.time_to_check(pid):
+            return True
+
+        all_workers = Worker.all(connection=rq_redis_connection)
+        worker = [w for w in all_workers if w.hostname == socket.gethostname().encode() and
+                  w.pid == pid].pop()
+
+        is_busy = worker.get_state() == WorkerStatus.BUSY
+
+        time_since_seen = datetime.datetime.utcnow() - worker.last_heartbeat
+        seen_lately = time_since_seen.seconds < 60
+
+        total_jobs_in_watched_queues = sum([len(q.jobs) for q in worker.queues])
+        has_nothing_to_do = total_jobs_in_watched_queues == 0
+
+        is_healthy = is_busy or seen_lately or has_nothing_to_do
+
+        self._log("Worker %s healthcheck: Is busy? %s. "
+                  "Seen lately? %s (%d seconds ago). "
+                  "Has nothing to do? %s (%d jobs in watched queues). "
+                  "==> Is healthy? %s",
+                  worker.key, is_busy, seen_lately, time_since_seen.seconds,
+                  has_nothing_to_do, total_jobs_in_watched_queues, is_healthy)
+
+        return is_healthy
+
+
@manager.command()
 def healthcheck():
-    hostname = socket.gethostname()
-    with Connection(rq_redis_connection):
-        all_workers = Worker.all()
-
-        local_workers = [w for w in all_workers if w.hostname == hostname]
-        heartbeats = [w.last_heartbeat for w in local_workers]
-        time_since_seen = [datetime.datetime.utcnow() - hb for hb in heartbeats]
-        active = [t.seconds < 60 for t in time_since_seen]
-
-        sys.exit(int(not all(active)))
+    return check_runner.CheckRunner(
+        'worker_healthcheck', 'worker', None, [(WorkerHealthcheck, {})]).run()
--- a/requirements.txt
+++ b/requirements.txt
@ -58,6 +58,8 @@ maxminddb-geolite2==2018.703
 pypd==1.1.0
 disposable-email-domains>=0.0.52
 gevent==1.4.0
+supervisor==4.1.0
+supervisor_checks==0.8.1
 # Install the dependencies of the bin/bundle-extensions script here.
 # It has its own requirements file to simplify the frontend client build process
 -r requirements_bundles.txt
--- a/worker.conf
+++ b/worker.conf
@ -0,0 +1,32 @@
+[supervisord]
+logfile=/dev/null
+pidfile=/tmp/supervisord.pid
+nodaemon=true
+
+[unix_http_server]
+file = /tmp/supervisor.sock
+
+[rpcinterface:supervisor]
+supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
+
+[program:worker]
+command=./manage.py rq worker %(ENV_QUEUES)s
+process_name=%(program_name)s-%(process_num)s
+numprocs=%(ENV_WORKERS_COUNT)s
+directory=/app
+stopsignal=TERM
+autostart=true
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+
+[eventlistener:worker_healthcheck]
+serverurl=AUTO
+command=./manage.py rq healthcheck
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+events=TICK_60