Job to cleanup unused query results

This commit is contained in:
Arik Fraimovich 2014-12-25 15:39:49 +02:00
parent 2278a181ca
commit e116e88e98
6 changed files with 68 additions and 12 deletions

View File

@ -100,6 +100,7 @@ def status_api():
status['version'] = __version__
status['queries_count'] = models.Query.select().count()
status['query_results_count'] = models.QueryResult.select().count()
status['unused_query_results_count'] = models.QueryResult.unused().count()
status['dashboards_count'] = models.Dashboard.select().count()
status['widgets_count'] = models.Widget.select().count()

View File

@ -225,6 +225,15 @@ class QueryResult(BaseModel):
'retrieved_at': self.retrieved_at
}
@classmethod
def unused(cls):
week_ago = datetime.datetime.now() - datetime.timedelta(days=7)
unused_results = cls.select().where(Query.id == None, cls.retrieved_at < week_ago)\
.join(Query, join_type=peewee.JOIN_LEFT_OUTER)
return unused_results
@classmethod
def get_latest(cls, data_source, query, ttl=0):
query_hash = utils.gen_query_hash(query)

View File

@ -56,6 +56,10 @@ CELERY_BROKER = os.environ.get("REDASH_CELERY_BROKER", REDIS_URL)
CELERY_BACKEND = os.environ.get("REDASH_CELERY_BACKEND", REDIS_URL)
CELERY_FLOWER_URL = os.environ.get("REDASH_CELERY_FLOWER_URL", "/flower")
# The following enables periodic job (every 5 minutes) of removing unused query results. Behind this "feature flag" until
# proved to be "safe".
QUERY_RESULTS_CLEANUP_ENABLED = parse_boolean(os.environ.get("REDASH_QUERY_RESULTS_CLEANUP_ENABLED", "false"))
# Google Apps domain to allow access from; any user with email in this Google Apps will be allowed
# access
GOOGLE_APPS_DOMAIN = os.environ.get("REDASH_GOOGLE_APPS_DOMAIN", "")

View File

@ -202,6 +202,22 @@ def cleanup_tasks():
redis_connection.delete(lock_keys[i])
@celery.task(base=BaseTask)
def cleanup_query_results():
"""
Job to cleanup unused query results -- such that no query links to them anymore, and older than a week (so it's less
likely to be open in someone's browser and be used).
Each time the job deletes only 100 query results so it won't choke the database in case of many such results.
"""
unused_query_results = models.QueryResult.unused().limit(100)
total_unused_query_results = models.QueryResult.unused().count()
deleted_count = models.QueryResult.delete().where(models.QueryResult.id << unused_query_results).execute()
logger.info("Deleted %d unused query results out of total of %d." % (deleted_count, total_unused_query_results))
@celery.task(bind=True, base=BaseTask, track_started=True)
def execute_query(self, query, data_source_id):
# TODO: maybe this should be a class?

View File

@ -7,19 +7,26 @@ celery = Celery('redash',
broker=settings.CELERY_BROKER,
include='redash.tasks')
celery.conf.update(CELERY_RESULT_BACKEND=settings.CELERY_BACKEND,
CELERYBEAT_SCHEDULE={
'refresh_queries': {
'task': 'redash.tasks.refresh_queries',
'schedule': timedelta(seconds=30)
},
'cleanup_tasks': {
'task': 'redash.tasks.cleanup_tasks',
'schedule': timedelta(minutes=5)
},
},
CELERY_TIMEZONE='UTC')
celery_schedule = {
'refresh_queries': {
'task': 'redash.tasks.refresh_queries',
'schedule': timedelta(seconds=30)
},
'cleanup_tasks': {
'task': 'redash.tasks.cleanup_tasks',
'schedule': timedelta(minutes=5)
}
}
if settings.QUERY_RESULTS_CLEANUP_ENABLED:
celery_schedule['cleanup_query_results'] = {
'task': 'redash.tasks.cleanup_query_results',
'schedule': timedelta(minutes=5)
}
celery.conf.update(CELERY_RESULT_BACKEND=settings.CELERY_BACKEND,
CELERYBEAT_SCHEDULE=celery_schedule,
CELERY_TIMEZONE='UTC')
if __name__ == '__main__':
celery.start()

View File

@ -129,6 +129,25 @@ class QueryResultTest(BaseTestCase):
self.assertEqual(found_query_result.id, qr.id)
class TestUnusedQueryResults(BaseTestCase):
def test_returns_only_unused_query_results(self):
two_weeks_ago = datetime.datetime.now() - datetime.timedelta(days=14)
qr = query_result_factory.create()
query = query_factory.create(latest_query_data=qr)
unused_qr = query_result_factory.create(retrieved_at=two_weeks_ago)
self.assertIn(unused_qr, models.QueryResult.unused())
self.assertNotIn(qr, models.QueryResult.unused())
def test_returns_only_over_a_week_old_results(self):
two_weeks_ago = datetime.datetime.now() - datetime.timedelta(days=14)
unused_qr = query_result_factory.create(retrieved_at=two_weeks_ago)
new_unused_qr = query_result_factory.create()
self.assertIn(unused_qr, models.QueryResult.unused())
self.assertNotIn(new_unused_qr, models.QueryResult.unused())
class TestQueryResultStoreResult(BaseTestCase):
def setUp(self):
super(TestQueryResultStoreResult, self).setUp()