From 9292ae8d3fe3d22b10554a10ce4cefe76214c5d4 Mon Sep 17 00:00:00 2001 From: Arik Fraimovich Date: Wed, 29 May 2019 10:45:29 +0300 Subject: [PATCH] CSV: correctly serialize booleans and dates. (#3841) * CSV: correctly serialize booleans and dates. Closes #3736, closes #2751. * pep8 fixes * Move column iteration to a helper function. * Use elif, as types are mutually exclusive. * Refactor parsing implementation. * Move the csv generation fucntion --- redash/handlers/query_results.py | 5 +- redash/models/__init__.py | 40 +------ .../__init__.py} | 2 + redash/serializers/query_result.py | 104 ++++++++++++++++++ tests/models/test_query_results.py | 4 +- tests/serializers/__init__.py | 0 tests/serializers/test_query_results.py | 49 +++++++++ 7 files changed, 161 insertions(+), 43 deletions(-) rename redash/{serializers.py => serializers/__init__.py} (98%) create mode 100644 redash/serializers/query_result.py create mode 100644 tests/serializers/__init__.py create mode 100644 tests/serializers/test_query_results.py diff --git a/redash/handlers/query_results.py b/redash/handlers/query_results.py index 3bed3ecd..64ecc57b 100644 --- a/redash/handlers/query_results.py +++ b/redash/handlers/query_results.py @@ -12,6 +12,7 @@ from redash.tasks import QueryTask from redash.tasks.queries import enqueue_query from redash.utils import (collect_parameters_from_request, gen_query_hash, json_dumps, utcnow, to_filename) from redash.models.parameterized_query import ParameterizedQuery, InvalidParameterError, dropdown_values +from redash.serializers import serialize_query_result_to_csv, serialize_query_result_to_xlsx def error_response(message): @@ -279,12 +280,12 @@ class QueryResultResource(BaseResource): @staticmethod def make_csv_response(query_result): headers = {'Content-Type': "text/csv; charset=UTF-8"} - return make_response(query_result.make_csv_content(), 200, headers) + return make_response(serialize_query_result_to_csv(query_result), 200, headers) @staticmethod def make_excel_response(query_result): headers = {'Content-Type': "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"} - return make_response(query_result.make_excel_content(), 200, headers) + return make_response(serialize_query_result_to_xlsx(query_result), 200, headers) class JobResource(BaseResource): diff --git a/redash/models/__init__.py b/redash/models/__init__.py index a4160ea0..2c81d252 100644 --- a/redash/models/__init__.py +++ b/redash/models/__init__.py @@ -1,12 +1,9 @@ -import cStringIO -import csv import datetime import calendar import logging import time import pytz -import xlsxwriter from six import python_2_unicode_compatible, text_type from sqlalchemy import distinct, or_, and_, UniqueConstraint from sqlalchemy.dialects import postgresql @@ -25,7 +22,7 @@ from redash.destinations import (get_configuration_schema_for_destination_type, get_destination) from redash.metrics import database # noqa: F401 from redash.query_runner import (get_configuration_schema_for_query_runner_type, - get_query_runner) + get_query_runner, TYPE_BOOLEAN, TYPE_DATE, TYPE_DATETIME) from redash.utils import generate_token, json_dumps, json_loads from redash.utils.configuration import ConfigurationContainer from redash.models.parameterized_query import ParameterizedQuery @@ -322,41 +319,6 @@ class QueryResult(db.Model, BelongsToOrgMixin): def groups(self): return self.data_source.groups - def make_csv_content(self): - s = cStringIO.StringIO() - - query_data = json_loads(self.data) - writer = csv.DictWriter(s, extrasaction="ignore", fieldnames=[col['name'] for col in query_data['columns']]) - writer.writer = utils.UnicodeWriter(s) - writer.writeheader() - for row in query_data['rows']: - writer.writerow(row) - - return s.getvalue() - - def make_excel_content(self): - s = cStringIO.StringIO() - - query_data = json_loads(self.data) - book = xlsxwriter.Workbook(s, {'constant_memory': True}) - sheet = book.add_worksheet("result") - - column_names = [] - for (c, col) in enumerate(query_data['columns']): - sheet.write(0, c, col['name']) - column_names.append(col['name']) - - for (r, row) in enumerate(query_data['rows']): - for (c, name) in enumerate(column_names): - v = row.get(name) - if isinstance(v, list) or isinstance(v, dict): - v = str(v).encode('utf-8') - sheet.write(r + 1, c, v) - - book.close() - - return s.getvalue() - def should_schedule_next(previous_iteration, now, interval, time=None, day_of_week=None, failures=0): # if time exists then interval > 23 hours (82800s) diff --git a/redash/serializers.py b/redash/serializers/__init__.py similarity index 98% rename from redash/serializers.py rename to redash/serializers/__init__.py index 81e1df38..605a3baa 100644 --- a/redash/serializers.py +++ b/redash/serializers/__init__.py @@ -12,6 +12,8 @@ from redash.permissions import has_access, view_only from redash.utils import json_loads from redash.models.parameterized_query import ParameterizedQuery +from .query_result import serialize_query_result_to_csv, serialize_query_result_to_xlsx + def public_widget(widget): res = { diff --git a/redash/serializers/query_result.py b/redash/serializers/query_result.py new file mode 100644 index 00000000..8a4d94d6 --- /dev/null +++ b/redash/serializers/query_result.py @@ -0,0 +1,104 @@ +import cStringIO +import csv +import xlsxwriter +from dateutil.parser import parse as parse_date +from redash.utils import json_loads, UnicodeWriter +from redash.query_runner import (TYPE_BOOLEAN, TYPE_DATE, TYPE_DATETIME) +from redash.authentication.org_resolving import current_org + + +def _convert_format(fmt): + return fmt.replace('DD', '%d').replace('MM', '%m').replace('YYYY', '%Y').replace('YY', '%y').replace('HH', '%H').replace('mm', '%M').replace('ss', '%s') + + +def _convert_bool(value): + if value is True: + return "true" + elif value is False: + return "false" + + return value + +def _convert_date(value): + if not value: + return value + + parsed = parse_date(value) + + return parsed.strftime(_convert_format(current_org.get_setting('date_format'))) + + +def _convert_datetime(value): + if not value: + return value + + parsed = parse_date(value) + + fmt = _convert_format('{} {}'.format(current_org.get_setting('date_format'), current_org.get_setting('time_format'))) + return parsed.strftime(fmt) + + +SPECIAL_TYPES = { + TYPE_BOOLEAN: _convert_bool, + TYPE_DATE: _convert_date, + TYPE_DATETIME: _convert_datetime +} + + +def _get_column_lists(columns): + fieldnames = [] + special_columns = dict() + + for col in columns: + fieldnames.append(col['name']) + + for col_type in SPECIAL_TYPES.keys(): + if col['type'] == col_type: + special_columns[col['name']] = SPECIAL_TYPES[col_type] + + return fieldnames, special_columns + + +def serialize_query_result_to_csv(query_result): + s = cStringIO.StringIO() + + query_data = json_loads(query_result.data) + + fieldnames, special_columns = _get_column_lists(query_data['columns']) + + writer = csv.DictWriter(s, extrasaction="ignore", fieldnames=fieldnames) + writer.writer = UnicodeWriter(s) + writer.writeheader() + + for row in query_data['rows']: + for col_name, converter in special_columns.iteritems(): + if col_name in row: + row[col_name] = converter(row[col_name]) + + writer.writerow(row) + + return s.getvalue() + + +def serialize_query_result_to_xlsx(query_result): + s = cStringIO.StringIO() + + query_data = json_loads(query_result.data) + book = xlsxwriter.Workbook(s, {'constant_memory': True}) + sheet = book.add_worksheet("result") + + column_names = [] + for (c, col) in enumerate(query_data['columns']): + sheet.write(0, c, col['name']) + column_names.append(col['name']) + + for (r, row) in enumerate(query_data['rows']): + for (c, name) in enumerate(column_names): + v = row.get(name) + if isinstance(v, list) or isinstance(v, dict): + v = str(v).encode('utf-8') + sheet.write(r + 1, c, v) + + book.close() + + return s.getvalue() diff --git a/tests/models/test_query_results.py b/tests/models/test_query_results.py index a3c805c8..751cbf95 100644 --- a/tests/models/test_query_results.py +++ b/tests/models/test_query_results.py @@ -4,7 +4,7 @@ import datetime from tests import BaseTestCase from redash import models -from redash.utils import utcnow +from redash.utils import utcnow, json_dumps class QueryResultTest(BaseTestCase): @@ -66,4 +66,4 @@ class QueryResultTest(BaseTestCase): models.QueryResult.store_result(query.org_id, query.data_source, query.query_hash, query.query_text, "", 0, utcnow()) - self.assertEqual(original_updated_at, query.updated_at) + self.assertEqual(original_updated_at, query.updated_at) \ No newline at end of file diff --git a/tests/serializers/__init__.py b/tests/serializers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/serializers/test_query_results.py b/tests/serializers/test_query_results.py new file mode 100644 index 00000000..19063012 --- /dev/null +++ b/tests/serializers/test_query_results.py @@ -0,0 +1,49 @@ +import datetime +import csv +import cStringIO + +from tests import BaseTestCase + +from redash import models +from redash.utils import utcnow, json_dumps +from redash.serializers import serialize_query_result_to_csv + + +data = { + "rows": [ + {"datetime": "2019-05-26T12:39:23.026Z", "bool": True, "date": "2019-05-26"}, + {"datetime": "", "bool": False, "date": ""}, + {"datetime": None, "bool": None, "date": None}, + ], + "columns": [ + {"friendly_name": "bool", "type": "boolean", "name": "bool"}, + {"friendly_name": "date", "type": "datetime", "name": "datetime"}, + {"friendly_name": "date", "type": "date", "name": "date"} + ] +} + +class CsvSerializationTest(BaseTestCase): + def get_csv_content(self): + query_result = self.factory.create_query_result(data=json_dumps(data)) + return serialize_query_result_to_csv(query_result) + + def test_serializes_booleans_correctly(self): + with self.app.test_request_context('/'): + parsed = csv.DictReader(cStringIO.StringIO(self.get_csv_content())) + rows = list(parsed) + + self.assertEqual(rows[0]['bool'], 'true') + self.assertEqual(rows[1]['bool'], 'false') + self.assertEqual(rows[2]['bool'], '') + + def test_serializes_datatime_with_correct_format(self): + with self.app.test_request_context('/'): + parsed = csv.DictReader(cStringIO.StringIO(self.get_csv_content())) + rows = list(parsed) + + self.assertEqual(rows[0]['datetime'], '26/05/19 12:39') + self.assertEqual(rows[1]['datetime'], '') + self.assertEqual(rows[2]['datetime'], '') + self.assertEqual(rows[0]['date'], '26/05/19') + self.assertEqual(rows[1]['date'], '') + self.assertEqual(rows[2]['date'], '') \ No newline at end of file