Python query runner: add function that transforms pandas dataframe to result format (#5629)

2024-11-06 00:55:16 +00:00 · 2022-01-19 22:53:27 +03:00 · 2022-01-19 22:53:27 +03:00 · e28e4227bf
commit e28e4227bf
parent 4fddff104a
2 changed files with 52 additions and 4 deletions
--- a/redash/query_runner/python.py
+++ b/redash/query_runner/python.py
@ -9,6 +9,14 @@ from redash import models
 from RestrictedPython import compile_restricted
 from RestrictedPython.Guards import safe_builtins, guarded_iter_unpack_sequence, guarded_unpack_sequence

+try:
+    import pandas as pd
+    import numpy as np
+    pandas_installed = True
+except ImportError:
+    pandas_installed = False
+
+from RestrictedPython.transformer import IOPERATOR_TO_STR

 logger = logging.getLogger(__name__)

@ -145,6 +153,14 @@ class Python(BaseQueryRunner):
    def custom_get_iter(obj):
        return iter(obj)

+    @staticmethod
+    def custom_inplacevar(op, x, y):
+        if op not in IOPERATOR_TO_STR.values():
+            raise Exception("'{} is not supported inplace variable'".format(op))
+        glb = {"x": x, "y": y}
+        exec("x" + op + "y", glb)
+        return glb["x"]
+
    @staticmethod
    def add_result_column(result, column_name, friendly_name, column_type):
        """Helper function to add columns inside a Python script running in Redash in an easier way
@ -179,7 +195,7 @@ class Python(BaseQueryRunner):
        result["rows"].append(values)

    @staticmethod
-    def execute_query(data_source_name_or_id, query):
+    def execute_query(data_source_name_or_id, query, result_type=None):
        """Run query from specific data source.

        Parameters:
@ -200,7 +216,13 @@ class Python(BaseQueryRunner):
            raise Exception(error)

        # TODO: allow avoiding the JSON dumps/loads in same process
-        return json_loads(data)
+        query_result = json_loads(data)
+
+        if result_type == "dataframe" and pandas_installed:
+            return pd.DataFrame(query_result["rows"])
+
+        return query_result
+

    @staticmethod
    def get_source_schema(data_source_name_or_id):
@ -239,6 +261,29 @@ class Python(BaseQueryRunner):

        return query.latest_query_data.data

+    def dataframe_to_result(self, result, df):
+
+        result["rows"] = df.to_dict("records")
+
+        for column_name, column_type in df.dtypes.items():
+            if column_type == np.bool:
+                redash_type = TYPE_BOOLEAN
+            elif column_type == np.inexact:
+                redash_type = TYPE_FLOAT
+            elif column_type == np.integer:
+                redash_type = TYPE_INTEGER
+            elif column_type in (np.datetime64, np.dtype('<M8[ns]')):
+                if df.empty:
+                    redash_type = TYPE_DATETIME
+                elif len(df[column_name].head(1).astype(str).loc[0]) > 10:
+                    redash_type = TYPE_DATETIME
+                else:
+                    redash_type = TYPE_DATE
+            else:
+                redash_type = TYPE_STRING
+
+            self.add_result_column(result, column_name, column_name, redash_type)
+
    def get_current_user(self):
        return self._current_user.to_dict()

@ -265,6 +310,7 @@ class Python(BaseQueryRunner):
            builtins["_print_"] = self._custom_print
            builtins["_unpack_sequence_"] = guarded_unpack_sequence
            builtins["_iter_unpack_sequence_"] = guarded_iter_unpack_sequence
+            builtins["_inplacevar_"] = self.custom_inplacevar

            # Layer in our own additional set of builtins that we have
            # considered safe.
@ -277,6 +323,8 @@ class Python(BaseQueryRunner):
            restricted_globals["get_current_user"] = self.get_current_user
            restricted_globals["execute_query"] = self.execute_query
            restricted_globals["add_result_column"] = self.add_result_column
+            if pandas_installed:
+                restricted_globals["dataframe_to_result"] = self.dataframe_to_result
            restricted_globals["add_result_row"] = self.add_result_row
            restricted_globals["disable_print_log"] = self._custom_print.disable
            restricted_globals["enable_print_log"] = self._custom_print.enable
@ -304,5 +352,4 @@ class Python(BaseQueryRunner):

        return json_data, error

-
 register(Python)
--- a/requirements_all_ds.txt
+++ b/requirements_all_ds.txt
@ -40,4 +40,5 @@ trino~=0.305
 cmem-cmempy==21.2.3
 xlrd==2.0.1
 openpyxl==3.0.7
-firebolt-sqlalchemy
+firebolt-sqlalchemy
+pandas==1.3.4