Python query runner: add function that transforms pandas dataframe to result format (#5629)

This commit is contained in:
Vladislav Denisov 2022-01-19 22:53:27 +03:00 committed by GitHub
parent 4fddff104a
commit e28e4227bf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 52 additions and 4 deletions

View File

@ -9,6 +9,14 @@ from redash import models
from RestrictedPython import compile_restricted
from RestrictedPython.Guards import safe_builtins, guarded_iter_unpack_sequence, guarded_unpack_sequence
try:
import pandas as pd
import numpy as np
pandas_installed = True
except ImportError:
pandas_installed = False
from RestrictedPython.transformer import IOPERATOR_TO_STR
logger = logging.getLogger(__name__)
@ -145,6 +153,14 @@ class Python(BaseQueryRunner):
def custom_get_iter(obj):
return iter(obj)
@staticmethod
def custom_inplacevar(op, x, y):
if op not in IOPERATOR_TO_STR.values():
raise Exception("'{} is not supported inplace variable'".format(op))
glb = {"x": x, "y": y}
exec("x" + op + "y", glb)
return glb["x"]
@staticmethod
def add_result_column(result, column_name, friendly_name, column_type):
"""Helper function to add columns inside a Python script running in Redash in an easier way
@ -179,7 +195,7 @@ class Python(BaseQueryRunner):
result["rows"].append(values)
@staticmethod
def execute_query(data_source_name_or_id, query):
def execute_query(data_source_name_or_id, query, result_type=None):
"""Run query from specific data source.
Parameters:
@ -200,7 +216,13 @@ class Python(BaseQueryRunner):
raise Exception(error)
# TODO: allow avoiding the JSON dumps/loads in same process
return json_loads(data)
query_result = json_loads(data)
if result_type == "dataframe" and pandas_installed:
return pd.DataFrame(query_result["rows"])
return query_result
@staticmethod
def get_source_schema(data_source_name_or_id):
@ -239,6 +261,29 @@ class Python(BaseQueryRunner):
return query.latest_query_data.data
def dataframe_to_result(self, result, df):
result["rows"] = df.to_dict("records")
for column_name, column_type in df.dtypes.items():
if column_type == np.bool:
redash_type = TYPE_BOOLEAN
elif column_type == np.inexact:
redash_type = TYPE_FLOAT
elif column_type == np.integer:
redash_type = TYPE_INTEGER
elif column_type in (np.datetime64, np.dtype('<M8[ns]')):
if df.empty:
redash_type = TYPE_DATETIME
elif len(df[column_name].head(1).astype(str).loc[0]) > 10:
redash_type = TYPE_DATETIME
else:
redash_type = TYPE_DATE
else:
redash_type = TYPE_STRING
self.add_result_column(result, column_name, column_name, redash_type)
def get_current_user(self):
return self._current_user.to_dict()
@ -265,6 +310,7 @@ class Python(BaseQueryRunner):
builtins["_print_"] = self._custom_print
builtins["_unpack_sequence_"] = guarded_unpack_sequence
builtins["_iter_unpack_sequence_"] = guarded_iter_unpack_sequence
builtins["_inplacevar_"] = self.custom_inplacevar
# Layer in our own additional set of builtins that we have
# considered safe.
@ -277,6 +323,8 @@ class Python(BaseQueryRunner):
restricted_globals["get_current_user"] = self.get_current_user
restricted_globals["execute_query"] = self.execute_query
restricted_globals["add_result_column"] = self.add_result_column
if pandas_installed:
restricted_globals["dataframe_to_result"] = self.dataframe_to_result
restricted_globals["add_result_row"] = self.add_result_row
restricted_globals["disable_print_log"] = self._custom_print.disable
restricted_globals["enable_print_log"] = self._custom_print.enable
@ -304,5 +352,4 @@ class Python(BaseQueryRunner):
return json_data, error
register(Python)

View File

@ -40,4 +40,5 @@ trino~=0.305
cmem-cmempy==21.2.3
xlrd==2.0.1
openpyxl==3.0.7
firebolt-sqlalchemy
firebolt-sqlalchemy
pandas==1.3.4