Query Runner: eccenca Corporate Memory (SPARQL) - query RDF / Linked Data Knowledge Graphs with redash (#5415)

* add Corporate Memory Runner based on cmempy 21.2.3 * fix code style * apply some code nice ups * use extendedEnum, boolean and extra_options for schema description * use lower case sorting for data source types list This correctly orders data source names which starts with lower chars (such as eccenca Corporate Memory) * add missing dblogo
2024-11-06 09:05:17 +00:00 · 2021-03-24 08:15:24 +01:00 · 2021-03-24 08:15:24 +01:00 · 70681294a3
commit 70681294a3
parent fb90b501cb
5 changed files with 272 additions and 1 deletions
--- a/client/app/assets/images/db-logos/corporate_memory.png
+++ b/client/app/assets/images/db-logos/corporate_memory.png
--- a/redash/handlers/data_sources.py
+++ b/redash/handlers/data_sources.py
@ -29,7 +29,8 @@ class DataSourceTypeListResource(BaseResource):
    @require_admin
    def get(self):
        return [
-            q.to_dict() for q in sorted(query_runners.values(), key=lambda q: q.name())
+            q.to_dict()
+            for q in sorted(query_runners.values(), key=lambda q: q.name().lower())
        ]


--- a/redash/query_runner/corporate_memory.py
+++ b/redash/query_runner/corporate_memory.py
@ -0,0 +1,268 @@
+"""Provide the query runner for eccenca Corporate Memory.
+
+seeAlso: https://documentation.eccenca.com/
+seeAlso: https://eccenca.com/
+"""
+
+import logging
+import json
+from os import environ
+
+from redash.query_runner import BaseQueryRunner
+from redash.utils import json_dumps, json_loads
+from . import register
+
+try:
+    from cmem.cmempy.queries import SparqlQuery, QueryCatalog, QUERY_STRING
+    from cmem.cmempy.dp.proxy.graph import get_graphs_list
+
+    enabled = True
+except ImportError:
+    enabled = False
+
+logger = logging.getLogger(__name__)
+
+
+class CorporateMemoryQueryRunner(BaseQueryRunner):
+    """Use eccenca Corporate Memory as redash data source"""
+
+    # These environment keys are used by cmempy
+    KNOWN_CONFIG_KEYS = (
+        "CMEM_BASE_PROTOCOL",
+        "CMEM_BASE_DOMAIN",
+        "CMEM_BASE_URI",
+        "SSL_VERIFY",
+        "REQUESTS_CA_BUNDLE",
+        "DP_API_ENDPOINT",
+        "DI_API_ENDPOINT",
+        "OAUTH_TOKEN_URI",
+        "OAUTH_GRANT_TYPE",
+        "OAUTH_USER",
+        "OAUTH_PASSWORD",
+        "OAUTH_CLIENT_ID",
+        "OAUTH_CLIENT_SECRET",
+    )
+
+    # These variables hold secret data and should NOT be logged
+    KNOWN_SECRET_KEYS = ("OAUTH_PASSWORD", "OAUTH_CLIENT_SECRET")
+
+    # This allows for an easy connection test
+    noop_query = "SELECT ?noop WHERE {BIND('noop' as ?noop)}"
+
+    # We do not want to have comment in our sparql queries
+    # FEATURE?: Implement annotate_query in case the metadata is useful somewhere
+    should_annotate_query = False
+
+    def __init__(self, configuration):
+        """init the class and configuration"""
+        super(CorporateMemoryQueryRunner, self).__init__(configuration)
+        """
+        FEATURE?: activate SPARQL support in the redash query editor
+            Currently SPARQL syntax seems not to be available for react-ace
+            component. However, the ace editor itself supports sparql mode:
+            https://github.com/ajaxorg/ace/blob/master/lib/ace/mode/sparql.js
+            then we can hopefully do: self.syntax = "sparql"
+        FEATURE?: implement the retrieve Query catalog URIs in order to use them in queries
+        FEATURE?: implement a way to use queries from the query catalog
+        FEATURE?: allow a checkbox to NOT use owl:imports imported graphs
+        FEATURE?: allow to use a context graph per data source
+        """
+        self.configuration = configuration
+
+    def _setup_environment(self):
+        """provide environment for cmempy
+
+        cmempy environment variables need to match key in the properties
+        object of the configuration_schema
+        """
+        for key in self.KNOWN_CONFIG_KEYS:
+            if key in environ:
+                environ.pop(key)
+            value = self.configuration.get(key, None)
+            if value is not None:
+                environ[key] = str(value)
+                if key in self.KNOWN_SECRET_KEYS:
+                    logger.info("{} set by config".format(key))
+                else:
+                    logger.info("{} set by config to {}".format(key, environ[key]))
+
+    @staticmethod
+    def _transform_sparql_results(results):
+        """transforms a SPARQL query result to a redash query result
+
+        source structure: SPARQL 1.1 Query Results JSON Format
+            - seeAlso: https://www.w3.org/TR/sparql11-results-json/
+
+        target structure: redash result set
+            there is no good documentation available
+            so here an example result set as needed for redash:
+            data = {
+                "columns": [ {"name": "name", "type": "string", "friendly_name": "friendly name"}],
+                "rows": [
+                    {"name": "value 1"},
+                    {"name": "value 2"}
+                ]}
+
+        FEATURE?: During the sparql_row loop, we could check the data types of the
+            values and, in case they are all the same, choose something better than
+            just string.
+        """
+        logger.info("results are: {}".format(results))
+        # Not sure why we do not use the json package here but all other
+        # query runner do it the same way :-)
+        sparql_results = json_loads(results)
+        # transform all bindings to redash rows
+        rows = []
+        for sparql_row in sparql_results["results"]["bindings"]:
+            row = {}
+            for var in sparql_results["head"]["vars"]:
+                try:
+                    row[var] = sparql_row[var]["value"]
+                except KeyError:
+                    # not bound SPARQL variables are set as empty strings
+                    row[var] = ""
+            rows.append(row)
+        # transform all vars to redash columns
+        columns = []
+        for var in sparql_results["head"]["vars"]:
+            columns.append({"name": var, "friendly_name": var, "type": "string"})
+        # Not sure why we do not use the json package here but all other
+        # query runner do it the same way :-)
+        return json_dumps({"columns": columns, "rows": rows})
+
+    @classmethod
+    def name(cls):
+        return "eccenca Corporate Memory (with SPARQL)"
+
+    @classmethod
+    def enabled(cls):
+        return enabled
+
+    @classmethod
+    def type(cls):
+        return "corporate_memory"
+
+    def run_query(self, query, user):
+        """send a sparql query to corporate memory"""
+        query_text = query
+        logger.info("about to execute query (user='{}'): {}".format(user, query_text))
+        query = SparqlQuery(query_text)
+        query_type = query.get_query_type()
+        # type of None means, there is an error in the query
+        # so execution is at least tried on endpoint
+        if query_type not in ["SELECT", None]:
+            raise ValueError(
+                "Queries of type {} can not be processed by redash.".format(query_type)
+            )
+
+        self._setup_environment()
+        try:
+            data = self._transform_sparql_results(query.get_results())
+        except Exception as error:
+            logger.info("Error: {}".format(error))
+            try:
+                # try to load Problem Details for HTTP API JSON
+                details = json.loads(error.response.text)
+                error = ""
+                if "title" in details:
+                    error += details["title"] + ": "
+                if "detail" in details:
+                    error += details["detail"]
+                    return None, error
+            except Exception:
+                pass
+
+            return None, error
+
+        error = None
+        return data, error
+
+    @classmethod
+    def configuration_schema(cls):
+        """provide the configuration of the data source as json schema"""
+        return {
+            "type": "object",
+            "properties": {
+                "CMEM_BASE_URI": {"type": "string", "title": "Base URL"},
+                "OAUTH_GRANT_TYPE": {
+                    "type": "string",
+                    "title": "Grant Type",
+                    "default": "client_credentials",
+                    "extendedEnum": [
+                        {"value": "client_credentials", "name": "client_credentials"},
+                        {"value": "password", "name": "password"},
+                    ],
+                },
+                "OAUTH_CLIENT_ID": {
+                    "type": "string",
+                    "title": "Client ID (e.g. cmem-service-account)",
+                    "default": "cmem-service-account",
+                },
+                "OAUTH_CLIENT_SECRET": {
+                    "type": "string",
+                    "title": "Client Secret - only needed for grant type 'client_credentials'",
+                },
+                "OAUTH_USER": {
+                    "type": "string",
+                    "title": "User account - only needed for grant type 'password'",
+                },
+                "OAUTH_PASSWORD": {
+                    "type": "string",
+                    "title": "User Password - only needed for grant type 'password'",
+                },
+                "SSL_VERIFY": {
+                    "type": "boolean",
+                    "title": "Verify SSL certificates for API requests",
+                    "default": True,
+                },
+                "REQUESTS_CA_BUNDLE": {
+                    "type": "string",
+                    "title": "Path to the CA Bundle file (.pem)",
+                },
+            },
+            "required": ["CMEM_BASE_URI", "OAUTH_GRANT_TYPE", "OAUTH_CLIENT_ID"],
+            "secret": ["OAUTH_CLIENT_SECRET", "OAUTH_PASSWORD"],
+            "extra_options": [
+                "OAUTH_GRANT_TYPE",
+                "OAUTH_USER",
+                "OAUTH_PASSWORD",
+                "SSL_VERIFY",
+                "REQUESTS_CA_BUNDLE",
+            ],
+        }
+
+    def get_schema(self, get_stats=False):
+        """Get the schema structure (prefixes, graphs)."""
+        schema = dict()
+        schema["1"] = {
+            "name": "-> Common Prefixes <-",
+            "columns": self._get_common_prefixes_schema(),
+        }
+        schema["2"] = {"name": "-> Graphs <-", "columns": self._get_graphs_schema()}
+        # schema.update(self._get_query_schema())
+        logger.info(schema.values())
+        return schema.values()
+
+    def _get_graphs_schema(self):
+        """Get a list of readable graph FROM clause strings."""
+        self._setup_environment()
+        graphs = []
+        for graph in get_graphs_list():
+            graphs.append("FROM <{}>".format(graph["iri"]))
+        return graphs
+
+    @staticmethod
+    def _get_common_prefixes_schema():
+        """Get a list of SPARQL prefix declarations."""
+        common_prefixes = [
+            "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>",
+            "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>",
+            "PREFIX owl: <http://www.w3.org/2002/07/owl#>",
+            "PREFIX schema: <http://schema.org/>",
+            "PREFIX dct: <http://purl.org/dc/terms/>",
+            "PREFIX skos: <http://www.w3.org/2004/02/skos/core#>",
+        ]
+        return common_prefixes
+
+
+register(CorporateMemoryQueryRunner)
--- a/redash/settings/init.py
+++ b/redash/settings/init.py
@ -372,6 +372,7 @@ default_query_runners = [
    "redash.query_runner.exasol",
    "redash.query_runner.cloudwatch",
    "redash.query_runner.cloudwatch_insights",
+    "redash.query_runner.corporate_memory",
 ]

 enabled_query_runners = array_from_string(
--- a/requirements_all_ds.txt
+++ b/requirements_all_ds.txt
@ -36,3 +36,4 @@ pyexasol==0.12.0
 python-rapidjson==0.8.0
 pyodbc==4.0.28
 trino~=0.305
+cmem-cmempy==21.2.3