From ff587ae8f610b374a668aa3a423a729601820af8 Mon Sep 17 00:00:00 2001
From: Richard Bruskiewich <richard.bruskiewich@delphinai.com>
Date: Tue, 28 May 2024 22:07:29 -0700
Subject: [PATCH] Interim commit, iteration on node bindings validation
 (incomplete code)

---
 reasoner_validator/codes.yaml   |   7 ++
 reasoner_validator/validator.py | 196 +++++++++++++++++++++++---------
 2 files changed, 148 insertions(+), 55 deletions(-)

diff --git a/reasoner_validator/codes.yaml b/reasoner_validator/codes.yaml
index 2f59e03..0d2706b 100644
--- a/reasoner_validator/codes.yaml
+++ b/reasoner_validator/codes.yaml
@@ -81,6 +81,13 @@ error:
             $context:
               - identifier
             $description: "The given TRAPI Response is expected to return a specified edge - in its list of Results - relating to the original input data used to prepare the TRAPI Request!"
+          node_binding:
+            key:
+              missing:
+                $message: "Node binding entry key of TRAPI Result is missing in Query Graph"
+                $context:
+                  - identifier
+                $description: "The TRAPI Response Message Results entry has the specified node binding entry key missing in the corresponding TRAPI Message Query Graph."
         results:
           missing:
             $message: "TRAPI Message is missing its Results component!"
diff --git a/reasoner_validator/validator.py b/reasoner_validator/validator.py
index 505d698..050fd06 100644
--- a/reasoner_validator/validator.py
+++ b/reasoner_validator/validator.py
@@ -559,8 +559,81 @@ def testcase_node_found(
         # hence, we deem the node effectively missing.
         return None
 
+    def validate_binding(
+            self,
+            q_node_entry: Dict,
+            target_id: str,
+            target_query_id: Optional[str],
+            node_binding_details: Dict
+    ) -> bool:
+        """
+        Validate that a specified target_id has a valid node_binding in specified node binding details.
+
+        :param q_node_entry: Dict, query node data currently being searched
+        :param target_id: str, target knowledge graph node identifier to be matched to node binding
+        :param target_query_id: Optional[str], query identifier related to the target identifier,
+                                               if not identical to the target_id
+        :param node_binding_details: Dict, data relating to a given node_binding of query to knowledge graph identifier
+        :return: bool, True if a valid node binding was found
+        """
+        #
+        # Example of node_bindings *without* and *with* a 'query_id':
+        #
+        # "node_bindings": {
+        #     "n0": [
+        #         {
+        #             "attributes": [],
+        #             "id": "CHEBI:16796"
+        #         }
+        #     ],
+        #     "n1": [
+        #         {
+        #             "attributes": [],
+        #             "id": "MONDO:0005258",
+        #             "query_id": "MONDO:0005260"
+        #         }
+        #     ]
+        # }
+        #
+        # In situations with a 'query_id', then the 'id' of the binding is the one
+        # associated with the knowledge graph node, while the 'query_id' is assumed to be a
+        # related identifier which was given in the query graph, but is not identical to the
+        # identifier of the knowledge graph edge node being matched. This may arise if the
+        # knowledge graph edge node identifier is an ontological subclass of the query identifier
+        # (e.g. a subtype of the MONDO disease requested in the QGraph)
+
+        # See also the TRAPI 'query_id' definition below, for other validation constraints.
+        #
+        # query_id:
+        #   oneOf:
+        #     - $ref: '#/components/schemas/CURIE'
+        #   description: >-
+        #     An optional property to provide the CURIE in the QueryGraph to
+        #     which this binding applies. If the bound QNode does not have
+        #     an 'id' property or if it is empty, then this query_id MUST be
+        #     null or absent. If the bound QNode has one or more CURIEs
+        #     as an 'id' and this NodeBinding's 'id' refers to a QNode 'id'
+        #     in a manner where the CURIEs are different (typically due to
+        #     the NodeBinding.id being a descendant of a QNode.id), then
+        #     this query_id MUST be provided. In other cases, there is no
+        #     ambiguity, and this query_id SHOULD NOT be provided.
+        #     TODO: unsure where a bound QNode is detected as 'empty'
+        if "id" in node_binding_details and node_binding_details["id"]:
+            if target_id == node_binding_details["id"]:
+                return True
+            else:
+                pass
+        else:
+            if self.minimum_required_biolink_version("1.3.0"):
+                # 'query_id'
+                if "query_id" in node_binding_details and node_binding_details["query_id"]:
+                    pass
+
+        return False
+
     def testcase_node_bindings(
             self,
+            query_nodes: Dict,
             subject_id: str,
             subject_query_id: Optional[str],
             object_id: str,
@@ -571,60 +644,63 @@ def testcase_node_bindings(
         Check if the specified subject and object identifier
         are found in the result node bindings.
         Expected query_id's are also validated.
+
+        :param query_nodes: query nodes dictionary
         :param subject_id: expected node identifier of the knowledge graph subject
         :param subject_query_id: expected bound 'query_id' if not the 'subject_id' (see TRAPI spec)
         :param object_id: expected node identifier of the knowledge graph object
         :param object_query_id: expected bound 'query_id' if not the 'object_id' (see TRAPI spec)
         :param data: the result object
-        :return:
+        :return: bool, True if node_bindings found for specified subject and object
         """
-        # Sanity check (maybe unnecessary?)
-        if "node_bindings" not in data:
-            return False
-
+        # node_bindings:
+        #   type: object
+        #   description: >-
+        #     The dictionary of Input Query Graph to Result Knowledge Graph node
+        #     bindings where the dictionary keys are the key identifiers of the
+        #     Query Graph nodes and the associated values of those keys are
+        #     instances of NodeBinding schema type. This value is an
+        #     array of NodeBindings since a given query node may have multiple
+        #     knowledge graph Node bindings in the result.
+        #
         node_bindings: Dict = data["node_bindings"]
-        subject_id_found: bool = False
-        object_id_found: bool = False
-        for node in node_bindings.values():
-            for details in node:
-                if "id" in details:
-                    if subject_id == details["id"]:
-                        subject_id_found = True
-                    if object_id == details["id"]:
-                        object_id_found = True
-                if self.minimum_required_biolink_version("1.3.0"):
-                    #
-                    # TODO: for TRAPI 1.3.0 and later, validate 'query_id' settings here,
-                    #       using the node resolution information harvested elsewhere
-                    #
-                    # query_id:
-                    #   oneOf:
-                    #     - $ref: '#/components/schemas/CURIE'
-                    #   description: >-
-                    #     An optional property to provide the CURIE in the QueryGraph to
-                    #     which this binding applies. If the bound QNode does not have
-                    #     an 'id' property or if it is empty, then this query_id MUST be
-                    #     null or absent. If the bound QNode has one or more CURIEs
-                    #     as an 'id' and this NodeBinding's 'id' refers to a QNode 'id'
-                    #     in a manner where the CURIEs are different (typically due to
-                    #     the NodeBinding.id being a descendant of a QNode.id), then
-                    #     this query_id MUST be provided. In other cases, there is no
-                    #     ambiguity, and this query_id SHOULD NOT be provided.
-                    pass
+        for q_node_id, node_bindings in node_bindings.items():
+
+            # A basic expectation is for the node_binding keys
+            # to match one of the input query node keys
+            if q_node_id not in query_nodes:
+                self.report(
+                    code="error.trapi.response.message.result.node_binding.key.missing",
+                    identifier=q_node_id
+                )
+                continue
+
+            q_node_entry: Dict = query_nodes[q_node_id]
+
+            entry: Dict
+            subject_id_found: bool = False
+            object_id_found: bool = False
+            for entry in node_bindings:
+                # The subject and object id's will match separately...
+                if self.validate_binding(q_node_entry, subject_id, subject_query_id, entry):
+                    subject_id_found = True
+                if self.validate_binding(q_node_entry, object_id, object_query_id, entry):
+                    object_id_found = True
+
+                # ... but we need both to match, to succeed
                 if subject_id_found and object_id_found:
-                    # short-cut return if and when both
-                    # subject and object are matched
+                    # Short-cut tagging of search as successful if
+                    # and when both subject and object are matched
                     return True
 
         # Either the subject_id or object_id
-        # failed to match any results: failure!
+        # failed to match any node_bindings: failure!
         return False
 
     @staticmethod
-    def testcase_edge_bindings(q_edge_ids: List[str], target_edge_id: str, data: Dict) -> bool:
+    def testcase_edge_bindings(query_edges: Dict, target_edge_id: str, data: Dict) -> bool:
         """
         Check if target query edge id and knowledge graph edge id are in specified edge_bindings.
-        :rtype: object
         :param q_edge_ids: List[str], expected query edge identifiers in a matching result
         :param target_edge_id:  str, expected knowledge edge identifier in a matching result
         :param data: TRAPI version-specific Response context from which the 'edge_bindings' may be retrieved
@@ -634,9 +710,11 @@ def testcase_edge_bindings(q_edge_ids: List[str], target_edge_id: str, data: Dic
         if "edge_bindings" not in data:
             return False
 
+        query_edge_ids = list(query_edges.keys())
+
         edge_bindings: Dict = data["edge_bindings"]
         for bound_query_id, edge in edge_bindings.items():
-            if bound_query_id in q_edge_ids:
+            if bound_query_id in query_edge_ids:
                 for binding_details in edge:
                     # TRAPI schema validation likely actually
                     # catches missing id's, but sanity check...
@@ -668,18 +746,19 @@ def testcase_result_found(
         """
         # TODO: need to implement some kind of validation of 'subject_query_id' and 'object_query_id'
         assert query_graph, "testcase_result_found() encountered an empty query graph"
-        q_edges: Dict = query_graph["edges"]
-        q_edge_ids = list(q_edges.keys())
 
         result_found: bool = False
         result: Dict
 
+        # At this point, a TRAPI Response.Message.Results
+        # is generally a non-empty list of Result objects
         for result in results:
 
             # Node binding validation still currently
-            # the same for most recent TRAPI versions
+            # the same for most recent TRAPI versions >= 1.3.0
             node_bindings_found: bool = \
                 self.testcase_node_bindings(
+                    query_graph["nodes"],
                     subject_id,
                     subject_query_id,
                     object_id,
@@ -687,7 +766,7 @@ def testcase_result_found(
                     result
                 )
 
-            # However, TRAPI 1.4.0 Message 'Results' 'edge_bindings' are reported differently
+            # However, TRAPI 1.4.0++ Message 'Results' 'edge_bindings' are reported differently
             # from 1.3.0, rather, embedded in 'Analysis' objects (and 'Auxiliary Graphs')
             edge_binding_found: bool = False
             if self.is_trapi_1_4_or_later():
@@ -749,7 +828,7 @@ def testcase_result_found(
                 # the "analysis" key is at least present and that the objects themselves are 'well-formed'
                 analyses: List = result["analyses"]
                 for analysis in analyses:
-                    edge_binding_found = self.testcase_edge_bindings(q_edge_ids, edge_id, analysis)
+                    edge_binding_found = self.testcase_edge_bindings(query_graph["edges"], edge_id, analysis)
                     if edge_binding_found:
                         break
             else:
@@ -774,12 +853,15 @@ def testcase_result_found(
                 #     }
                 # ]
                 #
-                edge_binding_found = self.testcase_edge_bindings(q_edge_ids, edge_id, result)
+                edge_binding_found = self.testcase_edge_bindings(query_graph["edges"], edge_id, result)
 
+            # We declare 'success' after the first
+            # successful nodes/edges binding matches
             if node_bindings_found and edge_binding_found:
                 result_found = True
                 break
 
+        # If nothing matches, this result could still be False
         return result_found
 
     def resolve_testcase_node(
@@ -1020,16 +1102,20 @@ def testcase_input_found_in_response(
             )
             return False
 
-        results: List = message["results"]
-        if not self.testcase_result_found(
-            query_graph,
-            edge_subject_match,
-            edge_subject_query_id_match,
-            edge_object_match,
-            edge_object_query_id_match,
-            edge_id_match,
-            results
-        ):
+        results_found: bool = False
+        # TRAPI Response.Message.Results is nullable but...then results are not found?
+        if "results" in message and message["results"]:
+            results: List = message["results"]
+            results_found = self.testcase_result_found(
+                query_graph,
+                edge_subject_match,
+                edge_subject_query_id_match,
+                edge_object_match,
+                edge_object_query_id_match,
+                edge_id_match,
+                results
+            )
+        if not results_found:
             self.report(
                 code="error.trapi.response.message.result.missing",
                 identifier=testcase_edge_id