Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Diverse Validation enhancements #31

Merged
merged 15 commits into from
Aug 12, 2022
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
9f2c456
validating the attribute_type_id as an association_slot (--> breaking…
RichardBruskiewich Aug 5, 2022
fc1c3ce
validating the attribute_type_id as an association_slot (--> breaking…
RichardBruskiewich Aug 5, 2022
7c7fc7a
Merge remote-tracking branch 'origin/issue-19-edge-attribute-validati…
RichardBruskiewich Aug 5, 2022
d5f2e09
Upgrade BMT; adding attribute test back in (but need to iterate)
RichardBruskiewich Aug 8, 2022
bfcfe35
Updated default testing Biolink Model to 2.4.8 and fix constructor bi…
RichardBruskiewich Aug 8, 2022
e7af6ed
Merge branch 'master' into issue-19-edge-attribute-validation
RichardBruskiewich Aug 10, 2022
e52119f
pip dependency to patch version of BMT 0.8.4 plus refactoring of biol…
RichardBruskiewich Aug 11, 2022
7cf38e1
Add input test edge check for mixin
RichardBruskiewich Aug 11, 2022
33312bc
test for abstract categories in the input edge test data
RichardBruskiewich Aug 11, 2022
78a8dfe
DRY refactoring of category validation to reduce code duplication in …
RichardBruskiewich Aug 11, 2022
86e64b0
Add unit test for attribute_type_id is_a 'biolink:association_slot'
RichardBruskiewich Aug 11, 2022
6a6cce1
Validating the attribute_type_id to 1) be a CURIE and 2) be a known p…
RichardBruskiewich Aug 11, 2022
d755218
Merge branch 'master' into validation-enhancements
RichardBruskiewich Aug 12, 2022
0c74ffc
updated tests README
RichardBruskiewich Aug 12, 2022
d80c1c4
tests README updated; simpler main.py tests for empty TRAPI message c…
RichardBruskiewich Aug 12, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 126 additions & 84 deletions reasoner_validator/biolink/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import logging

from bmt import Toolkit
from linkml_runtime.linkml_model import ClassDefinition

from reasoner_validator.util import SemVer, SemVerError

Expand Down Expand Up @@ -125,6 +126,21 @@ def minimum_required_biolink_version(self, version: str) -> bool:
logger.error(f"minimum_required_biolink_version() error: {str(sve)}")
return False

@staticmethod
def is_curie(s: str) -> bool:
"""
Check if a given string is a CURIE.

:param s: str, string to be validated as a CURIE
:return: bool, whether or not the given string is a CURIE
"""
# Method copied from kgx.prefix_manager.PrefixManager...
if isinstance(s, str):
m = re.match(r"^[^ <()>:]*:[^/ :]+$", s)
return bool(m)
else:
return False

def get_result(self) -> Tuple[str, Optional[List[str]]]:
"""
Get result of validation.
Expand All @@ -133,42 +149,7 @@ def get_result(self) -> Tuple[str, Optional[List[str]]]:
"""
return self.bmtk.get_model_version(), list(self.errors)

def validate_category(self, node_id: str, category: str) -> Optional[str]:
"""
Validate the category of node.

:param node_id: identifier of a concept node
:type node_id: str
:param category: of the node
:type category: str
:return: category name associated wth the category of the node
:rtype: Optional[str]
"""
if self.bmtk.is_category(category):
return self.bmtk.get_element(category).name
elif self.bmtk.is_mixin(category):
# finding mixins in the categories is OK, but we otherwise ignore them in validation
logger.info(f"\nReported Biolink Model category '{category}' resolves to a Biolink Model 'mixin'?")
else:
element = self.bmtk.get_element(category)
if element:
# got something here... hopefully just an abstract class
# but not a regular category, so we also ignore it!
# TODO: how do we better detect abstract classes from the model?
# How strict should our validation be here?
logger.info(
f"\nReported Biolink Model category '{category}' " +
"resolves to the (possibly abstract) " +
f"Biolink Model element '{element.name}'?")
else:
# Error: a truly unrecognized category?
self.report_error(
f"'{category}' for node '{node_id}' " +
"is not a recognized Biolink Model category?"
)
return None

def validate_node(self, node_id, slots: Dict[str, Any]):
def validate_graph_node(self, node_id, slots: Dict[str, Any]):
"""
Validate slot properties (mainly 'categories') of a node.

Expand All @@ -190,10 +171,11 @@ def validate_node(self, node_id, slots: Dict[str, Any]):
categories = slots["categories"]
node_prefix_mapped: bool = False
for category in categories:
category_name: str = self.validate_category(node_id, category)
if category_name:
category: Optional[ClassDefinition] = \
self.validate_category(context="Node", category=category, strict_validation=False)
if category:
possible_subject_categories = self.bmtk.get_element_by_prefix(node_id)
if category_name in possible_subject_categories:
if category.name in possible_subject_categories:
node_prefix_mapped = True
if not node_prefix_mapped:
self.report_error(
Expand Down Expand Up @@ -225,11 +207,12 @@ def validate_node(self, node_id, slots: Dict[str, Any]):
id_prefix_mapped: Dict = {identifier: False for identifier in ids}
for category in categories:
# category validation may report an error internally
category_name = self.validate_category(node_id, category)
if category_name:
category: Optional[ClassDefinition] = \
self.validate_category(context="Node", category=category, strict_validation=False)
if category:
for identifier in ids: # may be empty list if not provided...
possible_subject_categories = self.bmtk.get_element_by_prefix(identifier)
if category_name in possible_subject_categories:
if category.name in possible_subject_categories:
id_prefix_mapped[identifier] = True
unmapped_ids = [
identifier for identifier in id_prefix_mapped.keys() if not id_prefix_mapped[identifier]
Expand All @@ -253,7 +236,7 @@ def validate_node(self, node_id, slots: Dict[str, Any]):
def set_nodes(self, nodes: Set):
self.nodes.update(nodes)

def validate_edge(self, edge: Dict):
def validate_graph_edge(self, edge: Dict):
"""
Validate slot properties of a relationship ('biolink:Association') edge.

Expand Down Expand Up @@ -317,14 +300,99 @@ def validate_edge(self, edge: Dict):

if self.graph_type is TrapiGraphType.Knowledge_Graph:
if not attributes:
# TODO: not quite sure whether and how to fully validate the 'attributes' of an edge
# For now, we simply assume that *all* edges must have *some* attributes
# (at least, provenance related, but we don't explicitly test for them)
self.report_error(f"Edge '{edge_id}' has missing or empty attributes?")
else:
# TODO: attempt some deeper attribute validation here
for attribute in attributes:
attribute_type_id: str = attribute['attribute_type_id']

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is attribute.get('attribute_type_id', None) necessary here or can we assume this key exists due to prior validation?

Copy link
Collaborator Author

@RichardBruskiewich RichardBruskiewich Aug 12, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, good idea. I should also check for a non-None 'value' field as well.

#
# TODO: not sure if this should only be a Pytest 'warning' rather than an Pytest 'error'
#
if not self.is_curie(attribute_type_id):
self.report_error(
f"Edge '{edge_id}' attribute_type_id '{str(attribute_type_id)}' is not a CURIE?"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say warning is probably more beneficial as attributes_types are not always well standardized.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment, all discrepancies are reported as 'error' but Eric has made the same point, and also, some normalization of errors into an index set of error codes (with parameters) is on the near term horizon, maybe over the next few days. Afterwards, it may be useful to decide which items are absolute errors, and which could be warnings or even info items.

)
elif not self.bmtk.is_association_slot(attribute_type_id):
self.report_error(
f"Edge '{edge_id}' attribute_type_id '{str(attribute_type_id)}' " +
"not a biolink:association_slot?"
)
# if not a Biolink association_slot, at least, check if it is known to Biolink
prefix = attribute_type_id.split(":", 1)[0]
if not self.bmtk.get_element_by_prefix(prefix):
self.report_error(
f"Edge '{edge_id}' attribute_type_id '{str(attribute_type_id)}' " +
f"has a CURIE prefix namespace unknown to Biolink?"
)
else:
# TODO: do we need to validate Query Graph 'constraints' slot contents here?
pass

def validate_category(
self,
context: str,
category: Optional[str],
strict_validation: bool = True
) -> ClassDefinition:
"""

:param context: str, label for context of concept whose category is being validated, i.e. 'Subject' or 'Object'
:param category: str, CURIE of putative concept 'category'
:param strict_validation: bool, True report mixin or abstract categories as errors; Ignore otherwise if False
:return:
"""
biolink_class: Optional[ClassDefinition] = None
if category:
biolink_class = self.bmtk.get_element(category)
if biolink_class:
if biolink_class.deprecated:
self.report_error(
f"{context} Biolink class '{category}' is deprecated: {biolink_class.deprecated}?"
)
biolink_class = None
elif biolink_class.abstract:
if strict_validation:
self.report_error(
f"{context} Biolink class '{category}' is abstract, not a concrete category?"
)
else:
logger.info(f"{context} Biolink class '{category}' is abstract. Ignored in this context.")
biolink_class = None
elif self.bmtk.is_mixin(category):
# A mixin cannot be instantiated so it should not be given as an input concept category
if strict_validation:
self.report_error(
f"{context} identifier '{category}' designates a mixin, not a concrete category?"
)
else:
logger.info(f"{context} Biolink class '{category}' is a 'mixin'. Ignored in this context.")
biolink_class = None
elif not self.bmtk.is_category(category):
self.report_error(f"{context} identifier '{category}' is not a valid Biolink category?")
biolink_class = None
else:
self.report_error(f"{context} Biolink class '{category}' is unknown?")
else:
self.report_error(f"{context} category identifier is missing?")

return biolink_class

def validate_input_node(self, context: str, category: Optional[str], identifier: Optional[str]):

biolink_class: Optional[ClassDefinition] = self.validate_category(f"Input {context}", category)

if identifier:
if biolink_class:
possible_subject_categories = self.bmtk.get_element_by_prefix(identifier)
if biolink_class.name not in possible_subject_categories:
err_msg = f"Namespace prefix of input {context} identifier '{identifier}' is unmapped to '{category}'?"
self.report_error(err_msg)
# else, we will have already reported an error in validate_category()
else:
self.report_error(f"Input {context} identifier is missing?")

def check_biolink_model_compliance_of_input_edge(self, edge: Dict[str, str]) -> Tuple[str, Optional[List[str]]]:
"""
Validate a templated test input edge contents against the current BMT Biolink Model release.
Expand All @@ -351,51 +419,25 @@ def check_biolink_model_compliance_of_input_edge(self, edge: Dict[str, str]) ->
subject_curie = edge['subject'] if 'subject' in edge else None
object_curie = edge['object'] if 'object' in edge else None

if subject_category_curie and self.bmtk.is_category(subject_category_curie):
subject_category_name = self.bmtk.get_element(subject_category_curie).name
else:
err_msg = f"'subject' category "
err_msg += f"'{subject_category_curie}' is unknown?" if subject_category_curie else "is missing?"
self.report_error(err_msg)
subject_category_name = None

if object_category_curie and self.bmtk.is_category(object_category_curie):
object_category_name = self.bmtk.get_element(object_category_curie).name
else:
err_msg = f"'object' category "
err_msg += f"'{object_category_curie}' is unknown?" if object_category_curie else "is missing?"
self.report_error(err_msg)
object_category_name = None
self.validate_input_node(
context='subject',
category=subject_category_curie,
identifier=subject_curie
)

if not (predicate_curie and self.bmtk.is_predicate(predicate_curie)):
err_msg = f"predicate "
err_msg = f"Input predicate "
err_msg += f"'{predicate_curie}' is unknown?" if predicate_curie else "is missing?"
self.report_error(err_msg)
elif self.minimum_required_biolink_version("2.2.0") and \
not self.bmtk.is_translator_canonical_predicate(predicate_curie):
self.report_error(f"predicate '{predicate_curie}' is non-canonical?")

if subject_curie:
if subject_category_name:
possible_subject_categories = self.bmtk.get_element_by_prefix(subject_curie)
if subject_category_name not in possible_subject_categories:
err_msg = f"namespace prefix of 'subject' identifier '{subject_curie}' " +\
f"is unmapped to '{subject_category_curie}'?"
self.report_error(err_msg)
else:
err_msg = "'subject' is missing?"
self.report_error(err_msg)
self.report_error(f"Input predicate '{predicate_curie}' is non-canonical?")

if object_curie:
if object_category_name:
possible_object_categories = self.bmtk.get_element_by_prefix(object_curie)
if object_category_name not in possible_object_categories:
err_msg = f"namespace prefix of 'object' identifier '{object_curie}' " +\
f"is unmapped to '{object_category_curie}'?"
self.report_error(err_msg)
else:
err_msg = "'object' is missing?"
self.report_error(err_msg)
self.validate_input_node(
context='object',
category=object_category_curie,
identifier=object_curie
)

return self.get_result()

Expand Down Expand Up @@ -435,7 +477,7 @@ def check_biolink_model_compliance(self, graph: Dict) -> Tuple[str, Optional[Lis
if nodes:
for node_id, details in nodes.items():

self.validate_node(node_id, details)
self.validate_graph_node(node_id, details)

nodes_seen += 1
if nodes_seen >= _MAX_TEST_NODES:
Expand All @@ -449,7 +491,7 @@ def check_biolink_model_compliance(self, graph: Dict) -> Tuple[str, Optional[Lis
for edge in edges.values():

# print(f"{str(edge)}", flush=True)
self.validate_edge(edge)
self.validate_graph_edge(edge)

edges_seen += 1
if edges_seen >= _MAX_TEST_EDGES:
Expand Down
3 changes: 2 additions & 1 deletion requirements-service.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ linkml-runtime>=1.3.1
linkml>=1.3.2
prefixcommons==0.1.11
tomli<2.0.0,>=0.2.6
bmt==0.8.4
# bmt>=0.8.10
git+https://github.com/biolink/biolink-model-toolkit.git@v0.8.4-patch-release#egg=bmt
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ linkml-runtime>=1.3.1
linkml>=1.3.2
prefixcommons==0.1.11
tomli<2.0.0,>=0.2.6
bmt==0.8.4
# bmt>=0.8.10
git+https://github.com/biolink/biolink-model-toolkit.git@v0.8.4-patch-release#egg=bmt
1 change: 1 addition & 0 deletions tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
- [test_biolink_compliance_validation.py](translator/biolink/test_biolink_compliance_validation.py) Version-specific Biolink Model semantic compliance test harness unit tests:
- **test_set_default_biolink_versioned_global_environment:** testing default Biolink Model release
- **test_set_specific_biolink_versioned_global_environment:** testing specific Biolink Model release
- **test_minimum_required_biolink_version:** testing minimal threshold Biolink Model release
- **test_check_biolink_model_compliance_of_input_edge:** test of KP data template test input edges validation
- **test_check_biolink_model_compliance_of_query_graph:** test of TRAPI output query graph validation
- **test_check_biolink_model_compliance_of_knowledge_graph:** test of TRAPI output knowledge graphs validation
Loading