From f92a5c2dd5f89f7aebd1e9be124def2072fdc0b3 Mon Sep 17 00:00:00 2001
From: joeflack4 <joeflack4@gmail.com>
Date: Sun, 10 Sep 2023 19:57:10 -0400
Subject: [PATCH] Synchronization: SubClassOf Pipeline to keep subclass
 relationships in sync between Mondo and sources. - Add: makefile goals: sync,
 sync-subclassof, reports/sync-subclassof.robot.template.tsv - Update:
 makefile goal: build-mondo-ingest - Add: src/scripts/sync_subclassof.py

General
- Bugfix: utils.py remove_angle_brackets(): Now correctly returns a str if receives a str.
- Update: config/prefixes.csv: Add: New entries
- Add: utils.py: get_monarch_curies_converter()
---
 src/ontology/config/prefixes.csv   |  13 +++-
 src/ontology/mondo-ingest.Makefile |  15 ++++-
 src/scripts/sync_subclassof.py     | 104 +++++++++++++++++++++++++++++
 src/scripts/utils.py               |  22 +++++-
 4 files changed, 149 insertions(+), 5 deletions(-)
 create mode 100644 src/scripts/sync_subclassof.py

diff --git a/src/ontology/config/prefixes.csv b/src/ontology/config/prefixes.csv
index 7e78d9845..0b19e751d 100644
--- a/src/ontology/config/prefixes.csv
+++ b/src/ontology/config/prefixes.csv
@@ -224,8 +224,19 @@ PLANP,http://purl.obolibrary.org/obo/PLANP_
 CARO,http://purl.obolibrary.org/obo/CARO_
 NPO,http://purl.bioontology.org/ontology/npo#NPO_
 ICD10CM,http://purl.bioontology.org/ontology/ICD10CM/
+ICD10CM2,https://icd.codes/icd10cm/
 ICD10WHO,https://icd.who.int/browse10/2019/en#/
+ICD10WHO2010,http://apps.who.int/classifications/icd10/browse/2010/en#/
 OMIMPS,https://www.omim.org/phenotypicSeries/PS
+OMIMPS2,https://omim.org/phenotypicSeries/
 OMIM,https://omim.org/entry/
 Orphanet,http://www.orpha.net/ORDO/Orphanet_
-GARD,http://purl.obolibrary.org/obo/GARD_
\ No newline at end of file
+GARD,http://purl.obolibrary.org/obo/GARD_
+MEDRA,http://identifiers.org/meddra/
+MEDGEN,http://identifiers.org/medgen/
+MESH,http://identifiers.org/mesh/
+SCTID,http://identifiers.org/snomedct/
+MEDGEN,http://purl.obolibrary.org/obo/MEDGEN_
+MEDGENCUI,http://purl.obolibrary.org/obo/MEDGENCUI_
+UMLS,http://purl.obolibrary.org/obo/UMLS_
+UMLS2,http://linkedlifedata.com/resource/umls/id/
\ No newline at end of file
diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile
index 39f4fe020..83e6f8717 100644
--- a/src/ontology/mondo-ingest.Makefile
+++ b/src/ontology/mondo-ingest.Makefile
@@ -321,7 +321,7 @@ documentation: j2 $(ALL_DOCS) unmapped-terms-docs mapped-deprecated-terms-docs s
 build-mondo-ingest:
 	$(MAKE) refresh-imports exclusions-all mondo-ingest.db slurp-all mappings matches \
 		mapped-deprecated-terms mapping-progress-report \
-		recreate-unmapped-components documentation
+		recreate-unmapped-components sync documentation
 	$(MAKE) prepare_release
 
 .PHONY: build-mondo-ingest-no-imports
@@ -502,6 +502,19 @@ slurp-all-no-updates: $(foreach n,$(ALL_COMPONENT_IDS), slurp-no-updates-$(n))
 .PHONY: slurp-all
 slurp-all: $(foreach n,$(ALL_COMPONENT_IDS), slurp-$(n))
 
+
+#############################
+###### Synchronization ######
+#############################
+.PHONY: sync
+sync: sync-subclassof
+
+.PHONY: sync-subclassof
+sync-subclassof: reports/sync-subclassof.robot.template.tsv
+
+reports/sync-subclassof.robot.template.tsv: tmp/merged.db
+	python3 $(SCRIPTSDIR)/sync_subclassof.py --ontology-db-path tmp/merged.db --prefixes-csv-path config/prefixes.csv --outpath $@
+
 #############################
 ######### Analysis ##########
 #############################
diff --git a/src/scripts/sync_subclassof.py b/src/scripts/sync_subclassof.py
new file mode 100644
index 000000000..95d943e38
--- /dev/null
+++ b/src/scripts/sync_subclassof.py
@@ -0,0 +1,104 @@
+"""Create a robot template for purpose of syncing subclassof relations.
+
+Resources
+- GitHub issue: https://github.com/monarch-initiative/mondo-ingest/issues/92
+- GitHub PR: https://github.com/monarch-initiative/mondo-ingest/pull/363
+"""
+import os
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import Any, Dict, List, Union
+
+import pandas as pd
+from oaklib import get_adapter
+from oaklib.interfaces.basic_ontology_interface import RELATIONSHIP
+from oaklib.types import CURIE, URI
+
+from src.scripts.utils import DEFAULT_PREFIXES_CSV, get_monarch_curies_converter, remove_angle_brackets
+
+HERE = Path(os.path.abspath(os.path.dirname(__file__)))
+SRC_DIR = HERE.parent
+ONTOLOGY_DIR = SRC_DIR / 'ontology'
+DEFAULT_OUTPATH = ONTOLOGY_DIR / 'reports' / 'sync-subclassof.robot.template.tsv'
+DEFAULT_ONTOLOGY_DB_PATH = ONTOLOGY_DIR / 'tmp' / 'merged.db'
+ROBOT_ROW = [{
+    'term_id': 'ID',
+    'parent_id': 'SC % SPLIT=|',
+    # todo: PR comment to remind about better property for:
+    'xref_ids': 'A oboInOwl:hasDbXref SPLIT=|',  # todo: Nico may provide different value
+}]
+
+# todo: oak issue: too slow iterating. 36 seconds for below snippet
+# for mondo_id in mondo_ids[0:10]:
+#     rels: List[CURIE] = [x for x in oi.relationships(subjects=[mondo_id])]
+
+# todo: add caching?
+def sync_subclassof(
+    outpath: str = DEFAULT_OUTPATH, prefixes_csv_path: str = DEFAULT_PREFIXES_CSV,
+    ontology_db_path: str = DEFAULT_ONTOLOGY_DB_PATH
+):
+    """Run"""
+    # Vars
+    conv = get_monarch_curies_converter(prefixes_csv_path)
+    compress = lambda x: conv.compress(x) if conv.compress(x) else x
+    oi = get_adapter(ontology_db_path)
+
+    # Get all terms
+    ids_all: List[Union[CURIE, URI]] = [x for x in oi.entities(filter_obsoletes=False)]
+    mondo_ids: List[CURIE] = [x for x in ids_all if x.startswith('MONDO:')]
+
+    # todo: pr comment?: skip if parent_id not in mondo? is this only the case for: [('MONDO:0000001', 'rdfs:subClassOf', 'BFO:0000016')]
+    #  TODO: analyze first
+    #   example: mondo:0000001 rdfs:subClassOf BFO:0000016
+    # Relationships
+    # todo: PR comment: how did i implement this and why. memory vs speed. timeout on all rels. 36 seconds on iter 1x/time for just 10 records
+    # todo: oak issue: slow even batch rels (all preds). 5 seconds for 10. 25 for 100. when passed 5 preds, 1 second for 100. 36 sec for 25k~
+    rels_raw: List[RELATIONSHIP] = [x for x in oi.relationships(
+        subjects=mondo_ids,
+        # todo: pr comment: correct rels? close/related too? I think for GARD we might have 1-2 related cuz unsure
+        #  - and another question: are we indeed putting all of these in same column? (currently "'xref_ids': 'A oboInOwl:hasDbXref SPLIT=|'")
+        predicates=['rdfs:subClassOf', 'oboInOwl:hasDbXref', 'skos:exactMatch', 'skos:broadMatch', 'skos:narrowMatch']
+    )]
+    rels: List[RELATIONSHIP] = [(x[0], x[1], compress(remove_angle_brackets(x[2]))) for x in rels_raw]
+
+    # Create rows
+    # - Collect rows using lists
+    # TODO: pr comment: want to filter out only certain ontologies? (e.g. snomed shows up but we have no ingest)
+    rows_by_id: Dict[CURIE, Dict[str, Any]] = {}
+    for rel in rels:
+        sub, pred, obj = rel
+        if sub not in rows_by_id:
+            rows_by_id[sub] = {
+                'term_id': sub,
+                'parent_id': [],
+                'xref_ids': [],
+            }
+        if pred == 'rdfs:subClassOf':
+            rows_by_id[sub]['parent_id'].append(obj)
+        else:
+            rows_by_id[sub]['xref_ids'].append(obj)
+    # - Convert lists to pipe-delimited strings
+    rows_by_id2: Dict[CURIE, Dict[str, Any]] = {}
+    for k, v in rows_by_id.items():
+        rows_by_id2[k] = {k2: '|'.join(v2) if isinstance(v2, list) else v2 for k2, v2 in v.items()}
+
+    df = pd.DataFrame(rows_by_id2.values()).sort_values(by=['term_id'])
+    out_df = pd.concat([pd.DataFrame(ROBOT_ROW), df])
+    out_df.to_csv(outpath, sep='\t', index=False)
+
+
+def cli():
+    """Command line interface."""
+    parser = ArgumentParser(
+        prog='sync-subclassof',
+        description='Create a robot template for purpose of syncing subclassof relations.')
+    parser.add_argument('-o', '--outpath', default=DEFAULT_OUTPATH, help='Path to output robot template.')
+    parser.add_argument('-p', '--prefixes-csv-path', default=DEFAULT_PREFIXES_CSV,
+                        help='Path to a CSV with prefix in first column and URI stem in second.')
+    parser.add_argument('-d', '--ontology-db-path', default=DEFAULT_ONTOLOGY_DB_PATH,
+                        help='Path to SemanticSQL sqlite DB to query.')
+    sync_subclassof(**vars(parser.parse_args()))
+
+
+if __name__ == '__main__':
+    cli()
diff --git a/src/scripts/utils.py b/src/scripts/utils.py
index 42fb80d48..10369981a 100644
--- a/src/scripts/utils.py
+++ b/src/scripts/utils.py
@@ -4,10 +4,12 @@
 import subprocess
 import sys
 from datetime import datetime
+from pathlib import Path
 from typing import Dict, List, Set, Union
 
 import curies
 import pandas as pd
+import yaml
 from jinja2 import Template
 from oaklib import OntologyResource
 from oaklib.implementations import ProntoImplementation, SqlImplementation
@@ -27,6 +29,7 @@
 TEMP_DIR = os.path.join(ONTOLOGY_DIR, 'tmp')
 DOCS_DIR = os.path.join(PROJECT_DIR, 'docs')
 CACHE_DIR = TEMP_DIR
+DEFAULT_PREFIXES_CSV = os.path.join(ONTOLOGY_DIR, 'config', 'prefixes.csv')
 
 
 # todo: there are remaining todo's in this class
@@ -167,6 +170,18 @@ def _load_ontology(ontology_path: str, use_cache=False) -> ProntoImplementation:
     return ontology
 
 
+def get_monarch_curies_converter(from_prefixes_csv: Union[str, Path] = DEFAULT_PREFIXES_CSV) -> curies.Converter:
+    """:param from_prefixes_csv: Path to a CSV with prefix in first column and URI stem in second."""
+    if not from_prefixes_csv:
+        # https://curies.readthedocs.io/en/latest/tutorial.html#loading-a-pre-defined-context
+        return curies.get_monarch_converter()
+    df = pd.read_csv(from_prefixes_csv)
+    df['yaml'] = df.apply(lambda x: f'{x["prefix"]}: {x["base"]}', axis=1)
+    prefix_map = yaml.safe_load('\n'.join(df['yaml']))
+    conv = curies.Converter.from_prefix_map(prefix_map)
+    return conv
+
+
 def _get_next_available_mondo_id(min_id: int, max_id: int, mondo_ids: Set[int]) -> (int, Set[int]):
     """Starting from `min_id`, count up and check until finding the next ID.
 
@@ -183,16 +198,17 @@ def _get_next_available_mondo_id(min_id: int, max_id: int, mondo_ids: Set[int])
     return next_id, mondo_ids
 
 
-def remove_angle_brackets(uris: Union[URI, List[URI]]):
+def remove_angle_brackets(uris: Union[URI, List[URI]]) -> Union[URI, List[URI]]:
     """Remove angle brackets from URIs, e.g.:
     <https://omim.org/entry/100050> --> https://omim.org/entry/100050"""
-    uris = [uris] if isinstance(uris, str) else uris
+    str_input = isinstance(uris, str)
+    uris = [uris] if str_input else uris
     uris2 = []
     for x in uris:
         x = x[1:] if x.startswith('<') else x
         x = x[:-1] if x.endswith('>') else x
         uris2.append(x)
-    return uris2
+    return uris2[0] if str_input else uris2
 
 
 def get_mondo_term_ids(mondo_terms_path: str, slurp_id_map: Dict[str, str]) -> Set[int]: