From f92a5c2dd5f89f7aebd1e9be124def2072fdc0b3 Mon Sep 17 00:00:00 2001 From: joeflack4 Date: Sun, 10 Sep 2023 19:57:10 -0400 Subject: [PATCH] Synchronization: SubClassOf Pipeline to keep subclass relationships in sync between Mondo and sources. - Add: makefile goals: sync, sync-subclassof, reports/sync-subclassof.robot.template.tsv - Update: makefile goal: build-mondo-ingest - Add: src/scripts/sync_subclassof.py General - Bugfix: utils.py remove_angle_brackets(): Now correctly returns a str if receives a str. - Update: config/prefixes.csv: Add: New entries - Add: utils.py: get_monarch_curies_converter() --- src/ontology/config/prefixes.csv | 13 +++- src/ontology/mondo-ingest.Makefile | 15 ++++- src/scripts/sync_subclassof.py | 104 +++++++++++++++++++++++++++++ src/scripts/utils.py | 22 +++++- 4 files changed, 149 insertions(+), 5 deletions(-) create mode 100644 src/scripts/sync_subclassof.py diff --git a/src/ontology/config/prefixes.csv b/src/ontology/config/prefixes.csv index 7e78d9845..0b19e751d 100644 --- a/src/ontology/config/prefixes.csv +++ b/src/ontology/config/prefixes.csv @@ -224,8 +224,19 @@ PLANP,http://purl.obolibrary.org/obo/PLANP_ CARO,http://purl.obolibrary.org/obo/CARO_ NPO,http://purl.bioontology.org/ontology/npo#NPO_ ICD10CM,http://purl.bioontology.org/ontology/ICD10CM/ +ICD10CM2,https://icd.codes/icd10cm/ ICD10WHO,https://icd.who.int/browse10/2019/en#/ +ICD10WHO2010,http://apps.who.int/classifications/icd10/browse/2010/en#/ OMIMPS,https://www.omim.org/phenotypicSeries/PS +OMIMPS2,https://omim.org/phenotypicSeries/ OMIM,https://omim.org/entry/ Orphanet,http://www.orpha.net/ORDO/Orphanet_ -GARD,http://purl.obolibrary.org/obo/GARD_ \ No newline at end of file +GARD,http://purl.obolibrary.org/obo/GARD_ +MEDRA,http://identifiers.org/meddra/ +MEDGEN,http://identifiers.org/medgen/ +MESH,http://identifiers.org/mesh/ +SCTID,http://identifiers.org/snomedct/ +MEDGEN,http://purl.obolibrary.org/obo/MEDGEN_ +MEDGENCUI,http://purl.obolibrary.org/obo/MEDGENCUI_ +UMLS,http://purl.obolibrary.org/obo/UMLS_ +UMLS2,http://linkedlifedata.com/resource/umls/id/ \ No newline at end of file diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile index 39f4fe020..83e6f8717 100644 --- a/src/ontology/mondo-ingest.Makefile +++ b/src/ontology/mondo-ingest.Makefile @@ -321,7 +321,7 @@ documentation: j2 $(ALL_DOCS) unmapped-terms-docs mapped-deprecated-terms-docs s build-mondo-ingest: $(MAKE) refresh-imports exclusions-all mondo-ingest.db slurp-all mappings matches \ mapped-deprecated-terms mapping-progress-report \ - recreate-unmapped-components documentation + recreate-unmapped-components sync documentation $(MAKE) prepare_release .PHONY: build-mondo-ingest-no-imports @@ -502,6 +502,19 @@ slurp-all-no-updates: $(foreach n,$(ALL_COMPONENT_IDS), slurp-no-updates-$(n)) .PHONY: slurp-all slurp-all: $(foreach n,$(ALL_COMPONENT_IDS), slurp-$(n)) + +############################# +###### Synchronization ###### +############################# +.PHONY: sync +sync: sync-subclassof + +.PHONY: sync-subclassof +sync-subclassof: reports/sync-subclassof.robot.template.tsv + +reports/sync-subclassof.robot.template.tsv: tmp/merged.db + python3 $(SCRIPTSDIR)/sync_subclassof.py --ontology-db-path tmp/merged.db --prefixes-csv-path config/prefixes.csv --outpath $@ + ############################# ######### Analysis ########## ############################# diff --git a/src/scripts/sync_subclassof.py b/src/scripts/sync_subclassof.py new file mode 100644 index 000000000..95d943e38 --- /dev/null +++ b/src/scripts/sync_subclassof.py @@ -0,0 +1,104 @@ +"""Create a robot template for purpose of syncing subclassof relations. + +Resources +- GitHub issue: https://github.com/monarch-initiative/mondo-ingest/issues/92 +- GitHub PR: https://github.com/monarch-initiative/mondo-ingest/pull/363 +""" +import os +from argparse import ArgumentParser +from pathlib import Path +from typing import Any, Dict, List, Union + +import pandas as pd +from oaklib import get_adapter +from oaklib.interfaces.basic_ontology_interface import RELATIONSHIP +from oaklib.types import CURIE, URI + +from src.scripts.utils import DEFAULT_PREFIXES_CSV, get_monarch_curies_converter, remove_angle_brackets + +HERE = Path(os.path.abspath(os.path.dirname(__file__))) +SRC_DIR = HERE.parent +ONTOLOGY_DIR = SRC_DIR / 'ontology' +DEFAULT_OUTPATH = ONTOLOGY_DIR / 'reports' / 'sync-subclassof.robot.template.tsv' +DEFAULT_ONTOLOGY_DB_PATH = ONTOLOGY_DIR / 'tmp' / 'merged.db' +ROBOT_ROW = [{ + 'term_id': 'ID', + 'parent_id': 'SC % SPLIT=|', + # todo: PR comment to remind about better property for: + 'xref_ids': 'A oboInOwl:hasDbXref SPLIT=|', # todo: Nico may provide different value +}] + +# todo: oak issue: too slow iterating. 36 seconds for below snippet +# for mondo_id in mondo_ids[0:10]: +# rels: List[CURIE] = [x for x in oi.relationships(subjects=[mondo_id])] + +# todo: add caching? +def sync_subclassof( + outpath: str = DEFAULT_OUTPATH, prefixes_csv_path: str = DEFAULT_PREFIXES_CSV, + ontology_db_path: str = DEFAULT_ONTOLOGY_DB_PATH +): + """Run""" + # Vars + conv = get_monarch_curies_converter(prefixes_csv_path) + compress = lambda x: conv.compress(x) if conv.compress(x) else x + oi = get_adapter(ontology_db_path) + + # Get all terms + ids_all: List[Union[CURIE, URI]] = [x for x in oi.entities(filter_obsoletes=False)] + mondo_ids: List[CURIE] = [x for x in ids_all if x.startswith('MONDO:')] + + # todo: pr comment?: skip if parent_id not in mondo? is this only the case for: [('MONDO:0000001', 'rdfs:subClassOf', 'BFO:0000016')] + # TODO: analyze first + # example: mondo:0000001 rdfs:subClassOf BFO:0000016 + # Relationships + # todo: PR comment: how did i implement this and why. memory vs speed. timeout on all rels. 36 seconds on iter 1x/time for just 10 records + # todo: oak issue: slow even batch rels (all preds). 5 seconds for 10. 25 for 100. when passed 5 preds, 1 second for 100. 36 sec for 25k~ + rels_raw: List[RELATIONSHIP] = [x for x in oi.relationships( + subjects=mondo_ids, + # todo: pr comment: correct rels? close/related too? I think for GARD we might have 1-2 related cuz unsure + # - and another question: are we indeed putting all of these in same column? (currently "'xref_ids': 'A oboInOwl:hasDbXref SPLIT=|'") + predicates=['rdfs:subClassOf', 'oboInOwl:hasDbXref', 'skos:exactMatch', 'skos:broadMatch', 'skos:narrowMatch'] + )] + rels: List[RELATIONSHIP] = [(x[0], x[1], compress(remove_angle_brackets(x[2]))) for x in rels_raw] + + # Create rows + # - Collect rows using lists + # TODO: pr comment: want to filter out only certain ontologies? (e.g. snomed shows up but we have no ingest) + rows_by_id: Dict[CURIE, Dict[str, Any]] = {} + for rel in rels: + sub, pred, obj = rel + if sub not in rows_by_id: + rows_by_id[sub] = { + 'term_id': sub, + 'parent_id': [], + 'xref_ids': [], + } + if pred == 'rdfs:subClassOf': + rows_by_id[sub]['parent_id'].append(obj) + else: + rows_by_id[sub]['xref_ids'].append(obj) + # - Convert lists to pipe-delimited strings + rows_by_id2: Dict[CURIE, Dict[str, Any]] = {} + for k, v in rows_by_id.items(): + rows_by_id2[k] = {k2: '|'.join(v2) if isinstance(v2, list) else v2 for k2, v2 in v.items()} + + df = pd.DataFrame(rows_by_id2.values()).sort_values(by=['term_id']) + out_df = pd.concat([pd.DataFrame(ROBOT_ROW), df]) + out_df.to_csv(outpath, sep='\t', index=False) + + +def cli(): + """Command line interface.""" + parser = ArgumentParser( + prog='sync-subclassof', + description='Create a robot template for purpose of syncing subclassof relations.') + parser.add_argument('-o', '--outpath', default=DEFAULT_OUTPATH, help='Path to output robot template.') + parser.add_argument('-p', '--prefixes-csv-path', default=DEFAULT_PREFIXES_CSV, + help='Path to a CSV with prefix in first column and URI stem in second.') + parser.add_argument('-d', '--ontology-db-path', default=DEFAULT_ONTOLOGY_DB_PATH, + help='Path to SemanticSQL sqlite DB to query.') + sync_subclassof(**vars(parser.parse_args())) + + +if __name__ == '__main__': + cli() diff --git a/src/scripts/utils.py b/src/scripts/utils.py index 42fb80d48..10369981a 100644 --- a/src/scripts/utils.py +++ b/src/scripts/utils.py @@ -4,10 +4,12 @@ import subprocess import sys from datetime import datetime +from pathlib import Path from typing import Dict, List, Set, Union import curies import pandas as pd +import yaml from jinja2 import Template from oaklib import OntologyResource from oaklib.implementations import ProntoImplementation, SqlImplementation @@ -27,6 +29,7 @@ TEMP_DIR = os.path.join(ONTOLOGY_DIR, 'tmp') DOCS_DIR = os.path.join(PROJECT_DIR, 'docs') CACHE_DIR = TEMP_DIR +DEFAULT_PREFIXES_CSV = os.path.join(ONTOLOGY_DIR, 'config', 'prefixes.csv') # todo: there are remaining todo's in this class @@ -167,6 +170,18 @@ def _load_ontology(ontology_path: str, use_cache=False) -> ProntoImplementation: return ontology +def get_monarch_curies_converter(from_prefixes_csv: Union[str, Path] = DEFAULT_PREFIXES_CSV) -> curies.Converter: + """:param from_prefixes_csv: Path to a CSV with prefix in first column and URI stem in second.""" + if not from_prefixes_csv: + # https://curies.readthedocs.io/en/latest/tutorial.html#loading-a-pre-defined-context + return curies.get_monarch_converter() + df = pd.read_csv(from_prefixes_csv) + df['yaml'] = df.apply(lambda x: f'{x["prefix"]}: {x["base"]}', axis=1) + prefix_map = yaml.safe_load('\n'.join(df['yaml'])) + conv = curies.Converter.from_prefix_map(prefix_map) + return conv + + def _get_next_available_mondo_id(min_id: int, max_id: int, mondo_ids: Set[int]) -> (int, Set[int]): """Starting from `min_id`, count up and check until finding the next ID. @@ -183,16 +198,17 @@ def _get_next_available_mondo_id(min_id: int, max_id: int, mondo_ids: Set[int]) return next_id, mondo_ids -def remove_angle_brackets(uris: Union[URI, List[URI]]): +def remove_angle_brackets(uris: Union[URI, List[URI]]) -> Union[URI, List[URI]]: """Remove angle brackets from URIs, e.g.: --> https://omim.org/entry/100050""" - uris = [uris] if isinstance(uris, str) else uris + str_input = isinstance(uris, str) + uris = [uris] if str_input else uris uris2 = [] for x in uris: x = x[1:] if x.startswith('<') else x x = x[:-1] if x.endswith('>') else x uris2.append(x) - return uris2 + return uris2[0] if str_input else uris2 def get_mondo_term_ids(mondo_terms_path: str, slurp_id_map: Dict[str, str]) -> Set[int]: