Skip to content

Commit

Permalink
Synchronization: SubClassOf
Browse files Browse the repository at this point in the history
Pipeline to keep subclass relationships in sync between Mondo and sources.
- Add: makefile goals: sync, sync-subclassof, reports/sync-subclassof.robot.template.tsv
- Update: makefile goal: build-mondo-ingest
- Add: src/scripts/sync_subclassof.py

General
- Bugfix: utils.py remove_angle_brackets(): Now correctly returns a str if receives a str.
  • Loading branch information
joeflack4 committed Sep 16, 2023
1 parent f4590dc commit 763e959
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 4 deletions.
15 changes: 14 additions & 1 deletion src/ontology/mondo-ingest.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ documentation: j2 $(ALL_DOCS) unmapped-terms-docs mapped-deprecated-terms-docs s
build-mondo-ingest:
$(MAKE) refresh-imports exclusions-all mondo-ingest.db slurp-all mappings matches \
mapped-deprecated-terms mapping-progress-report \
recreate-unmapped-components documentation
recreate-unmapped-components sync documentation
$(MAKE) prepare_release

.PHONY: build-mondo-ingest-no-imports
Expand Down Expand Up @@ -502,6 +502,19 @@ slurp-all-no-updates: $(foreach n,$(ALL_COMPONENT_IDS), slurp-no-updates-$(n))
.PHONY: slurp-all
slurp-all: $(foreach n,$(ALL_COMPONENT_IDS), slurp-$(n))


#############################
###### Synchronization ######
#############################
.PHONY: sync
sync: sync-subclassof

.PHONY: sync-subclassof
sync-subclassof: reports/sync-subclassof.robot.template.tsv

reports/sync-subclassof.robot.template.tsv: tmp/merged.db
python3 $(SCRIPTSDIR)/sync_subclassof.py --ontology-db-path tmp/merged.db --outpath $@

#############################
######### Analysis ##########
#############################
Expand Down
110 changes: 110 additions & 0 deletions src/scripts/sync_subclassof.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Create a robot template for purpose of syncing subclassof relations.
Resources
- GitHub issue: https://github.com/monarch-initiative/mondo-ingest/issues/92
- GitHub PR: https://github.com/monarch-initiative/mondo-ingest/pull/363
"""
import os
from argparse import ArgumentParser
from pathlib import Path
from typing import Any, Dict, List, Union

# todo: clear unused imports
import curies
import pandas as pd
from oaklib import get_adapter
from oaklib.interfaces.basic_ontology_interface import RELATIONSHIP
from oaklib.types import CURIE, URI

from src.scripts.utils import remove_angle_brackets

# from sssom.util import is_curie

HERE = Path(os.path.abspath(os.path.dirname(__file__)))
SRC_DIR = HERE.parent
ONTOLOGY_DIR = SRC_DIR / 'ontology'
DEFAULT_OUTPATH = ONTOLOGY_DIR / 'reports' / 'sync-subclassof.robot.template.tsv'
DEFAULT_ONTOLOGY_DB_PATH = ONTOLOGY_DIR / 'tmp' / 'merged.db'
ROBOT_ROW = {
'term_id': 'ID',
'parent_id': 'SC % SPLIT=|',
# todo: PR comment to remind about better property for:
'xref_ids': 'A oboInOwl:hasDbXref SPLIT=|', # todo: Nico may provide different value
}
# https://curies.readthedocs.io/en/latest/tutorial.html#loading-a-pre-defined-context
CONV = curies.get_monarch_converter()

compress = lambda x: CONV.compress(x) if CONV.compress(x) else x
# todo: oak issue: too slow iterating. 36 seconds for below snippet
# for mondo_id in mondo_ids[0:10]:
# rels: List[CURIE] = [x for x in oi.relationships(subjects=[mondo_id])]

# todo: add caching?
def sync_subclassof(outpath: str = DEFAULT_OUTPATH, ontology_db_path: str = DEFAULT_ONTOLOGY_DB_PATH):
"""Run"""
# Vars
oi = get_adapter(ontology_db_path)

# Get all terms
ids_all: List[Union[CURIE, URI]] = [x for x in oi.entities(filter_obsoletes=False)]
mondo_ids: List[CURIE] = [x for x in ids_all if x.startswith('MONDO:')]

# todo: pr comment?: skip if parent_id not in mondo? is this only the case for: [('MONDO:0000001', 'rdfs:subClassOf', 'BFO:0000016')]
# TODO: analyze first
# example: mondo:0000001 rdfs:subClassOf BFO:0000016
# Relationships
# todo: PR comment: how did i implement this and why. memory vs speed. timeout on all rels. 36 seconds on iter 1x/time for just 10 records
# todo: oak issue: slow even batch rels (all preds). 5 seconds for 10. 25 for 100. when passed 5 preds, 1 second for 100. 36 sec for 25k~
rels_raw: List[RELATIONSHIP] = [x for x in oi.relationships(
subjects=mondo_ids,
# todo: pr comment: correct rels? close/related too? I think for GARD we might have 1-2 related cuz unsure
# - and another question: are we indeed putting all of these in same column? (currently "'xref_ids': 'A oboInOwl:hasDbXref SPLIT=|'")
predicates=['rdfs:subClassOf', 'oboInOwl:hasDbXref', 'skos:exactMatch', 'skos:broadMatch', 'skos:narrowMatch']
)]
rels: List[RELATIONSHIP] = [(x[0], x[1], compress(remove_angle_brackets(x[2]))) for x in rels_raw]
# TODO: report uncompressed namespaces
# - can use a custom prefix_map if he wants instead of curies.get_monarch_converter()
# - are URIs OK?
# TODO: back up this analytic code
objs = [x[2] for x in rels]
http_objs = [x for x in objs if x.startswith('http')]
uri_stems = ['/'.join(x.split('/')[0:-1]) + '/' for x in http_objs]
uri_stem_set = set(uri_stems)

# Create rows
# TODO: pr comment: want to filter out only certain ontologies? (e.g. snomed shows up but we have no ingest)
rows_by_id: Dict[CURIE, Dict[str, Any]] = {}
for rel in rels:
sub, pred, obj = rel
if sub not in rows_by_id:
rows_by_id[sub] = {
'term_id': sub,
'parent_id': [],
'xref_ids': [],
}
if pred == 'rdfs:subClassOf':
rows_by_id[sub]['parent_id'].append(obj)
else:
rows_by_id[sub]['xref_ids'].append(obj)

# TODO: for each row, convert lists to | strings
pass

df = pd.DataFrame(rows_by_id.values()).sort_values(by=['term_id'])
out_df = pd.concat([pd.DataFrame(ROBOT_ROW), df])
out_df.to_csv(outpath, sep='\t', index=False)


def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='sync-subclassof',
description='Create a robot template for purpose of syncing subclassof relations.')
parser.add_argument('-o', '--outpath', default=DEFAULT_OUTPATH, help='Path to output robot template.')
parser.add_argument('-d', '--ontology-db-path', default=DEFAULT_ONTOLOGY_DB_PATH,
help='Path to SemanticSQL sqlite DB to query.')
sync_subclassof(**vars(parser.parse_args()))


if __name__ == '__main__':
cli()
7 changes: 4 additions & 3 deletions src/scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,16 +183,17 @@ def _get_next_available_mondo_id(min_id: int, max_id: int, mondo_ids: Set[int])
return next_id, mondo_ids


def remove_angle_brackets(uris: Union[URI, List[URI]]):
def remove_angle_brackets(uris: Union[URI, List[URI]]) -> Union[URI, List[URI]]:
"""Remove angle brackets from URIs, e.g.:
<https://omim.org/entry/100050> --> https://omim.org/entry/100050"""
uris = [uris] if isinstance(uris, str) else uris
str_input = isinstance(uris, str)
uris = [uris] if str_input else uris
uris2 = []
for x in uris:
x = x[1:] if x.startswith('<') else x
x = x[:-1] if x.endswith('>') else x
uris2.append(x)
return uris2
return uris2[0] if str_input else uris2


def get_mondo_term_ids(mondo_terms_path: str, slurp_id_map: Dict[str, str]) -> Set[int]:
Expand Down

0 comments on commit 763e959

Please sign in to comment.