Skip to content

Commit

Permalink
Synchronization: SubClassOf
Browse files Browse the repository at this point in the history
Pipeline to keep subclass relationships in sync between Mondo and sources.
- Add: makefile goals: sync, sync-subclassof, reports/sync-subclassof.robot.template.tsv
- Update: makefile goal: build-mondo-ingest
- Add: src/scripts/sync_subclassof.py

General
- Bugfix: utils.py remove_angle_brackets(): Now correctly returns a str if receives a str.
- Update: config/prefixes.csv: Add: New entries
- Add: utils.py: get_monarch_curies_converter()
  • Loading branch information
joeflack4 committed Sep 16, 2023
1 parent f4590dc commit f92a5c2
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 5 deletions.
13 changes: 12 additions & 1 deletion src/ontology/config/prefixes.csv
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,19 @@ PLANP,http://purl.obolibrary.org/obo/PLANP_
CARO,http://purl.obolibrary.org/obo/CARO_
NPO,http://purl.bioontology.org/ontology/npo#NPO_
ICD10CM,http://purl.bioontology.org/ontology/ICD10CM/
ICD10CM2,https://icd.codes/icd10cm/
ICD10WHO,https://icd.who.int/browse10/2019/en#/
ICD10WHO2010,http://apps.who.int/classifications/icd10/browse/2010/en#/
OMIMPS,https://www.omim.org/phenotypicSeries/PS
OMIMPS2,https://omim.org/phenotypicSeries/
OMIM,https://omim.org/entry/
Orphanet,http://www.orpha.net/ORDO/Orphanet_
GARD,http://purl.obolibrary.org/obo/GARD_
GARD,http://purl.obolibrary.org/obo/GARD_
MEDRA,http://identifiers.org/meddra/
MEDGEN,http://identifiers.org/medgen/
MESH,http://identifiers.org/mesh/
SCTID,http://identifiers.org/snomedct/
MEDGEN,http://purl.obolibrary.org/obo/MEDGEN_
MEDGENCUI,http://purl.obolibrary.org/obo/MEDGENCUI_
UMLS,http://purl.obolibrary.org/obo/UMLS_
UMLS2,http://linkedlifedata.com/resource/umls/id/
15 changes: 14 additions & 1 deletion src/ontology/mondo-ingest.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ documentation: j2 $(ALL_DOCS) unmapped-terms-docs mapped-deprecated-terms-docs s
build-mondo-ingest:
$(MAKE) refresh-imports exclusions-all mondo-ingest.db slurp-all mappings matches \
mapped-deprecated-terms mapping-progress-report \
recreate-unmapped-components documentation
recreate-unmapped-components sync documentation
$(MAKE) prepare_release

.PHONY: build-mondo-ingest-no-imports
Expand Down Expand Up @@ -502,6 +502,19 @@ slurp-all-no-updates: $(foreach n,$(ALL_COMPONENT_IDS), slurp-no-updates-$(n))
.PHONY: slurp-all
slurp-all: $(foreach n,$(ALL_COMPONENT_IDS), slurp-$(n))


#############################
###### Synchronization ######
#############################
.PHONY: sync
sync: sync-subclassof

.PHONY: sync-subclassof
sync-subclassof: reports/sync-subclassof.robot.template.tsv

reports/sync-subclassof.robot.template.tsv: tmp/merged.db
python3 $(SCRIPTSDIR)/sync_subclassof.py --ontology-db-path tmp/merged.db --prefixes-csv-path config/prefixes.csv --outpath $@

#############################
######### Analysis ##########
#############################
Expand Down
104 changes: 104 additions & 0 deletions src/scripts/sync_subclassof.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""Create a robot template for purpose of syncing subclassof relations.
Resources
- GitHub issue: https://github.com/monarch-initiative/mondo-ingest/issues/92
- GitHub PR: https://github.com/monarch-initiative/mondo-ingest/pull/363
"""
import os
from argparse import ArgumentParser
from pathlib import Path
from typing import Any, Dict, List, Union

import pandas as pd
from oaklib import get_adapter
from oaklib.interfaces.basic_ontology_interface import RELATIONSHIP
from oaklib.types import CURIE, URI

from src.scripts.utils import DEFAULT_PREFIXES_CSV, get_monarch_curies_converter, remove_angle_brackets

HERE = Path(os.path.abspath(os.path.dirname(__file__)))
SRC_DIR = HERE.parent
ONTOLOGY_DIR = SRC_DIR / 'ontology'
DEFAULT_OUTPATH = ONTOLOGY_DIR / 'reports' / 'sync-subclassof.robot.template.tsv'
DEFAULT_ONTOLOGY_DB_PATH = ONTOLOGY_DIR / 'tmp' / 'merged.db'
ROBOT_ROW = [{
'term_id': 'ID',
'parent_id': 'SC % SPLIT=|',
# todo: PR comment to remind about better property for:
'xref_ids': 'A oboInOwl:hasDbXref SPLIT=|', # todo: Nico may provide different value
}]

# todo: oak issue: too slow iterating. 36 seconds for below snippet
# for mondo_id in mondo_ids[0:10]:
# rels: List[CURIE] = [x for x in oi.relationships(subjects=[mondo_id])]

# todo: add caching?
def sync_subclassof(
outpath: str = DEFAULT_OUTPATH, prefixes_csv_path: str = DEFAULT_PREFIXES_CSV,
ontology_db_path: str = DEFAULT_ONTOLOGY_DB_PATH
):
"""Run"""
# Vars
conv = get_monarch_curies_converter(prefixes_csv_path)
compress = lambda x: conv.compress(x) if conv.compress(x) else x
oi = get_adapter(ontology_db_path)

# Get all terms
ids_all: List[Union[CURIE, URI]] = [x for x in oi.entities(filter_obsoletes=False)]
mondo_ids: List[CURIE] = [x for x in ids_all if x.startswith('MONDO:')]

# todo: pr comment?: skip if parent_id not in mondo? is this only the case for: [('MONDO:0000001', 'rdfs:subClassOf', 'BFO:0000016')]
# TODO: analyze first
# example: mondo:0000001 rdfs:subClassOf BFO:0000016
# Relationships
# todo: PR comment: how did i implement this and why. memory vs speed. timeout on all rels. 36 seconds on iter 1x/time for just 10 records
# todo: oak issue: slow even batch rels (all preds). 5 seconds for 10. 25 for 100. when passed 5 preds, 1 second for 100. 36 sec for 25k~
rels_raw: List[RELATIONSHIP] = [x for x in oi.relationships(
subjects=mondo_ids,
# todo: pr comment: correct rels? close/related too? I think for GARD we might have 1-2 related cuz unsure
# - and another question: are we indeed putting all of these in same column? (currently "'xref_ids': 'A oboInOwl:hasDbXref SPLIT=|'")
predicates=['rdfs:subClassOf', 'oboInOwl:hasDbXref', 'skos:exactMatch', 'skos:broadMatch', 'skos:narrowMatch']
)]
rels: List[RELATIONSHIP] = [(x[0], x[1], compress(remove_angle_brackets(x[2]))) for x in rels_raw]

# Create rows
# - Collect rows using lists
# TODO: pr comment: want to filter out only certain ontologies? (e.g. snomed shows up but we have no ingest)
rows_by_id: Dict[CURIE, Dict[str, Any]] = {}
for rel in rels:
sub, pred, obj = rel
if sub not in rows_by_id:
rows_by_id[sub] = {
'term_id': sub,
'parent_id': [],
'xref_ids': [],
}
if pred == 'rdfs:subClassOf':
rows_by_id[sub]['parent_id'].append(obj)
else:
rows_by_id[sub]['xref_ids'].append(obj)
# - Convert lists to pipe-delimited strings
rows_by_id2: Dict[CURIE, Dict[str, Any]] = {}
for k, v in rows_by_id.items():
rows_by_id2[k] = {k2: '|'.join(v2) if isinstance(v2, list) else v2 for k2, v2 in v.items()}

df = pd.DataFrame(rows_by_id2.values()).sort_values(by=['term_id'])
out_df = pd.concat([pd.DataFrame(ROBOT_ROW), df])
out_df.to_csv(outpath, sep='\t', index=False)


def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='sync-subclassof',
description='Create a robot template for purpose of syncing subclassof relations.')
parser.add_argument('-o', '--outpath', default=DEFAULT_OUTPATH, help='Path to output robot template.')
parser.add_argument('-p', '--prefixes-csv-path', default=DEFAULT_PREFIXES_CSV,
help='Path to a CSV with prefix in first column and URI stem in second.')
parser.add_argument('-d', '--ontology-db-path', default=DEFAULT_ONTOLOGY_DB_PATH,
help='Path to SemanticSQL sqlite DB to query.')
sync_subclassof(**vars(parser.parse_args()))


if __name__ == '__main__':
cli()
22 changes: 19 additions & 3 deletions src/scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Set, Union

import curies
import pandas as pd
import yaml
from jinja2 import Template
from oaklib import OntologyResource
from oaklib.implementations import ProntoImplementation, SqlImplementation
Expand All @@ -27,6 +29,7 @@
TEMP_DIR = os.path.join(ONTOLOGY_DIR, 'tmp')
DOCS_DIR = os.path.join(PROJECT_DIR, 'docs')
CACHE_DIR = TEMP_DIR
DEFAULT_PREFIXES_CSV = os.path.join(ONTOLOGY_DIR, 'config', 'prefixes.csv')


# todo: there are remaining todo's in this class
Expand Down Expand Up @@ -167,6 +170,18 @@ def _load_ontology(ontology_path: str, use_cache=False) -> ProntoImplementation:
return ontology


def get_monarch_curies_converter(from_prefixes_csv: Union[str, Path] = DEFAULT_PREFIXES_CSV) -> curies.Converter:
""":param from_prefixes_csv: Path to a CSV with prefix in first column and URI stem in second."""
if not from_prefixes_csv:
# https://curies.readthedocs.io/en/latest/tutorial.html#loading-a-pre-defined-context
return curies.get_monarch_converter()
df = pd.read_csv(from_prefixes_csv)
df['yaml'] = df.apply(lambda x: f'{x["prefix"]}: {x["base"]}', axis=1)
prefix_map = yaml.safe_load('\n'.join(df['yaml']))
conv = curies.Converter.from_prefix_map(prefix_map)
return conv


def _get_next_available_mondo_id(min_id: int, max_id: int, mondo_ids: Set[int]) -> (int, Set[int]):
"""Starting from `min_id`, count up and check until finding the next ID.
Expand All @@ -183,16 +198,17 @@ def _get_next_available_mondo_id(min_id: int, max_id: int, mondo_ids: Set[int])
return next_id, mondo_ids


def remove_angle_brackets(uris: Union[URI, List[URI]]):
def remove_angle_brackets(uris: Union[URI, List[URI]]) -> Union[URI, List[URI]]:
"""Remove angle brackets from URIs, e.g.:
<https://omim.org/entry/100050> --> https://omim.org/entry/100050"""
uris = [uris] if isinstance(uris, str) else uris
str_input = isinstance(uris, str)
uris = [uris] if str_input else uris
uris2 = []
for x in uris:
x = x[1:] if x.startswith('<') else x
x = x[:-1] if x.endswith('>') else x
uris2.append(x)
return uris2
return uris2[0] if str_input else uris2


def get_mondo_term_ids(mondo_terms_path: str, slurp_id_map: Dict[str, str]) -> Set[int]:
Expand Down

0 comments on commit f92a5c2

Please sign in to comment.