-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Pipeline to keep subclass relationships in sync between Mondo and sources. - Add: makefile goals: sync, sync-subclassof, reports/sync-subclassof.robot.template.tsv - Update: makefile goal: build-mondo-ingest - Add: src/scripts/sync_subclassof.py General - Bugfix: utils.py remove_angle_brackets(): Now correctly returns a str if receives a str.
- Loading branch information
Showing
3 changed files
with
128 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
"""Create a robot template for purpose of syncing subclassof relations. | ||
Resources | ||
- GitHub issue: https://github.com/monarch-initiative/mondo-ingest/issues/92 | ||
- GitHub PR: https://github.com/monarch-initiative/mondo-ingest/pull/363 | ||
""" | ||
import os | ||
from argparse import ArgumentParser | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Union | ||
|
||
# todo: clear unused imports | ||
import curies | ||
import pandas as pd | ||
from oaklib import get_adapter | ||
from oaklib.interfaces.basic_ontology_interface import RELATIONSHIP | ||
from oaklib.types import CURIE, URI | ||
|
||
from src.scripts.utils import remove_angle_brackets | ||
|
||
# from sssom.util import is_curie | ||
|
||
HERE = Path(os.path.abspath(os.path.dirname(__file__))) | ||
SRC_DIR = HERE.parent | ||
ONTOLOGY_DIR = SRC_DIR / 'ontology' | ||
DEFAULT_OUTPATH = ONTOLOGY_DIR / 'reports' / 'sync-subclassof.robot.template.tsv' | ||
DEFAULT_ONTOLOGY_DB_PATH = ONTOLOGY_DIR / 'tmp' / 'merged.db' | ||
ROBOT_ROW = { | ||
'term_id': 'ID', | ||
'parent_id': 'SC % SPLIT=|', | ||
# todo: PR comment to remind about better property for: | ||
'xref_ids': 'A oboInOwl:hasDbXref SPLIT=|', # todo: Nico may provide different value | ||
} | ||
# https://curies.readthedocs.io/en/latest/tutorial.html#loading-a-pre-defined-context | ||
CONV = curies.get_monarch_converter() | ||
|
||
compress = lambda x: CONV.compress(x) if CONV.compress(x) else x | ||
# todo: oak issue: too slow iterating. 36 seconds for below snippet | ||
# for mondo_id in mondo_ids[0:10]: | ||
# rels: List[CURIE] = [x for x in oi.relationships(subjects=[mondo_id])] | ||
|
||
# todo: add caching? | ||
def sync_subclassof(outpath: str = DEFAULT_OUTPATH, ontology_db_path: str = DEFAULT_ONTOLOGY_DB_PATH): | ||
"""Run""" | ||
# Vars | ||
oi = get_adapter(ontology_db_path) | ||
|
||
# Get all terms | ||
ids_all: List[Union[CURIE, URI]] = [x for x in oi.entities(filter_obsoletes=False)] | ||
mondo_ids: List[CURIE] = [x for x in ids_all if x.startswith('MONDO:')] | ||
|
||
# todo: pr comment?: skip if parent_id not in mondo? is this only the case for: [('MONDO:0000001', 'rdfs:subClassOf', 'BFO:0000016')] | ||
# TODO: analyze first | ||
# example: mondo:0000001 rdfs:subClassOf BFO:0000016 | ||
# Relationships | ||
# todo: PR comment: how did i implement this and why. memory vs speed. timeout on all rels. 36 seconds on iter 1x/time for just 10 records | ||
# todo: oak issue: slow even batch rels (all preds). 5 seconds for 10. 25 for 100. when passed 5 preds, 1 second for 100. 36 sec for 25k~ | ||
rels_raw: List[RELATIONSHIP] = [x for x in oi.relationships( | ||
subjects=mondo_ids, | ||
# todo: pr comment: correct rels? close/related too? I think for GARD we might have 1-2 related cuz unsure | ||
# - and another question: are we indeed putting all of these in same column? (currently "'xref_ids': 'A oboInOwl:hasDbXref SPLIT=|'") | ||
predicates=['rdfs:subClassOf', 'oboInOwl:hasDbXref', 'skos:exactMatch', 'skos:broadMatch', 'skos:narrowMatch'] | ||
)] | ||
rels: List[RELATIONSHIP] = [(x[0], x[1], compress(remove_angle_brackets(x[2]))) for x in rels_raw] | ||
# TODO: report uncompressed namespaces | ||
# - can use a custom prefix_map if he wants instead of curies.get_monarch_converter() | ||
# - are URIs OK? | ||
# TODO: back up this analytic code | ||
objs = [x[2] for x in rels] | ||
http_objs = [x for x in objs if x.startswith('http')] | ||
uri_stems = ['/'.join(x.split('/')[0:-1]) + '/' for x in http_objs] | ||
uri_stem_set = set(uri_stems) | ||
|
||
# Create rows | ||
# TODO: pr comment: want to filter out only certain ontologies? (e.g. snomed shows up but we have no ingest) | ||
rows_by_id: Dict[CURIE, Dict[str, Any]] = {} | ||
for rel in rels: | ||
sub, pred, obj = rel | ||
if sub not in rows_by_id: | ||
rows_by_id[sub] = { | ||
'term_id': sub, | ||
'parent_id': [], | ||
'xref_ids': [], | ||
} | ||
if pred == 'rdfs:subClassOf': | ||
rows_by_id[sub]['parent_id'].append(obj) | ||
else: | ||
rows_by_id[sub]['xref_ids'].append(obj) | ||
|
||
# TODO: for each row, convert lists to | strings | ||
pass | ||
|
||
df = pd.DataFrame(rows_by_id.values()).sort_values(by=['term_id']) | ||
out_df = pd.concat([pd.DataFrame(ROBOT_ROW), df]) | ||
out_df.to_csv(outpath, sep='\t', index=False) | ||
|
||
|
||
def cli(): | ||
"""Command line interface.""" | ||
parser = ArgumentParser( | ||
prog='sync-subclassof', | ||
description='Create a robot template for purpose of syncing subclassof relations.') | ||
parser.add_argument('-o', '--outpath', default=DEFAULT_OUTPATH, help='Path to output robot template.') | ||
parser.add_argument('-d', '--ontology-db-path', default=DEFAULT_ONTOLOGY_DB_PATH, | ||
help='Path to SemanticSQL sqlite DB to query.') | ||
sync_subclassof(**vars(parser.parse_args())) | ||
|
||
|
||
if __name__ == '__main__': | ||
cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters