Skip to content

Commit

Permalink
chore: move over all_ontology.json generator script + gha to repo fro…
Browse files Browse the repository at this point in the history
…m single-cell-curation
  • Loading branch information
nayib-jose-gloria committed Feb 9, 2024
1 parent 399db0b commit f90fb05
Show file tree
Hide file tree
Showing 6 changed files with 312 additions and 0 deletions.
52 changes: 52 additions & 0 deletions .github/workflows/generate_all_ontology.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: Updates to Ontology Files

on:
push:
paths:
- '**/tools/ontology-builder/ontology-references/owl_info.yml'
branches-ignore:
- main

jobs:
ontology-processing:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.ref }}
- name: ontology changes
uses: dorny/paths-filter@v2
id: filter
with:
filters: |
owl_info:
- 'tools/ontology-builder/ontology-references/owl_info.yml
- name: Set up Python 3.8
uses: actions/setup-python@v1
with:
python-version: 3.8
- name: Python cache
uses: actions/cache@v1
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: install requirements
run: |
pip install -r tools/ontology-builder/requirements.txt
- name: setup git
run: |
git config user.name github-actions
git config user.email github-actions@github.com
- name: owl-processing
if: ${{ steps.filter.outputs.owl_info == 'true' }}
run: |
make download-ontologies -C cellxgene_schema_cli
git add ./cellxgene_schema_cli/cellxgene_schema/ontology_files/all_ontology.json.gz
- name: Commit
if: ${{ steps.filter.outputs.owl_info == 'true' }}
run: |
git commit -m "AUTO: update ontologies"
git push
214 changes: 214 additions & 0 deletions tools/ontology-builder/all_ontology_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
import env
import gzip
import json
import os
import re
import urllib.request
from threading import Thread
from typing import List
from urllib.error import HTTPError, URLError

import owlready2
import yaml


def _download_owls(owl_info_yml: str = env.OWL_INFO_YAML, output_dir: str = env.ONTOLOGY_DIR):
"""
Downloads the ontology owl files specified in 'owl_info_yml' into 'output_dir'
:param str owl_info_yml: path to yaml file wit OWL information
:param str output_dir: path to writable directory where owl files will be downloaded to
:rtype None
"""

with open(owl_info_yml, "r") as owl_info_handle:
owl_info = yaml.safe_load(owl_info_handle)

def download(_ontology, _url):
print(f"Start Downloading {_ontology}")
# Format of owl (handles cases where they are compressed)
download_format = _url.split(".")[-1]

output_file = os.path.join(output_dir, _ontology + ".owl")
if download_format == "gz":
urllib.request.urlretrieve(_url, output_file + ".gz")
_decompress(output_file + ".gz", output_file)
os.remove(output_file + ".gz")
else:
urllib.request.urlretrieve(_url, output_file)
print(f"Finish Downloading {_ontology}")

threads = []
for ontology, _ in owl_info.items():
latest_version = owl_info[ontology]["latest"]
url = owl_info[ontology]["urls"][latest_version]
try:
urllib.request.urlopen(url)
except HTTPError as e:
raise Exception(f"{ontology} with pinned URL {url} returns status code {e.code}") from e
except URLError as e:
raise Exception(f"{ontology} with pinned URL {url} fails due to {e.reason}") from e

t = Thread(target=download, args=(ontology, url))
t.start()
threads.append(t)

for t in threads:
t.join()


def _decompress(infile: str, tofile: str):
"""
Decompresses a gziped file
:param str infile: path gziped file
:param str tofile: path to output decompressed file
:rtype None
"""
with open(infile, "rb") as inf, open(tofile, "w", encoding="utf8") as tof:
decom_str = gzip.decompress(inf.read()).decode("utf-8")
tof.write(decom_str)


def _parse_owls(
working_dir: str = env.ONTOLOGY_DIR,
owl_info_yml: str = env.OWL_INFO_YAML,
output_json_file: str = env.PARSED_ONTOLOGIES_FILE,
):
"""
Parser all owl files in working_dir. Extracts information from all classes in the owl file.
The extracted information is written into a gzipped a json file with the following structure:
{
"ontology_name":
{
"term_id": {
"label": "..."
"deprecated": True
"ancestors": [
"ancestor1_term_id_1",
"ancestor2_term_id_2"
]
}
}
"term_id2": {
...
}
...
}
}
:param str working_dir: path to folder with owl files
:param str owl_info_yml: path to writable directory where owl files will be downloaded to
:param str owl_info_yml: path to yaml file wit owl information
:param str output_json_file: path to output jsaon file
:rtype None
"""

with open(owl_info_yml, "r") as owl_info_handle:
owl_info = yaml.safe_load(owl_info_handle)

owl_files = []
for owl_file in os.listdir(working_dir):
if owl_file.endswith(".owl"):
owl_files.append(os.path.join(working_dir, owl_file))

# Parse owl files
onto_dict = {}
for owl_file in owl_files:
world = owlready2.World()
onto = world.get_ontology(owl_file)
onto.load()
onto_dict[onto.name] = {}

print(f"Processing {onto.name}")

for onto_class in onto.classes():
term_id = onto_class.name.replace("_", ":")

# Skip terms that are not direct children from this ontology
if onto.name != term_id.split(":")[0]:
continue

# If there are specified target terms then only work with them
if onto.name in owl_info and "only" in owl_info[onto.name] and term_id not in owl_info[onto.name]["only"]:
continue

# Gets label
onto_dict[onto.name][term_id] = dict()
try:
onto_dict[onto.name][term_id]["label"] = onto_class.label[0]
except IndexError:
onto_dict[onto.name][term_id]["label"] = ""

# Add the "deprecated" status
onto_dict[onto.name][term_id]["deprecated"] = False
if onto_class.deprecated and onto_class.deprecated.first():
# if deprecated, include information to determine replacement term(s)
onto_dict[onto.name][term_id]["deprecated"] = True
if onto_class.comment:
onto_dict[onto.name][term_id]["comments"] = [str(c) for c in onto_class.comment]
# stores term tracking URL, such as a github issue discussing deprecated term
if hasattr(onto_class, "IAO_0000233") and onto_class.IAO_0000233:
onto_dict[onto.name][term_id]["term_tracker"] = str(onto_class.IAO_0000233[0])

# only need to record replaced_by OR considers
if onto_class.IAO_0100001 and onto_class.IAO_0100001.first():
# url --> term
ontology_term = re.findall(r"[^\W_]+", str(onto_class.IAO_0100001[0]))
onto_dict[onto.name][term_id]["replaced_by"] = f"{ontology_term[-2]}:{ontology_term[-1]}"
else:
if hasattr(onto_class, "consider") and onto_class.consider:
onto_dict[onto.name][term_id]["consider"] = [str(c) for c in onto_class.consider]
# Gets ancestors
ancestors = _get_ancestors(onto_class, onto.name)

# If "children_of" specified in owl info then skip the current term if it is
# not a children of those indicated.
if (onto.name in owl_info and "children_of" in owl_info[onto.name]) and (
not list(set(ancestors) & set(owl_info[onto.name]["children_of"]))
):
onto_dict[onto.name].pop(term_id)
continue

# only add the ancestors if it's not NCBITaxon, as this saves a lot of disk space
if onto.name == "NCBITaxon":
onto_dict[onto.name][term_id]["ancestors"] = []
else:
onto_dict[onto.name][term_id]["ancestors"] = ancestors

with gzip.open(output_json_file, "wt") as output_json:
json.dump(onto_dict, output_json, indent=2)


def _get_ancestors(onto_class: owlready2.entity.ThingClass, ontololgy_name: str) -> List[str]:
"""
Returns a list of ancestors ids of the given onto class, only returns those belonging to ontology_name,
it will format the id from the form CL_xxxx to CL:xxxx
:param owlready2.entity.ThingClass onto_class: the class for which ancestors will be retrieved
:param str ontololgy_name: only ancestors from this ontology will be kept
:rtype List[str]
:return list of ancestors (term ids), it could be empty
"""

ancestors = []

for ancestor in onto_class.ancestors():
if onto_class.name == ancestor.name:
continue
if ancestor.name.split("_")[0] == ontololgy_name:
ancestors.append(ancestor.name.replace("_", ":"))

return ancestors


# Download and parse owls upon execution
if __name__ == "__main__":
_download_owls()
_parse_owls()
6 changes: 6 additions & 0 deletions tools/ontology-builder/env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import os

PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__))
ONTOLOGY_DIR = os.path.join(PACKAGE_ROOT, "ontology-references")
OWL_INFO_YAML = os.path.join(ONTOLOGY_DIR, "owl_info.yml")
PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_DIR, "all_ontology.json.gz")
Binary file not shown.
38 changes: 38 additions & 0 deletions tools/ontology-builder/ontology-references/owl_info.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
CL:
latest: 2024-01-04
urls:
2024-01-04: https://github.com/obophenotype/cell-ontology/releases/download/v2024-01-04/cl.owl
EFO:
latest: 2024-01-15 EFO 3.62.0
urls:
2024-01-15 EFO 3.62.0: https://github.com/EBISPOT/efo/releases/download/v3.62.0/efo.owl
HANCESTRO:
latest: 3.0
urls:
3.0: https://github.com/EBISPOT/hancestro/raw/3.0/hancestro-base.owl
HsapDv:
latest: 2020-03-10
urls:
2020-03-10: http://aber-owl.net/media/ontologies/HSAPDV/11/hsapdv.owl
MONDO:
latest: 2024-01-03
urls:
2024-01-03: https://github.com/monarch-initiative/mondo/releases/download/v2024-01-03/mondo.owl
MmusDv:
latest: 2020-03-10
urls:
2020-03-10: http://aber-owl.net/media/ontologies/MMUSDV/9/mmusdv.owl
NCBITaxon:
latest: 2023-06-20
urls:
2023-06-20: https://github.com/obophenotype/ncbitaxon/releases/download/v2023-06-20/ncbitaxon.owl.gz
children_of:
- NCBITaxon:33208
UBERON:
latest: 2024-01-18
urls:
2024-01-18: https://github.com/obophenotype/uberon/releases/download/v2024-01-18/uberon.owl
PATO:
latest: 2023-05-18
urls:
2023-05-18: https://github.com/pato-ontology/pato/raw/v2023-05-18/pato.owl
2 changes: 2 additions & 0 deletions tools/ontology-builder/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
owlready2==0.38
PyYaml==6.0

0 comments on commit f90fb05

Please sign in to comment.