Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create phenopackets from clinical case reports (WIP) #13

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.idea/*
*.pem
db/
proddb/
Expand Down
8 changes: 0 additions & 8 deletions .idea/.gitignore

This file was deleted.

13 changes: 0 additions & 13 deletions .idea/curate-gpt.iml

This file was deleted.

35 changes: 0 additions & 35 deletions .idea/inspectionProfiles/Project_Default.xml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/inspectionProfiles/profiles_settings.xml

This file was deleted.

9 changes: 0 additions & 9 deletions .idea/misc.xml

This file was deleted.

8 changes: 0 additions & 8 deletions .idea/modules.xml

This file was deleted.

7 changes: 0 additions & 7 deletions .idea/vcs.xml

This file was deleted.

80 changes: 79 additions & 1 deletion src/curate_gpt/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import csv
import gzip
import json
import os
import logging
import sys
import warnings
from pathlib import Path
from typing import Any, Dict, List, Union
from typing import Any, Dict, List, Union, Optional

import click
import pandas as pd
Expand All @@ -17,6 +19,7 @@
from llm.cli import load_conversation
from oaklib import get_adapter
from pydantic import BaseModel
from tqdm import tqdm

from curate_gpt import ChromaDBAdapter, __version__
from curate_gpt.agents.chat_agent import ChatAgent, ChatResponse
Expand All @@ -32,8 +35,10 @@
from curate_gpt.extract import AnnotatedObject
from curate_gpt.extract.basic_extractor import BasicExtractor
from curate_gpt.store.schema_proxy import SchemaProxy
from curate_gpt.utils.search import generate_phenopacket
from curate_gpt.utils.vectordb_operations import match_collections
from curate_gpt.wrappers import BaseWrapper, get_wrapper
from curate_gpt.wrappers.clinical.hpoa_wrapper import HPOAWrapper
from curate_gpt.wrappers.literature.pubmed_wrapper import PubmedWrapper
from curate_gpt.wrappers.ontology import OntologyWrapper

Expand Down Expand Up @@ -1745,5 +1750,78 @@ def pubmed_ask(query, path, model, show_references, **kwargs):
print(ref_text)


@main.command(name="pubmed2phenopacket")
@click.argument("pmid")
@click.option("--output-dir", type=click.Path(), help="Directory to save the phenopacket")
def pubmed2phenopacket(pmid, output_dir):
"""
Fetch full text for a given PMID and generate a phenopacket.

:param pmid: PubMed ID of the article.
:param output_dir: Directory to save the generated phenopacket.
"""
pubmed_wrapper = PubmedWrapper()
full_text = pubmed_wrapper.fetch_full_text(pmid)
phenopacket = generate_phenopacket(full_text)

# Save the phenopacket
output_path = Path(output_dir) / f"{pmid}_phenopacket.json"
with open(output_path, "w") as f:
json.dump(phenopacket, f)
click.echo(f"Phenopacket generated and saved to {output_path}")


@main.command(name="hpoa2phenopackets")
@click.option("--output-dir", type=click.Path(), help="Directory to save the phenopackets")
@click.option("--limit", type=click.INT, help="Stop at --limit entries")
@click.option("--hpoa_file", type=click.Path(), help="Use this hpoa file instead of retrieving")
@click.option("--only_pubs_about_one_disease", type=click.BOOL, help="Only make phenopackets for papers about a single diseaes")
def hpoa2phenopackets(output_dir, limit, hpoa_file, only_pubs_about_one_disease):
"""
Fetch full text for PMIDs mentioned in HPOA and generate phenopackets.
:param limit: stop after limit number of entries (for testing)
:param output_dir: Directory to save the generated phenopackets.
:param only_pubs_about_one_disease: Only make phenopackets for papers about a single disease
"""
output_dir = output_dir if output_dir else os.getcwd()

hw = HPOAWrapper(group_by_publication=True)
if hpoa_file:
with open(hpoa_file) as f:
items = hw.objects_from_file(f, retrieve_pubmed_data=False)
else:
items = list(hw.objects(retrieve_pubmed_data=False))

if only_pubs_about_one_disease:
# select items where there is only one OMIM disease mentioned in the paper.
# otherwise the 'correct' disease for each paper is ambiguous
items_with_one_correct_disease = []
for i in items:
if len(set(list([a['disease'] for a in i['associations']]))) == 1:
items_with_one_correct_disease.append(i)
items = items_with_one_correct_disease

# get papers and make phenopackets
pubmed_wrapper = PubmedWrapper()
items = items[:limit] if limit is not None else items
for item in tqdm(items, desc="making phenopackets"):
pmids = set([i['reference'] for i in item['associations']])
if len(pmids) > 1:
warnings.warn(f"Got >1 PMID in item {item}")
continue
pmid = list(pmids)[0]
full_text = pubmed_wrapper.fetch_full_text(pmid)
if full_text is None:
warnings.warn(f"Couldn't find full text for {pmid}")
continue
phenopacket = generate_phenopacket(full_text)

# Save the phenopacket
output_path = Path(output_dir) / f"{pmid}_phenopacket.json"
with open(output_path, "w") as f:
json.dump(phenopacket, f)
click.echo(f"Phenopacket generated and saved to {output_path}")


if __name__ == "__main__":
main()
53 changes: 27 additions & 26 deletions src/curate_gpt/extract/openai_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def functions(self):
return [
{
"name": FUNC_NAME,
"description": "A n ontology term",
"description": "An ontology term",
"parameters": self.schema_proxy.json_schema(),
},
]
Expand All @@ -50,35 +50,36 @@ def extract(
"content": f"You are system that returns {target_class} object in JSON.",
},
]
for example in examples:
ex_text = example.text
ex_object = example.object
print(f"EXAMPLE = {ex_text}")
messages.append(
{
"role": "user",
"content": f"make terms for {ex_text}",
}
)
if not examples_as_functions:
if examples:
for example in examples:
ex_text = example.text
ex_object = example.object
print(f"EXAMPLE = {ex_text}")
messages.append(
{
"role": "assistant",
"content": None,
"function_call": {
"role": "user",
"content": f"make terms for {ex_text}",
}
)
if not examples_as_functions:
messages.append(
{
"role": "assistant",
"content": None,
"function_call": {
"name": FUNC_NAME,
"arguments": json.dumps(ex_object),
},
},
)
else:
messages.append(
{
"role": "function",
"name": FUNC_NAME,
"arguments": json.dumps(ex_object),
"content": json.dumps(ex_object),
},
},
)
else:
messages.append(
{
"role": "function",
"name": FUNC_NAME,
"content": json.dumps(ex_object),
},
)
)
if conversation:
messages.extend(conversation)
# content = f"make terms for {text}"
Expand Down
47 changes: 47 additions & 0 deletions src/curate_gpt/utils/search.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,50 @@
import logging
import openai

logger = logging.getLogger(__name__)


def generate_phenopacket(full_text, model="gpt-4-1106-preview", max_tokens=3000):
messages = [
{
"role": "system",
"content": """"You are system that returns Phenopackets in JSON format.
These phenopackets should have phenotypicFeatures captured as HPO terms, for example:"
"phenotypicFeatures": [
{
"type": {
"id": "HP:0005294",
"label": "Arterial dissection"
}
},
{
"type": {
"id": "HP:0010648",
"label": "Dermal translucency"
}
}]
and the correct disease diagnosis in as interpretation, preferably as OMIM diseases, for
example:
"interpretations": [
{
"id": "someuniqueID1234",
"diagnosis": {
"disease": {
"id": "OMIM:130050",
"label": "EHLERS-DANLOS SYNDROME, VASCULAR TYPE"
}
}
}
],

Create a phenopacket from the following scientific article:
""" +
full_text}]

response = openai.ChatCompletion.create(
model=model,
# functions=None,
messages=messages,
max_tokens=max_tokens,
)
return response.choices[0].message['content']
Loading
Loading