monarch-initiative · justaddcoffee · Dec 8, 2023 · Dec 8, 2023 · Dec 8, 2023 · Dec 8, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+.idea/*
 *.pem
 db/
 proddb/

diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/curate-gpt.iml b/.idea/curate-gpt.iml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
@@ -2,10 +2,12 @@
 import csv
 import gzip
 import json
+import os
 import logging
 import sys
+import warnings
 from pathlib import Path
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Union, Optional
 
 import click
 import pandas as pd
@@ -17,6 +19,7 @@
 from llm.cli import load_conversation
 from oaklib import get_adapter
 from pydantic import BaseModel
+from tqdm import tqdm
 
 from curate_gpt import ChromaDBAdapter, __version__
 from curate_gpt.agents.chat_agent import ChatAgent, ChatResponse
@@ -32,8 +35,10 @@
 from curate_gpt.extract import AnnotatedObject
 from curate_gpt.extract.basic_extractor import BasicExtractor
 from curate_gpt.store.schema_proxy import SchemaProxy
+from curate_gpt.utils.search import generate_phenopacket
 from curate_gpt.utils.vectordb_operations import match_collections
 from curate_gpt.wrappers import BaseWrapper, get_wrapper
+from curate_gpt.wrappers.clinical.hpoa_wrapper import HPOAWrapper
 from curate_gpt.wrappers.literature.pubmed_wrapper import PubmedWrapper
 from curate_gpt.wrappers.ontology import OntologyWrapper
 
@@ -1745,5 +1750,78 @@ def pubmed_ask(query, path, model, show_references, **kwargs):
             print(ref_text)
 
 
+@main.command(name="pubmed2phenopacket")
+@click.argument("pmid")
+@click.option("--output-dir", type=click.Path(), help="Directory to save the phenopacket")
+def pubmed2phenopacket(pmid, output_dir):
+    """
+    Fetch full text for a given PMID and generate a phenopacket.
+
+    :param pmid: PubMed ID of the article.
+    :param output_dir: Directory to save the generated phenopacket.
+    """
+    pubmed_wrapper = PubmedWrapper()
+    full_text = pubmed_wrapper.fetch_full_text(pmid)
+    phenopacket = generate_phenopacket(full_text)
+
+    # Save the phenopacket
+    output_path = Path(output_dir) / f"{pmid}_phenopacket.json"
+    with open(output_path, "w") as f:
+        json.dump(phenopacket, f)
+    click.echo(f"Phenopacket generated and saved to {output_path}")
+
+
+@main.command(name="hpoa2phenopackets")
+@click.option("--output-dir", type=click.Path(), help="Directory to save the phenopackets")
+@click.option("--limit", type=click.INT, help="Stop at --limit entries")
+@click.option("--hpoa_file", type=click.Path(), help="Use this hpoa file instead of retrieving")
+@click.option("--only_pubs_about_one_disease", type=click.BOOL, help="Only make phenopackets for papers about a single diseaes")
+def hpoa2phenopackets(output_dir, limit, hpoa_file, only_pubs_about_one_disease):
+    """
+    Fetch full text for PMIDs mentioned in HPOA and generate phenopackets.
+    :param limit: stop after limit number of entries (for testing)
+    :param output_dir: Directory to save the generated phenopackets.
+    :param only_pubs_about_one_disease: Only make phenopackets for papers about a single disease
+    """
+    output_dir = output_dir if output_dir else os.getcwd()
+
+    hw = HPOAWrapper(group_by_publication=True)
+    if hpoa_file:
+        with open(hpoa_file) as f:
+            items = hw.objects_from_file(f, retrieve_pubmed_data=False)
+    else:
+        items = list(hw.objects(retrieve_pubmed_data=False))
+
+    if only_pubs_about_one_disease:
+        # select items where there is only one OMIM disease mentioned in the paper.
+        # otherwise the 'correct' disease for each paper is ambiguous
+        items_with_one_correct_disease = []
+        for i in items:
+            if len(set(list([a['disease'] for a in i['associations']]))) == 1:
+                items_with_one_correct_disease.append(i)
+        items = items_with_one_correct_disease
+
+    # get papers and make phenopackets
+    pubmed_wrapper = PubmedWrapper()
+    items = items[:limit] if limit is not None else items
+    for item in tqdm(items, desc="making phenopackets"):
+        pmids = set([i['reference'] for i in item['associations']])
+        if len(pmids) > 1:
+            warnings.warn(f"Got >1 PMID in item {item}")
+            continue
+        pmid = list(pmids)[0]
+        full_text = pubmed_wrapper.fetch_full_text(pmid)
+        if full_text is None:
+            warnings.warn(f"Couldn't find full text for {pmid}")
+            continue
+        phenopacket = generate_phenopacket(full_text)
+
+        # Save the phenopacket
+        output_path = Path(output_dir) / f"{pmid}_phenopacket.json"
+        with open(output_path, "w") as f:
+            json.dump(phenopacket, f)
+        click.echo(f"Phenopacket generated and saved to {output_path}")
+
+
 if __name__ == "__main__":
     main()
diff --git a/src/curate_gpt/extract/openai_extractor.py b/src/curate_gpt/extract/openai_extractor.py
@@ -30,7 +30,7 @@ def functions(self):
         return [
             {
                 "name": FUNC_NAME,
-                "description": "A n ontology term",
+                "description": "An ontology term",
                 "parameters": self.schema_proxy.json_schema(),
             },
         ]
@@ -50,35 +50,36 @@ def extract(
                 "content": f"You are system that returns {target_class} object in JSON.",
             },
         ]
-        for example in examples:
-            ex_text = example.text
-            ex_object = example.object
-            print(f"EXAMPLE = {ex_text}")
-            messages.append(
-                {
-                    "role": "user",
-                    "content": f"make terms for {ex_text}",
-                }
-            )
-            if not examples_as_functions:
+        if examples:
+            for example in examples:
+                ex_text = example.text
+                ex_object = example.object
+                print(f"EXAMPLE = {ex_text}")
                 messages.append(
                     {
-                        "role": "assistant",
-                        "content": None,
-                        "function_call": {
+                        "role": "user",
+                        "content": f"make terms for {ex_text}",
+                    }
+                )
+                if not examples_as_functions:
+                    messages.append(
+                        {
+                            "role": "assistant",
+                            "content": None,
+                            "function_call": {
+                                "name": FUNC_NAME,
+                                "arguments": json.dumps(ex_object),
+                            },
+                        },
+                    )
+                else:
+                    messages.append(
+                        {
+                            "role": "function",
                             "name": FUNC_NAME,
-                            "arguments": json.dumps(ex_object),
+                            "content": json.dumps(ex_object),
                         },
-                    },
-                )
-            else:
-                messages.append(
-                    {
-                        "role": "function",
-                        "name": FUNC_NAME,
-                        "content": json.dumps(ex_object),
-                    },
-                )
+                    )
         if conversation:
             messages.extend(conversation)
         # content = f"make terms for {text}"

diff --git a/src/curate_gpt/utils/search.py b/src/curate_gpt/utils/search.py
@@ -1,3 +1,50 @@
 import logging
+import openai
 
 logger = logging.getLogger(__name__)
+
+
+def generate_phenopacket(full_text, model="gpt-4-1106-preview", max_tokens=3000):
+        messages = [
+            {
+                "role": "system",
+                "content": """"You are system that returns Phenopackets in JSON format.           
+These phenopackets should have phenotypicFeatures captured as HPO terms, for example:"
+  "phenotypicFeatures": [
+    {
+      "type": {
+        "id": "HP:0005294",
+        "label": "Arterial dissection"
+      }
+    },
+    {
+      "type": {
+        "id": "HP:0010648",
+        "label": "Dermal translucency"
+      }
+    }]
+and the correct disease diagnosis in as interpretation, preferably as OMIM diseases, for
+example: 
+  "interpretations": [
+    {
+      "id": "someuniqueID1234",
+      "diagnosis": {
+        "disease": {
+          "id": "OMIM:130050",
+          "label": "EHLERS-DANLOS SYNDROME, VASCULAR TYPE"
+        }
+      }
+    }
+  ],
+
+Create a phenopacket from the following scientific article:
+""" +
+                           full_text}]
+
+        response = openai.ChatCompletion.create(
+            model=model,
+            # functions=None,
+            messages=messages,
+            max_tokens=max_tokens,
+        )
+        return response.choices[0].message['content']
-Original file line number
+Diff line change
@@ -1,3 +1,4 @@
+    .idea/*
     *.pem
     db/
     proddb/
@@ Expand Down @@