Merge pull request #19 from DKI/prime_prompt

DKI/prime_prompt_generator
Strexas · May 27, 2024 · 1490bc9 · 1490bc9
2 parents 7d71f6c + 7f882ce
commit 1490bc9
Show file tree

Hide file tree

Showing 2 changed files with 119 additions and 0 deletions.
diff --git a/prime_prompt/generate_prime_promt.py b/prime_prompt/generate_prime_promt.py
@@ -0,0 +1,42 @@
+""" Script for generating prime prompt """
+
+import os
+from os.path import normpath, join
+
+project_folder = normpath(join(__file__, '..', '..'))
+
+
+def scan_docs(path: str):
+    """
+    Scans Python file and yields function and its documentation
+    :param path: path of file
+    """
+    with open(path, "r", encoding="utf=8") as python_file:
+        lines = python_file.readlines()
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            i += 1
+            if not line.startswith("def ") or line[4] == "_":
+                continue
+
+            function_name = line[4:line.index("(")]
+            i += 1  # skip first """
+
+            function_documentation = ""
+            while True:
+                if lines[i].strip() == "\"\"\"":
+                    break
+                function_documentation += lines[i]
+                i += 1
+            yield function_name, function_documentation
+
+
+llm_related_modules = ["data_collection"]
+with open("prime_prompt.txt", "w", encoding="utf=8") as f:
+    for module in llm_related_modules:
+        for directory, folder, files in os.walk(join(project_folder, module)):
+            for file in files:
+                for function, documentation in scan_docs(join(directory, file)):
+                    f.write(f"Documentation for {function}\n")
+                    f.write(documentation + "\n\n")
diff --git a/prime_prompt/prime_prompt.txt b/prime_prompt/prime_prompt.txt
@@ -0,0 +1,77 @@
+Documentation for get_file_from_url
+    Gets file from url and saves it into provided path. Overrides, if override is True.
+
+    :param str url: link with file
+    :param str save_to: path to save
+    :param bool override: needs override
+
+
+Documentation for download_lovd_database_for_eys_gene
+    Gets file from url and saves it into provided path. Overrides, if override is True.
+
+    :param str database_name: database to download
+    :param bool override: needs override
+
+
+Documentation for download_genes_lovd
+    Downloads data into txt files from gene_list.
+
+    :param list gene_list: list of gene's symbols
+    :param str folder_path: folder to save the data
+    :param bool raise_exception: raise exception if True, otherwise log
+
+
+Documentation for download_database_for_eys_gene
+    downloads chosen database
+    and handles where it should be saved,
+    renames the downloaded (latest) file to appropriate name
+    :param database_name: the name of the database
+    :param override: should an existing file be overriden with a new one
+
+
+Documentation for store_database_for_eys_gene
+    calls a function to download a database
+    :param database_name: the name of the database that should be downloaded
+    :param override: should be already existing file be overwritten
+
+
+Documentation for calculate_max_frequency
+    Calculating maximum allele frequency in GNOMAD row.
+
+    :param row: row in dataframe
+    :returns: panda series with 'PopMax', 'PopMax population' fields
+    :rtype: pd.Series
+
+
+Documentation for main
+    Main function implementing pipeline for data collection and merging of data from
+    LOVD, GNOMAD and CLINVAR.
+
+
+Documentation for set_lovd_dtypes
+    Convert data from LOVD format table to desired data format based on specified data types.
+
+    :param dict[str, tuple[DataFrame, list[str]] df_dict: Dictionary of tables saved as DataFrame
+
+
+Documentation for parse_lovd
+    Converts data from text file with LOVD format to dictionary of tables.
+
+    Key is name of table, value is data saved as pandas DataFrame.
+    Notes for each table are displayed with log.
+
+    **IMPORTANT:** It doesn't provide types for data inside. Use convert_lovd_to_datatype for this.
+
+    :param str path: path to text file
+    :returns: dictionary of tables
+    :rtype: dict[str, tuple[DataFrame, list[str]]]
+
+
+Documentation for from_clinvar_name_to_cdna_position
+    Custom cleaner to extract cDNA position from Clinvar `name` variable.
+
+    :param str name:
+    :returns: extracted cDNA
+    :rtype: str
+
+