diff --git a/api/data/__init__.py b/api/data/__init__.py index 7bfdfbe..a2a2249 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -33,26 +33,6 @@ GNOMAD_PATH, ) -# DATA COLLECTION IMPORT -from .collection import ( - # Custom exceptions - BadResponseException, - DownloadError, - - # Custom utility functions - get_file_from_url, - - # Functions for downloading databases - download_lovd_database_for_eys_gene, - download_genes_lovd, - download_database_for_eys_gene, - download_data_from_gnomad_eys, - - # Functions for storing databases - store_database_for_eys_gene - -) - # DATA REFACTORING IMPORT from .refactoring import ( # Functions for refactoring data diff --git a/api/data/constants.py b/api/data/constants.py index 757074c..c455472 100644 --- a/api/data/constants.py +++ b/api/data/constants.py @@ -25,6 +25,8 @@ LOVD_PATH = os.path.join(DATA_PATH, "lovd/") GNOMAD_PATH = os.path.join(DATA_PATH, "gnomad/") CLINVAR_PATH = os.path.join(DATA_PATH, "clinvar/") +DEFAULT_SAVE_PATH = os.path.join(DATA_PATH, "merged_data/") +SAVE_LOVD_GNOMAD = "../data/merged_lovd_gnomad/lovd_gnomad.csv" # variable data types LOVD_TABLES_DATA_TYPES = { diff --git a/api/data/refactoring.py b/api/data/refactoring.py index b93ccd7..6efc1bb 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -11,8 +11,8 @@ from pyliftover import LiftOver -from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH - +from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH, DEFAULT_SAVE_PATH, \ + SAVE_LOVD_GNOMAD def set_lovd_dtypes(df_dict): @@ -348,3 +348,61 @@ def find_popmax_in_gnomad(data): data.loc[i, 'Popmax'] = max_pop data.loc[i, 'Popmax population'] = population_mapping[max_id] + +def routing_merge(lovd_path:str=LOVD_PATH,gnomad_path:str=GNOMAD_PATH,save_path:str=DEFAULT_SAVE_PATH,overwrite:bool=False): + """ + Merges data from provided paths and saves to new location + :param overwrite: does file requires overwriting + :param lovd_path: path to LOVD dataframe + :param gnomad_path: path to gnomAD dataframe + :param save_path: path where to save merged data + :return: + """ + + if overwrite: + return + + lovd_file = os.path.join(lovd_path, "lovd_data.txt") + gnomad_file = os.path.join(gnomad_path, "gnomad_data.csv") + + if not os.path.exists(lovd_file): + raise FileNotFoundError(f"LOVD data file not found at: {lovd_file}") + + if not os.path.exists(gnomad_file): + raise FileNotFoundError(f"gnomAD data file not found at: {gnomad_file}") + + lovd_data = parse_lovd(lovd_path + "/lovd_data.txt") + gnomad_data = parse_gnomad(gnomad_path + '/gnomad_data.csv') + + set_lovd_dtypes(lovd_data) + set_gnomad_dtypes(gnomad_data) + + # Extract "Variants_On_Genome" and merge it with "Variants_On_Transcripts" + variants_on_genome = lovd_data["Variants_On_Genome"].copy() + + lovd_data = pd.merge( + lovd_data["Variants_On_Transcripts"], + variants_on_genome[['id', 'VariantOnGenome/DNA', 'VariantOnGenome/DNA/hg38']], + on='id', + how='left' + ) + + # Copy gnomAD data and merge with LOVD data + gnomad_data = gnomad_data.copy() + final_data = merge_gnomad_lovd(lovd_data, gnomad_data) + + if not os.path.exists(os.path.dirname(save_path)): + os.makedirs(os.path.dirname(save_path)) + try: + final_data.to_csv(SAVE_LOVD_GNOMAD) + print(f"Merged data saved to {save_path}") + except OSError as e: + print(f"Error saving file: {e}") + + save_to = SAVE_LOVD_GNOMAD + + # check if directory exists, if not - create + save_to_dir = os.path.dirname(save_to) + if not os.path.exists(save_to_dir): + os.makedirs(save_to_dir) + diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 23df568..b92df1c 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -1693,6 +1693,527 @@ "display(results)" ], "id": "ba435cd29d565f7d" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-19T15:57:49.043391Z", + "start_time": "2024-09-19T15:57:40.144311Z" + } + }, + "cell_type": "code", + "source": [ + "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n", + "import pandas as pd\n", + "from api import (store_database_for_eys_gene,\n", + " parse_lovd,\n", + " set_lovd_dtypes,\n", + " LOVD_PATH,\n", + " GNOMAD_PATH)\n", + "\n", + "store_database_for_eys_gene('lovd', False)\n", + "store_database_for_eys_gene('gnomad', False)\n", + "\n", + "lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", + "gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n", + "\n", + "set_lovd_dtypes(lovd_data)\n", + "set_gnomad_dtypes(gnomad_data)\n", + "\n", + "variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n", + "\n", + "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n", + " variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38']],\n", + " on='id',\n", + " how='left')\n", + "\n", + "gnomad_data = gnomad_data.copy()\n", + "final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", + "final_data" + ], + "id": "d86fa6b925aea085", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The file at ../data/lovd/lovd_data.txt already exists.\n" + ] + }, + { + "data": { + "text/plain": [ + " id transcriptid effectid position_c_start \\\n", + "0 822823 7329 70 632 \n", + "1 822787 7329 70 8391 \n", + "2 822843 7329 70 5608 \n", + "3 822771 7329 70 8206 \n", + "4 \n", + "... ... ... ... ... \n", + "13220 959060 7329 70 9383 \n", + "13221 959064 7329 50 0 \n", + "13222 985494 7329 70 2137 \n", + "13223 986425 7329 90 4361 \n", + "13224 987322 7329 90 9299 \n", + "\n", + " position_c_start_intron position_c_end position_c_end_intron \\\n", + "0 0 632 0 \n", + "1 0 8391 0 \n", + "2 0 5608 0 \n", + "3 0 8206 0 \n", + "4 \n", + "... ... ... ... \n", + "13220 0 9387 0 \n", + "13221 0 0 0 \n", + "13222 20590 3444 -29847 \n", + "13223 0 4362 0 \n", + "13224 0 9302 0 \n", + "\n", + " VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n", + "0 c.632G>A r.(?) \n", + "1 c.8391del r.(?) \n", + "2 c.5608C>T r.(?) \n", + "3 c.8206G>C r.(?) \n", + "4 \n", + "... ... ... \n", + "13220 c.9383_9387del r.(?) \n", + "13221 c.-538_862+10652{1}inv r.? \n", + "13222 c.2137+20590_3444-29847del r.? \n", + "13223 c.4361_4362delinsAG r.(?) \n", + "13224 c.9299_9302del r.(?) \n", + "\n", + " VariantOnTranscript/Protein ... Homozygote Count Amish_gnomad \\\n", + "0 p.(Cys211Tyr) ... \n", + "1 p.(Gly2799Valfs*31) ... \n", + "2 p.(Arg1870Trp) ... \n", + "3 p.(Ala2736Pro) ... \n", + "4 ... 0 \n", + "... ... ... ... \n", + "13220 p.(Lys3128ArgfsTer7) ... \n", + "13221 p.? ... \n", + "13222 p.(Val713AspfsTer14) ... \n", + "13223 p.(Ser1454Ter) ... \n", + "13224 p.(Thr3100LysfsTer26) ... \n", + "\n", + " Hemizygote Count Amish_gnomad Allele Count South Asian_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 0 0 \n", + "... ... ... \n", + "13220 \n", + "13221 \n", + "13222 \n", + "13223 \n", + "13224 \n", + "\n", + " Allele Number South Asian_gnomad Homozygote Count South Asian_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 55362 0 \n", + "... ... ... \n", + "13220 \n", + "13221 \n", + "13222 \n", + "13223 \n", + "13224 \n", + "\n", + " Hemizygote Count South Asian_gnomad Allele Count Remaining_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 0 0 \n", + "... ... ... \n", + "13220 \n", + "13221 \n", + "13222 \n", + "13223 \n", + "13224 \n", + "\n", + " Allele Number Remaining_gnomad Homozygote Count Remaining_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 44082 0 \n", + "... ... ... \n", + "13220 \n", + "13221 \n", + "13222 \n", + "13223 \n", + "13224 \n", + "\n", + " Hemizygote Count Remaining_gnomad \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 0 \n", + "... ... \n", + "13220 \n", + "13221 \n", + "13222 \n", + "13223 \n", + "13224 \n", + "\n", + "[13225 rows x 86 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/Protein...Homozygote Count Amish_gnomadHemizygote Count Amish_gnomadAllele Count South Asian_gnomadAllele Number South Asian_gnomadHomozygote Count South Asian_gnomadHemizygote Count South Asian_gnomadAllele Count Remaining_gnomadAllele Number Remaining_gnomadHomozygote Count Remaining_gnomadHemizygote Count Remaining_gnomad
082282373297063206320c.632G>Ar.(?)p.(Cys211Tyr)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
18227877329708391083910c.8391delr.(?)p.(Gly2799Valfs*31)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
28228437329705608056080c.5608C>Tr.(?)p.(Arg1870Trp)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
38227717329708206082060c.8206G>Cr.(?)p.(Ala2736Pro)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
4<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>...000553620004408200
..................................................................
132209590607329709383093870c.9383_9387delr.(?)p.(Lys3128ArgfsTer7)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132219590647329500000c.-538_862+10652{1}invr.?p.?...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132229854947329702137205903444-29847c.2137+20590_3444-29847delr.?p.(Val713AspfsTer14)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132239864257329904361043620c.4361_4362delinsAGr.(?)p.(Ser1454Ter)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132249873227329909299093020c.9299_9302delr.(?)p.(Thr3100LysfsTer26)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
\n", + "

13225 rows × 86 columns

\n", + "
" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 1 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-19T16:22:52.431348Z", + "start_time": "2024-09-19T16:22:43.027091Z" + } + }, + "cell_type": "code", + "source": [ + "from api.data.refactoring import routing_merge\n", + "\n", + "routing_merge()" + ], + "id": "29ecf5e58e3d53e4", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merged data saved to C:\\Users\\Vlad\\PycharmProjects\\kath\\data/merged_data/\n" + ] + } + ], + "execution_count": 1 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "b5eedffd56faee1d" } ], "metadata": {