From 437954ab2c5781cffeae7161b3e5b96016045ccc Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Mon, 2 Sep 2024 21:50:20 +0300 Subject: [PATCH] Refactored and resolved PR comments --- api/data/refactoring.py | 56 ++++----- tests/pipeline.ipynb | 253 +++++++++++++++++++++++++++++++++++----- 2 files changed, 256 insertions(+), 53 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index f2fd6cd..0c32241 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -316,37 +316,37 @@ def request_gnomad_api_data(gene_name): }} """ - response = requests.post(url, json={'query': query}, timeout=300)# timeout set to 5 minutes + response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes if response.status_code != 200: print('Error:', response.status_code) - return None data = response.json()['data']['gene']['variants'] df = pd.json_normalize(data) - df['total_ac'] = df['exome.ac'].fillna(0) + df['genome.ac'].fillna(0) - df['total_an'] = df['exome.an'].fillna(0) + df['genome.an'].fillna(0) + df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0) + df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0) - df['HGVS Consequence'] = df['hgvsc'].fillna(0) # cDNA change - df['Protein Consequence'] = df['hgvsp'].fillna(0) # Protein change + df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change + df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change - df['Allele Frequency'] = df['total_ac'] / df['total_an'] - df['Homozygote Count'] = df['exome.ac_hom'].fillna(0) + df['genome.ac_hom'].fillna(0) - exome_populations = df['exome.populations'] - genome_populations = df['genome.populations'] - ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] + df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an'] + df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0) + exome_populations = df.loc[:, 'exome.populations'] + genome_populations = df.loc[:, 'genome.populations'] + population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] for i in range(len(exome_populations)): exome_pop = exome_populations[i] - process_population_data(df, exome_pop, 'exome', ids, i) + process_population_data(df, exome_pop, 'exome', population_ids, i) genome_pop = genome_populations[i] - process_population_data(df, genome_pop, 'genome', ids, i) + process_population_data(df, genome_pop, 'genome', population_ids, i) - for variant_id in ids: - df[f'Allele_Frequency_{variant_id}'] = (df[f'exome_ac_{variant_id}'].fillna(0) + df[f'genome_ac_{variant_id}'].fillna(0)) / ( - df[f'exome_an_{variant_id}'].fillna(0) + df[f'genome_an_{variant_id}'].fillna(0)) + for population_id in population_ids: + df.loc[:, f'Allele_Frequency_{population_id}'] = ( + (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / ( + df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0))) population_mapping = { 'afr': 'African/African American', 'eas': 'East Asian', @@ -360,19 +360,21 @@ def request_gnomad_api_data(gene_name): 'remaining': 'Remaining', '': '' } - for i in range(len(df)): + + for i in range(df.shape[0]): max_pop = 0 - maxid = '' - for variant_id in ids: - if df.loc[i, f'Allele_Frequency_{variant_id}'] > max_pop: - max_pop = df.loc[i, f'Allele_Frequency_{variant_id}'] - maxid = variant_id + max_id = '' + for population_id in population_ids: + if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop: + max_pop = df.loc[i, f'Allele_Frequency_{population_id}'] + max_id = population_id df.loc[i, 'Popmax'] = max_pop - df.loc[i, 'Popmax population'] = population_mapping[maxid] - not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', 'variant_id', - 'cDNA change', 'Protein change'] - df = df.drop([col for col in df.columns if col not in not_to_drop], axis=1) + df.loc[i, 'Popmax population'] = population_mapping[max_id] + not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', + 'variant_id', 'cDNA change', 'Protein change'] + + df = df.filter(not_to_drop, axis="columns") - df.rename(columns={'variant_id': 'gnomAD ID'}, inplace=True) + df.rename(columns={'variant_id': 'gnomAD ID'}) return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 6734e80..45c74af 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -7,6 +7,10 @@ "collapsed": true, "jupyter": { "outputs_hidden": true + }, + "ExecuteTime": { + "end_time": "2024-09-02T18:45:02.492330Z", + "start_time": "2024-09-02T18:45:02.488185Z" } }, "source": [ @@ -29,7 +33,7 @@ "pd.options.display.max_columns = 0" ], "outputs": [], - "execution_count": null + "execution_count": 11 }, { "cell_type": "code", @@ -59,7 +63,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:42:20.091398Z" + } + }, "cell_type": "code", "source": [ "gnomad_data = request_gnomad_api_data(\"EYS\")\n", @@ -73,8 +81,7 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-28T18:11:25.802540Z", - "start_time": "2024-08-28T18:11:25.715039Z" + "start_time": "2024-09-02T18:44:44.422287Z" } }, "cell_type": "code", @@ -85,10 +92,14 @@ ], "id": "60f3f3074a9b19f4", "outputs": [], - "execution_count": 24 + "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:44.497881Z" + } + }, "cell_type": "code", "source": "display(gnomad_data_2)", "id": "9d3e4d6b5f7be127", @@ -98,8 +109,7 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-28T18:11:35.536411Z", - "start_time": "2024-08-28T18:11:35.258009Z" + "start_time": "2024-09-02T18:44:44.546361Z" } }, "cell_type": "code", @@ -109,10 +119,14 @@ ], "id": "2e869f5c77dbe3d3", "outputs": [], - "execution_count": 26 + "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:44.806484Z" + } + }, "cell_type": "code", "source": [ "len(gnomad_data_2), len(gnomad_data)\n", @@ -124,21 +138,180 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-02T18:45:06.035450Z", + "start_time": "2024-09-02T18:45:06.022832Z" + } + }, + "cell_type": "code", + "source": "gnomad_data", + "id": "96283480cccf641", + "outputs": [ + { + "data": { + "text/plain": [ + " Popmax Popmax population ... Allele Frequency variant_id\n", + "0 0.000016 African/African American ... 1.807419e-06 6-63720525-A-G\n", + "1 0.000192 East Asian ... 6.573844e-06 6-63720525-A-T\n", + "2 0.000000 ... 0.000000e+00 6-63720525-A-C\n", + "3 0.000020 South Asian ... 1.045299e-06 6-63720526-T-A\n", + "4 0.000000 ... 0.000000e+00 6-63720527-G-T\n", + "... ... ... ... ... ...\n", + "14295 0.000000 ... 0.000000e+00 6-65495479-G-T\n", + "14296 0.000031 African/African American ... 1.446349e-06 6-65495479-G-A\n", + "14297 0.000070 Admixed American ... 2.629510e-06 6-65495482-A-G\n", + "14298 0.000060 South Asian ... 3.645085e-06 6-65495484-T-G\n", + "14299 0.000012 South Asian ... 7.310070e-07 6-65495485-T-C\n", + "\n", + "[14300 rows x 5 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PopmaxPopmax populationHomozygote CountAllele Frequencyvariant_id
00.000016African/African American0.01.807419e-066-63720525-A-G
10.000192East Asian0.06.573844e-066-63720525-A-T
20.0000000.00.000000e+006-63720525-A-C
30.000020South Asian0.01.045299e-066-63720526-T-A
40.0000000.00.000000e+006-63720527-G-T
..................
142950.0000000.00.000000e+006-65495479-G-T
142960.000031African/African American0.01.446349e-066-65495479-G-A
142970.000070Admixed American0.02.629510e-066-65495482-A-G
142980.000060South Asian0.03.645085e-066-65495484-T-G
142990.000012South Asian0.07.310070e-076-65495485-T-C
\n", + "

14300 rows × 5 columns

\n", + "
" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 12 + }, + { + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:44.827926Z" + } + }, "cell_type": "code", "source": [ - "gnomad_data_2.rename(columns={'gnomAD ID': 'variant_id'}, inplace=True)\n", - "\n", "missing_from_api = []\n", "\n", - "for i in gnomad_data['variant_id']:\n", - " if(i in gnomad_data_2['variant_id'].values):\n", + "for i in gnomad_data['gnomAD ID']:\n", + " if(i in gnomad_data_2['gnomAD ID'].values):\n", " continue\n", " missing_from_api.append(i)\n", "\n", "len(missing_from_api)\n", "\n", - "missing_data = gnomad_data.loc[gnomad_data['variant_id'].isin(missing_from_api)]\n", + "missing_data = gnomad_data.loc[gnomad_data['gnomAD ID'].isin(missing_from_api)]\n", "\n", "missing_data" ], @@ -149,18 +322,21 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-28T18:06:31.488622Z", - "start_time": "2024-08-28T18:06:31.471299Z" + "start_time": "2024-09-02T18:44:45.626358Z" } }, "cell_type": "code", "source": "missing_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_missing.csv', index=False)", "id": "388120b03b094511", "outputs": [], - "execution_count": 23 + "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.626358Z" + } + }, "cell_type": "code", "source": [ "set_lovd_dtypes(data)\n", @@ -182,7 +358,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.627863Z" + } + }, "cell_type": "code", "source": [ "for i in data:\n", @@ -194,7 +374,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.628871Z" + } + }, "cell_type": "code", "source": [ "set_lovd_dtypes(data)\n", @@ -207,7 +391,12 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-02T18:44:45.646110Z", + "start_time": "2024-09-02T18:44:45.629871Z" + } + }, "cell_type": "code", "source": "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")", "id": "c968af1617be40db", @@ -215,7 +404,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.630870Z" + } + }, "cell_type": "code", "source": [ "from subprocess import Popen\n", @@ -228,7 +421,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.631870Z" + } + }, "cell_type": "code", "source": [ "from api.tools import get_revel_scores\n", @@ -245,7 +442,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.631870Z" + } + }, "cell_type": "code", "source": "", "id": "6f0abfb50bd211a0",