From 437954ab2c5781cffeae7161b3e5b96016045ccc Mon Sep 17 00:00:00 2001
From: Kajus CC <42713684+KajusC@users.noreply.github.com>
Date: Mon, 2 Sep 2024 21:50:20 +0300
Subject: [PATCH] Refactored and resolved PR comments

---
 api/data/refactoring.py |  56 ++++-----
 tests/pipeline.ipynb    | 253 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 256 insertions(+), 53 deletions(-)

diff --git a/api/data/refactoring.py b/api/data/refactoring.py
index f2fd6cd..0c32241 100644
--- a/api/data/refactoring.py
+++ b/api/data/refactoring.py
@@ -316,37 +316,37 @@ def request_gnomad_api_data(gene_name):
     }}
     """
 
-    response = requests.post(url, json={'query': query}, timeout=300)# timeout set to 5 minutes
+    response = requests.post(url, json={'query': query}, timeout=300)  # timeout set to 5 minutes
 
     if response.status_code != 200:
         print('Error:', response.status_code)
-        return None
 
     data = response.json()['data']['gene']['variants']
 
     df = pd.json_normalize(data)
 
-    df['total_ac'] = df['exome.ac'].fillna(0) + df['genome.ac'].fillna(0)
-    df['total_an'] = df['exome.an'].fillna(0) + df['genome.an'].fillna(0)
+    df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0)
+    df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0)
 
-    df['HGVS Consequence'] = df['hgvsc'].fillna(0) # cDNA change
-    df['Protein Consequence'] = df['hgvsp'].fillna(0) # Protein change
+    df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0)  # cDNA change
+    df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0)  # Protein change
 
-    df['Allele Frequency'] = df['total_ac'] / df['total_an']
-    df['Homozygote Count'] = df['exome.ac_hom'].fillna(0) + df['genome.ac_hom'].fillna(0)
-    exome_populations = df['exome.populations']
-    genome_populations = df['genome.populations']
-    ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']
+    df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
+    df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0)
+    exome_populations = df.loc[:, 'exome.populations']
+    genome_populations = df.loc[:, 'genome.populations']
+    population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']
 
     for i in range(len(exome_populations)):
         exome_pop = exome_populations[i]
-        process_population_data(df, exome_pop, 'exome', ids, i)
+        process_population_data(df, exome_pop, 'exome', population_ids, i)
         genome_pop = genome_populations[i]
-        process_population_data(df, genome_pop, 'genome', ids, i)
+        process_population_data(df, genome_pop, 'genome', population_ids, i)
 
-    for variant_id in ids:
-        df[f'Allele_Frequency_{variant_id}'] = (df[f'exome_ac_{variant_id}'].fillna(0) + df[f'genome_ac_{variant_id}'].fillna(0)) / (
-                        df[f'exome_an_{variant_id}'].fillna(0) + df[f'genome_an_{variant_id}'].fillna(0))
+    for population_id in population_ids:
+        df.loc[:, f'Allele_Frequency_{population_id}'] = (
+               (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / (
+                df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
     population_mapping = {
             'afr': 'African/African American',
             'eas': 'East Asian',
@@ -360,19 +360,21 @@ def request_gnomad_api_data(gene_name):
             'remaining': 'Remaining',
             '': ''
         }
-    for i in range(len(df)):
+
+    for i in range(df.shape[0]):
         max_pop = 0
-        maxid = ''
-        for variant_id in ids:
-            if df.loc[i, f'Allele_Frequency_{variant_id}'] > max_pop:
-                max_pop = df.loc[i, f'Allele_Frequency_{variant_id}']
-                maxid = variant_id
+        max_id = ''
+        for population_id in population_ids:
+            if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
+                max_pop = df.loc[i, f'Allele_Frequency_{population_id}']
+                max_id = population_id
         df.loc[i, 'Popmax'] = max_pop
-        df.loc[i, 'Popmax population'] = population_mapping[maxid]
-    not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', 'variant_id',
-                       'cDNA change', 'Protein change']
-    df = df.drop([col for col in df.columns if col not in not_to_drop], axis=1)
+        df.loc[i, 'Popmax population'] = population_mapping[max_id]
+    not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
+                   'variant_id', 'cDNA change', 'Protein change']
+
+    df = df.filter(not_to_drop, axis="columns")
 
-    df.rename(columns={'variant_id': 'gnomAD ID'}, inplace=True)
+    df.rename(columns={'variant_id': 'gnomAD ID'})
 
     return df
diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb
index 6734e80..45c74af 100644
--- a/tests/pipeline.ipynb
+++ b/tests/pipeline.ipynb
@@ -7,6 +7,10 @@
     "collapsed": true,
     "jupyter": {
      "outputs_hidden": true
+    },
+    "ExecuteTime": {
+     "end_time": "2024-09-02T18:45:02.492330Z",
+     "start_time": "2024-09-02T18:45:02.488185Z"
     }
    },
    "source": [
@@ -29,7 +33,7 @@
     "pd.options.display.max_columns = 0"
    ],
    "outputs": [],
-   "execution_count": null
+   "execution_count": 11
   },
   {
    "cell_type": "code",
@@ -59,7 +63,11 @@
    "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2024-09-02T18:42:20.091398Z"
+    }
+   },
    "cell_type": "code",
    "source": [
     "gnomad_data = request_gnomad_api_data(\"EYS\")\n",
@@ -73,8 +81,7 @@
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-08-28T18:11:25.802540Z",
-     "start_time": "2024-08-28T18:11:25.715039Z"
+     "start_time": "2024-09-02T18:44:44.422287Z"
     }
    },
    "cell_type": "code",
@@ -85,10 +92,14 @@
    ],
    "id": "60f3f3074a9b19f4",
    "outputs": [],
-   "execution_count": 24
+   "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2024-09-02T18:44:44.497881Z"
+    }
+   },
    "cell_type": "code",
    "source": "display(gnomad_data_2)",
    "id": "9d3e4d6b5f7be127",
@@ -98,8 +109,7 @@
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-08-28T18:11:35.536411Z",
-     "start_time": "2024-08-28T18:11:35.258009Z"
+     "start_time": "2024-09-02T18:44:44.546361Z"
     }
    },
    "cell_type": "code",
@@ -109,10 +119,14 @@
    ],
    "id": "2e869f5c77dbe3d3",
    "outputs": [],
-   "execution_count": 26
+   "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2024-09-02T18:44:44.806484Z"
+    }
+   },
    "cell_type": "code",
    "source": [
     "len(gnomad_data_2), len(gnomad_data)\n",
@@ -124,21 +138,180 @@
    "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-02T18:45:06.035450Z",
+     "start_time": "2024-09-02T18:45:06.022832Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "gnomad_data",
+   "id": "96283480cccf641",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "         Popmax         Popmax population  ...  Allele Frequency      variant_id\n",
+       "0      0.000016  African/African American  ...      1.807419e-06  6-63720525-A-G\n",
+       "1      0.000192                East Asian  ...      6.573844e-06  6-63720525-A-T\n",
+       "2      0.000000                            ...      0.000000e+00  6-63720525-A-C\n",
+       "3      0.000020               South Asian  ...      1.045299e-06  6-63720526-T-A\n",
+       "4      0.000000                            ...      0.000000e+00  6-63720527-G-T\n",
+       "...         ...                       ...  ...               ...             ...\n",
+       "14295  0.000000                            ...      0.000000e+00  6-65495479-G-T\n",
+       "14296  0.000031  African/African American  ...      1.446349e-06  6-65495479-G-A\n",
+       "14297  0.000070          Admixed American  ...      2.629510e-06  6-65495482-A-G\n",
+       "14298  0.000060               South Asian  ...      3.645085e-06  6-65495484-T-G\n",
+       "14299  0.000012               South Asian  ...      7.310070e-07  6-65495485-T-C\n",
+       "\n",
+       "[14300 rows x 5 columns]"
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Popmax</th>\n",
+       "      <th>Popmax population</th>\n",
+       "      <th>Homozygote Count</th>\n",
+       "      <th>Allele Frequency</th>\n",
+       "      <th>variant_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.000016</td>\n",
+       "      <td>African/African American</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.807419e-06</td>\n",
+       "      <td>6-63720525-A-G</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.000192</td>\n",
+       "      <td>East Asian</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>6.573844e-06</td>\n",
+       "      <td>6-63720525-A-T</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td></td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>6-63720525-A-C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.000020</td>\n",
+       "      <td>South Asian</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.045299e-06</td>\n",
+       "      <td>6-63720526-T-A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td></td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>6-63720527-G-T</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14295</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td></td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>6-65495479-G-T</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14296</th>\n",
+       "      <td>0.000031</td>\n",
+       "      <td>African/African American</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.446349e-06</td>\n",
+       "      <td>6-65495479-G-A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14297</th>\n",
+       "      <td>0.000070</td>\n",
+       "      <td>Admixed American</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.629510e-06</td>\n",
+       "      <td>6-65495482-A-G</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14298</th>\n",
+       "      <td>0.000060</td>\n",
+       "      <td>South Asian</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.645085e-06</td>\n",
+       "      <td>6-65495484-T-G</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14299</th>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>South Asian</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>7.310070e-07</td>\n",
+       "      <td>6-65495485-T-C</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>14300 rows × 5 columns</p>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 12
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2024-09-02T18:44:44.827926Z"
+    }
+   },
    "cell_type": "code",
    "source": [
-    "gnomad_data_2.rename(columns={'gnomAD ID': 'variant_id'}, inplace=True)\n",
-    "\n",
     "missing_from_api = []\n",
     "\n",
-    "for i in gnomad_data['variant_id']:\n",
-    "    if(i in gnomad_data_2['variant_id'].values):\n",
+    "for i in gnomad_data['gnomAD ID']:\n",
+    "    if(i in gnomad_data_2['gnomAD ID'].values):\n",
     "        continue\n",
     "    missing_from_api.append(i)\n",
     "\n",
     "len(missing_from_api)\n",
     "\n",
-    "missing_data = gnomad_data.loc[gnomad_data['variant_id'].isin(missing_from_api)]\n",
+    "missing_data = gnomad_data.loc[gnomad_data['gnomAD ID'].isin(missing_from_api)]\n",
     "\n",
     "missing_data"
    ],
@@ -149,18 +322,21 @@
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-08-28T18:06:31.488622Z",
-     "start_time": "2024-08-28T18:06:31.471299Z"
+     "start_time": "2024-09-02T18:44:45.626358Z"
     }
    },
    "cell_type": "code",
    "source": "missing_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_missing.csv', index=False)",
    "id": "388120b03b094511",
    "outputs": [],
-   "execution_count": 23
+   "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2024-09-02T18:44:45.626358Z"
+    }
+   },
    "cell_type": "code",
    "source": [
     "set_lovd_dtypes(data)\n",
@@ -182,7 +358,11 @@
    "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2024-09-02T18:44:45.627863Z"
+    }
+   },
    "cell_type": "code",
    "source": [
     "for i in data:\n",
@@ -194,7 +374,11 @@
    "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2024-09-02T18:44:45.628871Z"
+    }
+   },
    "cell_type": "code",
    "source": [
     "set_lovd_dtypes(data)\n",
@@ -207,7 +391,12 @@
    "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-02T18:44:45.646110Z",
+     "start_time": "2024-09-02T18:44:45.629871Z"
+    }
+   },
    "cell_type": "code",
    "source": "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")",
    "id": "c968af1617be40db",
@@ -215,7 +404,11 @@
    "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2024-09-02T18:44:45.630870Z"
+    }
+   },
    "cell_type": "code",
    "source": [
     "from subprocess import Popen\n",
@@ -228,7 +421,11 @@
    "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2024-09-02T18:44:45.631870Z"
+    }
+   },
    "cell_type": "code",
    "source": [
     "from api.tools import get_revel_scores\n",
@@ -245,7 +442,11 @@
    "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2024-09-02T18:44:45.631870Z"
+    }
+   },
    "cell_type": "code",
    "source": "",
    "id": "6f0abfb50bd211a0",