Merge pull request #50 from Strexas/VLE/KBE-25/mut_pos_standartisation

VLE/KBE-25/mut_pos_standartisation
Strexas · Sep 11, 2024 · 6d856b2 · 6d856b2
2 parents 43db108 + 555e2e7
commit 6d856b2
Show file tree

Hide file tree

Showing 6 changed files with 210 additions and 23 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,7 +5,6 @@ __pycache__
 
 # specific folders
 data/*
-tests/*
 
 # large gene reference files
 *.fa
diff --git a/api/data/refactoring.py b/api/data/refactoring.py
@@ -2,12 +2,15 @@
 
 import os
 import logging
+import re
 
 import requests
 
 import pandas as pd
 from pandas import DataFrame
 
+from pyliftover import LiftOver
+
 from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH
 
 
@@ -177,45 +180,111 @@ def from_clinvar_name_to_cdna_position(name):
     return name[start:end]
 
 
-def add_g_position_to_gnomad(gnomad):
+def lovd_fill_hg38(lovd: pd.DataFrame):
+    """
+    Fills missing hg38 values in the LOVD dataframe
+    by converting hg19 values to hg38.
+    New column 'hg38_gnomad_format' is added to store
+    the converted positions in the format '6-position-ref-alt'.
+    :param lovd: pandas DataFrame containing following columns:
+               - 'VariantOnGenome/DNA': hg19 values.
+               - 'VariantOnGenome/DNA/hg38': hg38 values.
+    :return: None: Modifies the input DataFrame in-place by adding
+                'hg38_gnomad_format' column.
     """
-    Create new column 'hg38_gnomAD' from 'gnomAD ID' in the gnomAD dataframe.
 
-    Parameters:
-    gnomad : pd.DataFrame
-        gnomAD dataframe. This function modifies it in-place.
+    if lovd.empty:
+        return
+    lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'VariantOnGenome/DNA/hg38'].replace('', pd.NA)
+    missing_hg38_mask = lovd.loc[:,'hg38_gnomad_format'].isna()
+    lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = lovd.loc[missing_hg38_mask, 'VariantOnGenome/DNA'].apply(
+        convert_hg19_if_missing)
+    lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'hg38_gnomad_format'].apply(convert_to_gnomad_gen)
+
+
+def convert_hg19_if_missing(hg19: str, lo = LiftOver('hg19', 'hg38')):
+    """
+    Converts hg19 variant to hg38 if hg38 is missing.
+    :param hg19: a row from the DataFrame.
+    :param lo: converter for genomic data between reference assemblies
+    :return: hg38 value or a conversion of the hg19 value in the format 'g.positionref>alt'.
+    """
+
+    if pd.isna(hg19) or '_' in hg19:
+        return "?"
+
+    match = re.search(r'g\.(\d+)', hg19)
+    if not match:
+        return '?'
+
+    position_str = match.group(1)
+    new_pos = lo.convert_coordinate('chr6', int(position_str))[0][1]
+    return f"g.{new_pos}{hg19[-3:]}"
+
+
+
+def convert_to_gnomad_gen(variant: str):
     """
-    gnomad[['chromosome', 'position', 'ref', 'alt']] = gnomad['gnomAD ID'].str.split('-', expand=True)
-    gnomad['hg38'] = 'g.' + gnomad['position'] + gnomad['ref'] + '>' + gnomad['alt']
-    gnomad.drop(columns=['chromosome', 'position', 'ref', 'alt'], inplace=True)
+    converts a variant string from hg38 format
+    to the format used by gnomAD ('6-position-ref-alt').
+    :param variant: str: the variant in the format 'g.startRef>Alt'.
+    :return: str: variant formatted as '6-position-ref-alt'
+    or '?' if the input contains interval ranges or is invalid.
+    """
+
+    patterns = {
+        'dup': re.compile(r'^g\.(\d+)dup$'),
+        'del': re.compile(r'^g\.(\d+)del$'),
+        'ref_alt': re.compile(r'^g\.(\d+)([A-Z])>([A-Z])$')
+    }
+
+    match = patterns['dup'].match(variant)
+    if match:
+        position = match.group(1)
+        return f"6-{position}-dup"
+
+    match = patterns['del'].match(variant)
+    if match:
+        position = match.group(1)
+        return f"6-{position}-del"
+
+    match = patterns['ref_alt'].match(variant)
+    if match:
+        position = match.group(1)
+        ref = match.group(2)
+        alt = match.group(3)
+        return f"6-{position}-{ref}-{alt}"
+
+    return "?"
 
 
 def merge_gnomad_lovd(lovd, gnomad):
     """
-    merge LOVD and gnomAD dataframes on genomic positions.
+    Merge LOVD and gnomAD dataframes on genomic positions.
 
-    parameters:
+    Parameters:
     lovd : pd.DataFrame
         LOVD dataframe.
     gnomAD : pd.DataFrame
         gnomAD dataframe.
 
-    returns:
+    Returns:
     pd.DataFrame
-        merged dataframe with combined information from LOVD and gnomAD.
+        Merged dataframe with combined information from LOVD and gnomAD.
     """
 
-    add_g_position_to_gnomad(gnomad)
+    lovd_fill_hg38(lovd)
     gnomad.columns = [col + '_gnomad' for col in gnomad.columns]
 
-    main_frame = pd.merge(
+    merged_frame = pd.merge(
         lovd,
         gnomad,
         how="outer",
-        left_on="VariantOnGenome/DNA/hg38",
-        right_on="hg38_gnomad")
+        left_on="hg38_gnomad_format",
+        right_on="gnomAD ID_gnomad"
+    )
 
-    return main_frame
+    return merged_frame
 
 
 def save_lovd_as_vcf(data, save_to="./lovd.vcf"):

diff --git a/api/tools/__init__.py b/api/tools/__init__.py
@@ -4,4 +4,4 @@
 
 from .revel.revel import (
     get_revel_scores
-)
+)
diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,5 @@ pandas
 selenium
 spliceai
 tensorflow
-flask
+flask
+pyliftover
diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb
@@ -238,10 +238,22 @@
   {
    "metadata": {},
    "cell_type": "code",
-   "source": "",
-   "id": "6f0abfb50bd211a0",
    "outputs": [],
-   "execution_count": null
+   "execution_count": null,
+   "source": [
+    "from api.tools import get_revel_scores\n",
+    "\n",
+    "chromosome = 6\n",
+    "position = 65655758\n",
+    "\n",
+    "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n",
+    "                       variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38']],\n",
+    "                       on='id',\n",
+    "                       how='left')\n",
+    "\n",
+    "display(results)"
+   ],
+   "id": "ba435cd29d565f7d"
   }
  ],
  "metadata": {

diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py
@@ -0,0 +1,106 @@
+"""
+Module for testing the `lovd_fill_hg38` function from the `api.data.refactoring` module.
+"""
+import unittest
+import pandas as pd
+from api.data.refactoring import lovd_fill_hg38
+
+
+class TestLOVDFillHg38(unittest.TestCase):
+    """
+        Unit tests for the `lovd_fill_hg38` function.
+    """
+
+    def setUp(self):
+        """Set up any initial data used in multiple tests."""
+        self.df = pd.DataFrame()
+
+    def test_fill_hg38_with_no_missing_values(self):
+        """Test filling hg38 values when no values are missing."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430517C>T', 'g.64430535C>G'],
+            'VariantOnGenome/DNA/hg38': ['g.63720621C>T', 'g.63720639C>G']
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ['6-63720621-C-T', '6-63720639-C-G']
+        self.assertIn('hg38_gnomad_format', self.df.columns,
+                      "Column 'hg38_gnomad_format' should be added.")
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_with_missing_values_nan(self):
+        """Test filling hg38 values when they are missing (NaN case)."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430518C>T'],
+            'VariantOnGenome/DNA/hg38': [pd.NA]
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ['6-63720622-C-T']
+        self.assertIn('hg38_gnomad_format', self.df.columns)
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_with_missing_values_empty_string(self):
+        """Test filling hg38 values when they are missing (empty string case)."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430518C>T'],
+            'VariantOnGenome/DNA/hg38': [""]
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ['6-63720622-C-T']
+        self.assertIn('hg38_gnomad_format', self.df.columns)
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_with_dup_cases(self):
+        """Test filling hg38 values when they have 'dup' postfix."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430518dup'],
+            'VariantOnGenome/DNA/hg38': [""]
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ['6-63720622-dup']
+        self.assertIn('hg38_gnomad_format', self.df.columns)
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_with_del_cases(self):
+        """Test filling hg38 values when they have 'del' postfix."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430518del'],
+            'VariantOnGenome/DNA/hg38': [""]
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ['6-63720622-del']
+        self.assertIn('hg38_gnomad_format', self.df.columns)
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_with_interval_cases(self):
+        """Test filling hg38 values when they have intervals (e.g., 'del' range)."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430540_64430544del'],
+            'VariantOnGenome/DNA/hg38': [""]
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ["?"]
+        self.assertIn('hg38_gnomad_format', self.df.columns)
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_no_variants(self):
+        """Test filling hg38 values when there are no variants in the dataframe."""
+        self.df = pd.DataFrame(columns=['VariantOnGenome/DNA', 'VariantOnGenome/DNA/hg38'])
+        lovd_fill_hg38(self.df)
+        self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.")
+
+    def test_fill_hg38_NA_variants(self):
+        """Test filling hg38 values when there are pd. NA variants in the dataframe."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': [pd.NA],
+            'VariantOnGenome/DNA/hg38': [pd.NA]
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ['?']
+        self.assertIn('hg38_gnomad_format', self.df.columns,
+                      "Column 'hg38_gnomad_format' should be added.")
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+
+
+if __name__ == '__main__':
+    unittest.main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,7 +5,6 @@ __pycache__ @@
     # specific folders
     data/*
-    tests/*
     # large gene reference files
     *.fa