diff --git a/.gitignore b/.gitignore index fc2dc10..17a5b3c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ __pycache__ # specific folders data/* -tests/* # large gene reference files *.fa \ No newline at end of file diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 51f9a4c..369c643 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -2,12 +2,15 @@ import os import logging +import re import requests import pandas as pd from pandas import DataFrame +from pyliftover import LiftOver + from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH @@ -177,45 +180,111 @@ def from_clinvar_name_to_cdna_position(name): return name[start:end] -def add_g_position_to_gnomad(gnomad): +def lovd_fill_hg38(lovd: pd.DataFrame): + """ + Fills missing hg38 values in the LOVD dataframe + by converting hg19 values to hg38. + New column 'hg38_gnomad_format' is added to store + the converted positions in the format '6-position-ref-alt'. + :param lovd: pandas DataFrame containing following columns: + - 'VariantOnGenome/DNA': hg19 values. + - 'VariantOnGenome/DNA/hg38': hg38 values. + :return: None: Modifies the input DataFrame in-place by adding + 'hg38_gnomad_format' column. """ - Create new column 'hg38_gnomAD' from 'gnomAD ID' in the gnomAD dataframe. - Parameters: - gnomad : pd.DataFrame - gnomAD dataframe. This function modifies it in-place. + if lovd.empty: + return + lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'VariantOnGenome/DNA/hg38'].replace('', pd.NA) + missing_hg38_mask = lovd.loc[:,'hg38_gnomad_format'].isna() + lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = lovd.loc[missing_hg38_mask, 'VariantOnGenome/DNA'].apply( + convert_hg19_if_missing) + lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'hg38_gnomad_format'].apply(convert_to_gnomad_gen) + + +def convert_hg19_if_missing(hg19: str, lo = LiftOver('hg19', 'hg38')): + """ + Converts hg19 variant to hg38 if hg38 is missing. + :param hg19: a row from the DataFrame. + :param lo: converter for genomic data between reference assemblies + :return: hg38 value or a conversion of the hg19 value in the format 'g.positionref>alt'. + """ + + if pd.isna(hg19) or '_' in hg19: + return "?" + + match = re.search(r'g\.(\d+)', hg19) + if not match: + return '?' + + position_str = match.group(1) + new_pos = lo.convert_coordinate('chr6', int(position_str))[0][1] + return f"g.{new_pos}{hg19[-3:]}" + + + +def convert_to_gnomad_gen(variant: str): """ - gnomad[['chromosome', 'position', 'ref', 'alt']] = gnomad['gnomAD ID'].str.split('-', expand=True) - gnomad['hg38'] = 'g.' + gnomad['position'] + gnomad['ref'] + '>' + gnomad['alt'] - gnomad.drop(columns=['chromosome', 'position', 'ref', 'alt'], inplace=True) + converts a variant string from hg38 format + to the format used by gnomAD ('6-position-ref-alt'). + :param variant: str: the variant in the format 'g.startRef>Alt'. + :return: str: variant formatted as '6-position-ref-alt' + or '?' if the input contains interval ranges or is invalid. + """ + + patterns = { + 'dup': re.compile(r'^g\.(\d+)dup$'), + 'del': re.compile(r'^g\.(\d+)del$'), + 'ref_alt': re.compile(r'^g\.(\d+)([A-Z])>([A-Z])$') + } + + match = patterns['dup'].match(variant) + if match: + position = match.group(1) + return f"6-{position}-dup" + + match = patterns['del'].match(variant) + if match: + position = match.group(1) + return f"6-{position}-del" + + match = patterns['ref_alt'].match(variant) + if match: + position = match.group(1) + ref = match.group(2) + alt = match.group(3) + return f"6-{position}-{ref}-{alt}" + + return "?" def merge_gnomad_lovd(lovd, gnomad): """ - merge LOVD and gnomAD dataframes on genomic positions. + Merge LOVD and gnomAD dataframes on genomic positions. - parameters: + Parameters: lovd : pd.DataFrame LOVD dataframe. gnomAD : pd.DataFrame gnomAD dataframe. - returns: + Returns: pd.DataFrame - merged dataframe with combined information from LOVD and gnomAD. + Merged dataframe with combined information from LOVD and gnomAD. """ - add_g_position_to_gnomad(gnomad) + lovd_fill_hg38(lovd) gnomad.columns = [col + '_gnomad' for col in gnomad.columns] - main_frame = pd.merge( + merged_frame = pd.merge( lovd, gnomad, how="outer", - left_on="VariantOnGenome/DNA/hg38", - right_on="hg38_gnomad") + left_on="hg38_gnomad_format", + right_on="gnomAD ID_gnomad" + ) - return main_frame + return merged_frame def save_lovd_as_vcf(data, save_to="./lovd.vcf"): diff --git a/api/tools/__init__.py b/api/tools/__init__.py index e1023e4..f8d75a8 100644 --- a/api/tools/__init__.py +++ b/api/tools/__init__.py @@ -4,4 +4,4 @@ from .revel.revel import ( get_revel_scores -) \ No newline at end of file +) diff --git a/requirements.txt b/requirements.txt index 0cae80e..99a5bc2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ pandas selenium spliceai tensorflow -flask \ No newline at end of file +flask +pyliftover \ No newline at end of file diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 71cf21d..3275f3f 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -238,10 +238,22 @@ { "metadata": {}, "cell_type": "code", - "source": "", - "id": "6f0abfb50bd211a0", "outputs": [], - "execution_count": null + "execution_count": null, + "source": [ + "from api.tools import get_revel_scores\n", + "\n", + "chromosome = 6\n", + "position = 65655758\n", + "\n", + "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n", + " variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38']],\n", + " on='id',\n", + " how='left')\n", + "\n", + "display(results)" + ], + "id": "ba435cd29d565f7d" } ], "metadata": { diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py new file mode 100644 index 0000000..d8cf37e --- /dev/null +++ b/tests/test_lovd_fill_hg38.py @@ -0,0 +1,106 @@ +""" +Module for testing the `lovd_fill_hg38` function from the `api.data.refactoring` module. +""" +import unittest +import pandas as pd +from api.data.refactoring import lovd_fill_hg38 + + +class TestLOVDFillHg38(unittest.TestCase): + """ + Unit tests for the `lovd_fill_hg38` function. + """ + + def setUp(self): + """Set up any initial data used in multiple tests.""" + self.df = pd.DataFrame() + + def test_fill_hg38_with_no_missing_values(self): + """Test filling hg38 values when no values are missing.""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430517C>T', 'g.64430535C>G'], + 'VariantOnGenome/DNA/hg38': ['g.63720621C>T', 'g.63720639C>G'] + }) + lovd_fill_hg38(self.df) + expected_values = ['6-63720621-C-T', '6-63720639-C-G'] + self.assertIn('hg38_gnomad_format', self.df.columns, + "Column 'hg38_gnomad_format' should be added.") + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_with_missing_values_nan(self): + """Test filling hg38 values when they are missing (NaN case).""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430518C>T'], + 'VariantOnGenome/DNA/hg38': [pd.NA] + }) + lovd_fill_hg38(self.df) + expected_values = ['6-63720622-C-T'] + self.assertIn('hg38_gnomad_format', self.df.columns) + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_with_missing_values_empty_string(self): + """Test filling hg38 values when they are missing (empty string case).""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430518C>T'], + 'VariantOnGenome/DNA/hg38': [""] + }) + lovd_fill_hg38(self.df) + expected_values = ['6-63720622-C-T'] + self.assertIn('hg38_gnomad_format', self.df.columns) + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_with_dup_cases(self): + """Test filling hg38 values when they have 'dup' postfix.""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430518dup'], + 'VariantOnGenome/DNA/hg38': [""] + }) + lovd_fill_hg38(self.df) + expected_values = ['6-63720622-dup'] + self.assertIn('hg38_gnomad_format', self.df.columns) + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_with_del_cases(self): + """Test filling hg38 values when they have 'del' postfix.""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430518del'], + 'VariantOnGenome/DNA/hg38': [""] + }) + lovd_fill_hg38(self.df) + expected_values = ['6-63720622-del'] + self.assertIn('hg38_gnomad_format', self.df.columns) + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_with_interval_cases(self): + """Test filling hg38 values when they have intervals (e.g., 'del' range).""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430540_64430544del'], + 'VariantOnGenome/DNA/hg38': [""] + }) + lovd_fill_hg38(self.df) + expected_values = ["?"] + self.assertIn('hg38_gnomad_format', self.df.columns) + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_no_variants(self): + """Test filling hg38 values when there are no variants in the dataframe.""" + self.df = pd.DataFrame(columns=['VariantOnGenome/DNA', 'VariantOnGenome/DNA/hg38']) + lovd_fill_hg38(self.df) + self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.") + + def test_fill_hg38_NA_variants(self): + """Test filling hg38 values when there are pd. NA variants in the dataframe.""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': [pd.NA], + 'VariantOnGenome/DNA/hg38': [pd.NA] + }) + lovd_fill_hg38(self.df) + expected_values = ['?'] + self.assertIn('hg38_gnomad_format', self.df.columns, + "Column 'hg38_gnomad_format' should be added.") + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + + +if __name__ == '__main__': + unittest.main()