Skip to content

Commit

Permalink
Merge pull request #50 from Strexas/VLE/KBE-25/mut_pos_standartisation
Browse files Browse the repository at this point in the history
VLE/KBE-25/mut_pos_standartisation
  • Loading branch information
Akaud authored Sep 11, 2024
2 parents 43db108 + 555e2e7 commit 6d856b2
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 23 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ __pycache__

# specific folders
data/*
tests/*

# large gene reference files
*.fa
103 changes: 86 additions & 17 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@

import os
import logging
import re

import requests

import pandas as pd
from pandas import DataFrame

from pyliftover import LiftOver

from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH


Expand Down Expand Up @@ -177,45 +180,111 @@ def from_clinvar_name_to_cdna_position(name):
return name[start:end]


def add_g_position_to_gnomad(gnomad):
def lovd_fill_hg38(lovd: pd.DataFrame):
"""
Fills missing hg38 values in the LOVD dataframe
by converting hg19 values to hg38.
New column 'hg38_gnomad_format' is added to store
the converted positions in the format '6-position-ref-alt'.
:param lovd: pandas DataFrame containing following columns:
- 'VariantOnGenome/DNA': hg19 values.
- 'VariantOnGenome/DNA/hg38': hg38 values.
:return: None: Modifies the input DataFrame in-place by adding
'hg38_gnomad_format' column.
"""
Create new column 'hg38_gnomAD' from 'gnomAD ID' in the gnomAD dataframe.

Parameters:
gnomad : pd.DataFrame
gnomAD dataframe. This function modifies it in-place.
if lovd.empty:
return
lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'VariantOnGenome/DNA/hg38'].replace('', pd.NA)
missing_hg38_mask = lovd.loc[:,'hg38_gnomad_format'].isna()
lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = lovd.loc[missing_hg38_mask, 'VariantOnGenome/DNA'].apply(
convert_hg19_if_missing)
lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'hg38_gnomad_format'].apply(convert_to_gnomad_gen)


def convert_hg19_if_missing(hg19: str, lo = LiftOver('hg19', 'hg38')):
"""
Converts hg19 variant to hg38 if hg38 is missing.
:param hg19: a row from the DataFrame.
:param lo: converter for genomic data between reference assemblies
:return: hg38 value or a conversion of the hg19 value in the format 'g.positionref>alt'.
"""

if pd.isna(hg19) or '_' in hg19:
return "?"

match = re.search(r'g\.(\d+)', hg19)
if not match:
return '?'

position_str = match.group(1)
new_pos = lo.convert_coordinate('chr6', int(position_str))[0][1]
return f"g.{new_pos}{hg19[-3:]}"



def convert_to_gnomad_gen(variant: str):
"""
gnomad[['chromosome', 'position', 'ref', 'alt']] = gnomad['gnomAD ID'].str.split('-', expand=True)
gnomad['hg38'] = 'g.' + gnomad['position'] + gnomad['ref'] + '>' + gnomad['alt']
gnomad.drop(columns=['chromosome', 'position', 'ref', 'alt'], inplace=True)
converts a variant string from hg38 format
to the format used by gnomAD ('6-position-ref-alt').
:param variant: str: the variant in the format 'g.startRef>Alt'.
:return: str: variant formatted as '6-position-ref-alt'
or '?' if the input contains interval ranges or is invalid.
"""

patterns = {
'dup': re.compile(r'^g\.(\d+)dup$'),
'del': re.compile(r'^g\.(\d+)del$'),
'ref_alt': re.compile(r'^g\.(\d+)([A-Z])>([A-Z])$')
}

match = patterns['dup'].match(variant)
if match:
position = match.group(1)
return f"6-{position}-dup"

match = patterns['del'].match(variant)
if match:
position = match.group(1)
return f"6-{position}-del"

match = patterns['ref_alt'].match(variant)
if match:
position = match.group(1)
ref = match.group(2)
alt = match.group(3)
return f"6-{position}-{ref}-{alt}"

return "?"


def merge_gnomad_lovd(lovd, gnomad):
"""
merge LOVD and gnomAD dataframes on genomic positions.
Merge LOVD and gnomAD dataframes on genomic positions.
parameters:
Parameters:
lovd : pd.DataFrame
LOVD dataframe.
gnomAD : pd.DataFrame
gnomAD dataframe.
returns:
Returns:
pd.DataFrame
merged dataframe with combined information from LOVD and gnomAD.
Merged dataframe with combined information from LOVD and gnomAD.
"""

add_g_position_to_gnomad(gnomad)
lovd_fill_hg38(lovd)
gnomad.columns = [col + '_gnomad' for col in gnomad.columns]

main_frame = pd.merge(
merged_frame = pd.merge(
lovd,
gnomad,
how="outer",
left_on="VariantOnGenome/DNA/hg38",
right_on="hg38_gnomad")
left_on="hg38_gnomad_format",
right_on="gnomAD ID_gnomad"
)

return main_frame
return merged_frame


def save_lovd_as_vcf(data, save_to="./lovd.vcf"):
Expand Down
2 changes: 1 addition & 1 deletion api/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@

from .revel.revel import (
get_revel_scores
)
)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ pandas
selenium
spliceai
tensorflow
flask
flask
pyliftover
18 changes: 15 additions & 3 deletions tests/pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -238,10 +238,22 @@
{
"metadata": {},
"cell_type": "code",
"source": "",
"id": "6f0abfb50bd211a0",
"outputs": [],
"execution_count": null
"execution_count": null,
"source": [
"from api.tools import get_revel_scores\n",
"\n",
"chromosome = 6\n",
"position = 65655758\n",
"\n",
"lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n",
" variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38']],\n",
" on='id',\n",
" how='left')\n",
"\n",
"display(results)"
],
"id": "ba435cd29d565f7d"
}
],
"metadata": {
Expand Down
106 changes: 106 additions & 0 deletions tests/test_lovd_fill_hg38.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
Module for testing the `lovd_fill_hg38` function from the `api.data.refactoring` module.
"""
import unittest
import pandas as pd
from api.data.refactoring import lovd_fill_hg38


class TestLOVDFillHg38(unittest.TestCase):
"""
Unit tests for the `lovd_fill_hg38` function.
"""

def setUp(self):
"""Set up any initial data used in multiple tests."""
self.df = pd.DataFrame()

def test_fill_hg38_with_no_missing_values(self):
"""Test filling hg38 values when no values are missing."""
self.df = pd.DataFrame({
'VariantOnGenome/DNA': ['g.64430517C>T', 'g.64430535C>G'],
'VariantOnGenome/DNA/hg38': ['g.63720621C>T', 'g.63720639C>G']
})
lovd_fill_hg38(self.df)
expected_values = ['6-63720621-C-T', '6-63720639-C-G']
self.assertIn('hg38_gnomad_format', self.df.columns,
"Column 'hg38_gnomad_format' should be added.")
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)

def test_fill_hg38_with_missing_values_nan(self):
"""Test filling hg38 values when they are missing (NaN case)."""
self.df = pd.DataFrame({
'VariantOnGenome/DNA': ['g.64430518C>T'],
'VariantOnGenome/DNA/hg38': [pd.NA]
})
lovd_fill_hg38(self.df)
expected_values = ['6-63720622-C-T']
self.assertIn('hg38_gnomad_format', self.df.columns)
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)

def test_fill_hg38_with_missing_values_empty_string(self):
"""Test filling hg38 values when they are missing (empty string case)."""
self.df = pd.DataFrame({
'VariantOnGenome/DNA': ['g.64430518C>T'],
'VariantOnGenome/DNA/hg38': [""]
})
lovd_fill_hg38(self.df)
expected_values = ['6-63720622-C-T']
self.assertIn('hg38_gnomad_format', self.df.columns)
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)

def test_fill_hg38_with_dup_cases(self):
"""Test filling hg38 values when they have 'dup' postfix."""
self.df = pd.DataFrame({
'VariantOnGenome/DNA': ['g.64430518dup'],
'VariantOnGenome/DNA/hg38': [""]
})
lovd_fill_hg38(self.df)
expected_values = ['6-63720622-dup']
self.assertIn('hg38_gnomad_format', self.df.columns)
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)

def test_fill_hg38_with_del_cases(self):
"""Test filling hg38 values when they have 'del' postfix."""
self.df = pd.DataFrame({
'VariantOnGenome/DNA': ['g.64430518del'],
'VariantOnGenome/DNA/hg38': [""]
})
lovd_fill_hg38(self.df)
expected_values = ['6-63720622-del']
self.assertIn('hg38_gnomad_format', self.df.columns)
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)

def test_fill_hg38_with_interval_cases(self):
"""Test filling hg38 values when they have intervals (e.g., 'del' range)."""
self.df = pd.DataFrame({
'VariantOnGenome/DNA': ['g.64430540_64430544del'],
'VariantOnGenome/DNA/hg38': [""]
})
lovd_fill_hg38(self.df)
expected_values = ["?"]
self.assertIn('hg38_gnomad_format', self.df.columns)
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)

def test_fill_hg38_no_variants(self):
"""Test filling hg38 values when there are no variants in the dataframe."""
self.df = pd.DataFrame(columns=['VariantOnGenome/DNA', 'VariantOnGenome/DNA/hg38'])
lovd_fill_hg38(self.df)
self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.")

def test_fill_hg38_NA_variants(self):
"""Test filling hg38 values when there are pd. NA variants in the dataframe."""
self.df = pd.DataFrame({
'VariantOnGenome/DNA': [pd.NA],
'VariantOnGenome/DNA/hg38': [pd.NA]
})
lovd_fill_hg38(self.df)
expected_values = ['?']
self.assertIn('hg38_gnomad_format', self.df.columns,
"Column 'hg38_gnomad_format' should be added.")
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)



if __name__ == '__main__':
unittest.main()

0 comments on commit 6d856b2

Please sign in to comment.