Skip to content

Commit

Permalink
added pd. NA case in test_lovd_fill_hg38.py, added mask for values in…
Browse files Browse the repository at this point in the history
… lovd_fill_hg38
  • Loading branch information
Akaud committed Sep 3, 2024
1 parent e16bde3 commit 94156a9
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 9 deletions.
18 changes: 11 additions & 7 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,30 +192,34 @@ def lovd_fill_hg38(lovd: pd.DataFrame):

if lovd.empty:
return
lovd['hg38_gnomad_format'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA)
missing_hg38_mask = lovd['hg38_gnomad_format'].isna()
lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'VariantOnGenome/DNA/hg38'].replace('', pd.NA)
missing_hg38_mask = lovd.loc[:,'hg38_gnomad_format'].isna()
lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = lovd.loc[missing_hg38_mask, 'VariantOnGenome/DNA'].apply(
convert_hg19_if_missing)
lovd['hg38_gnomad_format'] = lovd['hg38_gnomad_format'].apply(convert_to_gnomad_gen)
lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'hg38_gnomad_format'].apply(convert_to_gnomad_gen)


def convert_hg19_if_missing(hg19: pd.Series, lo = LiftOver('hg19', 'hg38')):
def convert_hg19_if_missing(hg19: str, lo = LiftOver('hg19', 'hg38')):
"""
Converts hg19 variant to hg38 if hg38 is missing.
:param hg19: a row from the DataFrame.
:param lo: converter for genomic data between reference assemblies
:return: hg38 value or a conversion of the hg19 value in the format 'g.positionref>alt'.
"""

if pd.isna(hg19):
if pd.isna(hg19) or '_' in hg19:
return "?"
if '?' in hg19 or "_" in hg19:

match = re.search(r'g\.(\d+)', hg19)
if not match:
return '?'
position_str = hg19[2:10]

position_str = match.group(1)
new_pos = lo.convert_coordinate('chr6', int(position_str))[0][1]
return f"g.{new_pos}{hg19[-3:]}"



def convert_to_gnomad_gen(variant: str):
"""
converts a variant string from hg38 format
Expand Down
4 changes: 2 additions & 2 deletions tests/pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4018,8 +4018,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-09-03T08:10:37.655840Z",
"start_time": "2024-09-03T08:10:28.838482Z"
"end_time": "2024-09-03T14:19:14.730427Z",
"start_time": "2024-09-03T14:19:05.969159Z"
}
},
"id": "dd9b17623f26a07c",
Expand Down
12 changes: 12 additions & 0 deletions tests/test_lovd_fill_hg38.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,18 @@ def test_fill_hg38_no_variants(self):
lovd_fill_hg38(self.df)
self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.")

def test_fill_hg38_NA_variants(self):
"""Test filling hg38 values when there are pd. NA variants in the dataframe."""
self.df = pd.DataFrame({
'VariantOnGenome/DNA': [pd.NA],
'VariantOnGenome/DNA/hg38': [pd.NA]
})
lovd_fill_hg38(self.df)
expected_values = ['?']
self.assertIn('hg38_gnomad_format', self.df.columns,
"Column 'hg38_gnomad_format' should be added.")
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)



if __name__ == '__main__':
Expand Down

0 comments on commit 94156a9

Please sign in to comment.