Skip to content

Commit

Permalink
Merge pull request #41 from VLE/KBE-15/merge_gnomad_lovd
Browse files Browse the repository at this point in the history
VLE/KBE-15/merge_gnomad_lovd
  • Loading branch information
Strexas authored Aug 21, 2024
2 parents 05ef057 + b7daca3 commit a3ffd99
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 1 deletion.
41 changes: 41 additions & 0 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,47 @@ def from_clinvar_name_to_cdna_position(name):
return name[start:end]


def add_g_position_to_gnomad(gnomad):
"""
Create new column 'hg38_gnomAD' from 'gnomAD ID' in the gnomAD dataframe.
Parameters:
gnomad : pd.DataFrame
gnomAD dataframe. This function modifies it in-place.
"""
gnomad[['chromosome', 'position', 'ref', 'alt']] = gnomad['gnomAD ID'].str.split('-', expand=True)
gnomad['hg38'] = 'g.' + gnomad['position'] + gnomad['ref'] + '>' + gnomad['alt']
gnomad.drop(columns=['chromosome', 'position', 'ref', 'alt'], inplace=True)


def merge_gnomad_lovd(lovd, gnomad):
"""
merge LOVD and gnomAD dataframes on genomic positions.
parameters:
lovd : pd.DataFrame
LOVD dataframe.
gnomAD : pd.DataFrame
gnomAD dataframe.
returns:
pd.DataFrame
merged dataframe with combined information from LOVD and gnomAD.
"""

add_g_position_to_gnomad(gnomad)
gnomad.columns = [col + '_gnomad' for col in gnomad.columns]

main_frame = pd.merge(
lovd,
gnomad,
how="outer",
left_on="VariantOnGenome/DNA/hg38",
right_on="hg38_gnomad")

return main_frame


def save_lovd_as_vcf(data, save_to="./lovd.vcf"):
"""
Gets hg38 variants from LOVD and saves as VCF file.
Expand Down
61 changes: 60 additions & 1 deletion tests/pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3984,14 +3984,73 @@
"id": "4ba7fd02a60f5693",
"execution_count": 1
},
{
"cell_type": "code",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The file at ../data/lovd/lovd_data.txt already exists.\n"
]
},
{
"data": {
"text/plain": " id transcriptid effectid position_c_start \\\n0 170936 7329 90 -538 \n1 235579 7329 99 -332 \n2 235593 7329 99 1300 \n3 235595 7329 99 1300 \n4 235603 7329 99 6572 \n... ... ... ... ... \n13272 822052 7329 70 1767 \n13273 822775 7329 70 0 \n13274 822785 7329 70 0 \n13275 822816 7329 70 0 \n13276 867648 7329 70 0 \n\n position_c_start_intron position_c_end position_c_end_intron \\\n0 0 1599 1 \n1 -1 748 1 \n2 -1 1459 1 \n3 -1 1459 1 \n4 -1 6725 1 \n... ... ... ... \n13272 -1 2023 1 \n13273 0 0 0 \n13274 0 0 0 \n13275 0 0 0 \n13276 0 0 0 \n\n VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n0 c.(?_-538)_(1599+1_1600-1)del r.0? \n1 c.(-333+1_-332-1)_(748+1_749-1)del r.? \n2 c.(1299+1_1300-1)_(1459+1_1460-1)del r.? \n3 c.(1299+1_1300-1)_(1459+1_1460-1)del r.(?) \n4 c.(6571+1_6572-1)_(6725+1_6726-1)del r.? \n... ... ... \n13272 c.(1766+1_1767-1)_(2023+1_2024-1)del r.spl \n13273 c.? r.(?) \n13274 c.? r.(?) \n13275 c.? r.(?) \n13276 c.? r.(?) \n\n VariantOnTranscript/Protein VariantOnTranscript/Exon \\\n0 p.0? _1_10i \n1 p.? 2i_4i \n2 p.? 8i_9i \n3 p.? 8i_9i \n4 p.(Ser2191Thrfs*14) 32i_33i \n... ... ... \n13272 p.(?) \n13273 p.(Tyr2555fs) \n13274 p.(Asp498fs) \n13275 p.(Gln3101fs) \n13276 p.? \n\n VariantOnGenome/DNA/hg38 gnomAD ID_gnomad hg38_gnomad \n0 <NA> <NA> \n1 <NA> <NA> \n2 <NA> <NA> \n3 <NA> <NA> \n4 <NA> <NA> \n... ... ... ... \n13272 g.? <NA> <NA> \n13273 g.? <NA> <NA> \n13274 g.? <NA> <NA> \n13275 g.? <NA> <NA> \n13276 g.? <NA> <NA> \n\n[13277 rows x 14 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>transcriptid</th>\n <th>effectid</th>\n <th>position_c_start</th>\n <th>position_c_start_intron</th>\n <th>position_c_end</th>\n <th>position_c_end_intron</th>\n <th>VariantOnTranscript/DNA</th>\n <th>VariantOnTranscript/RNA</th>\n <th>VariantOnTranscript/Protein</th>\n <th>VariantOnTranscript/Exon</th>\n <th>VariantOnGenome/DNA/hg38</th>\n <th>gnomAD ID_gnomad</th>\n <th>hg38_gnomad</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>170936</td>\n <td>7329</td>\n <td>90</td>\n <td>-538</td>\n <td>0</td>\n <td>1599</td>\n <td>1</td>\n <td>c.(?_-538)_(1599+1_1600-1)del</td>\n <td>r.0?</td>\n <td>p.0?</td>\n <td>_1_10i</td>\n <td></td>\n <td>&lt;NA&gt;</td>\n <td>&lt;NA&gt;</td>\n </tr>\n <tr>\n <th>1</th>\n <td>235579</td>\n <td>7329</td>\n <td>99</td>\n <td>-332</td>\n <td>-1</td>\n <td>748</td>\n <td>1</td>\n <td>c.(-333+1_-332-1)_(748+1_749-1)del</td>\n <td>r.?</td>\n <td>p.?</td>\n <td>2i_4i</td>\n <td></td>\n <td>&lt;NA&gt;</td>\n <td>&lt;NA&gt;</td>\n </tr>\n <tr>\n <th>2</th>\n <td>235593</td>\n <td>7329</td>\n <td>99</td>\n <td>1300</td>\n <td>-1</td>\n <td>1459</td>\n <td>1</td>\n <td>c.(1299+1_1300-1)_(1459+1_1460-1)del</td>\n <td>r.?</td>\n <td>p.?</td>\n <td>8i_9i</td>\n <td></td>\n <td>&lt;NA&gt;</td>\n <td>&lt;NA&gt;</td>\n </tr>\n <tr>\n <th>3</th>\n <td>235595</td>\n <td>7329</td>\n <td>99</td>\n <td>1300</td>\n <td>-1</td>\n <td>1459</td>\n <td>1</td>\n <td>c.(1299+1_1300-1)_(1459+1_1460-1)del</td>\n <td>r.(?)</td>\n <td>p.?</td>\n <td>8i_9i</td>\n <td></td>\n <td>&lt;NA&gt;</td>\n <td>&lt;NA&gt;</td>\n </tr>\n <tr>\n <th>4</th>\n <td>235603</td>\n <td>7329</td>\n <td>99</td>\n <td>6572</td>\n <td>-1</td>\n <td>6725</td>\n <td>1</td>\n <td>c.(6571+1_6572-1)_(6725+1_6726-1)del</td>\n <td>r.?</td>\n <td>p.(Ser2191Thrfs*14)</td>\n <td>32i_33i</td>\n <td></td>\n <td>&lt;NA&gt;</td>\n <td>&lt;NA&gt;</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>13272</th>\n <td>822052</td>\n <td>7329</td>\n <td>70</td>\n <td>1767</td>\n <td>-1</td>\n <td>2023</td>\n <td>1</td>\n <td>c.(1766+1_1767-1)_(2023+1_2024-1)del</td>\n <td>r.spl</td>\n <td>p.(?)</td>\n <td></td>\n <td>g.?</td>\n <td>&lt;NA&gt;</td>\n <td>&lt;NA&gt;</td>\n </tr>\n <tr>\n <th>13273</th>\n <td>822775</td>\n <td>7329</td>\n <td>70</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>c.?</td>\n <td>r.(?)</td>\n <td>p.(Tyr2555fs)</td>\n <td></td>\n <td>g.?</td>\n <td>&lt;NA&gt;</td>\n <td>&lt;NA&gt;</td>\n </tr>\n <tr>\n <th>13274</th>\n <td>822785</td>\n <td>7329</td>\n <td>70</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>c.?</td>\n <td>r.(?)</td>\n <td>p.(Asp498fs)</td>\n <td></td>\n <td>g.?</td>\n <td>&lt;NA&gt;</td>\n <td>&lt;NA&gt;</td>\n </tr>\n <tr>\n <th>13275</th>\n <td>822816</td>\n <td>7329</td>\n <td>70</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>c.?</td>\n <td>r.(?)</td>\n <td>p.(Gln3101fs)</td>\n <td></td>\n <td>g.?</td>\n <td>&lt;NA&gt;</td>\n <td>&lt;NA&gt;</td>\n </tr>\n <tr>\n <th>13276</th>\n <td>867648</td>\n <td>7329</td>\n <td>70</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>c.?</td>\n <td>r.(?)</td>\n <td>p.?</td>\n <td></td>\n <td>g.?</td>\n <td>&lt;NA&gt;</td>\n <td>&lt;NA&gt;</td>\n </tr>\n </tbody>\n</table>\n<p>13277 rows × 14 columns</p>\n</div>"
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n",
"import pandas as pd\n",
"from api import (store_database_for_eys_gene,\n",
" parse_lovd,\n",
" set_lovd_dtypes,\n",
" LOVD_PATH,\n",
" GNOMAD_PATH)\n",
"\n",
"store_database_for_eys_gene('lovd', False)\n",
"store_database_for_eys_gene('gnomad', False)\n",
"\n",
"lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n",
"gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n",
"\n",
"set_lovd_dtypes(lovd_data)\n",
"set_gnomad_dtypes(gnomad_data)\n",
"\n",
"variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n",
"\n",
"lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n",
" variants_on_genome[['id','VariantOnGenome/DNA/hg38']],\n",
" on='id',\n",
" how='left')\n",
"\n",
"gnomad_data = gnomad_data.copy()\n",
"final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n",
"final_data"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-08-21T18:35:42.249375Z",
"start_time": "2024-08-21T18:35:33.312752Z"
}
},
"id": "dd9b17623f26a07c",
"execution_count": 1
},
{
"cell_type": "code",
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
},
"id": "dd9b17623f26a07c"
"id": "1a3b6e41853817ca"
}
],
"metadata": {
Expand Down

0 comments on commit a3ffd99

Please sign in to comment.