From 6159adef505a88b59ebd3e68945e3c160d64145a Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Wed, 21 Aug 2024 20:31:00 +0300 Subject: [PATCH 1/3] merging gnomAD and lovd based on gnomAD ID and VariantOnGenome/hg38 --- api/data/refactoring.py | 47 +++++++++++++++++++++++++++++++ tests/pipeline.ipynb | 61 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 107 insertions(+), 1 deletion(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 6d1ce8c..bd98625 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -174,6 +174,53 @@ def from_clinvar_name_to_cdna_position(name): return name[start:end] +def create_gnomad_columns(gnomad): + """ + Create new columns 'chromosome', 'position', 'ref', 'alt', and 'HGVS g.' from 'gnomAD ID'. + + Parameters: + gnomad : pd.DataFrame + gnomAD dataframe. + + Returns: + pd.DataFrame + gnomAD dataframe with new columns. + """ + + gnomad[['chromosome', 'position', 'ref', 'alt']] = gnomad['gnomAD ID'].str.split('-', expand=True) + gnomad['hg38_gnomAD'] = 'g.' + gnomad['position'] + gnomad['ref'] + '>' + gnomad['alt'] + gnomad.drop(columns=['chromosome', 'position', 'ref', 'alt'], inplace=True) + + return gnomad + + +def merge_gnomad_lovd(lovd, gnomad): + """ + merge LOVD and gnomAD dataframes on genomic positions. + + parameters: + lovd : pd.DataFrame + LOVD dataframe. + gnomAD : pd.DataFrame + gnomAD dataframe. + + returns: + pd.DataFrame + merged dataframe with combined information from LOVD and gnomAD. + """ + + gnomad = create_gnomad_columns(gnomad) + + main_frame = pd.merge( + lovd, + gnomad, + how="outer", + left_on="VariantOnGenome/DNA/hg38", + right_on="hg38_gnomAD") + + return main_frame + + def save_lovd_as_vcf(data, save_to="./lovd.vcf"): """ Gets hg38 variants from LOVD and saves as VCF file. diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 1a7898e..21b6860 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -3984,6 +3984,65 @@ "id": "4ba7fd02a60f5693", "execution_count": 1 }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The file at ../data/lovd/lovd_data.txt already exists.\n" + ] + }, + { + "data": { + "text/plain": " id transcriptid effectid position_c_start \\\n0 170936 7329 90 -538 \n1 235579 7329 99 -332 \n2 235593 7329 99 1300 \n3 235595 7329 99 1300 \n4 235603 7329 99 6572 \n... ... ... ... ... \n12626 822052 7329 70 1767 \n12627 822775 7329 70 0 \n12628 822785 7329 70 0 \n12629 822816 7329 70 0 \n12630 867648 7329 70 0 \n\n position_c_start_intron position_c_end position_c_end_intron \\\n0 0 1599 1 \n1 -1 748 1 \n2 -1 1459 1 \n3 -1 1459 1 \n4 -1 6725 1 \n... ... ... ... \n12626 -1 2023 1 \n12627 0 0 0 \n12628 0 0 0 \n12629 0 0 0 \n12630 0 0 0 \n\n VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n0 c.(?_-538)_(1599+1_1600-1)del r.0? \n1 c.(-333+1_-332-1)_(748+1_749-1)del r.? \n2 c.(1299+1_1300-1)_(1459+1_1460-1)del r.? \n3 c.(1299+1_1300-1)_(1459+1_1460-1)del r.(?) \n4 c.(6571+1_6572-1)_(6725+1_6726-1)del r.? \n... ... ... \n12626 c.(1766+1_1767-1)_(2023+1_2024-1)del r.spl \n12627 c.? r.(?) \n12628 c.? r.(?) \n12629 c.? r.(?) \n12630 c.? r.(?) \n\n VariantOnTranscript/Protein VariantOnTranscript/Exon \\\n0 p.0? _1_10i \n1 p.? 2i_4i \n2 p.? 8i_9i \n3 p.? 8i_9i \n4 p.(Ser2191Thrfs*14) 32i_33i \n... ... ... \n12626 p.(?) \n12627 p.(Tyr2555fs) \n12628 p.(Asp498fs) \n12629 p.(Gln3101fs) \n12630 p.? \n\n VariantOnGenome/DNA/hg38 gnomAD ID hg38_gnomAD \n0 \n1 \n2 \n3 \n4 \n... ... ... ... \n12626 g.? \n12627 g.? \n12628 g.? \n12629 g.? \n12630 g.? \n\n[12631 rows x 14 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/ProteinVariantOnTranscript/ExonVariantOnGenome/DNA/hg38gnomAD IDhg38_gnomAD
0170936732990-538015991c.(?_-538)_(1599+1_1600-1)delr.0?p.0?_1_10i<NA><NA>
1235579732999-332-17481c.(-333+1_-332-1)_(748+1_749-1)delr.?p.?2i_4i<NA><NA>
22355937329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.?p.?8i_9i<NA><NA>
32355957329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.(?)p.?8i_9i<NA><NA>
42356037329996572-167251c.(6571+1_6572-1)_(6725+1_6726-1)delr.?p.(Ser2191Thrfs*14)32i_33i<NA><NA>
.............................................
126268220527329701767-120231c.(1766+1_1767-1)_(2023+1_2024-1)delr.splp.(?)g.?<NA><NA>
126278227757329700000c.?r.(?)p.(Tyr2555fs)g.?<NA><NA>
126288227857329700000c.?r.(?)p.(Asp498fs)g.?<NA><NA>
126298228167329700000c.?r.(?)p.(Gln3101fs)g.?<NA><NA>
126308676487329700000c.?r.(?)p.?g.?<NA><NA>
\n

12631 rows × 14 columns

\n
" + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes, filter_lovd_for_eys\n", + "import pandas as pd\n", + "from api import (store_database_for_eys_gene,\n", + " parse_lovd,\n", + " set_lovd_dtypes,\n", + " LOVD_PATH,\n", + " GNOMAD_PATH)\n", + "\n", + "store_database_for_eys_gene('lovd', False)\n", + "store_database_for_eys_gene('gnomad', False)\n", + "\n", + "lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", + "gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n", + "\n", + "set_lovd_dtypes(lovd_data)\n", + "set_gnomad_dtypes(gnomad_data)\n", + "\n", + "variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n", + "\n", + "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n", + " variants_on_genome[['id','VariantOnGenome/DNA/hg38']],\n", + " on='id',\n", + " how='left')\n", + "\n", + "gnomad_data = gnomad_data.copy()[[\"gnomAD ID\"]]\n", + "final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", + "final_data" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-08-21T17:21:32.692786Z", + "start_time": "2024-08-21T17:21:24.356784Z" + } + }, + "id": "dd9b17623f26a07c", + "execution_count": 1 + }, { "cell_type": "code", "outputs": [], @@ -3991,7 +4050,7 @@ "metadata": { "collapsed": false }, - "id": "dd9b17623f26a07c" + "id": "1a3b6e41853817ca" } ], "metadata": { From 332e61c11bbd0dd6fec2e5d9513025b3a9ea5487 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Wed, 21 Aug 2024 21:23:04 +0300 Subject: [PATCH 2/3] added postfix for gnomad columns --- api/data/refactoring.py | 16 +++++----------- tests/pipeline.ipynb | 14 +++++++------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index bd98625..ede8bf3 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -174,25 +174,18 @@ def from_clinvar_name_to_cdna_position(name): return name[start:end] -def create_gnomad_columns(gnomad): +def add_g_position_to_gnomad(gnomad): """ - Create new columns 'chromosome', 'position', 'ref', 'alt', and 'HGVS g.' from 'gnomAD ID'. + Create new column 'hg38_gnomAD' from 'gnomAD ID' in the gnomAD dataframe. Parameters: gnomad : pd.DataFrame - gnomAD dataframe. - - Returns: - pd.DataFrame - gnomAD dataframe with new columns. + gnomAD dataframe. This function modifies it in-place. """ - gnomad[['chromosome', 'position', 'ref', 'alt']] = gnomad['gnomAD ID'].str.split('-', expand=True) gnomad['hg38_gnomAD'] = 'g.' + gnomad['position'] + gnomad['ref'] + '>' + gnomad['alt'] gnomad.drop(columns=['chromosome', 'position', 'ref', 'alt'], inplace=True) - return gnomad - def merge_gnomad_lovd(lovd, gnomad): """ @@ -209,7 +202,8 @@ def merge_gnomad_lovd(lovd, gnomad): merged dataframe with combined information from LOVD and gnomAD. """ - gnomad = create_gnomad_columns(gnomad) + add_g_position_to_gnomad(gnomad) + gnomad.columns = [col + '_gnomad' if col != 'hg38_gnomAD' else col for col in gnomad.columns] main_frame = pd.merge( lovd, diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 21b6860..bfe78f4 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -3996,16 +3996,16 @@ }, { "data": { - "text/plain": " id transcriptid effectid position_c_start \\\n0 170936 7329 90 -538 \n1 235579 7329 99 -332 \n2 235593 7329 99 1300 \n3 235595 7329 99 1300 \n4 235603 7329 99 6572 \n... ... ... ... ... \n12626 822052 7329 70 1767 \n12627 822775 7329 70 0 \n12628 822785 7329 70 0 \n12629 822816 7329 70 0 \n12630 867648 7329 70 0 \n\n position_c_start_intron position_c_end position_c_end_intron \\\n0 0 1599 1 \n1 -1 748 1 \n2 -1 1459 1 \n3 -1 1459 1 \n4 -1 6725 1 \n... ... ... ... \n12626 -1 2023 1 \n12627 0 0 0 \n12628 0 0 0 \n12629 0 0 0 \n12630 0 0 0 \n\n VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n0 c.(?_-538)_(1599+1_1600-1)del r.0? \n1 c.(-333+1_-332-1)_(748+1_749-1)del r.? \n2 c.(1299+1_1300-1)_(1459+1_1460-1)del r.? \n3 c.(1299+1_1300-1)_(1459+1_1460-1)del r.(?) \n4 c.(6571+1_6572-1)_(6725+1_6726-1)del r.? \n... ... ... \n12626 c.(1766+1_1767-1)_(2023+1_2024-1)del r.spl \n12627 c.? r.(?) \n12628 c.? r.(?) \n12629 c.? r.(?) \n12630 c.? r.(?) \n\n VariantOnTranscript/Protein VariantOnTranscript/Exon \\\n0 p.0? _1_10i \n1 p.? 2i_4i \n2 p.? 8i_9i \n3 p.? 8i_9i \n4 p.(Ser2191Thrfs*14) 32i_33i \n... ... ... \n12626 p.(?) \n12627 p.(Tyr2555fs) \n12628 p.(Asp498fs) \n12629 p.(Gln3101fs) \n12630 p.? \n\n VariantOnGenome/DNA/hg38 gnomAD ID hg38_gnomAD \n0 \n1 \n2 \n3 \n4 \n... ... ... ... \n12626 g.? \n12627 g.? \n12628 g.? \n12629 g.? \n12630 g.? \n\n[12631 rows x 14 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/ProteinVariantOnTranscript/ExonVariantOnGenome/DNA/hg38gnomAD IDhg38_gnomAD
0170936732990-538015991c.(?_-538)_(1599+1_1600-1)delr.0?p.0?_1_10i<NA><NA>
1235579732999-332-17481c.(-333+1_-332-1)_(748+1_749-1)delr.?p.?2i_4i<NA><NA>
22355937329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.?p.?8i_9i<NA><NA>
32355957329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.(?)p.?8i_9i<NA><NA>
42356037329996572-167251c.(6571+1_6572-1)_(6725+1_6726-1)delr.?p.(Ser2191Thrfs*14)32i_33i<NA><NA>
.............................................
126268220527329701767-120231c.(1766+1_1767-1)_(2023+1_2024-1)delr.splp.(?)g.?<NA><NA>
126278227757329700000c.?r.(?)p.(Tyr2555fs)g.?<NA><NA>
126288227857329700000c.?r.(?)p.(Asp498fs)g.?<NA><NA>
126298228167329700000c.?r.(?)p.(Gln3101fs)g.?<NA><NA>
126308676487329700000c.?r.(?)p.?g.?<NA><NA>
\n

12631 rows × 14 columns

\n
" + "text/plain": " id transcriptid effectid position_c_start \\\n0 170936 7329 90 -538 \n1 235579 7329 99 -332 \n2 235593 7329 99 1300 \n3 235595 7329 99 1300 \n4 235603 7329 99 6572 \n... ... ... ... ... \n13272 822052 7329 70 1767 \n13273 822775 7329 70 0 \n13274 822785 7329 70 0 \n13275 822816 7329 70 0 \n13276 867648 7329 70 0 \n\n position_c_start_intron position_c_end position_c_end_intron \\\n0 0 1599 1 \n1 -1 748 1 \n2 -1 1459 1 \n3 -1 1459 1 \n4 -1 6725 1 \n... ... ... ... \n13272 -1 2023 1 \n13273 0 0 0 \n13274 0 0 0 \n13275 0 0 0 \n13276 0 0 0 \n\n VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n0 c.(?_-538)_(1599+1_1600-1)del r.0? \n1 c.(-333+1_-332-1)_(748+1_749-1)del r.? \n2 c.(1299+1_1300-1)_(1459+1_1460-1)del r.? \n3 c.(1299+1_1300-1)_(1459+1_1460-1)del r.(?) \n4 c.(6571+1_6572-1)_(6725+1_6726-1)del r.? \n... ... ... \n13272 c.(1766+1_1767-1)_(2023+1_2024-1)del r.spl \n13273 c.? r.(?) \n13274 c.? r.(?) \n13275 c.? r.(?) \n13276 c.? r.(?) \n\n VariantOnTranscript/Protein VariantOnTranscript/Exon \\\n0 p.0? _1_10i \n1 p.? 2i_4i \n2 p.? 8i_9i \n3 p.? 8i_9i \n4 p.(Ser2191Thrfs*14) 32i_33i \n... ... ... \n13272 p.(?) \n13273 p.(Tyr2555fs) \n13274 p.(Asp498fs) \n13275 p.(Gln3101fs) \n13276 p.? \n\n VariantOnGenome/DNA/hg38 gnomAD ID_gnomad hg38_gnomAD \n0 \n1 \n2 \n3 \n4 \n... ... ... ... \n13272 g.? \n13273 g.? \n13274 g.? \n13275 g.? \n13276 g.? \n\n[13277 rows x 14 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/ProteinVariantOnTranscript/ExonVariantOnGenome/DNA/hg38gnomAD ID_gnomadhg38_gnomAD
0170936732990-538015991c.(?_-538)_(1599+1_1600-1)delr.0?p.0?_1_10i<NA><NA>
1235579732999-332-17481c.(-333+1_-332-1)_(748+1_749-1)delr.?p.?2i_4i<NA><NA>
22355937329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.?p.?8i_9i<NA><NA>
32355957329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.(?)p.?8i_9i<NA><NA>
42356037329996572-167251c.(6571+1_6572-1)_(6725+1_6726-1)delr.?p.(Ser2191Thrfs*14)32i_33i<NA><NA>
.............................................
132728220527329701767-120231c.(1766+1_1767-1)_(2023+1_2024-1)delr.splp.(?)g.?<NA><NA>
132738227757329700000c.?r.(?)p.(Tyr2555fs)g.?<NA><NA>
132748227857329700000c.?r.(?)p.(Asp498fs)g.?<NA><NA>
132758228167329700000c.?r.(?)p.(Gln3101fs)g.?<NA><NA>
132768676487329700000c.?r.(?)p.?g.?<NA><NA>
\n

13277 rows × 14 columns

\n
" }, - "execution_count": 1, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes, filter_lovd_for_eys\n", + "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n", "import pandas as pd\n", "from api import (store_database_for_eys_gene,\n", " parse_lovd,\n", @@ -4036,12 +4036,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-21T17:21:32.692786Z", - "start_time": "2024-08-21T17:21:24.356784Z" + "end_time": "2024-08-21T18:22:09.068809Z", + "start_time": "2024-08-21T18:21:48.966115Z" } }, "id": "dd9b17623f26a07c", - "execution_count": 1 + "execution_count": 2 }, { "cell_type": "code", From b7daca31bd2269c3833c5ceba67fda44cec0dd64 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Wed, 21 Aug 2024 21:36:52 +0300 Subject: [PATCH 3/3] fixed postfix --- api/data/refactoring.py | 6 +++--- tests/pipeline.ipynb | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index ede8bf3..5154c44 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -183,7 +183,7 @@ def add_g_position_to_gnomad(gnomad): gnomAD dataframe. This function modifies it in-place. """ gnomad[['chromosome', 'position', 'ref', 'alt']] = gnomad['gnomAD ID'].str.split('-', expand=True) - gnomad['hg38_gnomAD'] = 'g.' + gnomad['position'] + gnomad['ref'] + '>' + gnomad['alt'] + gnomad['hg38'] = 'g.' + gnomad['position'] + gnomad['ref'] + '>' + gnomad['alt'] gnomad.drop(columns=['chromosome', 'position', 'ref', 'alt'], inplace=True) @@ -203,14 +203,14 @@ def merge_gnomad_lovd(lovd, gnomad): """ add_g_position_to_gnomad(gnomad) - gnomad.columns = [col + '_gnomad' if col != 'hg38_gnomAD' else col for col in gnomad.columns] + gnomad.columns = [col + '_gnomad' for col in gnomad.columns] main_frame = pd.merge( lovd, gnomad, how="outer", left_on="VariantOnGenome/DNA/hg38", - right_on="hg38_gnomAD") + right_on="hg38_gnomad") return main_frame diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index bfe78f4..5e51efc 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -3996,10 +3996,10 @@ }, { "data": { - "text/plain": " id transcriptid effectid position_c_start \\\n0 170936 7329 90 -538 \n1 235579 7329 99 -332 \n2 235593 7329 99 1300 \n3 235595 7329 99 1300 \n4 235603 7329 99 6572 \n... ... ... ... ... \n13272 822052 7329 70 1767 \n13273 822775 7329 70 0 \n13274 822785 7329 70 0 \n13275 822816 7329 70 0 \n13276 867648 7329 70 0 \n\n position_c_start_intron position_c_end position_c_end_intron \\\n0 0 1599 1 \n1 -1 748 1 \n2 -1 1459 1 \n3 -1 1459 1 \n4 -1 6725 1 \n... ... ... ... \n13272 -1 2023 1 \n13273 0 0 0 \n13274 0 0 0 \n13275 0 0 0 \n13276 0 0 0 \n\n VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n0 c.(?_-538)_(1599+1_1600-1)del r.0? \n1 c.(-333+1_-332-1)_(748+1_749-1)del r.? \n2 c.(1299+1_1300-1)_(1459+1_1460-1)del r.? \n3 c.(1299+1_1300-1)_(1459+1_1460-1)del r.(?) \n4 c.(6571+1_6572-1)_(6725+1_6726-1)del r.? \n... ... ... \n13272 c.(1766+1_1767-1)_(2023+1_2024-1)del r.spl \n13273 c.? r.(?) \n13274 c.? r.(?) \n13275 c.? r.(?) \n13276 c.? r.(?) \n\n VariantOnTranscript/Protein VariantOnTranscript/Exon \\\n0 p.0? _1_10i \n1 p.? 2i_4i \n2 p.? 8i_9i \n3 p.? 8i_9i \n4 p.(Ser2191Thrfs*14) 32i_33i \n... ... ... \n13272 p.(?) \n13273 p.(Tyr2555fs) \n13274 p.(Asp498fs) \n13275 p.(Gln3101fs) \n13276 p.? \n\n VariantOnGenome/DNA/hg38 gnomAD ID_gnomad hg38_gnomAD \n0 \n1 \n2 \n3 \n4 \n... ... ... ... \n13272 g.? \n13273 g.? \n13274 g.? \n13275 g.? \n13276 g.? \n\n[13277 rows x 14 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/ProteinVariantOnTranscript/ExonVariantOnGenome/DNA/hg38gnomAD ID_gnomadhg38_gnomAD
0170936732990-538015991c.(?_-538)_(1599+1_1600-1)delr.0?p.0?_1_10i<NA><NA>
1235579732999-332-17481c.(-333+1_-332-1)_(748+1_749-1)delr.?p.?2i_4i<NA><NA>
22355937329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.?p.?8i_9i<NA><NA>
32355957329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.(?)p.?8i_9i<NA><NA>
42356037329996572-167251c.(6571+1_6572-1)_(6725+1_6726-1)delr.?p.(Ser2191Thrfs*14)32i_33i<NA><NA>
.............................................
132728220527329701767-120231c.(1766+1_1767-1)_(2023+1_2024-1)delr.splp.(?)g.?<NA><NA>
132738227757329700000c.?r.(?)p.(Tyr2555fs)g.?<NA><NA>
132748227857329700000c.?r.(?)p.(Asp498fs)g.?<NA><NA>
132758228167329700000c.?r.(?)p.(Gln3101fs)g.?<NA><NA>
132768676487329700000c.?r.(?)p.?g.?<NA><NA>
\n

13277 rows × 14 columns

\n
" + "text/plain": " id transcriptid effectid position_c_start \\\n0 170936 7329 90 -538 \n1 235579 7329 99 -332 \n2 235593 7329 99 1300 \n3 235595 7329 99 1300 \n4 235603 7329 99 6572 \n... ... ... ... ... \n13272 822052 7329 70 1767 \n13273 822775 7329 70 0 \n13274 822785 7329 70 0 \n13275 822816 7329 70 0 \n13276 867648 7329 70 0 \n\n position_c_start_intron position_c_end position_c_end_intron \\\n0 0 1599 1 \n1 -1 748 1 \n2 -1 1459 1 \n3 -1 1459 1 \n4 -1 6725 1 \n... ... ... ... \n13272 -1 2023 1 \n13273 0 0 0 \n13274 0 0 0 \n13275 0 0 0 \n13276 0 0 0 \n\n VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n0 c.(?_-538)_(1599+1_1600-1)del r.0? \n1 c.(-333+1_-332-1)_(748+1_749-1)del r.? \n2 c.(1299+1_1300-1)_(1459+1_1460-1)del r.? \n3 c.(1299+1_1300-1)_(1459+1_1460-1)del r.(?) \n4 c.(6571+1_6572-1)_(6725+1_6726-1)del r.? \n... ... ... \n13272 c.(1766+1_1767-1)_(2023+1_2024-1)del r.spl \n13273 c.? r.(?) \n13274 c.? r.(?) \n13275 c.? r.(?) \n13276 c.? r.(?) \n\n VariantOnTranscript/Protein VariantOnTranscript/Exon \\\n0 p.0? _1_10i \n1 p.? 2i_4i \n2 p.? 8i_9i \n3 p.? 8i_9i \n4 p.(Ser2191Thrfs*14) 32i_33i \n... ... ... \n13272 p.(?) \n13273 p.(Tyr2555fs) \n13274 p.(Asp498fs) \n13275 p.(Gln3101fs) \n13276 p.? \n\n VariantOnGenome/DNA/hg38 gnomAD ID_gnomad hg38_gnomad \n0 \n1 \n2 \n3 \n4 \n... ... ... ... \n13272 g.? \n13273 g.? \n13274 g.? \n13275 g.? \n13276 g.? \n\n[13277 rows x 14 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/ProteinVariantOnTranscript/ExonVariantOnGenome/DNA/hg38gnomAD ID_gnomadhg38_gnomad
0170936732990-538015991c.(?_-538)_(1599+1_1600-1)delr.0?p.0?_1_10i<NA><NA>
1235579732999-332-17481c.(-333+1_-332-1)_(748+1_749-1)delr.?p.?2i_4i<NA><NA>
22355937329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.?p.?8i_9i<NA><NA>
32355957329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.(?)p.?8i_9i<NA><NA>
42356037329996572-167251c.(6571+1_6572-1)_(6725+1_6726-1)delr.?p.(Ser2191Thrfs*14)32i_33i<NA><NA>
.............................................
132728220527329701767-120231c.(1766+1_1767-1)_(2023+1_2024-1)delr.splp.(?)g.?<NA><NA>
132738227757329700000c.?r.(?)p.(Tyr2555fs)g.?<NA><NA>
132748227857329700000c.?r.(?)p.(Asp498fs)g.?<NA><NA>
132758228167329700000c.?r.(?)p.(Gln3101fs)g.?<NA><NA>
132768676487329700000c.?r.(?)p.?g.?<NA><NA>
\n

13277 rows × 14 columns

\n
" }, - "execution_count": 2, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -4029,19 +4029,19 @@ " on='id',\n", " how='left')\n", "\n", - "gnomad_data = gnomad_data.copy()[[\"gnomAD ID\"]]\n", + "gnomad_data = gnomad_data.copy()\n", "final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", "final_data" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-21T18:22:09.068809Z", - "start_time": "2024-08-21T18:21:48.966115Z" + "end_time": "2024-08-21T18:35:42.249375Z", + "start_time": "2024-08-21T18:35:33.312752Z" } }, "id": "dd9b17623f26a07c", - "execution_count": 2 + "execution_count": 1 }, { "cell_type": "code",