diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 06705ed..6cdacd5 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -3986,24 +3986,6 @@ }, { "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The file at ../data/lovd/lovd_data.txt already exists.\n" - ] - }, - { - "data": { - "text/plain": " id transcriptid effectid position_c_start \\\n0 822823 7329 70 632 \n1 822787 7329 70 8391 \n2 822843 7329 70 5608 \n3 822771 7329 70 8206 \n4 \n... ... ... ... ... \n13218 959060 7329 70 9383 \n13219 959064 7329 50 0 \n13220 985494 7329 70 2137 \n13221 986425 7329 90 4361 \n13222 987322 7329 90 9299 \n\n position_c_start_intron position_c_end position_c_end_intron \\\n0 0 632 0 \n1 0 8391 0 \n2 0 5608 0 \n3 0 8206 0 \n4 \n... ... ... ... \n13218 0 9387 0 \n13219 0 0 0 \n13220 20590 3444 -29847 \n13221 0 4362 0 \n13222 0 9302 0 \n\n VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n0 c.632G>A r.(?) \n1 c.8391del r.(?) \n2 c.5608C>T r.(?) \n3 c.8206G>C r.(?) \n4 \n... ... ... \n13218 c.9383_9387del r.(?) \n13219 c.-538_862+10652{1}inv r.? \n13220 c.2137+20590_3444-29847del r.? \n13221 c.4361_4362delinsAG r.(?) \n13222 c.9299_9302del r.(?) \n\n VariantOnTranscript/Protein ... Homozygote Count Amish_gnomad \\\n0 p.(Cys211Tyr) ... \n1 p.(Gly2799Valfs*31) ... \n2 p.(Arg1870Trp) ... \n3 p.(Ala2736Pro) ... \n4 ... 0 \n... ... ... ... \n13218 p.(Lys3128ArgfsTer7) ... \n13219 p.? ... \n13220 p.(Val713AspfsTer14) ... \n13221 p.(Ser1454Ter) ... \n13222 p.(Thr3100LysfsTer26) ... \n\n Hemizygote Count Amish_gnomad Allele Count South Asian_gnomad \\\n0 \n1 \n2 \n3 \n4 0 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Allele Number South Asian_gnomad Homozygote Count South Asian_gnomad \\\n0 \n1 \n2 \n3 \n4 55362 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Hemizygote Count South Asian_gnomad Allele Count Remaining_gnomad \\\n0 \n1 \n2 \n3 \n4 0 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Allele Number Remaining_gnomad Homozygote Count Remaining_gnomad \\\n0 \n1 \n2 \n3 \n4 44082 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Hemizygote Count Remaining_gnomad \n0 \n1 \n2 \n3 \n4 0 \n... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n[13223 rows x 86 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/Protein...Homozygote Count Amish_gnomadHemizygote Count Amish_gnomadAllele Count South Asian_gnomadAllele Number South Asian_gnomadHomozygote Count South Asian_gnomadHemizygote Count South Asian_gnomadAllele Count Remaining_gnomadAllele Number Remaining_gnomadHomozygote Count Remaining_gnomadHemizygote Count Remaining_gnomad
082282373297063206320c.632G>Ar.(?)p.(Cys211Tyr)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
18227877329708391083910c.8391delr.(?)p.(Gly2799Valfs*31)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
28228437329705608056080c.5608C>Tr.(?)p.(Arg1870Trp)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
38227717329708206082060c.8206G>Cr.(?)p.(Ala2736Pro)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
4<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>...000553620004408200
..................................................................
132189590607329709383093870c.9383_9387delr.(?)p.(Lys3128ArgfsTer7)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132199590647329500000c.-538_862+10652{1}invr.?p.?...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132209854947329702137205903444-29847c.2137+20590_3444-29847delr.?p.(Val713AspfsTer14)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132219864257329904361043620c.4361_4362delinsAGr.(?)p.(Ser1454Ter)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132229873227329909299093020c.9299_9302delr.(?)p.(Thr3100LysfsTer26)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
\n

13223 rows × 86 columns

\n
" - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n", "import pandas as pd\n", @@ -4036,11 +4018,459 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-31T16:18:56.746641Z", - "start_time": "2024-08-31T16:18:47.798219Z" + "end_time": "2024-09-02T18:03:53.205784Z", + "start_time": "2024-09-02T18:03:43.273240Z" } }, "id": "dd9b17623f26a07c", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The file at ../data/lovd/lovd_data.txt already exists.\n" + ] + }, + { + "data": { + "text/plain": [ + " id transcriptid effectid position_c_start \\\n", + "0 822823 7329 70 632 \n", + "1 822787 7329 70 8391 \n", + "2 822843 7329 70 5608 \n", + "3 822771 7329 70 8206 \n", + "4 \n", + "... ... ... ... ... \n", + "13218 959060 7329 70 9383 \n", + "13219 959064 7329 50 0 \n", + "13220 985494 7329 70 2137 \n", + "13221 986425 7329 90 4361 \n", + "13222 987322 7329 90 9299 \n", + "\n", + " position_c_start_intron position_c_end position_c_end_intron \\\n", + "0 0 632 0 \n", + "1 0 8391 0 \n", + "2 0 5608 0 \n", + "3 0 8206 0 \n", + "4 \n", + "... ... ... ... \n", + "13218 0 9387 0 \n", + "13219 0 0 0 \n", + "13220 20590 3444 -29847 \n", + "13221 0 4362 0 \n", + "13222 0 9302 0 \n", + "\n", + " VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n", + "0 c.632G>A r.(?) \n", + "1 c.8391del r.(?) \n", + "2 c.5608C>T r.(?) \n", + "3 c.8206G>C r.(?) \n", + "4 \n", + "... ... ... \n", + "13218 c.9383_9387del r.(?) \n", + "13219 c.-538_862+10652{1}inv r.? \n", + "13220 c.2137+20590_3444-29847del r.? \n", + "13221 c.4361_4362delinsAG r.(?) \n", + "13222 c.9299_9302del r.(?) \n", + "\n", + " VariantOnTranscript/Protein ... Homozygote Count Amish_gnomad \\\n", + "0 p.(Cys211Tyr) ... \n", + "1 p.(Gly2799Valfs*31) ... \n", + "2 p.(Arg1870Trp) ... \n", + "3 p.(Ala2736Pro) ... \n", + "4 ... 0 \n", + "... ... ... ... \n", + "13218 p.(Lys3128ArgfsTer7) ... \n", + "13219 p.? ... \n", + "13220 p.(Val713AspfsTer14) ... \n", + "13221 p.(Ser1454Ter) ... \n", + "13222 p.(Thr3100LysfsTer26) ... \n", + "\n", + " Hemizygote Count Amish_gnomad Allele Count South Asian_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 0 0 \n", + "... ... ... \n", + "13218 \n", + "13219 \n", + "13220 \n", + "13221 \n", + "13222 \n", + "\n", + " Allele Number South Asian_gnomad Homozygote Count South Asian_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 55362 0 \n", + "... ... ... \n", + "13218 \n", + "13219 \n", + "13220 \n", + "13221 \n", + "13222 \n", + "\n", + " Hemizygote Count South Asian_gnomad Allele Count Remaining_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 0 0 \n", + "... ... ... \n", + "13218 \n", + "13219 \n", + "13220 \n", + "13221 \n", + "13222 \n", + "\n", + " Allele Number Remaining_gnomad Homozygote Count Remaining_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 44082 0 \n", + "... ... ... \n", + "13218 \n", + "13219 \n", + "13220 \n", + "13221 \n", + "13222 \n", + "\n", + " Hemizygote Count Remaining_gnomad \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 0 \n", + "... ... \n", + "13218 \n", + "13219 \n", + "13220 \n", + "13221 \n", + "13222 \n", + "\n", + "[13223 rows x 86 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/Protein...Homozygote Count Amish_gnomadHemizygote Count Amish_gnomadAllele Count South Asian_gnomadAllele Number South Asian_gnomadHomozygote Count South Asian_gnomadHemizygote Count South Asian_gnomadAllele Count Remaining_gnomadAllele Number Remaining_gnomadHomozygote Count Remaining_gnomadHemizygote Count Remaining_gnomad
082282373297063206320c.632G>Ar.(?)p.(Cys211Tyr)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
18227877329708391083910c.8391delr.(?)p.(Gly2799Valfs*31)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
28228437329705608056080c.5608C>Tr.(?)p.(Arg1870Trp)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
38227717329708206082060c.8206G>Cr.(?)p.(Ala2736Pro)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
4<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>...000553620004408200
..................................................................
132189590607329709383093870c.9383_9387delr.(?)p.(Lys3128ArgfsTer7)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132199590647329500000c.-538_862+10652{1}invr.?p.?...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132209854947329702137205903444-29847c.2137+20590_3444-29847delr.?p.(Val713AspfsTer14)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132219864257329904361043620c.4361_4362delinsAGr.(?)p.(Ser1454Ter)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132229873227329909299093020c.9299_9302delr.(?)p.(Thr3100LysfsTer26)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
\n", + "

13223 rows × 86 columns

\n", + "
" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "execution_count": 1 }, { @@ -4050,7 +4480,8 @@ "metadata": { "collapsed": false }, - "id": "50b0e50e88fa0914" + "id": "50b0e50e88fa0914", + "execution_count": null } ], "metadata": { diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py index 1b4b164..fa9db1d 100644 --- a/tests/test_lovd_fill_hg38.py +++ b/tests/test_lovd_fill_hg38.py @@ -82,6 +82,17 @@ def test_fill_hg38_no_variants(self): lovd_fill_hg38(self.df) self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.") + def test_fill_hg38_NA_variants(self): + """Test filling hg38 values when there pd. NA variants in the dataframe.""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': [pd.NA], + 'VariantOnGenome/DNA/hg38': [pd.NA] + }) + with self.assertRaises(TypeError) as context: + lovd_fill_hg38(self.df) + + self.assertEqual(str(context.exception), "Expected a string for 'variant', got NAType instead") + if __name__ == '__main__': unittest.main()