diff --git a/CHANGELOG.md b/CHANGELOG.md index d2fe368..56948d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,24 @@ # Changelog +## 0.0.10 - 2022-10-11 + +### Added + + * FASTQ support ([#37]) + +### Changed + + * Repository now at ([#36]) + +### Fixed + + * Usage example now matches packaged structure ([#36]) + * Requests to the IMGT server now use HTTPS ([#34]) + +[#37]: https://github.com/ShawHahnLab/vquest/pull/37 +[#36]: https://github.com/ShawHahnLab/vquest/pull/36 +[#34]: https://github.com/ressy/vquest/pull/34 + ## 0.0.9 - 2021-07-20 ### Added diff --git a/README.md b/README.md index 1e33dfc..61c9cc0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Automate IMGT V-QUEST usage on imgt.org -[![vquest](https://circleci.com/gh/ressy/vquest.svg?style=shield)](https://circleci.com/gh/ressy/vquest) +[![vquest](https://circleci.com/gh/ShawHahnLab/vquest.svg?style=shield)](https://circleci.com/gh/ShawHahnLab/vquest) [IMGT](http://imgt.org)'s [V-QUEST](http://www.imgt.org/IMGT_vquest/analysis) is only available via a web interface. This Python package automates V-QUEST @@ -23,7 +23,7 @@ Here the aligned FASTA text is printed directly to standard output. Example Python usage: - >>> from vquest import * + >>> from vquest.vq import * >>> config = layer_configs(DEFAULTS, {"species": "rhesus-monkey", "receptorOrLocusType": "IG", "fileSequences": "seqs.fasta"}) >>> result = vquest(config) >>> result.keys() diff --git a/setup.py b/setup.py index fad8fea..b60b78d 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ description="Automate IMGT V-QUEST usage on imgt.org", long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/ressy/vquest", + url="https://github.com/shawhahnlab/vquest", packages=setuptools.find_packages(), package_data={"vquest": ["data/*"]}, entry_points={"console_scripts": [ diff --git a/test_vquest/data/test_vquest/TestVquestFasta/config.yml b/test_vquest/data/test_vquest/TestVquestFasta/config.yml new file mode 100644 index 0000000..3eaf7c4 --- /dev/null +++ b/test_vquest/data/test_vquest/TestVquestFasta/config.yml @@ -0,0 +1,2 @@ +species: rhesus-monkey +receptorOrLocusType: IG diff --git a/test_vquest/data/test_vquest/TestVquestFasta/config_inline.yml b/test_vquest/data/test_vquest/TestVquestFasta/config_inline.yml new file mode 100644 index 0000000..9f625d1 --- /dev/null +++ b/test_vquest/data/test_vquest/TestVquestFasta/config_inline.yml @@ -0,0 +1,10 @@ +species: rhesus-monkey +receptorOrLocusType: IG +resultType: excel +xv_outputtype: 3 +sequences: | + >IGKV2-ACR*02 + GACATTGTGATGACCCAGACTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCAGCCTCCATCTCCTGCAGGTCTAGTCA + GAGCCTCTTGGATAGTGACGGGTACACCTGTTTGGACTGGTACCTGCAGAAGCCAGGCCAGTCTCCACAGCTCCTGATCT + ATGAGGTTTCCAACCGGGTCTCTGGAGTCCCTGACAGGTTCAGTGGCAGTGGGTCAGNCACTGATTTCACACTGAAAATC + AGCCGGGTGGAAGCTGAGGATGTTGGGGTGTATTACTGTATGCAAAGTATAGAGTTTCCTCC diff --git a/test_vquest/data/test_vquest/TestVquestFasta/expected/Parameters.txt b/test_vquest/data/test_vquest/TestVquestFasta/expected/Parameters.txt new file mode 100644 index 0000000..2df2969 --- /dev/null +++ b/test_vquest/data/test_vquest/TestVquestFasta/expected/Parameters.txt @@ -0,0 +1,12 @@ +Date Tue Dec 01 22:08:11 CET 2020 +IMGT/V-QUEST program version 3.5.21 +IMGT/V-QUEST reference directory release 202049-2 +Species Macaca mulatta +Receptor type or locus IG +IMGT/V-QUEST reference directory set F+ORF+ in-frame P +Search for insertions and deletions no +Nb of nucleotides to add (or exclude) in 3' of the V-REGION for the evaluation of the alignment score 0 +Nb of nucleotides to exclude in 5' of the V-REGION for the evaluation of the nb of mutations 0 +Analysis of scFv no +Number of submitted sequences 1 + diff --git a/test_vquest/data/test_vquest/TestVquestFasta/expected/vquest_airr.tsv b/test_vquest/data/test_vquest/TestVquestFasta/expected/vquest_airr.tsv new file mode 100644 index 0000000..48c3345 --- /dev/null +++ b/test_vquest/data/test_vquest/TestVquestFasta/expected/vquest_airr.tsv @@ -0,0 +1,2 @@ +sequence_id sequence sequence_aa rev_comp productive complete_vdj vj_in_frame stop_codon locus v_call d_call j_call c_call sequence_alignment sequence_alignment_aa germline_alignment germline_alignment_aa junction junction_aa np1 np1_aa np2 np2_aa cdr1 cdr1_aa cdr2 cdr2_aa cdr3 cdr3_aa fwr1 fwr1_aa fwr2 fwr2_aa fwr3 fwr3_aa fwr4 fwr4_aa v_score v_identity v_support v_cigar d_score d_identity d_support d_cigar j_score j_identity j_support j_cigar c_score c_identity c_support c_cigar v_sequence_start v_sequence_end v_germline_start v_germline_end v_alignment_start v_alignment_end d_sequence_start d_sequence_end d_germline_start d_germline_end d_alignment_start d_alignment_end j_sequence_start j_sequence_end j_germline_start j_germline_end j_alignment_start j_alignment_end cdr1_start cdr1_end cdr2_start cdr2_end cdr3_start cdr3_end fwr1_start fwr1_end fwr2_start fwr2_end fwr3_start fwr3_end fwr4_start fwr4_end v_sequence_alignment v_sequence_alignment_aa d_sequence_alignment d_sequence_alignment_aa j_sequence_alignment j_sequence_alignment_aa c_sequence_alignment c_sequence_alignment_aa v_germline_alignment v_germline_alignment_aa d_germline_alignment d_germline_alignment_aa j_germline_alignment j_germline_alignment_aa c_germline_alignment c_germline_alignment_aa junction_length junction_aa_length np1_length np2_length n1_length n2_length p3v_length p5d_length p3d_length p5j_length consensus_count duplicate_count cell_id clone_id rearrangement_id repertoire_id rearrangement_set_id sequence_analysis_category d_number 5prime_trimmed_n_nb 3prime_trimmed_n_nb insertions deletions junction_decryption +IGKV2-ACR*02 gacattgtgatgacccagactccactctccctgcccgtcacccctggagagccagcctccatctcctgcaggtctagtcagagcctcttggatagtgacgggtacacctgtttggactggtacctgcagaagccaggccagtctccacagctcctgatctatgaggtttccaaccgggtctctggagtccctgacaggttcagtggcagtgggtcagncactgatttcacactgaaaatcagccgggtggaagctgaggatgttggggtgtattactgtatgcaaagtatagagtttcctcc F F IGK Macmul IGKV2S20*01 F gacattgtgatgacccagactccactctccctgcccgtcacccctggagagccagcctccatctcctgcaggtctagtcagagcctcttggatagt...gacgggtacacctgtttggactggtacctgcagaagccaggccagtctccacagctcctgatctatgaggtt.....................tccaaccgggtctctggagtccct...gacaggttcagtggcagtggg......tcagncactgatttcacactgaaaatcagccgggtggaagctgaggatgttggggtgtattactgtatgcaaagtatagagtttcctcc DIVMTQTPLSLPVTPGEPASISCRSSQSLLDS.DGYTCLDWYLQKPGQSPQLLIYEV.......SNRVSGVP.DRFSGSG..SXTDFTLKISRVEAEDVGVYYCMQSIEFP gatattgtgatgacccagactccactctccctgccagtcacccctggagagccggcctccatctcctgcaggtctagtcagagcctcttggatagtgaggatggaaacacctatttggaatggtacctgcagaagccaggccagtctccacagcccttgatttatgaggtt.....................tccaaccgggcctctggagtccca...gacaggttcagtggcagtggg......tcagacactgatttcacactgaaaatcagcagagtggaggctgaggatgttggggtttattactgcatgcaaggtatagagtatcctcc DIVMTQTPLSLPVTPGEPASISCRSSQSLLDSEDGNTYLEWYLQKPGQSPQPLIYEV.......SNRASGVP.DRFSGSG..SDTDFTLKISRVEAEDVGVYYCMQGIEYP cagagcctcttggatagtgacgggtacacctgt QSLLDSDGYTC gaggtttcc EVS atgcaaagtatagagtttcctcc MQSIEFP gacattgtgatgacccagactccactctccctgcccgtcacccctggagagccagcctccatctcctgcaggtctagt DIVMTQTPLSLPVTPGEPASISCRSS ttggactggtacctgcagaagccaggccagtctccacagctcctgatctat LDWYLQKPGQSPQLLIY aaccgggtctctggagtccctgacaggttcagtggcagtgggtcagncactgatttcacactgaaaatcagccgggtggaagctgaggatgttggggtgtattactgt NRVSGVPDRFSGSGSXTDFTLKISRVEAEDVGVYYC 1294 93.20 2=1X32=1X17=1X42=3D2=1X2=2X6=1X6=1X34=1X1=1X4=1X19=1X12=1X25=1M25=1X1=1X5=1X17=1X8=1X6=1X9=1X6= 1 302 1 335 1 335 79 111 163 171 280 302 1 78 112 162 172 279 gacattgtgatgacccagactccactctccctgcccgtcacccctggagagccagcctccatctcctgcaggtctagtcagagcctcttggatagt...gacgggtacacctgtttggactggtacctgcagaagccaggccagtctccacagctcctgatctatgaggtt.....................tccaaccgggtctctggagtccct...gacaggttcagtggcagtggg......tcagncactgatttcacactgaaaatcagccgggtggaagctgaggatgttggggtgtattactgtatgcaaagtatagagtttcctcc DIVMTQTPLSLPVTPGEPASISCRSSQSLLDS.DGYTCLDWYLQKPGQSPQLLIYEV.......SNRVSGVP.DRFSGSG..SXTDFTLKISRVEAEDVGVYYCMQSIEFP gatattgtgatgacccagactccactctccctgccagtcacccctggagagccggcctccatctcctgcaggtctagtcagagcctcttggatagtgaggatggaaacacctatttggaatggtacctgcagaagccaggccagtctccacagcccttgatttatgaggtt.....................tccaaccgggcctctggagtccca...gacaggttcagtggcagtggg......tcagacactgatttcacactgaaaatcagcagagtggaggctgaggatgttggggtttattactgcatgcaaggtatagagtatcctcc DIVMTQTPLSLPVTPGEPASISCRSSQSLLDSEDGNTYLEWYLQKPGQSPQPLIYEV.......SNRASGVP.DRFSGSG..SDTDFTLKISRVEAEDVGVYYCMQGIEYP 0 0 0 0 0 0 1 (noindelsearch) 0 0 0 diff --git a/test_vquest/data/test_vquest/TestVquestFasta/response.dat b/test_vquest/data/test_vquest/TestVquestFasta/response.dat new file mode 100644 index 0000000..c4ebadd Binary files /dev/null and b/test_vquest/data/test_vquest/TestVquestFasta/response.dat differ diff --git a/test_vquest/data/test_vquest/TestVquestFasta/seqs.fasta b/test_vquest/data/test_vquest/TestVquestFasta/seqs.fasta new file mode 100644 index 0000000..d1979fc --- /dev/null +++ b/test_vquest/data/test_vquest/TestVquestFasta/seqs.fasta @@ -0,0 +1,5 @@ +>IGKV2-ACR*02 +GACATTGTGATGACCCAGACTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCAGCCTCCATCTCCTGCAGGTCTAGTCA +GAGCCTCTTGGATAGTGACGGGTACACCTGTTTGGACTGGTACCTGCAGAAGCCAGGCCAGTCTCCACAGCTCCTGATCT +ATGAGGTTTCCAACCGGGTCTCTGGAGTCCCTGACAGGTTCAGTGGCAGTGGGTCAGNCACTGATTTCACACTGAAAATC +AGCCGGGTGGAAGCTGAGGATGTTGGGGTGTATTACTGTATGCAAAGTATAGAGTTTCCTCC diff --git a/test_vquest/data/test_vquest/TestVquestFastq/config.yml b/test_vquest/data/test_vquest/TestVquestFastq/config.yml new file mode 100644 index 0000000..3eaf7c4 --- /dev/null +++ b/test_vquest/data/test_vquest/TestVquestFastq/config.yml @@ -0,0 +1,2 @@ +species: rhesus-monkey +receptorOrLocusType: IG diff --git a/test_vquest/data/test_vquest/TestVquestFastq/config_inline.yml b/test_vquest/data/test_vquest/TestVquestFastq/config_inline.yml new file mode 100644 index 0000000..9f625d1 --- /dev/null +++ b/test_vquest/data/test_vquest/TestVquestFastq/config_inline.yml @@ -0,0 +1,10 @@ +species: rhesus-monkey +receptorOrLocusType: IG +resultType: excel +xv_outputtype: 3 +sequences: | + >IGKV2-ACR*02 + GACATTGTGATGACCCAGACTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCAGCCTCCATCTCCTGCAGGTCTAGTCA + GAGCCTCTTGGATAGTGACGGGTACACCTGTTTGGACTGGTACCTGCAGAAGCCAGGCCAGTCTCCACAGCTCCTGATCT + ATGAGGTTTCCAACCGGGTCTCTGGAGTCCCTGACAGGTTCAGTGGCAGTGGGTCAGNCACTGATTTCACACTGAAAATC + AGCCGGGTGGAAGCTGAGGATGTTGGGGTGTATTACTGTATGCAAAGTATAGAGTTTCCTCC diff --git a/test_vquest/data/test_vquest/TestVquestFastq/expected/Parameters.txt b/test_vquest/data/test_vquest/TestVquestFastq/expected/Parameters.txt new file mode 100644 index 0000000..2df2969 --- /dev/null +++ b/test_vquest/data/test_vquest/TestVquestFastq/expected/Parameters.txt @@ -0,0 +1,12 @@ +Date Tue Dec 01 22:08:11 CET 2020 +IMGT/V-QUEST program version 3.5.21 +IMGT/V-QUEST reference directory release 202049-2 +Species Macaca mulatta +Receptor type or locus IG +IMGT/V-QUEST reference directory set F+ORF+ in-frame P +Search for insertions and deletions no +Nb of nucleotides to add (or exclude) in 3' of the V-REGION for the evaluation of the alignment score 0 +Nb of nucleotides to exclude in 5' of the V-REGION for the evaluation of the nb of mutations 0 +Analysis of scFv no +Number of submitted sequences 1 + diff --git a/test_vquest/data/test_vquest/TestVquestFastq/expected/vquest_airr.tsv b/test_vquest/data/test_vquest/TestVquestFastq/expected/vquest_airr.tsv new file mode 100644 index 0000000..48c3345 --- /dev/null +++ b/test_vquest/data/test_vquest/TestVquestFastq/expected/vquest_airr.tsv @@ -0,0 +1,2 @@ +sequence_id sequence sequence_aa rev_comp productive complete_vdj vj_in_frame stop_codon locus v_call d_call j_call c_call sequence_alignment sequence_alignment_aa germline_alignment germline_alignment_aa junction junction_aa np1 np1_aa np2 np2_aa cdr1 cdr1_aa cdr2 cdr2_aa cdr3 cdr3_aa fwr1 fwr1_aa fwr2 fwr2_aa fwr3 fwr3_aa fwr4 fwr4_aa v_score v_identity v_support v_cigar d_score d_identity d_support d_cigar j_score j_identity j_support j_cigar c_score c_identity c_support c_cigar v_sequence_start v_sequence_end v_germline_start v_germline_end v_alignment_start v_alignment_end d_sequence_start d_sequence_end d_germline_start d_germline_end d_alignment_start d_alignment_end j_sequence_start j_sequence_end j_germline_start j_germline_end j_alignment_start j_alignment_end cdr1_start cdr1_end cdr2_start cdr2_end cdr3_start cdr3_end fwr1_start fwr1_end fwr2_start fwr2_end fwr3_start fwr3_end fwr4_start fwr4_end v_sequence_alignment v_sequence_alignment_aa d_sequence_alignment d_sequence_alignment_aa j_sequence_alignment j_sequence_alignment_aa c_sequence_alignment c_sequence_alignment_aa v_germline_alignment v_germline_alignment_aa d_germline_alignment d_germline_alignment_aa j_germline_alignment j_germline_alignment_aa c_germline_alignment c_germline_alignment_aa junction_length junction_aa_length np1_length np2_length n1_length n2_length p3v_length p5d_length p3d_length p5j_length consensus_count duplicate_count cell_id clone_id rearrangement_id repertoire_id rearrangement_set_id sequence_analysis_category d_number 5prime_trimmed_n_nb 3prime_trimmed_n_nb insertions deletions junction_decryption +IGKV2-ACR*02 gacattgtgatgacccagactccactctccctgcccgtcacccctggagagccagcctccatctcctgcaggtctagtcagagcctcttggatagtgacgggtacacctgtttggactggtacctgcagaagccaggccagtctccacagctcctgatctatgaggtttccaaccgggtctctggagtccctgacaggttcagtggcagtgggtcagncactgatttcacactgaaaatcagccgggtggaagctgaggatgttggggtgtattactgtatgcaaagtatagagtttcctcc F F IGK Macmul IGKV2S20*01 F gacattgtgatgacccagactccactctccctgcccgtcacccctggagagccagcctccatctcctgcaggtctagtcagagcctcttggatagt...gacgggtacacctgtttggactggtacctgcagaagccaggccagtctccacagctcctgatctatgaggtt.....................tccaaccgggtctctggagtccct...gacaggttcagtggcagtggg......tcagncactgatttcacactgaaaatcagccgggtggaagctgaggatgttggggtgtattactgtatgcaaagtatagagtttcctcc DIVMTQTPLSLPVTPGEPASISCRSSQSLLDS.DGYTCLDWYLQKPGQSPQLLIYEV.......SNRVSGVP.DRFSGSG..SXTDFTLKISRVEAEDVGVYYCMQSIEFP gatattgtgatgacccagactccactctccctgccagtcacccctggagagccggcctccatctcctgcaggtctagtcagagcctcttggatagtgaggatggaaacacctatttggaatggtacctgcagaagccaggccagtctccacagcccttgatttatgaggtt.....................tccaaccgggcctctggagtccca...gacaggttcagtggcagtggg......tcagacactgatttcacactgaaaatcagcagagtggaggctgaggatgttggggtttattactgcatgcaaggtatagagtatcctcc DIVMTQTPLSLPVTPGEPASISCRSSQSLLDSEDGNTYLEWYLQKPGQSPQPLIYEV.......SNRASGVP.DRFSGSG..SDTDFTLKISRVEAEDVGVYYCMQGIEYP cagagcctcttggatagtgacgggtacacctgt QSLLDSDGYTC gaggtttcc EVS atgcaaagtatagagtttcctcc MQSIEFP gacattgtgatgacccagactccactctccctgcccgtcacccctggagagccagcctccatctcctgcaggtctagt DIVMTQTPLSLPVTPGEPASISCRSS ttggactggtacctgcagaagccaggccagtctccacagctcctgatctat LDWYLQKPGQSPQLLIY aaccgggtctctggagtccctgacaggttcagtggcagtgggtcagncactgatttcacactgaaaatcagccgggtggaagctgaggatgttggggtgtattactgt NRVSGVPDRFSGSGSXTDFTLKISRVEAEDVGVYYC 1294 93.20 2=1X32=1X17=1X42=3D2=1X2=2X6=1X6=1X34=1X1=1X4=1X19=1X12=1X25=1M25=1X1=1X5=1X17=1X8=1X6=1X9=1X6= 1 302 1 335 1 335 79 111 163 171 280 302 1 78 112 162 172 279 gacattgtgatgacccagactccactctccctgcccgtcacccctggagagccagcctccatctcctgcaggtctagtcagagcctcttggatagt...gacgggtacacctgtttggactggtacctgcagaagccaggccagtctccacagctcctgatctatgaggtt.....................tccaaccgggtctctggagtccct...gacaggttcagtggcagtggg......tcagncactgatttcacactgaaaatcagccgggtggaagctgaggatgttggggtgtattactgtatgcaaagtatagagtttcctcc DIVMTQTPLSLPVTPGEPASISCRSSQSLLDS.DGYTCLDWYLQKPGQSPQLLIYEV.......SNRVSGVP.DRFSGSG..SXTDFTLKISRVEAEDVGVYYCMQSIEFP gatattgtgatgacccagactccactctccctgccagtcacccctggagagccggcctccatctcctgcaggtctagtcagagcctcttggatagtgaggatggaaacacctatttggaatggtacctgcagaagccaggccagtctccacagcccttgatttatgaggtt.....................tccaaccgggcctctggagtccca...gacaggttcagtggcagtggg......tcagacactgatttcacactgaaaatcagcagagtggaggctgaggatgttggggtttattactgcatgcaaggtatagagtatcctcc DIVMTQTPLSLPVTPGEPASISCRSSQSLLDSEDGNTYLEWYLQKPGQSPQPLIYEV.......SNRASGVP.DRFSGSG..SDTDFTLKISRVEAEDVGVYYCMQGIEYP 0 0 0 0 0 0 1 (noindelsearch) 0 0 0 diff --git a/test_vquest/data/test_vquest/TestVquestFastq/response.dat b/test_vquest/data/test_vquest/TestVquestFastq/response.dat new file mode 100644 index 0000000..c4ebadd Binary files /dev/null and b/test_vquest/data/test_vquest/TestVquestFastq/response.dat differ diff --git a/test_vquest/data/test_vquest/TestVquestFastq/seqs.fastq b/test_vquest/data/test_vquest/TestVquestFastq/seqs.fastq new file mode 100644 index 0000000..81d9ba2 --- /dev/null +++ b/test_vquest/data/test_vquest/TestVquestFastq/seqs.fastqdiff --git a/test_vquest/test_vquest.py b/test_vquest/test_vquest.py index 89f460e..6d4e693 100644 --- a/test_vquest/test_vquest.py +++ b/test_vquest/test_vquest.py @@ -111,7 +111,7 @@ def test_vquest(self): self.assertEqual(self.post.call_count, 1) self.assertEqual( self.post.call_args.args, - ('http://www.imgt.org/IMGT_vquest/analysis', )) + ('https://www.imgt.org/IMGT_vquest/analysis', )) config_used = self.config.copy() # Whatever input type was given the actual type submitted to the form # will be "inline" to allow chunking of sequences if needed. The @@ -236,6 +236,114 @@ def test_vquest_main_alignment(self): self.check_missing_defaults_main(lambda: main(["--align"])) +class TestVquestFasta(TestVquestBase): + """File-based input with FASTA.""" + + def setUp(self): + super().setUp() + self.input_path = self.path/"seqs.fasta" + del self.config["sequences"] + self.config["fileSequences"] = self.input_path + + def test_vquest(self): + """Test that a basic request gives the expected response.""" + result = vquest(self.config) + # requests.post should have been called once, with this input. + self.assertEqual(self.post.call_count, 1) + self.assertEqual( + self.post.call_args.args, + ('https://www.imgt.org/IMGT_vquest/analysis', )) + config_used = self.config.copy() + # Whatever input type was given the actual type submitted to the form + # will be "inline" to allow chunking of sequences if needed. The + # sequences are also reformatted via Biopython when chunked. + config_used["inputType"] = "inline" + config_used["sequences"] = """>IGKV2-ACR*02 +GACATTGTGATGACCCAGACTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCAGCCTCC +ATCTCCTGCAGGTCTAGTCAGAGCCTCTTGGATAGTGACGGGTACACCTGTTTGGACTGG +TACCTGCAGAAGCCAGGCCAGTCTCCACAGCTCCTGATCTATGAGGTTTCCAACCGGGTC +TCTGGAGTCCCTGACAGGTTCAGTGGCAGTGGGTCAGNCACTGATTTCACACTGAAAATC +AGCCGGGTGGAAGCTGAGGATGTTGGGGTGTATTACTGTATGCAAAGTATAGAGTTTCCT +CC +""" + self.assertEqual( + self.post.call_args.kwargs, + {"data": config_used}) + self.assertEqual( + list(result.keys()), + ["Parameters.txt", "vquest_airr.tsv"]) + with open(self.path / "expected/Parameters.txt") as f_in: + parameters = f_in.read() + with open(self.path / "expected/vquest_airr.tsv") as f_in: + vquest_airr = f_in.read() + self.assertEqual(parameters, result["Parameters.txt"]) + self.assertEqual(vquest_airr, result["vquest_airr.tsv"]) + + def test_vquest_no_collapse(self): + """test_vquest but with vquest(..., collapse=False).""" + # Also try with collapse=False, for raw output + result = vquest(self.config, collapse=False) + self.assertEqual(self.post.call_count, 1) + self.assertEqual(len(result), 1) + self.assertEqual( + list(result[0].keys()), + ["Parameters.txt", "vquest_airr.tsv"]) + + def test_vquest_main(self): + """Test that the command-line interface gives the expected response.""" + with tempfile.TemporaryDirectory() as tempdir: + os.chdir(tempdir) + with open(self.path / "config.yml") as f_in, open("config.yml", "wt") as f_out: + f_out.write(f_in.read()) + f_out.write(f"fileSequences: {self.input_path}\n") + main(["config.yml"]) + self.assertTrue(Path("vquest_airr.tsv").exists()) + self.assertTrue(Path("Parameters.txt").exists()) + + def test_vquest_main_no_collapse(self): + """Test command-line interface with --no-collapse.""" + with tempfile.TemporaryDirectory() as tempdir: + os.chdir(tempdir) + with open(self.path / "config.yml") as f_in, open("config.yml", "wt") as f_out: + f_out.write(f_in.read()) + f_out.write(f"fileSequences: {self.input_path}\n") + main(["--no-collapse", "config.yml"]) + self.assertTrue(Path("001/vquest_airr.tsv").exists()) + self.assertTrue(Path("001/Parameters.txt").exists()) + + def test_vquest_main_alignment(self): + """Try using the --align feature. + + In this case the regular output files should not be created and instead + FASTA text should be written to stdout. + """ + expected = """>IGKV2-ACR*02 +gacattgtgatgacccagactccactctccctgcccgtcacccctggagagccagcctccatctcctgcaggtctagtcagagcctcttggatagt...gacgggtacacctgtttggactggtacctgcagaagccaggccagtctccacagctcctgatctatgaggtt.....................tccaaccgggtctctggagtccct...gacaggttcagtggcagtggg......tcagncactgatttcacactgaaaatcagccgggtggaagctgaggatgttggggtgtattactgtatgcaaagtatagagtttcctcc +""" + out = StringIO() + err = StringIO() + with redirect_stdout(out), redirect_stderr(err): + with tempfile.TemporaryDirectory() as tempdir: + os.chdir(tempdir) + with open(self.path / "config.yml") as f_in, open("config.yml", "wt") as f_out: + f_out.write(f_in.read()) + f_out.write(f"fileSequences: {self.input_path}\n") + main(["config.yml", "--align"]) + self.assertFalse(Path("vquest_airr.tsv").exists()) + self.assertFalse(Path("Parameters.txt").exists()) + self.assertEqual(out.getvalue(), expected) + self.assertEqual(err.getvalue(), "") + + +class TestVquestFastq(TestVquestFasta): + """File-based input with FASTQ.""" + + def setUp(self): + super().setUp() + self.input_path = self.path/"seqs.fastq" + self.config["fileSequences"] = self.input_path + + class TestVquestCustom(TestVquestSimple): """Try changing one of the configuration options. @@ -274,7 +382,7 @@ def test_vquest(self): self.assertEqual(self.post.call_count, 1) self.assertEqual( self.post.call_args.args, - ('http://www.imgt.org/IMGT_vquest/analysis', )) + ('https://www.imgt.org/IMGT_vquest/analysis', )) def test_vquest_main(self): """Test that an html file with an error message is parsed correctly for cmd-line usage.""" diff --git a/vquest/__main__.py b/vquest/__main__.py index 02ea880..c10d908 100644 --- a/vquest/__main__.py +++ b/vquest/__main__.py @@ -6,12 +6,9 @@ import logging import argparse from pathlib import Path -import vquest +from vquest import __doc__ as main_doc from vquest import LOGGER -from . import request -from .config import DEFAULTS, OPTIONS, load_config, layer_configs -from .util import airr_to_fasta -from .version import __version__ +from vquest import vq def main(arglist=None): """Command-line interface for V-QUEST requests""" @@ -23,7 +20,7 @@ def main(arglist=None): args = parser.parse_args(arglist) LOGGER.setLevel(max(10, logging.WARNING - 10*args.verbose)) config_full = __setup_config(args, parser) - output = request.vquest(config_full, collapse=args.collapse) + output = vq.vquest(config_full, collapse=args.collapse) __process_output(args, output) LOGGER.info("Done.") @@ -32,7 +29,7 @@ def __setup_config(args, parser): # All the possible vquest options. They're grouped by section as the keys # to inner dictionaries. vquest_opts = [] - for opt_section in OPTIONS: + for opt_section in vq.OPTIONS: vquest_opts.extend(opt_section["options"].keys()) vquest_args = {k: v for k, v in args_set.items() if k in vquest_opts} # If no config file(s) and no vquest args were given, just print the help @@ -44,7 +41,7 @@ def __setup_config(args, parser): " ".join(["%s=%s" % (key, val) for key, val in args_set.items()])) # Overlay the default config, configs given as files, and then options # given as command line arguments - configs = [load_config(config) for config in args.config] + configs = [vq.load_config(config) for config in args.config] for filename, config in zip(args.config, configs): if config: msg = " ".join(["%s=%s" % (key, val) for key, val in config.items()]) @@ -54,7 +51,7 @@ def __setup_config(args, parser): configs = [config for config in configs if config] LOGGER.debug("overriding command-line options: %s", " ".join(["%s=%s" % (key, val) for key, val in vquest_args.items()])) - config_full = layer_configs(DEFAULTS, *configs, vquest_args) + config_full = vq.layer_configs(vq.DEFAULTS, *configs, vquest_args) LOGGER.debug("final config: %s", " ".join(["%s=%s" % (key, val) for key, val in config_full.items()])) LOGGER.info("Configuration prepared") @@ -63,7 +60,7 @@ def __setup_config(args, parser): def __process_output(args, output): if args.align: LOGGER.info("Writing FASTA to stdout") - print(airr_to_fasta(output["vquest_airr.tsv"]), end="") + print(vq.airr_to_fasta(output["vquest_airr.tsv"]), end="") else: args.outdir.mkdir(parents=True, exist_ok=True) if args.collapse: @@ -84,14 +81,14 @@ def __process_output(args, output): def __setup_arg_parser(): parser = argparse.ArgumentParser( - description=vquest.__doc__, + description=main_doc, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("config", nargs="*", help="YAML configuration file") parser.add_argument( "--verbose", "-v", action="count", default=0, help="increase logging verbosity") parser.add_argument( - "--version", "-V", action="version", version=__version__) + "--version", "-V", action="version", version=vq.__version__) parser.add_argument( "--outdir", "-o", default=".", type=Path, help="directory for output files (. by default)") # https://stackoverflow.com/a/52403318/4499968 @@ -108,7 +105,7 @@ def __setup_arg_parser(): "from AIRR results and print as FASTA. " "If there is no text in the sequence_alignment column " "for a given sequence the original sequence is used instead.")) - for opt_section in OPTIONS: + for opt_section in vq.OPTIONS: option_parser = parser.add_argument_group( title="V-QUEST options: \"%s\" section" % opt_section["section"], description=opt_section["description"]) diff --git a/vquest/request.py b/vquest/request.py index 06a3f65..314ac59 100644 --- a/vquest/request.py +++ b/vquest/request.py @@ -5,6 +5,7 @@ import time import logging from io import StringIO +from pathlib import Path import requests from requests_html import HTML from Bio import SeqIO @@ -12,19 +13,38 @@ LOGGER = logging.getLogger(__name__) -URL = "http://www.imgt.org/IMGT_vquest/analysis" +URL = "https://www.imgt.org/IMGT_vquest/analysis" DELAY = 1 # for rate-limiting multiple requests CHUNK_SIZE = 50 # to stay within V-QUEST's limit on sequences in one go +EXTS = { + ".fasta": "fasta", + ".fa": "fasta", + ".fna": "fasta", + ".fastq": "fastq", + ".fq": "fastq"} + def _parse_records(config): """Extract Seq records for sequences given in config""" records = [] if "sequences" in config and config["sequences"]: + if config["sequences"].startswith("@"): + fmt = "fastq" + elif config["sequences"].startswith(">"): + fmt = "fasta" + else: + raise ValueError("Sequence format not recognized") with StringIO(config["sequences"]) as seqs_stream: - records.extend(list(SeqIO.parse(seqs_stream, "fasta"))) + records.extend(list(SeqIO.parse(seqs_stream, fmt))) if "fileSequences" in config and config["fileSequences"]: - with open(config["fileSequences"]) as f_in: - records.extend(list(SeqIO.parse(f_in, "fasta"))) + path = Path(config["fileSequences"]) + ext = path.suffix.lower() + try: + fmt = EXTS[ext] + except KeyError as err: + raise ValueError(f"File format not recognized for {path}") from err + with open(path) as f_in: + records.extend(list(SeqIO.parse(f_in, fmt))) return records def vquest(config, collapse=True): @@ -94,4 +114,8 @@ def _collapse_outputs(outputs): else: airr = output_chunk["vquest_airr.tsv"].decode() output["vquest_airr.tsv"] += "\n".join(airr.splitlines()[1:]) + # I've seen cases where there may or may not be a final newline, so + # let's make sure there always is + if not output["vquest_airr.tsv"].endswith("\n"): + output["vquest_airr.tsv"] += "\n" return output diff --git a/vquest/version.py b/vquest/version.py index df89bd8..b37da1f 100644 --- a/vquest/version.py +++ b/vquest/version.py @@ -5,4 +5,4 @@ See https://www.python.org/dev/peps/pep-0396/ """ -__version__ = "0.0.9" +__version__ = "0.0.10" diff --git a/vquest/vq.py b/vquest/vq.py new file mode 100644 index 0000000..cb66e40 --- /dev/null +++ b/vquest/vq.py @@ -0,0 +1,7 @@ +""" +Common imports grouped here for convenience. +""" +from .request import vquest +from .config import DEFAULTS, OPTIONS, load_config, layer_configs +from .util import airr_to_fasta +from .version import __version__