From 046f166544ef4bcf312849ba4366e071b89742bd Mon Sep 17 00:00:00 2001 From: Kemal Eren Date: Sun, 31 Jul 2016 20:32:41 +0200 Subject: [PATCH] Fix fastq quality encoding inference (#243) * FASTQ parser actually uses given `quality_encoding` argument. `infer_quality_encoding()` was overwriting the given `encodings` argument. * FASTQParser requires exactly one quality encoding * make `quality_encoding` a required argument to `open(filename, FASTQ)` * update tests to use new fastq parsing api --- src/precompile.jl | 4 ++-- src/seq/fastq.jl | 11 +++++++++-- src/seq/quality.jl | 1 - test/seq/runtests.jl | 12 ++++++------ 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/precompile.jl b/src/precompile.jl index 7872899e0..fa04d2bc2 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -6,11 +6,11 @@ if VERSION < v"0.5-" precompile(Base.open, (ASCIIString, Type{Seq.FASTA},)) - precompile(Base.open, (ASCIIString, Type{Seq.FASTQ},)) + precompile(Base.open, (ASCIIString, Type{Seq.FASTQ}, Type{Seq.QualityEncoding},)) precompile(Base.open, (ASCIIString, Type{Intervals.BED},)) else precompile(Base.open, (String, Type{Seq.FASTA},)) - precompile(Base.open, (String, Type{Seq.FASTQ},)) + precompile(Base.open, (String, Type{Seq.FASTQ}, Type{Seq.QualityEncoding},)) precompile(Base.open, (String, Type{Intervals.BED},)) end precompile(Base.read, (Seq.FASTAParser{Seq.BioSequence},)) diff --git a/src/seq/fastq.jl b/src/seq/fastq.jl index 229c9bf00..0d860533b 100644 --- a/src/seq/fastq.jl +++ b/src/seq/fastq.jl @@ -59,7 +59,7 @@ function Base.open( ascii_offset::Integer=typemin(Int)) io = open(filepath, mode) if mode[1] == 'r' - return open(BufferedInputStream(io), FASTQ; quality_encoding=quality_encoding) + return open(BufferedInputStream(io), FASTQ, quality_encoding) elseif mode[1] ∈ ('w', 'a') return FASTQWriter(io, quality_header, ascii_offset) end @@ -67,8 +67,8 @@ function Base.open( end function Base.open{S}(input::BufferedInputStream, ::Type{FASTQ}, + quality_encoding::QualityEncoding, ::Type{S}=DNASequence; - quality_encoding::QualityEncoding=EMPTY_QUAL_ENCODING, # TODO: remove this option after v0.2 qualenc=quality_encoding) return FASTQParser{S}(input, quality_encoding) @@ -90,6 +90,13 @@ type FASTQParser{S<:Sequence} <: AbstractParser function FASTQParser(input::BufferedInputStream, quality_encodings::QualityEncoding) + if quality_encodings == EMPTY_QUAL_ENCODING + error("The `quality_encodings` argument is required when parsing FASTQ.") + elseif count_ones(convert(UInt16, quality_encodings)) > 1 + error("The `quality_encodings` argument must specify exactly one encoding.") + elseif count_ones(convert(UInt16, quality_encodings & ALL_QUAL_ENCODINGS)) != 1 + error("Unknown quality encoding.") + end return new(Ragel.State(fastqparser_start, input), BufferedOutputStream(), BufferedOutputStream(), StringField(), StringField(), 0, quality_encodings) diff --git a/src/seq/quality.jl b/src/seq/quality.jl index 8da4d44cf..b2ae380f9 100644 --- a/src/seq/quality.jl +++ b/src/seq/quality.jl @@ -90,7 +90,6 @@ giving the set of compatible encodings. function infer_quality_encoding(data::Vector{UInt8}, start, stop, encodings::QualityEncoding=ALL_QUAL_ENCODINGS, default::QualityEncoding=EMPTY_QUAL_ENCODING) - encodings = ALL_QUAL_ENCODINGS @inbounds for i in start:stop c = data[i] if '!' <= c <= '~' diff --git a/test/seq/runtests.jl b/test/seq/runtests.jl index 3789824d3..91b85d872 100644 --- a/test/seq/runtests.jl +++ b/test/seq/runtests.jl @@ -2764,15 +2764,15 @@ end function check_fastq_parse(filename) # Reading from a stream - for seqrec in open(filename, FASTQ) + for seqrec in open(filename, FASTQ, Seq.SANGER_QUAL_ENCODING) end # Reading from a memory mapped file - for seqrec in open(filename, FASTQ, memory_map=true) + for seqrec in open(filename, FASTQ, Seq.SANGER_QUAL_ENCODING, memory_map=true) end # in-place parsing - stream = open(filename, FASTQ) + stream = open(filename, FASTQ, Seq.SANGER_QUAL_ENCODING) entry = eltype(stream)() while !eof(stream) read!(stream, entry) @@ -2782,14 +2782,14 @@ end output = IOBuffer() writer = Seq.FASTQWriter(output, false, typemin(Int)) expected_entries = Any[] - for seqrec in open(filename, FASTQ) + for seqrec in open(filename, FASTQ, Seq.SANGER_QUAL_ENCODING) write(writer, seqrec) push!(expected_entries, seqrec) end flush(writer) read_entries = Any[] - for seqrec in open(takebuf_array(output), FASTQ) + for seqrec in open(takebuf_array(output), FASTQ, Seq.SANGER_QUAL_ENCODING) push!(read_entries, seqrec) end @@ -2822,7 +2822,7 @@ end """) for A in (DNAAlphabet{2}, DNAAlphabet{4}) seekstart(input) - seq = first(open(input, FASTQ, BioSequence{A})).seq + seq = first(open(input, FASTQ, Seq.SANGER_QUAL_ENCODING, BioSequence{A})).seq @test typeof(seq) == BioSequence{A} end end