Skip to content
This repository has been archived by the owner on Aug 26, 2023. It is now read-only.

Commit

Permalink
Fix fastq quality encoding inference (#243)
Browse files Browse the repository at this point in the history
* FASTQ parser actually uses given `quality_encoding` argument.

`infer_quality_encoding()` was overwriting the given `encodings`
argument.

* FASTQParser requires exactly one quality encoding

* make `quality_encoding` a required argument to `open(filename, FASTQ)`

* update tests to use new fastq parsing api
  • Loading branch information
kemaleren authored and Ben J. Ward committed Jul 31, 2016
1 parent b8e991b commit 046f166
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 11 deletions.
4 changes: 2 additions & 2 deletions src/precompile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@

if VERSION < v"0.5-"
precompile(Base.open, (ASCIIString, Type{Seq.FASTA},))
precompile(Base.open, (ASCIIString, Type{Seq.FASTQ},))
precompile(Base.open, (ASCIIString, Type{Seq.FASTQ}, Type{Seq.QualityEncoding},))
precompile(Base.open, (ASCIIString, Type{Intervals.BED},))
else
precompile(Base.open, (String, Type{Seq.FASTA},))
precompile(Base.open, (String, Type{Seq.FASTQ},))
precompile(Base.open, (String, Type{Seq.FASTQ}, Type{Seq.QualityEncoding},))
precompile(Base.open, (String, Type{Intervals.BED},))
end
precompile(Base.read, (Seq.FASTAParser{Seq.BioSequence},))
Expand Down
11 changes: 9 additions & 2 deletions src/seq/fastq.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,16 @@ function Base.open(
ascii_offset::Integer=typemin(Int))
io = open(filepath, mode)
if mode[1] == 'r'
return open(BufferedInputStream(io), FASTQ; quality_encoding=quality_encoding)
return open(BufferedInputStream(io), FASTQ, quality_encoding)
elseif mode[1] ('w', 'a')
return FASTQWriter(io, quality_header, ascii_offset)
end
error("invalid open mode")
end

function Base.open{S}(input::BufferedInputStream, ::Type{FASTQ},
quality_encoding::QualityEncoding,
::Type{S}=DNASequence;
quality_encoding::QualityEncoding=EMPTY_QUAL_ENCODING,
# TODO: remove this option after v0.2
qualenc=quality_encoding)
return FASTQParser{S}(input, quality_encoding)
Expand All @@ -90,6 +90,13 @@ type FASTQParser{S<:Sequence} <: AbstractParser

function FASTQParser(input::BufferedInputStream,
quality_encodings::QualityEncoding)
if quality_encodings == EMPTY_QUAL_ENCODING
error("The `quality_encodings` argument is required when parsing FASTQ.")
elseif count_ones(convert(UInt16, quality_encodings)) > 1
error("The `quality_encodings` argument must specify exactly one encoding.")
elseif count_ones(convert(UInt16, quality_encodings & ALL_QUAL_ENCODINGS)) != 1
error("Unknown quality encoding.")
end
return new(Ragel.State(fastqparser_start, input),
BufferedOutputStream(), BufferedOutputStream(),
StringField(), StringField(), 0, quality_encodings)
Expand Down
1 change: 0 additions & 1 deletion src/seq/quality.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ giving the set of compatible encodings.
function infer_quality_encoding(data::Vector{UInt8}, start, stop,
encodings::QualityEncoding=ALL_QUAL_ENCODINGS,
default::QualityEncoding=EMPTY_QUAL_ENCODING)
encodings = ALL_QUAL_ENCODINGS
@inbounds for i in start:stop
c = data[i]
if '!' <= c <= '~'
Expand Down
12 changes: 6 additions & 6 deletions test/seq/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2764,15 +2764,15 @@ end

function check_fastq_parse(filename)
# Reading from a stream
for seqrec in open(filename, FASTQ)
for seqrec in open(filename, FASTQ, Seq.SANGER_QUAL_ENCODING)
end

# Reading from a memory mapped file
for seqrec in open(filename, FASTQ, memory_map=true)
for seqrec in open(filename, FASTQ, Seq.SANGER_QUAL_ENCODING, memory_map=true)
end

# in-place parsing
stream = open(filename, FASTQ)
stream = open(filename, FASTQ, Seq.SANGER_QUAL_ENCODING)
entry = eltype(stream)()
while !eof(stream)
read!(stream, entry)
Expand All @@ -2782,14 +2782,14 @@ end
output = IOBuffer()
writer = Seq.FASTQWriter(output, false, typemin(Int))
expected_entries = Any[]
for seqrec in open(filename, FASTQ)
for seqrec in open(filename, FASTQ, Seq.SANGER_QUAL_ENCODING)
write(writer, seqrec)
push!(expected_entries, seqrec)
end
flush(writer)

read_entries = Any[]
for seqrec in open(takebuf_array(output), FASTQ)
for seqrec in open(takebuf_array(output), FASTQ, Seq.SANGER_QUAL_ENCODING)
push!(read_entries, seqrec)
end

Expand Down Expand Up @@ -2822,7 +2822,7 @@ end
""")
for A in (DNAAlphabet{2}, DNAAlphabet{4})
seekstart(input)
seq = first(open(input, FASTQ, BioSequence{A})).seq
seq = first(open(input, FASTQ, Seq.SANGER_QUAL_ENCODING, BioSequence{A})).seq
@test typeof(seq) == BioSequence{A}
end
end
Expand Down

0 comments on commit 046f166

Please sign in to comment.