Skip to content
This repository has been archived by the owner on Aug 26, 2023. It is now read-only.

Commit

Permalink
Sliding window distance computation (#338)
Browse files Browse the repository at this point in the history
* first draft of windowed sitance computation

* Corrected typo

* Remembered to initialize integer matrices to 0

* distance now also outputs ranges

* Changed ends construction

* Trying linear indexing to achieve simd

* Tried simplifying inner loop

* Eliminated some repeated operations

* added window distances for Proportion{T} distances too

* Corrected error in proportion windowed distance

* Added sliding distance method for JukesCantor69 distances

* Added draft flagmutations for both transition and transversion mutations.

* Added draft of windowed distance for Count{T} <: TsTv

* Added draft of windowed k80 distance

* Added corrections to windowed distance measure function signatures

* Added tests for windowed counts of Any, Transition, and Transversion Mutations

* Edited tests

* Corrected tests

* Edited tests

* Edited tests again

* Added tests for windowed proportion distance

* Tests edit

* Made some tests use isapprox instead of isequal

* Corrected missing end statements

* Added docstrings

* Corrected type in docstrings

* changed approx tests to different tolerance

* Corrected a typo in a function

* Added tests for windowed JukesCantor69 distance

* Added simd to proportion distance function

* Implemented @BICYCLE's suggestions

* Corrected stupid typo mistake
  • Loading branch information
Ben J. Ward authored and Ben Ward committed Feb 27, 2017
1 parent aa0ee59 commit 485691f
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 5 deletions.
11 changes: 11 additions & 0 deletions src/util/windows.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ immutable EachWindowIterator{T <: ArrayOrStringOrSeq}
end


immutable EachWindow
to::Int
width::Int
step::Int
end

function Base.size(winitr::EachWindow)
return length(StepRange(winitr.width+1, winitr.step, length(winitr.data)))
end


"""
Calculate the number of windows that will result from iterating across the container.
Expand Down
184 changes: 182 additions & 2 deletions src/var/distances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ immutable Kimura80 <: TsTv end
# Distance computation internals
# ------------------------------

@inline function expected_distance{T}(::Type{Proportion{T}}, n::Int64, l::Int64)
return n / l
end

## Jukes and Cantor 1969 distance computation.

@inline function expected_distance(::Type{JukesCantor69}, p::Float64)
Expand Down Expand Up @@ -129,6 +133,104 @@ function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs:
return count_mutations(T, seqs)
end

"""
distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
Compute pairwise distances using a sliding window.
As the window of `width` base pairs in size moves across a pair of sequences it
computes the distance between the two sequences in that window.
This method computes mutation counts for every window, and returns a tuple of the
matrix of p-distances for every window, a matrix of the number of valid sites
counted by the function for each window.
"""
function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
mutation_flags, ambiguous_flags = flagmutations(T, seqs)
nbases, npairs = size(mutation_flags)
if width < 1
throw(ArgumentError("`window` width must be ≥ 1."))
end
if step < 1
throw(ArgumentError("`step` must be ≥ 1."))
end
if width > nbases
throw(ArgumentError("The `window` size cannot be greater than number of data elements."))
end
starts = 1:step:nbases
ends = width:step:nbases
nwindows = length(ends)
mcounts = Matrix{Int}(nwindows, npairs)
wsizes = Matrix{Int}(nwindows, npairs)
ranges = Vector{UnitRange{Int}}(nwindows)

@inbounds for pair in 1:npairs
pairoffset = pair - 1
windowoffset = pairoffset * nwindows
flagsoffset = pairoffset * nbases
for i in 1:nwindows
from = starts[i]
to = ends[i]
mcount = 0
nsites = width
@simd for j in from:to
mcount += mutation_flags[flagsoffset + j]
nsites -= ambiguous_flags[flagsoffset + j]
end
ranges[i] = UnitRange(starts[i],ends[i])
mcounts[windowoffset + i] = mcount
wsizes[windowoffset + i] = nsites
end
end
return mcounts, wsizes, ranges
end


function distance{T<:TsTv,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
transitionFlags, transversionFlags, ambiguous_flags = flagmutations(TransitionMutation, TransversionMutation, seqs)
nbases, npairs = size(transitionFlags)
if width < 1
throw(ArgumentError("`window` width must be ≥ 1."))
end
if step < 1
throw(ArgumentError("`step` must be ≥ 1."))
end
if width > nbases
throw(ArgumentError("The `window` size cannot be greater than number of data elements."))
end
starts = 1:step:nbases
ends = width:step:nbases
nwindows = length(ends)
tscounts = Matrix{Int}(nwindows, npairs)
tvcounts = Matrix{Int}(nwindows, npairs)
wsizes = Matrix{Int}(nwindows, npairs)
ranges = Vector{UnitRange{Int}}(nwindows)

@inbounds for pair in 1:npairs
pairoffset = pair - 1
windowoffset = pairoffset * nwindows
flagsoffset = pairoffset * nbases
for i in 1:nwindows
from = starts[i]
to = ends[i]
tscount = 0
tvcount = 0
nsites = width
@simd for j in from:to
tscount += transitionFlags[flagsoffset + j]
tvcount += transversionFlags[flagsoffset + j]
nsites -= ambiguous_flags[flagsoffset + j]
end
ranges[i] = UnitRange(starts[i],ends[i])
tscounts[windowoffset + i] = tscount
tvcounts[windowoffset + i] = tvcount
wsizes[windowoffset + i] = nsites
end
end
return tscounts, tvcounts, wsizes, ranges
end


"""
distance{T<:MutationType,N<:Nucleotide}(::Type{Count{T}}, seqs::Matrix{N})
Expand Down Expand Up @@ -164,7 +266,7 @@ vector of the number of valid (i.e. non-ambiguous sites) counted by the function
function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Proportion{T}}, seqs::Vector{BioSequence{A}})
d, l = distance(Count{T}, seqs)
D = Vector{Float64}(length(d))
@inbounds for i in 1:length(D)
@inbounds @simd for i in 1:length(D)
D[i] = d[i] / l[i]
end
return D, l
Expand All @@ -174,7 +276,7 @@ end
distance{T<:MutationType,N<:Nucleotide}(::Type{Proportion{T}}, seqs::Matrix{N})
This method of distance returns a tuple of a vector of the p-distances, and a
vector of the number of valid (i.e. non-ambiguous sites) counted by the function.
vector of the number of valid (i.e. non-ambiguous) sites counted by the function.
**Note: This method assumes that the sequences are stored in the `Matrix{N}`
provided as `seqs` in sequence major order i.e. each column of the matrix is one
Expand All @@ -189,6 +291,27 @@ function distance{T<:MutationType,N<:Nucleotide}(::Type{Proportion{T}}, seqs::Ma
return D, l
end

"""
distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Proportion{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
A distance method which computes pairwise distances using a sliding window.
As the window of `width` base pairs in size moves across a pair of sequences it
computes the distance between the two sequences in that window.
This method computes p-distances for every window, and returns a tuple of the
matrix of p-distances for every window, a matrix of the number of valid sites
counted by the function for each window.
"""
function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Proportion{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
counts, wsizes, ranges = distance(Count{T}, seqs, width, step)
res = Matrix{Float64}(size(counts))
@inbounds for i in 1:endof(counts)
res[i] = expected_distance(Proportion{T}, counts[i], wsizes[i])
end
return res, wsizes, ranges
end

"""
distance{A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{BioSequence{A}})
Expand All @@ -206,6 +329,32 @@ function distance{A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{Bio
return D, V
end

"""
distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
A distance method which computes pairwise distances using a sliding window.
As the window of `width` base pairs in size moves across a pair of sequences it
computes the distance between the two sequences in that window.
This method computes the JukesCantor69 distance for every window, and returns a tuple of the
matrix of p-distances for every window, a matrix of the number of valid sites
counted by the function for each window.
"""
function distance{A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
ps, wsizes, ranges = distance(Proportion{AnyMutation}, seqs, width, step)
a, b = size(ps)
est = Matrix{Float64}(a, b)
var = Matrix{Float64}(a, b)
@inbounds for i in 1:endof(ps)
p = ps[i]
l = wsizes[i]
est[i] = expected_distance(JukesCantor69, p)
var[i] = variance(JukesCantor69, p, l)
end
return est, var, ranges
end

"""
distance{N<:Nucleotide}(::Type{JukesCantor69}, seqs::Matrix{N})
Expand Down Expand Up @@ -250,6 +399,37 @@ function distance{A<:NucleotideAlphabet}(::Type{Kimura80}, seqs::Vector{BioSeque
return D, V
end

"""
distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Kimura80}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
A distance method which computes pairwise distances using a sliding window.
As the window of `width` base pairs in size moves across a pair of sequences it
computes the distance between the two sequences in that window.
This method computes the Kimura80 distance for every window, and returns a tuple of the
matrix of p-distances for every window, a matrix of the number of valid sites
counted by the function for each window.
"""
function distance{A<:NucleotideAlphabet}(::Type{Kimura80}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
tss, tvs, wsizes, ranges = distance(Count{Kimura80}, seqs, width, step)
a, b = size(tss)
est = Matrix{Float64}(a, b)
var = Matrix{Float64}(a, b)
@inbounds for i in 1:endof(counts)
L = l[i]
P = tss[i] / L
Q = tvs[i] / L
a1 = 1 - 2 * P - Q
a2 = 1 - 2 * Q
tv = tvs[i]
l = wsizes[i]
est[i] = expected_distance(Kimura80, a1, a2)
var[i] = variance(Kimura80, P, Q, L, a1, a2)
end
return est, var, ranges
end

"""
distance{N<:Nucleotide}(::Type{Kimura80}, seqs::Matrix{N})
Expand Down
39 changes: 37 additions & 2 deletions src/var/mutation_counting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,9 @@ function flagmutations{M<:MutationType,N<:Nucleotide}(::Type{M}, seqs::Matrix{N}
return ismutant, isambiguous
end



function flagmutations{M<:MutationType,A<:NucleotideAlphabet}(::Type{M}, seqs::Vector{BioSequence{A}})
return flagmutations(M, seqmatrix(seqs, :seq))
end

"""
count_mutations{T<:MutationType,N<:Nucleotide}(::Type{T}, seqs::Matrix{N})
Expand Down Expand Up @@ -263,6 +264,40 @@ function count_mutations{T<:MutationType,A<:NucleotideAlphabet}(::Type{T}, seque
return count_mutations(T, seqs)
end


function flagmutations{N<:Nucleotide}(::Type{TransitionMutation}, ::Type{TransversionMutation}, seqs::Matrix{N})
seqsize, nseqs = size(seqs)
istransition = Matrix{Bool}(seqsize, binomial(nseqs, 2))
istransversion = Matrix{Bool}(seqsize, binomial(nseqs, 2))
isambiguous = Matrix{Bool}(seqsize, binomial(nseqs, 2))
col = 1
@inbounds for i1 in 1:nseqs
s1offset = (i1 - 1) * seqsize
for i2 in i1+1:nseqs
s2offset = (i2 - 1) * seqsize
resoffset = (col - 1) * seqsize
for s in 1:seqsize
s1 = seqs[s1offset + s]
s2 = seqs[s2offset + s]
isamb = is_ambiguous_strict(s1, s2)
isdiff = s1 != s2
ists = is_mutation(TransitionMutation, s1, s2)
isambiguous[resoffset + s] = isamb
istransition[resoffset + s] = !isamb & isdiff & ists
istransversion[resoffset + s] = !isamb & isdiff & !ists
end
col += 1
end
end
return istransition, istransversion, isambiguous
end

function flagmutations{A<:NucleotideAlphabet}(::Type{TransitionMutation}, ::Type{TransversionMutation}, seqs::Vector{BioSequence{A}})
return flagmutations(TransitionMutation, TransversionMutation, seqmatrix(seqs, :seq))
end



"""
count_mutations{N<:Nucleotide}(::Type{TransitionMutation}, ::Type{TransversionMutation}, sequences::Matrix{N})
Expand Down
40 changes: 39 additions & 1 deletion test/var/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ end
dnas1 = [dna"ATTG-ACCTGGNTTTCCGAA", dna"A-ACAGAGTATACRGTCGTC"]
m1 = seqmatrix(dnas1, :seq)

dnas2 = [dna"attgaacctggntttccgaa", dna"atacagagtatacrgtcgtc"]
dnas2 = [dna"attgaacctggntttccgaa",
dna"atacagagtatacrgtcgtc"]
dnas3 = [dna"attgaacctgtntttccgaa",
dna"atagaacgtatatrgccgtc"]
m2 = seqmatrix(dnas2, :seq)

@test distance(Count{AnyMutation}, dnas1) == ([12], [16])
Expand All @@ -51,6 +54,14 @@ end
@test distance(Count{TransversionMutation}, m1) == ([8], [16])
@test distance(Count{Kimura80}, m1) == ([4], [8], [16])

@test distance(Count{AnyMutation}, dnas2, 5, 5)[1][:] == [2, 4, 3, 3]
@test distance(Count{AnyMutation}, dnas2, 5, 5)[2][:] == [5, 5, 3, 5]
@test distance(Count{TransitionMutation}, dnas2, 5, 5)[1][:] == [0, 2, 1, 1]
@test distance(Count{TransitionMutation}, dnas2, 5, 5)[2][:] == [5, 5, 3, 5]
@test distance(Count{TransversionMutation}, dnas2, 5, 5)[1][:] == [2, 2, 2, 2]
@test distance(Count{TransversionMutation}, dnas2, 5, 5)[2][:] == [5, 5, 3, 5]
@test distance(Count{Kimura80}, dnas1) == ([4], [8], [16])

@test distance(Count{AnyMutation}, dnas2) == ([12], [18])
@test distance(Count{TransitionMutation}, dnas2) == ([4], [18])
@test distance(Count{TransversionMutation}, dnas2) == ([8], [18])
Expand All @@ -60,6 +71,25 @@ end
@test distance(Count{TransversionMutation}, m2) == ([8], [18])
@test distance(Count{Kimura80}, m2) == ([4], [8], [18])

d = distance(Proportion{AnyMutation}, dnas2, 5, 5)
a = [0.4, 0.8, 1.0, 0.6]
for i in 1:length(d[1])
@test_approx_eq_eps d[1][i] a[i] 1e-4
end
@test d[2][:] == [5, 5, 3, 5]
d = distance(Proportion{TransitionMutation}, dnas2, 5, 5)
a = [0.0, 0.4, 0.333333, 0.2]
for i in 1:length(d[1])
@test_approx_eq_eps d[1][i] a[i] 1e-4
end
@test d[2][:] == [5, 5, 3, 5]
d = distance(Proportion{TransversionMutation}, dnas2, 5, 5)
a = [0.4, 0.4, 0.666667, 0.4]
for i in 1:length(d[1])
@test_approx_eq_eps d[1][i] a[i] 1e-4
end
@test d[2][:] == [5, 5, 3, 5]

@test distance(Proportion{AnyMutation}, dnas1) == ([(12 / 16)], [16])
@test distance(Proportion{TransitionMutation}, dnas1) == ([(4 / 16)], [16])
@test distance(Proportion{TransversionMutation}, dnas1) == ([(8 / 16)], [16])
Expand All @@ -81,6 +111,14 @@ end
@test round(distance(JukesCantor69, dnas2)[2][1], 3) == 1
@test round(distance(JukesCantor69, m2)[1][1], 3) == 1.648
@test round(distance(JukesCantor69, m2)[2][1], 3) == 1
@test_throws DomainError distance(JukesCantor69, dnas2, 5, 5)
d = distance(JukesCantor69, dnas3, 5, 5)
a = [0.232616, 0.571605, 0.44084, 0.571605]
v = [0.0595041, 0.220408, 0.24, 0.220408]
for i in 1:length(d[1])
@test_approx_eq_eps d[1][i] a[i] 1e-5
@test_approx_eq_eps d[2][i] v[i] 1e-5
end

@test round(distance(Kimura80, dnas2)[1][1], 3) == 1.648
@test round(distance(Kimura80, dnas2)[2][1], 3) == 1
Expand Down

0 comments on commit 485691f

Please sign in to comment.