Sliding window distance computation (#338)

* first draft of windowed sitance computation * Corrected typo * Remembered to initialize integer matrices to 0 * distance now also outputs ranges * Changed ends construction * Trying linear indexing to achieve simd * Tried simplifying inner loop * Eliminated some repeated operations * added window distances for Proportion{T} distances too * Corrected error in proportion windowed distance * Added sliding distance method for JukesCantor69 distances * Added draft flagmutations for both transition and transversion mutations. * Added draft of windowed distance for Count{T} <: TsTv * Added draft of windowed k80 distance * Added corrections to windowed distance measure function signatures * Added tests for windowed counts of Any, Transition, and Transversion Mutations * Edited tests * Corrected tests * Edited tests * Edited tests again * Added tests for windowed proportion distance * Tests edit * Made some tests use isapprox instead of isequal * Corrected missing end statements * Added docstrings * Corrected type in docstrings * changed approx tests to different tolerance * Corrected a typo in a function * Added tests for windowed JukesCantor69 distance * Added simd to proportion distance function * Implemented @BICYCLE's suggestions * Corrected stupid typo mistake
BioJulia · Feb 27, 2017 · 485691f · 485691f
1 parent aa0ee59
commit 485691f
Show file tree

Hide file tree

Showing 4 changed files with 269 additions and 5 deletions.
diff --git a/src/util/windows.jl b/src/util/windows.jl
@@ -40,6 +40,17 @@ immutable EachWindowIterator{T <: ArrayOrStringOrSeq}
 end
 
 
+immutable EachWindow
+    to::Int
+    width::Int
+    step::Int
+end
+
+function Base.size(winitr::EachWindow)
+    return length(StepRange(winitr.width+1, winitr.step, length(winitr.data)))
+end
+
+
 """
 Calculate the number of windows that will result from iterating across the container.
 

diff --git a/src/var/distances.jl b/src/var/distances.jl
@@ -59,6 +59,10 @@ immutable Kimura80 <: TsTv end
 # Distance computation internals
 # ------------------------------
 
+@inline function expected_distance{T}(::Type{Proportion{T}}, n::Int64, l::Int64)
+    return n / l
+end
+
 ## Jukes and Cantor 1969 distance computation.
 
 @inline function expected_distance(::Type{JukesCantor69}, p::Float64)
@@ -129,6 +133,104 @@ function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs:
     return count_mutations(T, seqs)
 end
 
+"""
+    distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
+
+Compute pairwise distances using a sliding window.
+
+As the window of `width` base pairs in size moves across a pair of sequences it
+computes the distance between the two sequences in that window.
+
+This method computes mutation counts for every window, and returns a tuple of the
+matrix of p-distances for every window, a matrix of the number of valid sites
+counted by the function for each window.
+"""
+function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
+    mutation_flags, ambiguous_flags = flagmutations(T, seqs)
+    nbases, npairs = size(mutation_flags)
+    if width < 1
+        throw(ArgumentError("`window` width must be ≥ 1."))
+    end
+    if step < 1
+        throw(ArgumentError("`step` must be ≥ 1."))
+    end
+    if width > nbases
+        throw(ArgumentError("The `window` size cannot be greater than number of data elements."))
+    end
+    starts = 1:step:nbases
+    ends = width:step:nbases
+    nwindows = length(ends)
+    mcounts = Matrix{Int}(nwindows, npairs)
+    wsizes = Matrix{Int}(nwindows, npairs)
+    ranges = Vector{UnitRange{Int}}(nwindows)
+
+    @inbounds for pair in 1:npairs
+        pairoffset = pair - 1
+        windowoffset = pairoffset * nwindows
+        flagsoffset = pairoffset * nbases
+        for i in 1:nwindows
+            from = starts[i]
+            to = ends[i]
+            mcount = 0
+            nsites = width
+            @simd for j in from:to
+                mcount += mutation_flags[flagsoffset + j]
+                nsites -= ambiguous_flags[flagsoffset + j]
+            end
+            ranges[i] = UnitRange(starts[i],ends[i])
+            mcounts[windowoffset + i] = mcount
+            wsizes[windowoffset + i] = nsites
+        end
+    end
+    return mcounts, wsizes, ranges
+end
+
+
+function distance{T<:TsTv,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
+    transitionFlags, transversionFlags, ambiguous_flags = flagmutations(TransitionMutation, TransversionMutation, seqs)
+    nbases, npairs = size(transitionFlags)
+    if width < 1
+        throw(ArgumentError("`window` width must be ≥ 1."))
+    end
+    if step < 1
+        throw(ArgumentError("`step` must be ≥ 1."))
+    end
+    if width > nbases
+        throw(ArgumentError("The `window` size cannot be greater than number of data elements."))
+    end
+    starts = 1:step:nbases
+    ends = width:step:nbases
+    nwindows = length(ends)
+    tscounts = Matrix{Int}(nwindows, npairs)
+    tvcounts = Matrix{Int}(nwindows, npairs)
+    wsizes = Matrix{Int}(nwindows, npairs)
+    ranges = Vector{UnitRange{Int}}(nwindows)
+
+    @inbounds for pair in 1:npairs
+        pairoffset = pair - 1
+        windowoffset = pairoffset * nwindows
+        flagsoffset = pairoffset * nbases
+        for i in 1:nwindows
+            from = starts[i]
+            to = ends[i]
+            tscount = 0
+            tvcount = 0
+            nsites = width
+            @simd for j in from:to
+                tscount += transitionFlags[flagsoffset + j]
+                tvcount += transversionFlags[flagsoffset + j]
+                nsites -= ambiguous_flags[flagsoffset + j]
+            end
+            ranges[i] = UnitRange(starts[i],ends[i])
+            tscounts[windowoffset + i] = tscount
+            tvcounts[windowoffset + i] = tvcount
+            wsizes[windowoffset + i] = nsites
+        end
+    end
+    return tscounts, tvcounts, wsizes, ranges
+end
+
+
 """
     distance{T<:MutationType,N<:Nucleotide}(::Type{Count{T}}, seqs::Matrix{N})
 
@@ -164,7 +266,7 @@ vector of the number of valid (i.e. non-ambiguous sites) counted by the function
 function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Proportion{T}}, seqs::Vector{BioSequence{A}})
     d, l = distance(Count{T}, seqs)
     D = Vector{Float64}(length(d))
-    @inbounds for i in 1:length(D)
+    @inbounds @simd for i in 1:length(D)
         D[i] = d[i] / l[i]
     end
     return D, l
@@ -174,7 +276,7 @@ end
     distance{T<:MutationType,N<:Nucleotide}(::Type{Proportion{T}}, seqs::Matrix{N})
 
 This method of distance returns a tuple of a vector of the p-distances, and a
-vector of the number of valid (i.e. non-ambiguous sites) counted by the function.
+vector of the number of valid (i.e. non-ambiguous) sites counted by the function.
 
 **Note: This method assumes that the sequences are stored in the `Matrix{N}`
 provided as `seqs` in sequence major order i.e. each column of the matrix is one
@@ -189,6 +291,27 @@ function distance{T<:MutationType,N<:Nucleotide}(::Type{Proportion{T}}, seqs::Ma
     return D, l
 end
 
+"""
+    distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Proportion{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
+
+A distance method which computes pairwise distances using a sliding window.
+
+As the window of `width` base pairs in size moves across a pair of sequences it
+computes the distance between the two sequences in that window.
+
+This method computes p-distances for every window, and returns a tuple of the
+matrix of p-distances for every window, a matrix of the number of valid sites
+counted by the function for each window.
+"""
+function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Proportion{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
+    counts, wsizes, ranges = distance(Count{T}, seqs, width, step)
+    res = Matrix{Float64}(size(counts))
+    @inbounds for i in 1:endof(counts)
+        res[i] = expected_distance(Proportion{T}, counts[i], wsizes[i])
+    end
+    return res, wsizes, ranges
+end
+
 """
     distance{A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{BioSequence{A}})
 
@@ -206,6 +329,32 @@ function distance{A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{Bio
     return D, V
 end
 
+"""
+    distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
+
+A distance method which computes pairwise distances using a sliding window.
+
+As the window of `width` base pairs in size moves across a pair of sequences it
+computes the distance between the two sequences in that window.
+
+This method computes the JukesCantor69 distance for every window, and returns a tuple of the
+matrix of p-distances for every window, a matrix of the number of valid sites
+counted by the function for each window.
+"""
+function distance{A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
+    ps, wsizes, ranges = distance(Proportion{AnyMutation}, seqs, width, step)
+    a, b = size(ps)
+    est = Matrix{Float64}(a, b)
+    var = Matrix{Float64}(a, b)
+    @inbounds for i in 1:endof(ps)
+        p = ps[i]
+        l = wsizes[i]
+        est[i] = expected_distance(JukesCantor69, p)
+        var[i] = variance(JukesCantor69, p, l)
+    end
+    return est, var, ranges
+end
+
 """
     distance{N<:Nucleotide}(::Type{JukesCantor69}, seqs::Matrix{N})
 
@@ -250,6 +399,37 @@ function distance{A<:NucleotideAlphabet}(::Type{Kimura80}, seqs::Vector{BioSeque
     return D, V
 end
 
+"""
+    distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Kimura80}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
+
+A distance method which computes pairwise distances using a sliding window.
+
+As the window of `width` base pairs in size moves across a pair of sequences it
+computes the distance between the two sequences in that window.
+
+This method computes the Kimura80 distance for every window, and returns a tuple of the
+matrix of p-distances for every window, a matrix of the number of valid sites
+counted by the function for each window.
+"""
+function distance{A<:NucleotideAlphabet}(::Type{Kimura80}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
+    tss, tvs, wsizes, ranges = distance(Count{Kimura80}, seqs, width, step)
+    a, b = size(tss)
+    est = Matrix{Float64}(a, b)
+    var = Matrix{Float64}(a, b)
+    @inbounds for i in 1:endof(counts)
+        L = l[i]
+        P = tss[i] / L
+        Q = tvs[i] / L
+        a1 = 1 - 2 * P - Q
+        a2 = 1 - 2 * Q
+        tv = tvs[i]
+        l = wsizes[i]
+        est[i] = expected_distance(Kimura80, a1, a2)
+        var[i] = variance(Kimura80, P, Q, L, a1, a2)
+    end
+    return est, var, ranges
+end
+
 """
     distance{N<:Nucleotide}(::Type{Kimura80}, seqs::Matrix{N})
 

diff --git a/src/var/mutation_counting.jl b/src/var/mutation_counting.jl
@@ -196,8 +196,9 @@ function flagmutations{M<:MutationType,N<:Nucleotide}(::Type{M}, seqs::Matrix{N}
     return ismutant, isambiguous
 end
 
-
-
+function flagmutations{M<:MutationType,A<:NucleotideAlphabet}(::Type{M}, seqs::Vector{BioSequence{A}})
+    return flagmutations(M, seqmatrix(seqs, :seq))
+end
 
 """
     count_mutations{T<:MutationType,N<:Nucleotide}(::Type{T}, seqs::Matrix{N})
@@ -263,6 +264,40 @@ function count_mutations{T<:MutationType,A<:NucleotideAlphabet}(::Type{T}, seque
     return count_mutations(T, seqs)
 end
 
+
+function flagmutations{N<:Nucleotide}(::Type{TransitionMutation}, ::Type{TransversionMutation}, seqs::Matrix{N})
+    seqsize, nseqs = size(seqs)
+    istransition = Matrix{Bool}(seqsize, binomial(nseqs, 2))
+    istransversion = Matrix{Bool}(seqsize, binomial(nseqs, 2))
+    isambiguous = Matrix{Bool}(seqsize, binomial(nseqs, 2))
+    col = 1
+    @inbounds for i1 in 1:nseqs
+        s1offset = (i1 - 1) * seqsize
+        for i2 in i1+1:nseqs
+            s2offset = (i2 - 1) * seqsize
+            resoffset = (col - 1) * seqsize
+            for s in 1:seqsize
+                s1 = seqs[s1offset + s]
+                s2 = seqs[s2offset + s]
+                isamb = is_ambiguous_strict(s1, s2)
+                isdiff = s1 != s2
+                ists = is_mutation(TransitionMutation, s1, s2)
+                isambiguous[resoffset + s] = isamb
+                istransition[resoffset + s] = !isamb & isdiff & ists
+                istransversion[resoffset + s] = !isamb & isdiff & !ists
+            end
+            col += 1
+        end
+    end
+    return istransition, istransversion, isambiguous
+end
+
+function flagmutations{A<:NucleotideAlphabet}(::Type{TransitionMutation}, ::Type{TransversionMutation}, seqs::Vector{BioSequence{A}})
+    return flagmutations(TransitionMutation, TransversionMutation, seqmatrix(seqs, :seq))
+end
+
+
+
 """
     count_mutations{N<:Nucleotide}(::Type{TransitionMutation}, ::Type{TransversionMutation}, sequences::Matrix{N})
 

diff --git a/test/var/runtests.jl b/test/var/runtests.jl
@@ -39,7 +39,10 @@ end
     dnas1 = [dna"ATTG-ACCTGGNTTTCCGAA", dna"A-ACAGAGTATACRGTCGTC"]
     m1 = seqmatrix(dnas1, :seq)
 
-    dnas2 = [dna"attgaacctggntttccgaa", dna"atacagagtatacrgtcgtc"]
+    dnas2 = [dna"attgaacctggntttccgaa",
+             dna"atacagagtatacrgtcgtc"]
+    dnas3 = [dna"attgaacctgtntttccgaa",
+             dna"atagaacgtatatrgccgtc"]
     m2 = seqmatrix(dnas2, :seq)
 
     @test distance(Count{AnyMutation}, dnas1) == ([12], [16])
@@ -51,6 +54,14 @@ end
     @test distance(Count{TransversionMutation}, m1) == ([8], [16])
     @test distance(Count{Kimura80}, m1) == ([4], [8], [16])
 
+    @test distance(Count{AnyMutation}, dnas2, 5, 5)[1][:] == [2, 4, 3, 3]
+    @test distance(Count{AnyMutation}, dnas2, 5, 5)[2][:] == [5, 5, 3, 5]
+    @test distance(Count{TransitionMutation}, dnas2, 5, 5)[1][:] == [0, 2, 1, 1]
+    @test distance(Count{TransitionMutation}, dnas2, 5, 5)[2][:] == [5, 5, 3, 5]
+    @test distance(Count{TransversionMutation}, dnas2, 5, 5)[1][:] == [2, 2, 2, 2]
+    @test distance(Count{TransversionMutation}, dnas2, 5, 5)[2][:] == [5, 5, 3, 5]
+    @test distance(Count{Kimura80}, dnas1) == ([4], [8], [16])
+
     @test distance(Count{AnyMutation}, dnas2) == ([12], [18])
     @test distance(Count{TransitionMutation}, dnas2) == ([4], [18])
     @test distance(Count{TransversionMutation}, dnas2) == ([8], [18])
@@ -60,6 +71,25 @@ end
     @test distance(Count{TransversionMutation}, m2) == ([8], [18])
     @test distance(Count{Kimura80}, m2) == ([4], [8], [18])
 
+    d = distance(Proportion{AnyMutation}, dnas2, 5, 5)
+    a = [0.4, 0.8, 1.0, 0.6]
+    for i in 1:length(d[1])
+        @test_approx_eq_eps d[1][i] a[i] 1e-4
+    end
+    @test d[2][:] == [5, 5, 3, 5]
+    d = distance(Proportion{TransitionMutation}, dnas2, 5, 5)
+    a = [0.0, 0.4, 0.333333, 0.2]
+    for i in 1:length(d[1])
+        @test_approx_eq_eps d[1][i] a[i] 1e-4
+    end
+    @test d[2][:] == [5, 5, 3, 5]
+    d = distance(Proportion{TransversionMutation}, dnas2, 5, 5)
+    a = [0.4, 0.4, 0.666667, 0.4]
+    for i in 1:length(d[1])
+        @test_approx_eq_eps d[1][i] a[i] 1e-4
+    end
+    @test d[2][:] == [5, 5, 3, 5]
+
     @test distance(Proportion{AnyMutation}, dnas1) == ([(12 / 16)], [16])
     @test distance(Proportion{TransitionMutation}, dnas1) == ([(4 / 16)], [16])
     @test distance(Proportion{TransversionMutation}, dnas1) == ([(8 / 16)], [16])
@@ -81,6 +111,14 @@ end
     @test round(distance(JukesCantor69, dnas2)[2][1], 3) == 1
     @test round(distance(JukesCantor69, m2)[1][1], 3) == 1.648
     @test round(distance(JukesCantor69, m2)[2][1], 3) == 1
+    @test_throws DomainError distance(JukesCantor69, dnas2, 5, 5)
+    d = distance(JukesCantor69, dnas3, 5, 5)
+    a = [0.232616, 0.571605, 0.44084, 0.571605]
+    v = [0.0595041, 0.220408, 0.24, 0.220408]
+    for i in 1:length(d[1])
+        @test_approx_eq_eps d[1][i] a[i] 1e-5
+        @test_approx_eq_eps d[2][i] v[i] 1e-5
+    end
 
     @test round(distance(Kimura80, dnas2)[1][1], 3) == 1.648
     @test round(distance(Kimura80, dnas2)[2][1], 3) == 1