From 36743a9f8962ebaf321af819638393d765ced439 Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Sat, 3 Feb 2024 18:19:08 -0800 Subject: [PATCH] Improve caching and dispatch of `LinearizingSavingCallback` This adds a new type, `LinearizingSavingCallbackCache` and some sub-types to allow for efficient re-use of memory as the callback executes over the course of a solve, as well as re-use of that memory in future solves when operating on a large ensemble simulation. The top-level `LinearizingSavingCallbackCache` creates thread-safe cache pool objects that are then used to acquire thread-unsafe cache pool objects to be used within a single solve. Those thread-unsafe cache pool objects can then be released and acquired anew by the next solve. The thread-unsafe pool objects allow for acquisition of pieces of memory such as temporary `u` vectors (the recusrive nature of the `LinearizingSavingCallback` means that we must allocate unknown numbers of temporary `u` vectors) and chunks of `u` blocks that are then compacted into a single large matrix in the finalize method of the callback. All these pieces of memory are stored within that set of thread-unsafe caches, and these are released back to the top-level thread-safe cache pool, for the next solve to acquire and make use of those pieces of memory in the cache pool. Using these techniques, the solve time of a large ensemble simulation with low per-simulation computation has reduced dramatically. The simulation solves a butterworth 3rd-order filter circuit over a certain timespan, swept across different simulus frequencies and circuit parameters. The parameter sweep results in a 13500-element ensemble simulation, that when run with 8 threads on a M1 Pro takes: ``` 48.364827 seconds (625.86 M allocations: 19.472 GiB, 41.81% gc time, 0.17% compilation time) ``` Now, after these caching optimizations, we solve the same ensemble in: ``` 13.208123 seconds (166.76 M allocations: 7.621 GiB, 22.21% gc time, 0.61% compilation time) ``` As a side note, the size requirements of the raw linearized solution data itself is `1.04 GB`. In general, we expect to allocate somewhere between 2-3x the final output data to account for temporaries and inefficient sharing, so while there is still some more work to be done, this gets us significantly closer to minimal overhead. This also adds a package extension on `Sundials`, as `IDA` requires that state vectors are `NVector` types, rather than `Vector{S}` types in order to not allocate. --- Project.toml | 3 + ext/DiffEqCallbacksSundialsExt.jl | 13 ++ src/independentlylinearizedutils.jl | 253 +++++++++++++++++++++++---- src/saving.jl | 238 +++++++++++++++++-------- test/independentlylinearizedtests.jl | 56 +++++- test/saving_tests.jl | 37 +++- 6 files changed, 485 insertions(+), 115 deletions(-) create mode 100644 ext/DiffEqCallbacksSundialsExt.jl diff --git a/Project.toml b/Project.toml index 05576122..3f6174a9 100644 --- a/Project.toml +++ b/Project.toml @@ -21,6 +21,9 @@ StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" Sundials = "c3572dad-4567-51f8-b174-8c6c989267f4" +[extensions] +DiffEqCallbacksSundialsExt = "Sundials" + [compat] Aqua = "0.8" DataInterpolations = "4" diff --git a/ext/DiffEqCallbacksSundialsExt.jl b/ext/DiffEqCallbacksSundialsExt.jl new file mode 100644 index 00000000..1206b1ba --- /dev/null +++ b/ext/DiffEqCallbacksSundialsExt.jl @@ -0,0 +1,13 @@ +module DiffEqCallbacksSundialsExt + +using Sundials: NVector, IDA +import DiffEqCallbacks: solver_state_alloc, solver_state_type + + +# Allocator; `U` is typically something like `Vector{Float64}` +solver_state_alloc(solver::IDA, U::DataType, num_us::Int) = () -> NVector(U(undef, num_us)) + +# Type of `solver_state_alloc`, which is just `NVector` +solver_state_type(solver::IDA, U::DataType) = NVector + +end # module diff --git a/src/independentlylinearizedutils.jl b/src/independentlylinearizedutils.jl index 2bd0b7f5..2c390248 100644 --- a/src/independentlylinearizedutils.jl +++ b/src/independentlylinearizedutils.jl @@ -2,6 +2,113 @@ using SciMLBase export IndependentlyLinearizedSolution + +""" + CachePool(T, alloc; thread_safe = true) + +Simple memory-reusing cache that allows us to grow a cache and keep +re-using those pieces of memory (in our case, typically `u` vectors) +until the solve is finished. By default, this datastructure is made +to be thread-safe by locking on every acquire and release, but it +can be made thread-unsafe (and correspondingly faster) by passing +`thread_safe = false` to the constructor. + +While manual usage with `acquire!()` and `release!()` is possible, +most users will want to use `@with_cache`, which provides lexically- +scoped `acquire!()` and `release!()` usage automatically. Example: + +```julia +us = CachePool(Vector{S}, () -> Vector{S}(undef, num_us); thread_safe=false) +@with_cache us u_prev begin + @with_cache us u_next begin + # perform tasks with these two `u` vectors + end +end +``` + +!!! warning "Escaping values" + You must not use an acquired value after you have released it; + the memory may be immediately re-used by some other consumer of + your cache pool. Do not allow the acquired value to escape + outside of the `@with_cache` block, or past a `release!()`. +""" +mutable struct CachePool{T, THREAD_SAFE} + const pool::Vector{T} + const alloc::Function + lock::ReentrantLock + num_allocated::Int + num_acquired::Int + + function CachePool(T, alloc::F; thread_safe::Bool = true) where {F} + return new{T,Val{thread_safe}}(T[], alloc, ReentrantLock(), 0, 0) + end +end +const ThreadSafeCachePool{T} = CachePool{T,Val{true}} +const ThreadUnsafeCachePool{T} = CachePool{T,Val{false}} + +""" + acquire!(cache::CachePool) + +Returns a cached element of the cache pool, calling `cache.alloc()` if none +are available. +""" +Base.@inline function acquire!(cache::CachePool{T}, _dummy = nothing) where {T} + cache.num_acquired += 1 + if isempty(cache.pool) + cache.num_allocated += 1 + return cache.alloc()::T + end + return pop!(cache.pool) +end + +""" + release!(cache::CachePool, val) + +Returns the value `val` to the cache pool. +""" +Base.@inline function release!(cache::CachePool, val, _dummy = nothing) + push!(cache.pool, val) + cache.num_acquired -= 1 +end + +function is_fully_released(cache::CachePool, _dummy = nothing) + return cache.num_acquired == 0 +end + +# Thread-safe versions just sub out to the other methods, using `_dummy` to force correct dispatch +acquire!(cache::ThreadSafeCachePool) = @lock cache.lock acquire!(cache, nothing) +release!(cache::ThreadSafeCachePool, val) = @lock cache.lock release!(cache, val, nothing) +is_fully_released(cache::ThreadSafeCachePool) = @lock cache.lock is_fully_released(cache, nothing) + +macro with_cache(cache, name, body) + return quote + $(esc(name)) = acquire!($(esc(cache))) + try + $(esc(body)) + finally + release!($(esc(cache)), $(esc(name))) + end + end +end + + +struct IndependentlyLinearizedSolutionChunksCache{T,S} + t_chunks::ThreadUnsafeCachePool{Vector{T}} + u_chunks::ThreadUnsafeCachePool{Matrix{S}} + time_masks::ThreadUnsafeCachePool{BitMatrix} + + function IndependentlyLinearizedSolutionChunksCache{T,S}(num_us::Int, num_derivatives::Int, chunk_size::Int) where {T,S} + t_chunks_alloc = () -> Vector{T}(undef, chunk_size) + u_chunks_alloc = () -> Matrix{S}(undef, num_derivatives+1, chunk_size) + time_masks_alloc = () -> BitMatrix(undef, num_us, chunk_size) + return new( + CachePool(Vector{T}, t_chunks_alloc; thread_safe=false), + CachePool(Matrix{S}, u_chunks_alloc; thread_safe=false), + CachePool(BitMatrix, time_masks_alloc; thread_safe=false), + ) + end +end + """ IndependentlyLinearizedSolutionChunks @@ -9,23 +116,30 @@ When constructing an `IndependentlyLinearizedSolution` via the `IndependentlyLin we use this indermediate structure to reduce allocations and collect the unknown number of timesteps that the solve will generate. """ -mutable struct IndependentlyLinearizedSolutionChunks{T, S} +mutable struct IndependentlyLinearizedSolutionChunks{T, S, N} t_chunks::Vector{Vector{T}} u_chunks::Vector{Vector{Matrix{S}}} time_masks::Vector{BitMatrix} + # Temporary array that gets used by `get_chunks` + last_chunks::Vector{Matrix{S}} + # Index of next write into the last chunk u_offsets::Vector{Int} t_offset::Int + cache::IndependentlyLinearizedSolutionChunksCache + function IndependentlyLinearizedSolutionChunks{T, S}(num_us::Int, num_derivatives::Int = 0, - chunk_size::Int = 100) where {T, S} - return new([Vector{T}(undef, chunk_size)], - [[Matrix{S}(undef, num_derivatives+1, chunk_size)] for _ in 1:num_us], - [BitMatrix(undef, num_us, chunk_size)], - [1 for _ in 1:num_us], - 1, - ) + chunk_size::Int = 512, + cache::IndependentlyLinearizedSolutionChunksCache = IndependentlyLinearizedSolutionChunksCache{T,S}(num_us, num_derivatives, chunk_size)) where {T, S} + t_chunks = [acquire!(cache.t_chunks)] + u_chunks = [[acquire!(cache.u_chunks)] for _ in 1:num_us] + time_masks = [acquire!(cache.time_masks)] + last_chunks = [u_chunks[u_idx][1] for u_idx in 1:num_us] + u_offsets = [1 for _ in 1:num_us] + t_offset = 1 + return new{T,S,num_derivatives}(t_chunks, u_chunks, time_masks, last_chunks, u_offsets, t_offset, cache) end end @@ -44,14 +158,8 @@ function num_us(ilsc::IndependentlyLinearizedSolutionChunks) end return length(ilsc.u_chunks) end +num_derivatives(ilsc::IndependentlyLinearizedSolutionChunks{T,S,N}) where {T,S,N} = N -function num_derivatives(ilsc::IndependentlyLinearizedSolutionChunks) - # If we've been finalized, just return `0` (which means only the primal) - if isempty(ilsc.t_chunks) - return 0 - end - return size(first(first(ilsc.u_chunks)), 1) - 1 -end function Base.isempty(ilsc::IndependentlyLinearizedSolutionChunks) return length(ilsc.t_chunks) == 1 && ilsc.t_offset == 1 @@ -61,24 +169,25 @@ function get_chunks(ilsc::IndependentlyLinearizedSolutionChunks{T, S}) where {T, # Check if we need to allocate new `t` chunk chunksize = chunk_size(ilsc) if ilsc.t_offset > chunksize - push!(ilsc.t_chunks, Vector{T}(undef, chunksize)) - push!(ilsc.time_masks, BitMatrix(undef, length(ilsc.u_offsets), chunksize)) + push!(ilsc.t_chunks, acquire!(ilsc.cache.t_chunks)) + push!(ilsc.time_masks, acquire!(ilsc.cache.time_masks)) ilsc.t_offset = 1 end # Check if we need to allocate any new `u` chunks (but only for those with `u_mask`) for (u_idx, u_chunks) in enumerate(ilsc.u_chunks) if ilsc.u_offsets[u_idx] > chunksize - push!(u_chunks, Matrix{S}(undef, num_derivatives(ilsc)+1, chunksize)) + push!(u_chunks, acquire!(ilsc.cache.u_chunks)) ilsc.u_offsets[u_idx] = 1 end + ilsc.last_chunks[u_idx] = u_chunks[end] end # return the last chunk for each return ( ilsc.t_chunks[end], ilsc.time_masks[end], - [u_chunks[end] for u_chunks in ilsc.u_chunks], + ilsc.last_chunks, ) end @@ -135,16 +244,18 @@ function store!(ilsc::IndependentlyLinearizedSolutionChunks{T, S}, ts, time_mask, us = get_chunks(ilsc) # Store into the chunks, gated by `u_mask` - for u_idx in 1:size(u, 2) + @inbounds for u_idx in 1:size(u, 2) if u_mask[u_idx] for deriv_idx in 1:size(u, 1) us[u_idx][deriv_idx, ilsc.u_offsets[u_idx]] = u[deriv_idx, u_idx] end ilsc.u_offsets[u_idx] += 1 end + + # Update our `time_mask` while we're at it + time_mask[u_idx, ilsc.t_offset] = u_mask[u_idx] end ts[ilsc.t_offset] = t - time_mask[:, ilsc.t_offset] .= u_mask ilsc.t_offset += 1 end @@ -161,7 +272,7 @@ efficient `iterate()` method that can be used to reconstruct coherent views of the state variables at all timepoints, as well as an efficient `sample!()` method that can sample at arbitrary timesteps. """ -mutable struct IndependentlyLinearizedSolution{T, S} +mutable struct IndependentlyLinearizedSolution{T, S, N} # All timepoints, shared by all `us` ts::Vector{T} @@ -173,33 +284,42 @@ mutable struct IndependentlyLinearizedSolution{T, S} time_mask::BitMatrix # Temporary object used during construction, will be set to `nothing` at the end. - ilsc::Union{Nothing,IndependentlyLinearizedSolutionChunks{T,S}} + ilsc::Union{Nothing,IndependentlyLinearizedSolutionChunks{T,S,N}} + ilsc_cache_pool::Union{Nothing,ThreadSafeCachePool{IndependentlyLinearizedSolutionChunksCache{T,S}}} end # Helper function to create an ILS wrapped around an in-progress ILSC -function IndependentlyLinearizedSolution(ilsc::IndependentlyLinearizedSolutionChunks{T,S}) where {T,S} - ils = IndependentlyLinearizedSolution( +function IndependentlyLinearizedSolution(ilsc::IndependentlyLinearizedSolutionChunks{T,S,N}, cache_pool = nothing) where {T,S,N} + return IndependentlyLinearizedSolution{T,S,N}( T[], Matrix{S}[], BitMatrix(undef, 0,0), ilsc, + cache_pool, ) - return ils end # Automatically create an ILS wrapped around an ILSC from a `prob` -function IndependentlyLinearizedSolution(prob::SciMLBase.AbstractDEProblem, num_derivatives = 0) +function IndependentlyLinearizedSolution(prob::SciMLBase.AbstractDEProblem, num_derivatives = 0; + cache_pool = nothing, + chunk_size::Int = 512) T = eltype(prob.tspan) + S = eltype(prob.u0) U = isnothing(prob.u0) ? Float64 : eltype(prob.u0) - N = isnothing(prob.u0) ? 0 : length(prob.u0) - chunks = IndependentlyLinearizedSolutionChunks{T,U}(N, num_derivatives) - return IndependentlyLinearizedSolution(chunks) + num_us = isnothing(prob.u0) ? 0 : length(prob.u0) + if cache_pool === nothing + cache = IndependentlyLinearizedSolutionChunksCache{T,S}(num_us, num_derivatives, chunk_size) + else + cache = acquire!(cache_pool) + end + chunks = IndependentlyLinearizedSolutionChunks{T,U}(num_us, num_derivatives, chunk_size, cache) + return IndependentlyLinearizedSolution(chunks, cache_pool) end -num_derivatives(ils::IndependentlyLinearizedSolution) = !isempty(ils.us) ? size(first(ils.us), 1) : 0 +num_derivatives(::IndependentlyLinearizedSolution{T,S,N}) where {T,S,N} = N num_us(ils::IndependentlyLinearizedSolution) = length(ils.us) Base.size(ils::IndependentlyLinearizedSolution) = size(ils.time_mask) Base.length(ils::IndependentlyLinearizedSolution) = length(ils.ts) -function finish!(ils::IndependentlyLinearizedSolution) +function finish!(ils::IndependentlyLinearizedSolution{T,S}) where {T,S} function trim_chunk(chunks::Vector, offset) chunks = [chunk for chunk in chunks] if eltype(chunks) <: AbstractVector @@ -216,10 +336,52 @@ function finish!(ils::IndependentlyLinearizedSolution) end ilsc = ils.ilsc::IndependentlyLinearizedSolutionChunks - ts = vcat(trim_chunk(ilsc.t_chunks, ilsc.t_offset)...) - time_mask = hcat(trim_chunk(ilsc.time_masks, ilsc.t_offset)...) - us = [hcat(trim_chunk(ilsc.u_chunks[u_idx], ilsc.u_offsets[u_idx])...) - for u_idx in 1:length(ilsc.u_chunks)] + + chunk_len(chunk) = size(chunk, ndims(chunk)) + function chunks_len(chunks::Vector, offset) + len = 0 + for chunk_idx in 1:length(chunks)-1 + len += chunk_len(chunks[chunk_idx]) + end + return len + offset - 1 + end + + function copy_chunk!(out::Vector, in::Vector, out_offset::Int, len=chunk_len(in)) + for idx in 1:len + out[idx+out_offset] = in[idx] + end + end + function copy_chunk!(out::AbstractMatrix, in::AbstractMatrix, out_offset::Int, len=chunk_len(in)) + for zdx in 1:size(in, 1) + for idx in 1:len + out[zdx, idx+out_offset] = in[zdx, idx] + end + end + end + + function collapse_chunks!(out, chunks, offset::Int) + write_offset = 0 + for chunk_idx in 1:(length(chunks)-1) + chunk = chunks[chunk_idx] + copy_chunk!(out, chunk, write_offset) + write_offset += chunk_len(chunk) + end + copy_chunk!(out, chunks[end], write_offset, offset-1) + end + + # Collapse t_chunks + ts = Vector{T}(undef, chunks_len(ilsc.t_chunks, ilsc.t_offset)) + collapse_chunks!(ts, ilsc.t_chunks, ilsc.t_offset) + + # Collapse u_chunks + us = Vector{Matrix{S}}(undef, length(ilsc.u_chunks)) + for u_idx in 1:length(ilsc.u_chunks) + us[u_idx] = Matrix{S}(undef, size(ilsc.u_chunks[u_idx][1],1), chunks_len(ilsc.u_chunks[u_idx], ilsc.u_offsets[u_idx])) + collapse_chunks!(us[u_idx], ilsc.u_chunks[u_idx], ilsc.u_offsets[u_idx]) + end + + time_mask = BitMatrix(undef, size(ilsc.time_masks[1], 1), chunks_len(ilsc.time_masks, ilsc.t_offset)) + collapse_chunks!(time_mask, ilsc.time_masks, ilsc.t_offset) # Sanity-check lengths if length(ts) != size(time_mask, 2) @@ -238,7 +400,24 @@ function finish!(ils::IndependentlyLinearizedSolution) throw(ArgumentError("Time mask must indicate same length as `us` ($(time_mask_lens) != $(us_lens))")) end - # Update our struct, release the `ilsc` + # Update our struct, release the `ilsc` and its caches + for t_chunk in ilsc.t_chunks + release!(ilsc.cache.t_chunks, t_chunk) + end + @assert is_fully_released(ilsc.cache.t_chunks) + for u_idx in 1:length(ilsc.u_chunks) + for u_chunk in ilsc.u_chunks[u_idx] + release!(ilsc.cache.u_chunks, u_chunk) + end + end + @assert is_fully_released(ilsc.cache.u_chunks) + for time_mask in ilsc.time_masks + release!(ilsc.cache.time_masks, time_mask) + end + @assert is_fully_released(ilsc.cache.time_masks) + if ils.ilsc_cache_pool !== nothing + release!(ils.ilsc_cache_pool, ilsc.cache) + end ils.ilsc = nothing ils.ts = ts ils.us = us diff --git a/src/saving.jl b/src/saving.jl index e1058358..831b5a9e 100644 --- a/src/saving.jl +++ b/src/saving.jl @@ -1,3 +1,5 @@ +import SciMLBase: AbstractSciMLAlgorithm + """ SavedValues{tType<:Real, savevalType} @@ -182,14 +184,19 @@ as_array(t::AbstractArray) = t function is_linear_enough!(caches, is_linear, t₀, t₁, u₀, u₁, integ, abstol, reltol) (; y_linear, y_interp, slopes) = caches tspread = t₁ - t₀ - slopes .= (u₁ .- u₀) ./ tspread + num_us = length(u₀) + @inbounds for u_idx in 1:num_us + slopes[u_idx] = (u₁[u_idx] - u₀[u_idx])/tspread + end t_quartile(t_idx) = t₀ + tspread * t_idx/4.0 # Calculate interpolated and linear samplings in our three quartiles - for t_idx in 1:3 + @inbounds for t_idx in 1:3 t = t_quartile(t_idx) # Linear interpolation - y_linear[:, t_idx] .= u₀ .+ (t - t₀) .* slopes + @inbounds for u_idx in 1:num_us + y_linear[u_idx, t_idx] = u₀[u_idx] .+ (t - t₀) .* slopes[u_idx] + end # Solver interpolation # We would like to use `integ(@view(y_interp[:, t_idx]))` here, @@ -197,14 +204,16 @@ function is_linear_enough!(caches, is_linear, t₀, t₁, u₀, u₁, integ, abs # memory that the view would have given us, so we instead use a # temporary array then copy it into `y_interp`, which loses very # little time and still prevents allocations from `integ(t)`. - with_cache(caches.us) do u_interp - integ(u_interp, t) - y_interp[:, t_idx] .= u_interp + @with_cache caches.us u_interp begin + integ(u_interp, t, Val{0}; idxs=nothing) + for u_idx in 1:num_us + y_interp[u_idx, t_idx] = u_interp[u_idx] + end end end # Return `is_linear` for each state - for u_idx in 1:length(u₀) + @inbounds for u_idx in 1:num_us is_linear[u_idx] = true for t_idx in 1:3 is_linear[u_idx] &= isapprox(y_linear[u_idx, t_idx], @@ -216,8 +225,8 @@ function is_linear_enough!(caches, is_linear, t₀, t₁, u₀, u₁, integ, abs # Find worst time index so that we split our period there t_max_idx = 1 e_max = 0.0 - for t_idx in 1:3 - for u_idx in 1:length(u₀) + @inbounds for t_idx in 1:3 + for u_idx in 1:num_us e = abs(y_linear[u_idx, t_idx] - y_interp[u_idx, t_idx]) if e > e_max t_max_idx = t_idx @@ -235,7 +244,7 @@ function linearize_period(t₀, t₁, u₀, u₁, integ, ilsc, caches, u_mask, d throw(ArgumentError("Linearization failed, fell below linearization subdivision threshold")) end - with_cache(caches.u_masks) do is_linear + @with_cache caches.u_masks is_linear begin tᵦ = is_linear_enough!(caches, is_linear, t₀, t₁, @@ -252,8 +261,8 @@ function linearize_period(t₀, t₁, u₀, u₁, integ, ilsc, caches, u_mask, d if any(is_nonlinear) # If it's not linear, split this period into two and recurse, altering our `u_mask`: - with_cache(caches.us) do uᵦ - integ(uᵦ, tᵦ) + @with_cache caches.us uᵦ begin + integ(uᵦ, tᵦ, Val{0}; idxs=nothing) linearize_period( t₀, tᵦ, u₀, uᵦ, @@ -283,50 +292,144 @@ function linearize_period(t₀, t₁, u₀, u₁, integ, ilsc, caches, u_mask, d end else # If everyone is linear, store this period, according to our `u_mask`! - store_u_block!(ilsc, integ, caches, t₁, u₁, u_mask) + store_u_block!(ilsc, Val(num_derivatives(ilsc)), integ, caches, t₁, u₁, u_mask) end end end -function store_u_block!(ilsc, integ, caches, t₁, u₁, u_mask) - with_cache(caches.us) do u - caches.u_block[1, :] .= u₁ - for deriv_idx in 1:num_derivatives(ilsc) - integ(u, t₁, Val{deriv_idx}) - caches.u_block[deriv_idx+1, :] .= u +function store_u_block!(ilsc, ::Val{num_derivatives}, integ, caches, t₁, u₁, u_mask) where {num_derivatives} + @with_cache caches.us u begin + for u_idx in 1:length(u) + caches.u_block[1, u_idx] = u₁[u_idx] + end + for deriv_idx in 1:num_derivatives + integ(u, t₁, Val{deriv_idx}; idxs=nothing) + for u_idx in 1:length(u) + caches.u_block[deriv_idx+1, u_idx] = u[u_idx] + end end store!(ilsc, t₁, caches.u_block, u_mask) end end -""" - CachePool +struct LinearizingSavingCallbackCacheType{S,U} + y_linear::Matrix{S} + y_interp::Matrix{S} + slopes::Vector{S} + # U is not necessarily a `Vector{S}` because it can be an `NVector` thanks to Sundials. + us::ThreadUnsafeCachePool{U} + u_block::Matrix{S} + u_masks::ThreadUnsafeCachePool{BitVector} + + function LinearizingSavingCallbackCacheType{S,U}(num_us::Int, num_derivatives::Int, U_alloc::Function) where {S,U} + y_linear = Matrix{S}(undef, (num_us, 3)) + y_interp = Matrix{S}(undef, (num_us, 3)) + slopes = Vector{S}(undef, num_us) + u_block = Matrix{S}(undef, (num_derivatives+1, num_us)) + F_umasks = () -> BitVector(undef, num_us) + u_masks = CachePool(BitVector, F_umasks; thread_safe=false) + + # Workaround for Sundials allocations; conversion from `Vector{S}` + # to `NVector()` allocates, so we require the caller to pass in a + # `U` and a `U_alloc` so that the default of `U = Vector{S}` and + # U_alloc = () -> Vector{S}(undef, num_us)` can be overridden. + # This is automatically done by `DiffEqCallbacksSundialsExt`, via + # the `solver_state_type()` and `solver_state_alloc()` hooks below. + us = CachePool(U, U_alloc; thread_safe=false) + return new{S,U}( + y_linear, + y_interp, + slopes, + us, + u_block, + u_masks, + ) + end +end + +# This exists purely so that different solver types can wrap/alter the +# type of the state vectors cached by the `LinearizingSavingCallbackCache`. +# U is typically something like `Vector{Float64}`. +solver_state_type(solver::AbstractSciMLAlgorithm, U::DataType) = U +solver_state_alloc(solver::AbstractSciMLAlgorithm, U::DataType, num_us::Int) = () -> U(undef, num_us) -Simple memory-reusing cache that allows us to grow a cache and keep -re-using those pieces of memory (in our case, typically `u` vectors) -until the solve is finished. Note that this datastructure is _not_ -thread-safe! """ -mutable struct CachePool{T,F} - pool::Vector{T} - alloc::F - write_idx::Int + LinearizingSavingCallbackCache(prob, solver; num_derivatives=0) - function CachePool(T, alloc::F) where {F} - return new{T,F}(T[], alloc, 0) - end +Top-level cache for the `LinearizingSavingCallback`. Typically used +to vastly reduce the number of allocations when performing an ensemble +solve, where allocations from one solution can be used by the next. + +Users must pass in `solver` to allow for solver-specific allocation +strategies. As an example, `IDA` requires allocation of `NVector` +objects rather than `Vector{S}` objects, and to automatically +determine this, the `LinearizingSavingCallbackCache` takes in the +solver as well. See the `DiffEqCallbacksSundialsExt` extension +for the details on how this type adjustment is made. + +This top-level cache creates two thread-safe cache pools that are then +used by each solve to allocate thread-unsafe cache pools. Those per- +solution cache pools are then re-used across solutions as the ensemble +finishes one trajectory and moves to another. + +Example usage: + +```julia +# Linearize the primal, and the first derivative +num_derivatives = 1 + +# Create a cache, to be used across all ensemble simulations +cache = LinearizingSavingCallbackCache(prob, solver; num_derivatives) + +# Store the results in this array of independently linearized solutions +ilss = Vector{IndependentlyLinearizedSolution}(undef, num_trajectories) + +# Create `prob_func` piece to remake `prob` to have the correct callback, +# hooking up the necessary caching pieces. +function linearizer_adding_remake(prob,i,_) + ilss[i] = IndependentlyLinearizedSolution(prob, num_derivatives; cache_pool=cache.ils_cache) + lsc = LinearizingSavingCallback(ilss[i]; cache_pool=cache.lsc_cache) + return remake(prob; callback=lsc) end -function with_cache(f::Function, cache::CachePool{T}) where {T} - cache.write_idx += 1 - if length(cache.pool) < cache.write_idx - push!(cache.pool, cache.alloc()) - end - try - f(cache.pool[cache.write_idx]) - finally - cache.write_idx -= 1 - end +ensembleprob = EnsembleProblem(prob; prob_func=linearizer_adding_remake) +solve(ensembleprob, solver, EnsembleThreads(); ...) +``` +""" +function LinearizingSavingCallbackCache(prob, solver; num_derivatives=0, chunk_size=512) + T = eltype(prob.tspan) + S = eltype(prob.u0) + U = solver_state_type(solver, typeof(prob.u0)) + num_us = length(prob.u0) + U_alloc = solver_state_alloc(solver, typeof(prob.u0), num_us) + return LinearizingSavingCallbackCache(T, S, U, U_alloc, num_us; num_derivatives, chunk_size) +end + +function LinearizingSavingCallbackCache(T, S, U, U_alloc, num_us; num_derivatives=0, chunk_size=512) + return (; + # This cache is used by the LinearizingSavingCallback, it creates `LinearizingSavingCallbackCacheType` + # objects, which is quite a mouthful, but contains all the temporary values needed for a single + # solve's linearization. Notably, it contains within itself cachepools for `u` vectors and whatnot, + # and it is _not_ thread-safe, because we assume that a single solve is single-threaded, so we use a + # single thread-safe cache pool (the `lsc_cache`) to spawn off a collection of these smaller, thread- + # unsafe (but faster to acquire/release) cache pools. + lsc_cache = CachePool( + LinearizingSavingCallbackCacheType{S,U}, + () -> LinearizingSavingCallbackCacheType{S,U}(num_us, num_derivatives, U_alloc); + thread_safe = true, + ), + # This cache is used by the `IndependentlyLinearizedSolutionChunks` to do things like allocate `u`, + # `t` and `time_mask` chunks. + ils_cache = CachePool( + IndependentlyLinearizedSolutionChunksCache{T,S}, + () -> IndependentlyLinearizedSolutionChunksCache{T,S}( + num_us, + num_derivatives, + chunk_size, + ), + thread_safe = true, + ), + ) end """ @@ -366,11 +469,16 @@ function LinearizingSavingCallback(ils::IndependentlyLinearizedSolution{T,S}; interpolate_mask = BitVector(true for _ in 1:length(ils.ilsc.u_chunks)), abstol::Union{S,Nothing} = nothing, reltol::Union{S,Nothing} = nothing, - ) where {T, S} + cache_pool::CachePool{C} = LinearizingSavingCallbackCache(T, S, Vector{S}, length(ils.ilsc.u_chunks); num_derivatives=num_derivatives(ils.ilsc)).lsc_cache, + ) where {T, S, C} ilsc = ils.ilsc full_mask = BitVector(true for _ in 1:length(ilsc.u_chunks)) - # caches will be allocated in `initialize()` - caches = nothing + num_derivatives_val = Val(num_derivatives(ilsc)) + + # `caches` is initialized in `initialize`, but we need to constrain + # its type here so that the closures in `DiscreteCallback` are stable + local caches::C + #caches = acquire!(cache_pool) return DiscreteCallback( # We will process every timestep (u, t, integ) -> begin @@ -380,17 +488,17 @@ function LinearizingSavingCallback(ils::IndependentlyLinearizedSolution{T,S}; integ -> begin t₀ = integ.tprev t₁ = integ.t - with_cache(caches.us) do u₀ - with_cache(caches.us) do u₁ + @with_cache caches.us u₀ begin + @with_cache caches.us u₁ begin # Get `u₀` and `u₁` from the integrator - integ(u₀, t₀) - integ(u₁, t₁) + integ(u₀, t₀, Val{0}; idxs=nothing) + integ(u₁, t₁, Val{0}; idxs=nothing) # Store first timepoints. Usually we'd do this in `initialize` # but `integ(u, t, deriv)` doesn't work that early, and so we # must wait until we've taken at least a single step. if isempty(ilsc) - store_u_block!(ilsc, integ, caches, t₀, u₀, full_mask) + store_u_block!(ilsc, num_derivatives_val, integ, caches, t₀, u₀, full_mask) end dtmin = eps(t₁ - t₀)*1000.0 @@ -406,38 +514,20 @@ function LinearizingSavingCallback(ils::IndependentlyLinearizedSolution{T,S}; end u_modified!(integ, false) end, - # In our `initialize`, we create some caches so we allocate less + # In our `initialize`, we create some caches so we allocate less. initialize = (c, u, t, integ) -> begin - u = as_array(u) - num_us = length(ilsc.u_chunks) - - # Workaround for Sundials allocations; `NVector()` allocates, - # so we first use `typeof(integ.u_nvec)` to pull out the `NVector` type, - # then teach our `CachePool` to create pre-wrapped `NVector`s rather - # than just `Vector{S}`'s. - if hasfield(typeof(integ), :u_nvec) - NVector = typeof(integ.u_nvec) - us = CachePool(NVector, () -> NVector(Vector{S}(undef, num_us))) - else - us = CachePool(Vector{S}, () -> Vector{S}(undef, num_us)) - end - caches = (; - y_linear = Matrix{S}(undef, (num_us, 3)), - y_interp = Matrix{S}(undef, (num_us, 3)), - slopes = Vector{S}(undef, num_us), - us = us, - u_block = Matrix{S}(undef, (num_derivatives(ilsc)+1, num_us)), - u_masks = CachePool(BitVector, () -> BitVector(undef, num_us)) - ) + caches = acquire!(cache_pool) u_modified!(integ, false) end, # We need to finalize the ils and free our caches finalize = (c, u, t, integ) -> begin finish!(ils) - caches = nothing + if cache_pool !== nothing + release!(cache_pool, caches) + end end, # Don't add tstops to the left and right. save_positions = (false, false)) end -export SavingCallback, SavedValues, LinearizingSavingCallback +export SavingCallback, SavedValues, LinearizingSavingCallback, LinearizingSavingCallbackCache diff --git a/test/independentlylinearizedtests.jl b/test/independentlylinearizedtests.jl index afc3c9c7..92694a75 100644 --- a/test/independentlylinearizedtests.jl +++ b/test/independentlylinearizedtests.jl @@ -1,8 +1,60 @@ using Test, DiffEqCallbacks using DiffEqCallbacks: sample, store!, IndependentlyLinearizedSolutionChunks, finish! +using DiffEqCallbacks: CachePool, acquire!, release!, @with_cache +@testset "CachePool" begin + num_us = 10 + for thread_safe in (false, true) + pool = CachePool(Vector{Float64}, () -> Vector{Float64}(undef, num_us); thread_safe) + + @with_cache pool u₀ begin + @with_cache pool u₁ begin + @test length(u₀) == num_us + @test length(u₁) == num_us + + # We get two separate arrays + u₀[1] = 0.0 + u₁[1] = 1.0 + @test u₀[1] == 0.0 + @test u₁[1] == 1.0 + end + + # Test that u₁ gets re-used + @with_cache pool u₁ begin + @test length(u₁) == num_us + @test u₁[1] == 1.0 + end + + previously_allocated = pool.num_allocated + # Test that asking for a vector while we have one free does not allocate + function foo(pool) + return @with_cache pool u begin + u[1] + end + end + @test foo(pool) == 1.0 + @test pool.num_allocated == previously_allocated + end + end + + # Test the threadsafe versions actually lock + pool = CachePool(Vector{Float64}, () -> Vector{Float64}(undef, num_us); thread_safe=true) + # elapsed acquire once to warm everything up + @elapsed acquire!(pool) + @test @elapsed(acquire!(pool)) < 0.001 + + t_locker = Threads.@spawn begin + @lock pool.lock begin + sleep(0.01) + end + end + yield() + @test @elapsed(acquire!(pool)) > 0.001 +end + + @testset "IndependentlyLinearizedSolution" begin - ils = IndependentlyLinearizedSolution{Float64,Float64}( + ils = IndependentlyLinearizedSolution{Float64,Float64,0}( # t [0.0, 0.5, 0.75, 1.0], # us (primal only, no derivatives) @@ -19,6 +71,8 @@ using DiffEqCallbacks: sample, store!, IndependentlyLinearizedSolutionChunks, fi ), # ilsc nothing, + # ilsc_cache_pool + nothing, ) # Test `iterate()` diff --git a/test/saving_tests.jl b/test/saving_tests.jl index 6854afa5..9a9ca139 100644 --- a/test/saving_tests.jl +++ b/test/saving_tests.jl @@ -158,7 +158,8 @@ cb = SavingCallback((u, t, integrator) -> integrator.EEst * integrator.dt, saved # Test that our `LinearizingSavingCallback` gives back something that when interpolated, # respects our `abstol`/`reltol` versus the actual solution: using DataInterpolations -import DiffEqCallbacks: as_array, finish! +import DiffEqCallbacks: as_array, finish!, + IndependentlyLinearizedSolution, LinearizingSavingCallbackCache as_array(T::Type{<:AbstractArray}) = T as_array(T::Type{<:Number}) = Vector{T} @@ -217,9 +218,39 @@ if VERSION >= v"1.9" # stack test_linearization(prob_ode_rigidbody, solver(); max_deriv) test_linearization(prob_ode_nonlinchem, solver(); max_deriv) test_linearization(prob_ode_lorenz, solver(); max_deriv) + + # We do not support 2d states yet. + #test_linearization(prob_ode_2Dlinear, solver(); max_deriv) end end - # We do not support 2d states yet. - #test_linearization(prob_ode_2Dlinear, Tsit5()) + + # Benchmark that shows how to use a cache to speed up linearization of large ensemble solutions: + #= + num_trajectories = 10000 + num_derivatives = 1 + cache = LinearizingSavingCallbackCache(prob_ode_lorenz, Tsit5(); num_derivatives) + function p_disturbing_remake(prob, i, repeat; kwargs...) + return remake(prob; + p=prob.p .+ 0.1*randn(size(prob.p)), + kwargs... + ) + end + ilss = Vector{IndependentlyLinearizedSolution}(undef, num_trajectories) + function linearizer_adding_remake(prob,i,repeat) + ilss[i] = IndependentlyLinearizedSolution(prob, num_derivatives; cache_pool=cache.ils_cache) + lsc = LinearizingSavingCallback(ilss[i]; cache_pool=cache.lsc_cache) + return p_disturbing_remake(prob, i, repeat; callback=lsc) + end + ensembleprob = EnsembleProblem( + prob_ode_lorenz; + prob_func=linearizer_adding_remake, + safetycopy=false, + ) + @time sol = solve(ensembleprob, + Tsit5(), + EnsembleThreads(); + trajectories=num_trajectories, + ) + =# end