From b757c47393f118cff7156ada90a89c2f5b51bae3 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Wed, 26 Sep 2018 20:48:39 -0700 Subject: [PATCH] Fix bug in temporal blocking in 4D and more. --- src/common/combo.cpp | 97 +++++++++++++++++++++++++++++++++ src/common/combo.hpp | 41 ++++++++++++++ src/common/common.mk | 2 +- src/common/common_utils.cpp | 2 +- src/common/idiv.hpp | 4 +- src/kernel/Makefile | 2 +- src/kernel/lib/context.cpp | 103 ++++++++++++++++++++++++++---------- src/kernel/lib/context.hpp | 2 + src/kernel/lib/yask.hpp | 3 ++ 9 files changed, 221 insertions(+), 35 deletions(-) create mode 100644 src/common/combo.cpp create mode 100644 src/common/combo.hpp diff --git a/src/common/combo.cpp b/src/common/combo.cpp new file mode 100644 index 00000000..a263474a --- /dev/null +++ b/src/common/combo.cpp @@ -0,0 +1,97 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kernel +Copyright (c) 2014-2018, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +#include "combo.hpp" +#include + +using namespace std; + +namespace yask { + + // https://stackoverflow.com/questions/9330915/number-of-combinations-n-choose-r-in-c + int choose(int n, int k) { + if (k > n) + return 0; + if (k * 2 > n) + k = n-k; + if (k == 0) + return 1; + + int result = n; + for( int i = 2; i <= k; ++i ) { + result *= (n-i+1); + result /= i; + } + return result; + } + + // https://stackoverflow.com/questions/561/how-to-use-combinations-of-sets-as-test-data#794 + // Fixed to handle p==0 and p==1. + + // Get the 'x'th lexicographically ordered set of 'p' elements in 'n'. + // Returns 'p' values in 'c'. + // 'x' and values in 'c' are 1-based. + void combination(int* c, int n, int p, int x) { + assert(n > 0); + assert(p >= 0); + assert(p <= n); + assert(x >= 1); + assert(x <= choose(n, p)); + if (p == 0) + return; + if (p == 1) { + c[0] = x; + return; + } + int r, k = 0; + for(int i=0; i>(sizeof(a)*8-1) * b is equiv to (a >= 0) ? 0 : b; } -#endif diff --git a/src/kernel/Makefile b/src/kernel/Makefile index 0b69d995..7c86802e 100644 --- a/src/kernel/Makefile +++ b/src/kernel/Makefile @@ -531,7 +531,7 @@ BLOCK_LOOP_ORDER ?= 1 .. N-1 BLOCK_LOOP_CODE ?= $(BLOCK_LOOP_OUTER_MODS) loop($(BLOCK_LOOP_ORDER)) { \ $(BLOCK_LOOP_INNER_MODS) \ call(calc_mini_block(bp, nphases, phase, nshapes, shape, \ - region_idxs, block_idxs)); } + dims_to_bridge, region_idxs, block_idxs)); } # Mini-Block loops break up a mini-block into sub-blocks. The 'omp' modifier creates # a *nested* OpenMP loop so that each sub-block is assigned to a nested OpenMP diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp index 320a463b..8769db53 100644 --- a/src/kernel/lib/context.cpp +++ b/src/kernel/lib/context.cpp @@ -860,6 +860,7 @@ namespace yask { assert(phase == 0); idx_t nshapes = 1; idx_t shape = 0; + int dims_to_bridge[phase]; idx_t shift_num = 0; ScanIndices adj_block_idxs = block_idxs; @@ -871,14 +872,17 @@ namespace yask { // If TB is active, loop thru each required shape. else { - // Determine number of shapes for this phase. First and last + // Recalc number of phases. + idx_t nphases = nddims + 1; // E.g., nphases = 3 for 2D. + assert(phase >= 0); + assert(phase < nphases); // E.g., phase = 0..2 for 2D. + + // Determine number of shapes for this 'phase'. First and last // phase need one shape. Other (bridge) phases need one shape - // for each domain dim. Example: need 'x' and 'y' bridges for 2D - // problem. - idx_t nphases = nsdims; - idx_t nshapes = (phase == 0) ? 1 : - (phase == nphases - 1) ? 1 : - nddims; + // for each combination of domain dims. E.g., need 'x' and + // 'y' bridges for 2D problem in phase 1. + idx_t nshapes = choose(nddims, phase); + int dims_to_bridge[phase]; // Make a copy of the original index span // since block_idxs will be modified. @@ -889,6 +893,10 @@ namespace yask { // Outer loop thru shapes. for (idx_t shape = 0; shape < nshapes; shape++) { + // Get 'shape'th combo of 'phase' things from 'nddims'. + // These will be used to create bridge shapes. + combination(dims_to_bridge, nddims, phase, shape + 1); + // Restore the original block_idxs. block_idxs = orig_block_idxs; @@ -956,6 +964,7 @@ namespace yask { void StencilContext::calc_mini_block(BundlePackPtr& sel_bp, idx_t nphases, idx_t phase, idx_t nshapes, idx_t shape, + int dims_to_bridge[], const ScanIndices& base_region_idxs, const ScanIndices& base_block_idxs, const ScanIndices& adj_block_idxs) { @@ -1058,6 +1067,7 @@ namespace yask { shift_num, nphases, phase, nshapes, shape, + dims_to_bridge, base_region_idxs.begin, base_region_idxs.end, shift_num, bp, @@ -1179,6 +1189,7 @@ namespace yask { idx_t block_shift_num, idx_t nphases, idx_t phase, idx_t nshapes, idx_t shape, + int dims_to_bridge[], const Indices& region_base_start, const Indices& region_base_stop, idx_t region_shift_num, @@ -1187,6 +1198,7 @@ namespace yask { auto step_posn = +Indices::step_posn; int ndims = _dims->_stencil_dims.size(); auto& step_dim = _dims->_step_dim; + auto& fpts = _dims->_fold_pts; // Set 'idxs' begin & end to region boundaries for // given shift. @@ -1207,22 +1219,46 @@ namespace yask { // Is this block first and/or last in region? bool is_first_blk = block_base_start[i] <= region_base_start[i]; bool is_last_blk = block_base_stop[i] >= region_base_stop[i]; - + // Initial start and stop point of phase-0 block. idx_t blk_start = block_base_start[i]; idx_t blk_stop = block_base_stop[i]; - + + // x-> + // ^ ---------------------- + // | / \ /^ + // t / phase 0 \ phase 1 / | + // / \ / | + // ---------------------- | + // ^ ^ ^ | + // |<-blk_width->| -->| |<--sa=shifts*angle + // | | next_blk_start + // blk_start blk_stop | + // |<-----blk_base------>| + // blk_width = blk_base/2 + sa. + + // When there is >1 phase, initial width is half of base plus + // one shift distance. This will make 'up' and 'down' + // trapezoids approx same size. + auto tb_angle = tb_angles[j]; + if (nphases > 1) { + auto sa = (num_tb_shifts+1) * tb_angle; + idx_t blk_width = ROUND_UP(CEIL_DIV(blk_stop - blk_start, idx_t(2)) + sa, + fpts[j]); + blk_width = max(blk_width, 2 * sa + fpts[j]); + blk_stop = min(blk_start + blk_width, block_base_stop[i]); + } + // Starting point of the *next* block. This is used to create // bridge shapes between blocks. Initially, the beginning of // the next block is the end of this block. // TODO: split these parts more evenly when not full triangles. - idx_t next_blk_start = blk_stop; + idx_t next_blk_start = block_base_stop[i]; // Adjust these based on current shift. Adjust by pts in one TB // step, reducing size on R & L sides. But if block is first // and/or last, clamp to region. TODO: have different R & L // angles. TODO: have different shifts for each pack. - auto tb_angle = tb_angles[j]; // Shift start to right unless first. First block will be a // parallelogram or trapezoid clamped to beginning of region. @@ -1232,6 +1268,8 @@ namespace yask { // Shift stop to left. blk_stop -= tb_angle * block_shift_num; + if (nphases == 1 && is_last_blk) + blk_stop = idxs.end[i]; // Shift start of next block. Last block will be // clamped to end of region. @@ -1244,22 +1282,29 @@ namespace yask { // For phase 0, limits are simply the base start and stop. idx_t shape_start = blk_start; idx_t shape_stop = blk_stop; - - // Starting w/phase 1, create a "bridge" in one additional dim - // at a time from RHS of base block to the LHS of the next block - // until all dims are bridged at last phase. Shape of last phase - // will be an "upside-down" version of the first one. The - // 'shape' var determines what dim to start with. - for (idx_t k = 1; k <= phase; k++) { - - // Select another dim based on shape and phase. - if (shape == (j + k - 1) % nshapes) { - - // Start at end of base block. - shape_start = blk_stop; - - // Stop at beginning of next block. - shape_stop = next_blk_start; + + // Depending on the phase and shape, create a bridge between + // from RHS of base block to the LHS of the next block + // until all dims are bridged at last phase. + if (phase > 0) { + + // Check list of dims to bridge for this shape, + // computed earlier. + for (int i = 0; i < phase; i++) { + auto dim = dims_to_bridge[i] - 1; + + // Bridge this dim? + if (dim == j) { + TRACE_MSG("shift_mini_block: phase " << phase << + ", shape " << shape << + ": bridging dim " << j); + + // Start at end of base block. + shape_start = blk_stop; + + // Stop at beginning of next block. + shape_stop = next_blk_start; + } } } // We now have bounds of this shape in shape_{start,stop} @@ -1300,8 +1345,8 @@ namespace yask { } // dims. - TRACE_MSG("shift_mini_block: phase " << phase << - ", shape " << shape << + TRACE_MSG("shift_mini_block: phase " << phase << "/" << nphases << + ", shape " << shape << "/" << nshapes << ", pack '" << bp->get_name() << ", updated span: [" << idxs.begin.makeValStr(ndims) << " ... " << diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp index c29b4ed2..c1c3e109 100644 --- a/src/kernel/lib/context.hpp +++ b/src/kernel/lib/context.hpp @@ -519,6 +519,7 @@ namespace yask { void calc_mini_block(BundlePackPtr& sel_bp, idx_t nphases, idx_t phase, idx_t nshapes, idx_t shape, + int dims_to_bridge[], const ScanIndices& base_region_idxs, const ScanIndices& base_block_idxs, const ScanIndices& block_idxs); @@ -548,6 +549,7 @@ namespace yask { idx_t block_shift_num, idx_t nphases, idx_t phase, idx_t nshapes, idx_t shape, + int dims_to_bridge[], const Indices& region_base_start, const Indices& region_base_stop, idx_t region_shift_num, diff --git a/src/kernel/lib/yask.hpp b/src/kernel/lib/yask.hpp index ea24a882..d5d008ca 100644 --- a/src/kernel/lib/yask.hpp +++ b/src/kernel/lib/yask.hpp @@ -103,6 +103,9 @@ typedef std::uint64_t uidx_t; // Floored integer divide and mod. #include "idiv.hpp" +// Combinations. +#include "combo.hpp" + // Simple macros and stubs. #ifdef WIN32