From 8775765462c227fe1e83316c1d26fffa9c6c886e Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Tue, 17 Apr 2018 17:07:30 -0700
Subject: [PATCH 01/21] Change equation and stencil groups to bundles.
 Preliminary bundle API defns.

---
 include/yask_compiler_api.hpp         |  213 +++-
 include/yask_kernel_api.hpp           | 1609 +------------------------
 include/yk_grid_api.hpp               |  942 +++++++++++++++
 include/yk_solution_api.hpp           |  836 +++++++++++++
 src/common/common_utils.cpp           |    2 +-
 src/compiler/lib/Cpp.hpp              |   14 +-
 src/compiler/lib/CppIntrin.hpp        |   18 +-
 src/compiler/lib/Eqs.cpp              |  184 ++-
 src/compiler/lib/Eqs.hpp              |  120 +-
 src/compiler/lib/Grid.cpp             |    2 +-
 src/compiler/lib/Grid.hpp             |    6 +-
 src/compiler/lib/Print.cpp            |   16 +-
 src/compiler/lib/Print.hpp            |   18 +-
 src/compiler/lib/Soln.cpp             |   41 +-
 src/compiler/lib/Soln.hpp             |    4 +-
 src/compiler/lib/YaskKernel.cpp       |   78 +-
 src/compiler/main.cpp                 |   30 +-
 src/compiler/swig/yask_compiler_api.i |    1 +
 src/kernel/Makefile                   |    4 +-
 src/kernel/lib/context.cpp            |  120 +-
 src/kernel/lib/context.hpp            |   26 +-
 src/kernel/lib/stencil_calc.cpp       |   40 +-
 src/kernel/lib/stencil_calc.hpp       |   40 +-
 src/kernel/swig/yask_kernel_api.i     |    1 +
 24 files changed, 2338 insertions(+), 2027 deletions(-)
 create mode 100644 include/yk_grid_api.hpp
 create mode 100644 include/yk_solution_api.hpp

diff --git a/include/yask_compiler_api.hpp b/include/yask_compiler_api.hpp
index cd1eccbe..a6c3bd81 100644
--- a/include/yask_compiler_api.hpp
+++ b/include/yask_compiler_api.hpp
@@ -47,6 +47,10 @@ namespace yask {
     /// Shared pointer to \ref yc_grid
     typedef yc_grid* yc_grid_ptr;
 
+    class yc_equation_group;
+    /// Shared pointer to \ref yc_equation_group;
+    typedef std::shared_ptr<yc_equation_group> yc_equation_group_ptr;
+
     // Forward declarations of expression nodes and their pointers.
 
     class yc_expr_node;
@@ -166,7 +170,7 @@ namespace yask {
 
            At least one grid must be defined with at least one domain-index node.
 
-           @returns Pointer to the new grid. 
+           @returns Pointer to the new \ref yc_grid object. 
         */
         virtual yc_grid_ptr
         new_grid(const std::string& name
@@ -181,7 +185,7 @@ namespace yask {
         /**
            C++ initializer-list version with same semantics as vector version.
            @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
-           @returns Pointer to the new grid. 
+           @returns Pointer to the new \ref yc_grid object. 
         */
         virtual yc_grid_ptr
         new_grid(const std::string& name /**< [in] Unique name of the grid; must be
@@ -206,7 +210,7 @@ namespace yask {
            See `TestScratchStencil*` classes in
            `src/stencils/SimpleTestStencils.hpp` for usage examples.
 
-           @returns Pointer to the new grid. 
+           @returns Pointer to the new \ref yc_grid object. 
         */
         virtual yc_grid_ptr
         new_scratch_grid(const std::string& name
@@ -221,7 +225,7 @@ namespace yask {
         /**
            C++ initializer-list version with same semantics as vector version.
            @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
-           @returns Pointer to the new grid. 
+           @returns Pointer to the new \ref yc_grid object. 
         */
         virtual yc_grid_ptr
         new_scratch_grid(const std::string& name
@@ -249,17 +253,48 @@ namespace yask {
         get_grid(const std::string& name /**< [in] Name of the grid. */ ) =0;
         
         /// Get the number of equations in the solution.
-        /** Equations are added when equation_nodes are created via new_equation_node().
+        /** Equations are added when yc_node_factory::new_equation_node() is called.
             @returns Number of equations that have been created. */
         virtual int
         get_num_equations() const =0;
 
         /// Get the specified equation.
-        /** @returns Pointer to equation_node of nth equation. */
+        /** @returns Pointer to \ref yc_equation_node of nth equation. */
         virtual yc_equation_node_ptr
         get_equation(int n /**< [in] Index of equation between zero (0)
                               and get_num_equations()-1. */ ) =0;
 
+        /// Create a new equation group.
+        /**
+           In normal usage, equation groups are created automatically when
+           format() is called.  Under automatic grouping, the YASK compiler
+           discovers dependencies between equations and places equations
+           together in a group if they do not depend upon one another.
+           Then, the YASK compiler schedules the resulting groups for
+           execution in the kernel based on the dependencies between groups.
+
+           A \ref yc_equation_group object allows manual grouping of equations.
+           Under manual grouping, the YASK compiler does _not_ check
+           for illegal dependencies within the group.
+           In addition, if `do_schedule` is `false`, the YASK compiler
+           will not check for dependencies with other groups and
+           will not schedule the group for execution in the kernel.
+           Then, it will be the programmer's responsibility to run the
+           stencil group via yk_solution::run_stencil_group().
+
+           This capability is useful for processing equations that
+           the YASK compiler cannot currently handle, like equations
+           with dependencies between different points of a grid
+           at the same step index.
+
+           @returns Pointer to the new \ref yc_equation_group object. 
+         */
+        virtual yc_equation_group_ptr
+        new_equation_group(const std::string& name
+                           /**< [in] Name of the group. */,
+                           bool do_schedule = true
+                           /**< [in] Schedule the group for execution in the kernel. */ ) =0;
+
         /// Set the vectorization length in given dimension.
         /** For YASK-code generation, the product of the fold lengths should
             be equal to the number of elements in a HW SIMD register.
@@ -347,8 +382,8 @@ namespace yask {
         is a scalar, a 1-dim grid is an array, etc.
         A compile-time grid is a variable used for constructing equations.
         It does not contain any data.
-        Data is only stored during run-time, using a yk_grid.
-        Create new grids via yc_solution::new_grid(). */
+        Data is only stored during run-time, using a \ref yk_grid.
+        Created via yc_solution::new_grid(). */
     class yc_grid {
     public:
         virtual ~yc_grid() {}
@@ -380,9 +415,9 @@ namespace yask {
         /** The indices are specified relative to the stencil-evaluation
             index.  Each offset refers to the dimensions defined when the
             grid was created via stencil_solution::new_grid(). 
-            Example: if g = new_grid("heat", {"t", "x", "y"}), then
-            g->new_relative_grid_point(1, -1, 0) refers to heat(t+1, x-1, y)
-            for some point t, x, y during stencil evaluation.
+            Example: if `g = new_grid("heat", {"t", "x", "y"})`, then
+            `g->new_relative_grid_point(1, -1, 0)` refers to `heat(t+1, x-1, y)`
+            for some point `t, x, y` dynamically defined during stencil evaluation.
             @warning This convenience function can only be used when every
             dimension of the grid is either the step dimension or a domain dimension.
             @note Offsets beyond the dimensions in the grid will be ignored.
@@ -403,8 +438,8 @@ namespace yask {
     };
     
     /// Factory to create AST nodes.
-    /** @note Grid-point reference nodes are created from a `yc_grid` object
-        instead of from this factory. */
+    /** @note Grid-point reference nodes are created from a \ref yc_grid object
+        instead of from a \ref yc_node_factory. */
     class yc_node_factory {
     public:
         virtual ~yc_node_factory() {}
@@ -414,6 +449,7 @@ namespace yask {
            Create a variable to be used to index grids in the
            solution-step dimension.
            The name usually describes time, e.g. "t". 
+           @returns Pointer to new \ref yc_index_node object.
         */
         virtual yc_index_node_ptr
         new_step_index(const std::string& name
@@ -426,6 +462,7 @@ namespace yask {
            The name usually describes spatial dimensions, e.g. "x" or "y". 
            This should *not* include the step dimension, which is specified via
            new_step_index().
+           @returns Pointer to new \ref yc_index_node object.
          */
         virtual yc_index_node_ptr
         new_domain_index(const std::string& name
@@ -436,6 +473,7 @@ namespace yask {
            Create an variable to be used to index grids in the
            some dimension that is not the step dimension
            or a domain dimension. Example: index into an array.
+           @returns Pointer to new \ref yc_index_node object.
          */
         virtual yc_index_node_ptr
         new_misc_index(const std::string& name
@@ -447,49 +485,62 @@ namespace yask {
             created, it is automatically added to the list of equations for
             the yc_solution that contains the grid that is on the
             LHS.
-            @returns Pointer to new node. */
+            @returns Pointer to new \ref yc_equation_node object. 
+        */
         virtual yc_equation_node_ptr
         new_equation_node(yc_grid_point_node_ptr lhs /**< [in] Grid-point before EQUALS operator. */,
                         yc_number_node_ptr rhs /**< [in] Expression after EQUALS operator. */ );
 
         /// Create a constant numerical value node.
-        /** This is unary negation.
-             Use new_subtraction_node() for binary '-'.
-            @returns Pointer to new node. */
+        /** 
+            This is unary negation.
+            Use new_subtraction_node() for binary '-'.
+            @returns Pointer to new \ref yc_const_number_node object. 
+        */
         virtual yc_const_number_node_ptr
         new_const_number_node(double val /**< [in] Value to store in node. */ );
 
         /// Create a numerical negation operator node.
-        /** @returns Pointer to new node. */
+        /**
+           @returns Pointer to new \ref yc_negate_node object. 
+        */
         virtual yc_negate_node_ptr
         new_negate_node(yc_number_node_ptr rhs /**< [in] Expression after '-' sign. */ );
 
         /// Create an addition node.
-        /** Nodes must be created with at least two operands, and more can
-             be added by calling add_operand() on the returned node.
-            @returns Pointer to new node. */
+        /** 
+            Nodes must be created with at least two operands, and more can
+            be added by calling add_operand() on the returned node.
+            @returns Pointer to new \ref yc_add_node object. 
+        */
         virtual yc_add_node_ptr
         new_add_node(yc_number_node_ptr lhs /**< [in] Expression before '+' sign. */,
                      yc_number_node_ptr rhs /**< [in] Expression after '+' sign. */ );
 
         /// Create a multiplication node.
-        /** Nodes must be created with at least two operands, and more can
-             be added by calling add_operand() on the returned node.
-            @returns Pointer to new node. */
+        /**
+           Nodes must be created with at least two operands, and more can
+           be added by calling add_operand() on the returned node.
+           @returns Pointer to new \ref yc_multiply_node object. 
+        */
         virtual yc_multiply_node_ptr
         new_multiply_node(yc_number_node_ptr lhs /**< [in] Expression before '*' sign. */,
                           yc_number_node_ptr rhs /**< [in] Expression after '*' sign. */ );
 
         /// Create a subtraction node.
-        /** This is binary subtraction.
-             Use new_negation_node() for unary '-'.
-            @returns Pointer to new node. */
+        /**
+           This is binary subtraction.
+           Use new_negation_node() for unary '-'.
+           @returns Pointer to new \ref yc_subtract_node object. 
+        */
         virtual yc_subtract_node_ptr
         new_subtract_node(yc_number_node_ptr lhs /**< [in] Expression before '-' sign. */,
                           yc_number_node_ptr rhs /**< [in] Expression after '-' sign. */ );
 
         /// Create a division node.
-        /** @returns Pointer to new node. */
+        /**
+           @returns Pointer to new \ref yc_divide_node object. 
+        */
         virtual yc_divide_node_ptr
         new_divide_node(yc_number_node_ptr lhs /**< [in] Expression before '/' sign. */,
                         yc_number_node_ptr rhs /**< [in] Expression after '/' sign. */ );
@@ -502,20 +553,25 @@ namespace yask {
         virtual ~yc_expr_node() {}
 
         /// Create a simple human-readable string.
-        /** Formats the expression starting at this node.
-            @returns String containing a single-line human-readable version of the expression.
+        /**
+           Formats the expression starting at this node.
+           @returns String containing a single-line human-readable version of the expression.
          */
         virtual std::string format_simple() const =0;
 
         /// Count the size of the AST.
-        /** @returns Number of nodes in this tree,
-            including this node and all its descendants. */
+        /**
+           @returns Number of nodes in this tree,
+           including this node and all its descendants. 
+        */
         virtual int get_num_nodes() const =0;
     };
 
     /// Equation node.
     /** Indicates grid point on LHS is equivalent to expression
-        on RHS. This is NOT a test for equality. */
+        on RHS. This is NOT a test for equality.
+        Created via yc_node_factory::new_equation_node().
+    */
     class yc_equation_node : public virtual yc_expr_node {
     public:
 
@@ -537,45 +593,58 @@ namespace yask {
     class yc_bool_node : public virtual yc_expr_node { };
 
     /// A dimension or an index in that dimension.
-    /** This is a leaf node in an AST.
-        Use a yask_solution object to create an object of this type. */
+    /**
+       This is a leaf node in an AST.
+       Created via yc_node_factory::new_step_index(),
+       yc_node_factory::new_domain_index(), and
+       yc_node_factory::new_misc_index().
+    */
     class yc_index_node : public virtual yc_number_node {
     public:
 
         /// Get the dimension's name.
         /** @returns Name given at creation. */
-        virtual const std::string& get_name() const =0;
+        virtual const std::string&
+        get_name() const =0;
     };
 
     /// A reference to a point in a grid.
+    /**
+       Created via yc_grid::new_relative_grid_point().
+    */
    class yc_grid_point_node : public virtual yc_number_node {
     public:
 
         /// Get the grid this point is in.
-        /** @returns Pointer to grid. */
-        virtual yc_grid_ptr get_grid() =0;
+        /** @returns Pointer to a \ref yc_grid object. */
+        virtual yc_grid_ptr
+        get_grid() =0;
     };
     
     /// A constant numerical value.
     /** All values are stored as doubles.
         This is a leaf node in an AST.
-        Use a yask_compiler_factory object to create an object of this type. */
+        Created via yc_node_factory::new_const_number_node().
+    */
     class yc_const_number_node : public virtual yc_number_node {
     public:
 
         /// Set the value.
         /** The value is considered "constant" only when the 
             compiler output is created. It can be changed in the AST. */
-        virtual void set_value(double val /**< [in] Value to store in node. */ ) =0;
+        virtual void
+        set_value(double val /**< [in] Value to store in node. */ ) =0;
 
         /// Get the stored value.
         /** @returns Copy of stored value. */
-        virtual double get_value() const =0;
+        virtual double
+        get_value() const =0;
     };
 
     /// A numerical negation operator.
     /** Example: used to implement -(a*b).
-        Use a yask_compiler_factory object to create an object of this type. */
+        Created via yc_node_factory::new_negate_node().
+     */
     class yc_negate_node : public virtual yc_number_node {
     public:
 
@@ -583,7 +652,8 @@ namespace yask {
         /**  This node implements unary negation only, not subtraction, so there is
             never a left-hand-side.
             @returns Expression node on right-hand-side of '-' sign. */
-        virtual yc_number_node_ptr get_rhs() =0;
+        virtual yc_number_node_ptr
+        get_rhs() =0;
     };
 
     /// Base class for commutative numerical operators.
@@ -598,7 +668,8 @@ namespace yask {
             them. Example: for an add operator, if the operands are 'a',
             'b', and 'c', then the expression is 'a + b + c'.
             @returns Number of operands. */
-        virtual int get_num_operands() =0;
+        virtual int
+        get_num_operands() =0;
 
         /// Get the specified operand.
         /** @returns Pointer to node at given position or null pointer if out of bounds. */
@@ -612,35 +683,81 @@ namespace yask {
     };
 
     /// An addition node.
+    /** Created via yc_node_factory::new_negate_node(). */
     class yc_add_node : public virtual yc_commutative_number_node { };
 
     /// A multiplication node.
+    /** Created via yc_node_factory::new_multiply_node(). */
     class yc_multiply_node : public virtual yc_commutative_number_node { };
 
     /// A subtraction node.
+    /** Created via yc_node_factory::new_subtract_node(). */
     class yc_subtract_node : public virtual yc_number_node {
     public:
 
         /// Get the left-hand-side operand.
         /** @returns Pointer to expression node appearing before the '-' sign. */
-        virtual yc_number_node_ptr get_lhs() =0;
+        virtual yc_number_node_ptr
+        get_lhs() =0;
     
         /// Get the right-hand-side operand.
         /** @returns Pointer to expression node appearing after the '-' sign. */
-        virtual yc_number_node_ptr get_rhs() =0;
+        virtual yc_number_node_ptr
+        get_rhs() =0;
     };
 
     /// A division node.
+    /** Created via yc_node_factory::new_divide_node(). */
     class yc_divide_node : public virtual yc_number_node {
     public:
 
         /// Get the left-hand-side operand.
         /** @returns Pointer to expression node appearing before the '/' sign. */
-        virtual yc_number_node_ptr get_lhs() =0;
+        virtual yc_number_node_ptr
+        get_lhs() =0;
     
         /// Get the right-hand-side operand.
         /** @returns Pointer to expression node appearing after the '/' sign. */
-        virtual yc_number_node_ptr get_rhs() =0;
+        virtual yc_number_node_ptr
+        get_rhs() =0;
+    };
+
+    /// A manual grouping of stencil equations.
+    /**
+       Created via yc_solution::new_equation_group().
+       See yc_solution::new_equation_group() for a description of
+       automatic versus manual grouping.
+
+       After a \ref yc_equation_group is processed by the YASK
+       compiler and the resulting kernel is compiled,
+       it will be visible as a \ref yk_stencil_group
+       in the corresponding YASK kernel.
+    */
+    class yc_equation_group {
+    public:
+
+        /// Get the name of this group.
+        /**
+           @returns Name created via yc_solution::new_equation_group().
+        */
+        virtual const std::string&
+        get_name() const =0;
+
+        /// Determine whether this group will be automatically scheduled.
+        /**
+           @returns `true` if this group will be run via yk_solution::run_solution()
+           or `false` if this group must be run via yk_solution::run_stencil_group().
+           This is the `do_schedule` setting passed via yc_solution::new_equation_group().
+        */
+        virtual bool
+        get_do_schedule() const =0;
+
+        /// Add an equation to this group.
+        virtual void
+        add_equation(yc_equation_node_ptr equation
+                     /**< [in] Pointer to equation to be added. */ ) =0;
+                                  
+    public:
     };
 
 } // namespace yask.
diff --git a/include/yask_kernel_api.hpp b/include/yask_kernel_api.hpp
index 481e50ce..e579cac1 100644
--- a/include/yask_kernel_api.hpp
+++ b/include/yask_kernel_api.hpp
@@ -46,48 +46,35 @@ namespace yask {
     typedef std::int64_t idx_t;
 #endif
 
-    /// Allocate grids on local NUMA node.
-    /**
-       This is used in yk_solution::set_default_numa_preferred
-       and yk_grid::set_numa_preferred.
-       In Python, specify as `yask_kernel.cvar.yask_numa_local`. 
-    */
-    const int yask_numa_local = -1;
-
-    /// Allocate grids across all available NUMA nodes.
-    /**
-       This is used in yk_solution::set_default_numa_preferred
-       and yk_grid::set_numa_preferred.
-       In Python, specify as `yask_kernel.cvar.yask_numa_interleave`. 
-    */
-    const int yask_numa_interleave = -2;
-
-    /// Do not specify any NUMA binding.
-    /**
-       This is used in yk_solution::set_default_numa_preferred
-       and yk_grid::set_numa_preferred.
-       In Python, specify as `yask_kernel.cvar.yask_numa_none`. 
-    */
-    const int yask_numa_none = -9;
-
     // Forward declarations of classes and pointers.
 
     class yk_env;
-    /// Shared pointer to \ref yk_env
+    /// Shared pointer to \ref yk_env.
     typedef std::shared_ptr<yk_env> yk_env_ptr;
 
     class yk_solution;
-    /// Shared pointer to \ref yk_solution
+    /// Shared pointer to \ref yk_solution.
     typedef std::shared_ptr<yk_solution> yk_solution_ptr;
 
     class yk_grid;
-    /// Shared pointer to \ref yk_grid
+    /// Shared pointer to \ref yk_grid.
     typedef std::shared_ptr<yk_grid> yk_grid_ptr;
 
+    class yk_stencil_group;
+    /// Shared pointer to \ref yk_stencil_group.
+    typedef std::shared_ptr<yk_stencil_group> yk_stencil_group;
+
     class yk_stats;
-    /// Shared pointer to \ref yk_stats
+    /// Shared pointer to \ref yk_stats.
     typedef std::shared_ptr<yk_stats> yk_stats_ptr;
 
+} // namespace yask.
+
+#include "yk_solution_api.hpp"
+#include "yk_grid_api.hpp"
+
+namespace yask {
+
     /// Factory to create a stencil solution.
     class yk_factory {
     public:
@@ -161,1572 +148,6 @@ namespace yask {
         global_barrier() const =0;
     };
 
-    /// Stencil solution as defined by the generated code from the YASK stencil compiler.
-    /**
-       Objects of this type contain all the grids and equations
-       that comprise a solution.
-    */
-    class yk_solution {
-    public:
-        virtual ~yk_solution() {}
-
-        /// Set object to receive debug output.
-        virtual void
-        set_debug_output(yask_output_ptr debug
-                         /**< [out] Pointer to object to receive debug output. 
-                            See \ref yask_output_factory. */ ) =0;
-
-        /// Get the name of the solution.
-        /**
-           @returns String containing the solution name provided during stencil compilation.
-        */
-        virtual const std::string&
-        get_name() const =0;
-
-        /// Get the floating-point precision size.
-        /**
-           @returns Number of bytes in each FP element: 4 or 8.
-        */
-        virtual int 
-        get_element_bytes() const =0;
-        
-        /// Get the solution step dimension.
-        /**
-           @returns String containing the step-dimension name. 
-        */
-        virtual std::string
-        get_step_dim_name() const =0;
-
-        /// Get the number of domain dimensions used in this solution.
-        /**
-           The domain dimensions are those over which the stencil is 
-           applied in each step.
-           Does *not* include the step dimension or any miscellaneous dimensions.
-           @returns Number of dimensions that define the problem domain.
-        */
-        virtual int
-        get_num_domain_dims() const =0;
-
-        /// Get all the domain dimension names.
-        /**
-           @returns List of all domain-dimension names.
-        */
-        virtual std::vector<std::string>
-        get_domain_dim_names() const =0;
-
-        /// Get all the miscellaneous dimension names.
-        /**
-           @returns List of all dimension names used in the solution
-           that are not step or domain dimensions.
-        */
-        virtual std::vector<std::string>
-        get_misc_dim_names() const =0;
-
-        /// Set the size of the solution domain for this rank.
-        /**
-           The domain defines the number of elements that will be evaluated with the stencil(s). 
-           If MPI is not enabled, this is the entire problem domain.
-           If MPI is enabled, this is the domain for the current rank only,
-           and the problem domain consists of the sum of all rank domains
-           in each dimension (weak-scaling).
-           The domain size in each rank does not have to be the same, but
-           all domains in the same column must have the same width,
-           all domains in the same row must have the same height,
-           and so forth, for each domain dimension.
-           The domain size does *not* include the halo region or any padding.
-           For best performance, set the rank domain
-           size to a multiple of the number of elements in a vector-cluster in
-           each dimension whenever possible.
-           See the "Detailed Description" for \ref yk_grid for more information on grid sizes.
-           There is no domain-size setting allowed in the
-           solution-step dimension (usually "t"). 
-        */
-        virtual void
-        set_rank_domain_size(const std::string& dim
-                             /**< [in] Name of dimension to set.  Must be one of
-                                the names from get_domain_dim_names(). */,
-                             idx_t size /**< [in] Elements in the domain in this `dim`. */ ) =0;
-
-        /// Get the domain size for this rank.
-        /**
-           @returns Current setting of rank domain size in specified dimension.
-        */
-        virtual idx_t
-        get_rank_domain_size(const std::string& dim
-                             /**< [in] Name of dimension to get.  Must be one of
-                                the names from get_domain_dim_names(). */) const =0;
-
-        /// Set the minimum amount of grid padding for all grids.
-        /**
-           This sets the minimum number of elements in each grid that is
-           reserved outside of the rank domain in the given dimension.
-           This padding area can be used for required halo regions.  At
-           least the specified number of elements will be added to both
-           sides, i.e., both "before" and "after" the domain.
-           
-           The *actual* padding size will be the largest of the following values,
-           additionally rounded up based on the vector-folding dimensions
-           and/or cache-line alignment:
-           - Halo size.
-           - Value provided by any of the pad-size setting functions.
-           
-           The padding size cannot be changed after data storage
-           has been allocated for a given grid; attempted changes to the pad size for such
-           grids will be ignored.
-           In addition, once a grid's padding is set, it cannot be reduced, only increased.
-           Call yk_grid::get_pad_size() to determine the actual padding size for a given grid.
-           See the "Detailed Description" for \ref yk_grid for more information on grid sizes.
-           There is no padding allowed in the solution-step dimension (usually "t").
-        */
-        virtual void
-        set_min_pad_size(const std::string& dim
-                         /**< [in] Name of dimension to set.  Must
-                            be one of the names from get_domain_dim_names(). */,
-                         idx_t size
-                         /**< [in] Elements in this `dim` applied
-                            to both sides of the domain. */ ) =0;
-
-        /// Get the minimum amount of grid padding for all grids.
-        /**
-           @returns Current setting of minimum amount of grid padding for all grids.
-        */
-        virtual idx_t
-        get_min_pad_size(const std::string& dim
-                         /**< [in] Name of dimension to get.  Must be one of
-                            the names from get_domain_dim_names(). */) const =0;
-
-        /// Set the block size in the given dimension.
-        /**
-           This sets the approximate number of elements that are evaluated in
-           each "block".
-           This is a performance setting and should not affect the functional
-           correctness or total number of elements evaluated.
-           A block is typically the unit of work done by a
-           top-level OpenMP thread.  The actual number of elements evaluated
-           in a block may be greater than the specified size due to rounding
-           up to fold-cluster sizes.  The number of elements in a block may
-           also be smaller than the specified size when the block is at the
-           edge of the domain. The block size cannot be set in the
-           solution-step dimension (because temporal blocking is not yet enabled).
-
-           Unless auto-tuning is disabled, the block size will be used as
-           a starting point for an automated search for a higher-performing
-           block size.
-        */
-        virtual void
-        set_block_size(const std::string& dim
-                       /**< [in] Name of dimension to set.  Must be one of
-                          the names from get_domain_dim_names(). */,
-                       idx_t size
-                       /**< [in] Elements in a block in this `dim`. */ ) =0;
-
-        /// Get the block size.
-        /**
-           Returned value may be slightly larger than the value provided
-           via set_block_size() due to rounding.
-           @returns Current settings of block size.
-        */
-        virtual idx_t
-        get_block_size(const std::string& dim
-                        /**< [in] Name of dimension to get.  Must be one of
-                           the names from get_domain_dim_names(). */) const =0;
-
-        /// Set the number of MPI ranks in the given dimension.
-        /**
-           The *product* of the number of ranks across all dimensions must
-           equal yk_env::get_num_ranks().
-           The curent MPI rank will be assigned a unique location 
-           within the overall problem domain based on its MPI rank index.
-           The same number of MPI ranks must be set via this API on each
-           constituent MPI rank to ensure a consistent overall configuration.
-           The number of ranks in each dimension must be properly set
-           before calling yk_solution::prepare_solution().
-           There is no rank setting allowed in the
-           solution-step dimension (usually "t").
-        */
-        virtual void
-        set_num_ranks(const std::string& dim
-                      /**< [in] Name of dimension to set.  Must be one of
-                         the names from get_domain_dim_names(). */,
-                      idx_t num /**< [in] Number of ranks in `dim`. */ ) =0;
-
-        /// Get the number of MPI ranks in the given dimension.
-        /**
-           @returns Current setting of rank size.
-        */
-        virtual idx_t
-        get_num_ranks(const std::string& dim
-                      /**< [in] Name of dimension to get.  Must be one of
-                         the names from get_domain_dim_names(). */) const =0;
-
-        /// Get the rank index in the specified dimension.
-        /**
-           The overall rank indices in the specified dimension will range from
-           zero (0) to get_num_ranks() - 1, inclusive.
-           @returns Zero-based index of this rank.
-        */
-        virtual idx_t
-        get_rank_index(const std::string& dim
-                       /**< [in] Name of dimension to get.  Must be one of
-                         the names from get_domain_dim_names(). */ ) const =0;
-
-        /// Get the number of grids in the solution.
-        /**
-           Grids may be pre-defined by the stencil compiler
-           (e.g., via yc_solution::new_grid())
-           or created explicitly via yk_solution::new_grid().
-           @returns Number of grids that have been created.
-        */
-        virtual int
-        get_num_grids() const =0;
-        
-        /// Get the specified grid.
-        /**
-           This cannot be used to access scratch grids.
-           @returns Pointer to the specified grid or null pointer if it does not exist.
-        */
-        virtual yk_grid_ptr
-        get_grid(const std::string& name /**< [in] Name of the grid. */ ) =0;
-
-        /// Get all the grids.
-        /**
-           @returns List of all non-scratch grids in the solution.
-        */
-        virtual std::vector<yk_grid_ptr>
-        get_grids() =0;
-
-        /// Prepare the solution for stencil application.
-        /**
-           Allocates data in grids that do not already have storage allocated.
-           Calculates the position of each rank in the overall problem domain.
-           Sets many other data structures needed for proper stencil application.
-           Since this function initiates MPI communication, it must be called
-           on all MPI ranks, and it will block until all ranks have completed.
-           Must be called before applying any stencils.
-        */
-        virtual void
-        prepare_solution() =0;
-
-        /// Get the first index of the sub-domain in this rank in the specified dimension.
-        /**
-           This returns the first *overall* index at the beginning of the domain.
-           Elements within the domain in this rank lie between the values returned by
-           get_first_rank_domain_index() and get_last_rank_domain_index(), inclusive.
-           If there is only one MPI rank, this is typically zero (0).
-           If there is more than one MPI rank, the value depends
-           on the the rank's position within the overall problem domain.
-
-           @note This function should be called only *after* calling prepare_solution()
-           because prepare_solution() assigns this rank's position in the problem domain.
-           @returns First domain index in this rank. 
-        */
-        virtual idx_t
-        get_first_rank_domain_index(const std::string& dim
-                                    /**< [in] Name of dimension to get.  Must be one of
-                                       the names from get_domain_dim_names(). */ ) const =0;
-
-        /// Get the last index of the sub-domain in this rank the specified dimension.
-        /**
-           This returns the last *overall* index within the domain in this rank
-           (*not* one past the end).
-           If there is only one MPI rank, this is typically one less than the value
-           provided by set_rank_domain_size().
-           If there is more than one MPI rank, the value depends
-           on the the rank's position within the overall problem domain.
-           See get_first_rank_domain_index() for more information.
-
-           @note This function should be called only *after* calling prepare_solution()
-           because prepare_solution() assigns this rank's position in the problem domain.
-           @returns Last index in this rank.
-        */
-        virtual idx_t
-        get_last_rank_domain_index(const std::string& dim
-                                   /**< [in] Name of dimension to get.  Must be one of
-                                      the names from get_domain_dim_names(). */ ) const =0;
-
-        /// Get the overall problem size in the specified dimension.
-        /**
-           The overall domain indices in the specified dimension will range from
-           zero (0) to get_overall_domain_size() - 1, inclusive.
-           Call get_first_rank_domain_index() and get_last_rank_domain_index()
-           to find the subset of this domain in each rank.
-
-           @note This function should be called only *after* calling prepare_solution()
-           because prepare_solution() obtains the sub-domain sizes from other ranks.
-           @returns Sum of all ranks' domain sizes in the given dimension.
-        */
-        virtual idx_t
-        get_overall_domain_size(const std::string& dim
-                                /**< [in] Name of dimension to get.  Must be one of
-                                   the names from get_domain_dim_names(). */ ) const =0;
-
-        /// Run the stencil solution for the specified steps.
-        /**
-           The stencil(s) in the solution are applied to the grid data, setting the
-           index variables as follows:
-           1. If temporal wave-fronts are *not* used (the default):
-            - The step index (e.g., `t` for "time") will be sequentially set to values
-            from `first_step_index` to `last_step_index`, inclusive.
-             + If the stencil equations were defined with dependencies on lower-valued steps,
-             e.g., `t+1` depends on `t`, then `last_step_index` should be greater than or equal to
-             `first_step_index` (forward solution).
-             + If the stencil equations were defined with dependencies on higher-valued steps,
-             e.g., `t-1` depends on `t`, then `last_step_index` should be less than or equal to
-             `first_step_index` (reverse solution).
-            - For each step index, the domain indices will be set
-            to values across the entire domain as returned by yk_solution::get_overall_domain_size()
-            (not necessarily sequentially).
-            - MPI halo exchanges will occur as necessary before, after, or during a step.
-            - Since this function initiates MPI communication, it must be called
-              on all MPI ranks, and it will block until all ranks have completed.
-           2. **[Advanced]** If temporal wave-fronts *are* enabled (currently only possible via apply_command_line_options()):
-            - The step index (e.g., `t` for "time") will be sequentially set to values
-            from `first_step_index` to `last_step_index`, inclusive, within each wave-front tile.
-             + The number of steps in a wave-front tile may also be restricted by the size
-             of the tile in the step dimension. In that case, tiles will be done in slices of that size.
-             + Reverse solutions are not allowed with wave-front tiling.
-            - For each step index within each wave-front tile, the domain indices will be set
-            to values across the entire tile (not necessarily sequentially).
-            - Ultimately, the stencil(s) will be applied to same the elements in both the step 
-            and domain dimensions as when wave-front tiling is not used.
-            - MPI is not supported with wave-front tiling.
-
-           This function should be called only *after* calling prepare_solution().
-        */
-        virtual void
-        run_solution(idx_t first_step_index /**< [in] First index in the step dimension */,
-                     idx_t last_step_index /**< [in] Last index in the step dimension */ ) =0;
-
-        /// Run the stencil solution for the specified step.
-        /**
-           This function is simply an alias for `run_solution(step_index, step_index)`, i.e.,
-           the solution will be applied for exactly one step across the domain.
-
-           Typical C++ usage:
-
-           ~~~{.cpp}
-           for (idx_t t = 1; t <= num_steps; t++)
-               run_solution(t);
-           ~~~
-
-           As written, the above loop is identical to
-
-           ~~~{.cpp}
-           run_solution(1, num_steps);
-           ~~~
-
-           @note The parameter is *not* the number of steps to run.
-           @note Since only one step is taken per call, using this function effectively disables
-           wave-front tiling.
-        */
-        virtual void
-        run_solution(idx_t step_index /**< [in] Index in the step dimension */ ) =0;
-
-        /// Finish using a solution.
-        /**
-           Releases shared ownership of memory used by the grids.  This will
-           result in deallocating each memory block whose ownership is not
-           shared by another shared pointer.
-        */
-        virtual void
-        end_solution() =0;
-
-
-        /// Get performance statistics associated with preceding calls to run_solution().
-        /**
-           Side effect: resets all statistics, so a subsequent call will
-           measure performance after the current call.
-           @returns Pointer to statistics object.
-        */
-        virtual yk_stats_ptr
-        get_stats() =0;
-
-        /// Determine whether the auto-tuner is enabled on this rank.
-        /**
-           The auto-tuner is enabled by default.
-           It will become disabled after it has converged or after reset_auto_tuner(false) has been called.
-           @returns Whether the auto-tuner is still searching.
-        */
-        virtual bool
-        is_auto_tuner_enabled() =0;
-
-        /* Advanced APIs for yk_solution found below are not needed for most applications. */
-        
-        /// **[Advanced]** Restart or disable the auto-tuner on this rank.
-        /**
-           Under normal operation, an auto-tuner is invoked automatically during calls to
-           run_solution().
-           Currently, only the block size is set by the auto-tuner, and the search begins from the 
-           sizes set via set_block_size() or the default size if set_block_size() has
-           not been called.
-           This function is used to apply the current best-known settings if the tuner has
-           been running, reset the state of the auto-tuner, and either
-           restart its search or disable it from running.
-           This call must be made on each rank where the change is desired.
-        */
-        virtual void
-        reset_auto_tuner(bool enable
-                         /**< [in] If _true_, start or restart the auto-tuner search.
-                            If _false_, disable the auto-tuner from running. */,
-                         bool verbose = false
-                         /**< [in] If _true_, print progress information to the debug object
-                            set via set_debug_output(). */ ) =0;
-
-        /// **[Advanced]** Automatically tune selected settings immediately.
-        /**
-           Executes a search algorithm to find [locally] optimum values for some of the
-           settings.
-           Under normal operation, an auto-tuner is invoked during calls to
-           run_solution().
-           See reset_auto_tuner() for more information.
-           This function causes the stencil solution to be run immediately
-           until the auto-tuner converges on all ranks.
-           It is useful for benchmarking, where performance is to be timed
-           for a given number of steps after the best settings are found.
-           This function should be called only *after* calling prepare_solution().
-           This call must be made on each rank.
-           @warning Modifies the contents of the grids by calling run_solution()
-           an arbitrary number of times, but without halo exchange.
-           (See run_solution() for other restrictions and warnings.)
-           Thus, grid data should be set *after* calling this function when
-           used in a production or test setting where correct results are expected.
-        */
-        virtual void
-        run_auto_tuner_now(bool verbose = true
-                           /**< [in] If _true_, print progress information to the debug object
-                              set via set_debug_output(). */ ) =0;
-        
-        /// **[Advanced]** Add a new grid to the solution.
-        /**
-           This is typically not needed because grids used by the stencils are pre-defined
-           by the solution itself via the stencil compiler.
-           However, a grid may be created explicitly via this function
-           in order to use it for purposes other than by the
-           pre-defined stencils within the current solution.
-
-           Grids created by this function will be treated like a pre-defined grid.
-           For example,
-           - For each domain dimension of the grid,
-           the new grid's domain size will be the same as that returned by
-           get_rank_domain_size().
-           - Calls to set_rank_domain_size() will resize the corresponding domain 
-           size in this grid.
-           - This grid's first domain index in this rank will be determined
-           by the position of this rank.
-           - This grid's initial padding size will be the same as that returned by
-           get_min_pad_size().
-           - After creating a new grid, you can increase its padding
-           sizes in the domain dimensions via yk_grid::set_min_pad_size(), etc.
-           - For step and misc dimensions, you can change the allocation via
-           yk_grid::set_alloc_size().
-
-           If you want a grid that is not automatically resized based on the
-           solution settings, use new_fixed_size_grid() instead.
-
-           @note A new grid contains only the meta-data for the grid; data storage
-           is not yet allocated.
-           Storage may be allocated in any of the methods listed
-           in the "Detailed Description" for \ref yk_grid.
-           @returns Pointer to the new grid.
-        */
-        virtual yk_grid_ptr
-        new_grid(const std::string& name
-                 /**< [in] Name of the grid; must be unique
-                    within the solution. */,
-                 const std::vector<std::string>& dims
-                 /**< [in] List of names of all dimensions. 
-                    Names must be valid C++ identifiers and 
-                    not repeated within this grid. */ ) =0;
-
-#ifndef SWIG
-        /// **[Advanced]** Add a new grid to the solution.
-        /**
-           See documentation for the version of new_grid() with a vector of dimension names
-           as a parameter.
-           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
-           @returns Pointer to the new grid.
-        */
-        virtual yk_grid_ptr
-        new_grid(const std::string& name
-                 /**< [in] Name of the grid; must be unique
-                    within the solution. */,
-                 const std::initializer_list<std::string>& dims
-                 /**< [in] List of names of all dimensions. 
-                    Names must be valid C++ identifiers and 
-                    not repeated within this grid. */ ) =0;
-#endif
-
-        /// **[Advanced]** Add a new grid to the solution with a specified size.
-        /**
-           This is typically not needed because grids used by the stencils are pre-defined
-           by the solution itself via the stencil compiler.
-           However, a grid may be created explicitly via this function
-           in order to use it for purposes other than by the
-           pre-defined stencils within the current solution.
-
-           Unlike new_grid(),
-           grids created by this function will *not* be treated like a pre-defined grid.
-           For example,
-           - For each domain dimension of the grid,
-           the new grid's domain size is provided during creation and cannot be changed.
-           - Calls to set_rank_domain_size() will *not* resize the corresponding domain 
-           size in this grid.
-           - This grid's first domain index in this rank will be fixed at zero (0)
-           regardless of this rank's position.
-           - This grid's padding size will be affected only by calls to 
-           yk_grid::set_min_pad_size(), etc.
-           - For step and misc dimensions, you can still change the allocation via
-           yk_grid::set_alloc_size().
-
-           @note A new grid contains only the meta-data for the grid; data storage
-           is not yet allocated.
-           Storage may be allocated in any of the methods listed
-           in the "Detailed Description" for \ref yk_grid.
-           @returns Pointer to the new grid.
-        */
-        virtual yk_grid_ptr
-        new_fixed_size_grid(const std::string& name
-                       /**< [in] Name of the grid; must be unique
-                          within the solution. */,
-                       const std::vector<std::string>& dims
-                       /**< [in] List of names of all dimensions. 
-                          Names must be valid C++ identifiers and 
-                          not repeated within this grid. */,
-                       const std::vector<idx_t>& dim_sizes
-                       /**< [in] Initial allocation in each dimension.
-                          Must be exatly one size for each dimension. */ ) =0;
-
-#ifndef SWIG
-        /// **[Advanced]** Add a new grid to the solution with a specified size.
-        /**
-           See documentation for the version of new_fixed_size_grid() with a vector of dimension names
-           as a parameter.
-           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
-           @returns Pointer to the new grid.
-        */
-        virtual yk_grid_ptr
-        new_fixed_size_grid(const std::string& name
-                       /**< [in] Name of the grid; must be unique
-                          within the solution. */,
-                       const std::initializer_list<std::string>& dims
-                       /**< [in] List of names of all dimensions. 
-                          Names must be valid C++ identifiers and 
-                          not repeated within this grid. */,
-                       const std::initializer_list<idx_t>& dim_sizes
-                       /**< [in] Initial allocation in each dimension.
-                          Must be exatly one size for each dimension. */ ) =0;
-#endif
-
-        /// **[Advanced]** Set the default preferred NUMA node on which to allocate data.
-        /**
-           This value is used when allocating grids and MPI buffers.
-           The NUMA "preferred node allocation" policy is used, meaning that
-           memory will be allocated in an alternative node if the preferred one
-           doesn't have enough space available or is otherwise restricted.
-           Instead of specifying a NUMA node, a special value may be used
-           to specify another policy as listed.
-           This setting may be overridden for any specific grid.
-        */
-        virtual void
-        set_default_numa_preferred(int numa_node
-                                   /**< [in] Preferred NUMA node for data
-                                      allocation.  Alternatively, use
-                                      `yask_numa_local` for explicit
-                                      local-node allocation,
-                                      `yask_numa_interleave` for
-                                      interleaving pages across all nodes,
-                                      or `yask_numa_none` for no explicit NUMA
-                                      policy. These constants are defined in 
-                                      the _Variable Documentation_ section of
-                                      \ref yask_kernel_api.hpp. */) =0;
-
-        /// **[Advanced]** Get the default preferred NUMA node on which to allocate data.
-        /**
-           @returns Current setting of preferred NUMA node.
-        */
-        virtual int
-        get_default_numa_preferred() const =0;
-
-        /// **[Advanced]** Set performance parameters from an option string.
-        /**
-           Parses the string for options as if from a command-line.
-           Example: "-bx 64 -block_threads 4" sets the block-size in the *x*
-           dimension to 64 and the number of threads used to process each
-           block to 4.
-           See the help message from the YASK kernel binary for documentation
-           on the command-line options.
-
-           @returns Any strings that were not recognized by the parser as options.
-        */
-        virtual std::string
-        apply_command_line_options(const std::string& args
-                                   /**< [in] String of arguments to parse. */ ) =0;
-
-        /// **[Advanced]** Use data-storage from existing grids in specified solution.
-        /**
-           Calls yk_grid::share_storage() for each pair of grids that have the same name
-           in this solution and the source solution.
-           All conditions listed in yk_grid::share_storage() must hold for each pair.
-        */
-        virtual void
-        share_grid_storage(yk_solution_ptr source
-                           /**< [in] Solution from which grid storage will be shared. */) =0;
-    };
-
-    /// Statistics from calls to run_solution().
-    /**
-       A throughput rate may be calculated by multiplying an
-       amount-of-work-per-step quantity by the number of steps done and
-       dividing by the number of seconds elapsed.
-    */
-    class yk_stats {
-    public:
-    	virtual ~yk_stats() {}
-
-        /// Get the number of elements in the overall domain.
-        /**
-           @returns Product of all the overal domain sizes across all domain dimensions.
-        */
-        virtual idx_t
-        get_num_elements() =0;
-
-        /// Get the number of elements written in each step.
-        /**
-           @returns Number of elements written to each output grid.
-           This is the same value as get_num_elements() if there is only one output grid.
-        */
-        virtual idx_t
-        get_num_writes() =0;
-
-        /// Get the estimated number of floating-point operations required for each step.
-        /**
-           @returns Number of FP ops created by the stencil compiler.
-           It may be slightly more or less than the actual number of FP ops executed 
-           by the CPU due to C++ compiler transformations.
-        */
-        virtual idx_t
-        get_est_fp_ops() =0;
-
-        /// Get the number of steps calculated via run_solution().
-        /**
-           @returns A positive number, regardless of whether run_solution() steps were executed
-           forward or backward.
-        */
-        virtual idx_t
-        get_num_steps_done() =0;
-
-        /// Get the number of seconds elapsed during calls to run_solution().
-        /**
-           @returns Only the time spent in run_solution(), not in any other code in your
-           application between calls.
-        */
-        virtual double
-        get_elapsed_run_secs() =0;
-    };
-    
-    /// A run-time grid.
-    /**
-       "Grid" is a generic term for any n-dimensional array.  A 0-dim grid
-       is a scalar, a 1-dim grid is an array, etc.  A run-time grid contains
-       data, unlike yc_grid, a compile-time grid variable.  
-       
-       Typically, access to each grid is obtained via yk_solution::get_grid().
-       You may also use yk_solution::new_grid() or yk_solution::new_fixed_size_grid() 
-       if you need a grid that is not part of the pre-defined solution.
-
-       Each dimension of a grid is one of the following:
-       - The *step* dimension, typically time ("t"), as identified via yk_solution::get_step_dim_name().
-       - A *domain* dimension, typically a spatial dimension such as "x" or "y",
-       as identified via yk_solution:get_domain_dim_names().
-       - A *miscellaneous* dimension, which is any dimension that is not a domain or step dimension,
-       as identified via yk_solution:get_misc_dim_names().
-       
-       In the step dimension, there is no fixed domain size, and no
-       specified first or last index.
-       However, there is an allocation size, which is the number of values in the step
-       dimension that are stored in memory.
-       Step-dimension indices "wrap-around" within this allocation to reuse memory.
-       For example, if the step dimension is "t", and the t-dimension allocation size is 3,
-       then t=-2, t=0, t=3, t=6, ..., t=303, etc. would all alias to the same spatial values in memory.
-
-       In each domain dimension,
-       grid sizes include the following components:
-       - The *domain* is the elements to which the stencils are applied.
-       - The *left padding* is all the elements before the domain and includes the left halo.
-       - The *right padding* is all the elements before the domain and includes the right halo.
-       - The *left halo* is the elements just before the domain which must be
-       copied between preceding ranks during halo exchanges. The left halo is contained within the left padding.
-       - The *right halo* is the elements just after the domain which must be
-       copied between following ranks during halo exchanges. The right halo is contained within the right padding.
-       - The *extra left padding* is the elements before the domain and left halo
-       and thus does not include the left halo.
-       - The *extra right padding* is the elements after the domain and right halo
-       and thus does not include the right halo.
-       - The *allocation* includes the left padding, domain, and right padding.
-       
-       Domain sizes specified via yk_solution::set_rank_domain_size() apply to each MPI rank.
-       Visually, in each of the domain dimensions, these sizes are related as follows
-       in each rank:
-       <table>
-       <tr><td>extra left padding <td>left halo <td rowspan="2">domain <td>right halo <td>extra right padding
-       <tr><td colspan="2"><center>left padding</center> <td colspan="2"><center>right padding</center>
-       <tr><td colspan="5"><center>allocation</center>
-       </table>
-
-       If MPI is not enabled, a rank's domain is equivalent to the entire problem size.
-       If MPI is enabled, the domains of the ranks are logically abutted to create the 
-       overall problem domain in each dimension:
-       <table>
-       <tr><td>extra left padding of rank A <td>halo of rank A <td>domain of rank A <td>domain of rank B
-         <td>... <td>domain of rank Z <td>halo of rank Z <td>extra right padding of rank Z
-       <tr><td colspan="2"><center>left padding of rank A</center>
-         <td colspan="4"><center>overall problem domain</center>
-         <td colspan="2"><center>right padding of rank Z</center>
-       </table>
-       The intermediate halos and paddings also exist, but are not shown in the above diagram.
-       The halos overlap the domains of adjacent ranks.
-       For example, the left halo of rank B in the diagram would overlap the domain of rank A.
-       Data in these overlapped regions is exchanged as needed during stencil application
-       to maintain a consistent values as if there was only one rank.
-
-       In each miscellaneous dimension, there is only an allocation size,
-       and there is no wrap-around as in the step dimension.
-       Each index must be between its first and last allowed value.
-
-       All sizes are expressed in numbers of elements.
-       Each element may be a 4-byte (single precision)
-       or 8-byte (double precision) floating-point value as returned by
-       yk_solution::get_element_bytes().
-       
-       Initially, a grid is not assigned any allocated storage.
-       This is done to allow modification of domain, padding, and other allocation sizes
-       before allocation.
-       Once the allocation sizes have been set in all dimensions, the data storage itself may
-       be allocated.
-       This can be done in any of the following ways:
-       - Storage for all grids without data storage will be automatically allocated when
-       prepare_solution() is called.
-       - Storage for a specific grid may be allocated before calling prepare_solution()
-       via yk_grid::alloc_storage().
-       - **[Advanced]** Storage for a specific grid may be shared with another grid with
-       existing storage via yk_grid::share_storage().
-       
-       @note The domain index arguments to the \ref yk_grid functions that require indices
-       are *always* relative to the overall problem; they are *not* relative to the current rank.
-       The first and last overall-problem index that lies within a rank can be
-       retrieved via yk_solution::get_first_rank_domain_index() and 
-       yk_solution::get_last_rank_domain_index(), respectively.
-       The first and last accessible index that lies within a rank for a given grid can be
-       retrieved via yk_grid::get_first_rank_alloc_index() and 
-       yk_grid::get_last_rank_alloc_index(), respectively.
-       Also, index arguments are always inclusive. 
-       Specifically, for functions that return or require a "last" index, that
-       index indicates the last one in the relevant range, i.e., *not* one past the last value
-       (this is more like Fortran and Perl than Python and Lisp).
-    */
-    class yk_grid {
-    public:
-        virtual ~yk_grid() {}
-
-        /// Get the name of the grid.
-        /**
-           @returns String containing name provided via yc_solution::new_grid().
-        */
-        virtual const std::string& get_name() const =0;
-
-        /// Determine whether this grid is automatically resized based on the solution.
-        /**
-           @returns `true` if this grid was created via yk_solution::new_fixed_size_grid()
-           or `false` otherwise.
-        */
-        virtual bool is_fixed_size() const =0;
-
-        /// Get the number of dimensions used in this grid.
-        /**
-           This may include domain, step, and/or miscellaneous dimensions.
-           @returns Number of dimensions created via yc_solution::new_grid(),
-           yk_solution::new_grid(), or yk_solution::new_fixed_size_grid().
-        */
-        virtual int get_num_dims() const =0;
-
-        /// Get all the dimensions in this grid.
-        /**
-           This may include domain, step, and/or miscellaneous dimensions.
-           @returns List of names of all the dimensions.
-        */
-        virtual std::vector<std::string>
-        get_dim_names() const =0;
-        
-        /// Determine whether specified dimension exists in this grid.
-        /**
-           @returns `true` if dimension exists (including step-dimension),
-           `false` otherwise.
-        */
-        virtual bool
-        is_dim_used(const std::string& dim) const =0;
-
-        /// Get the domain size for this rank.
-        /**
-           @returns The same value as yk_solution::get_rank_domain_size() if
-           is_fixed_size() returns `false` or the fixed sized provided via
-           yk_solution::new_fixed_size_grid() otherwise.
-        */
-        virtual idx_t
-        get_rank_domain_size(const std::string& dim
-                             /**< [in] Name of dimension to get.  Must be one of
-                                the names from yk_solution::get_domain_dim_names(). */) const =0;
-
-        /// Get the first index of the sub-domain in this rank in the specified dimension.
-        /**
-           @note This function should be called only *after* calling prepare_solution()
-           because prepare_solution() assigns this rank's position in the problem domain.
-           @returns The same value as yk_solution::get_first_rank_domain_index() if
-           is_fixed_size() returns `false` or zero (0) otherwise.
-        */
-        virtual idx_t
-        get_first_rank_domain_index(const std::string& dim
-                                    /**< [in] Name of dimension to get.  Must be one of
-                                       the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-        
-        /// Get the last index of the sub-domain in this rank in the specified dimension.
-        /**
-           @note This function should be called only *after* calling prepare_solution()
-           because prepare_solution() assigns this rank's position in the problem domain.
-           @returns The same value as yk_solution::get_last_rank_domain_index() if
-           is_fixed_size() returns `false` or one less than the fixed sized provided via
-           yk_solution::new_fixed_size_grid() otherwise.
-        */
-        virtual idx_t
-        get_last_rank_domain_index(const std::string& dim
-                                    /**< [in] Name of dimension to get.  Must be one of
-                                       the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
-        /// Get the left halo size in the specified dimension.
-        /**
-           This value is typically set by the stencil compiler.
-           @returns Elements in halo in given dimension before the domain.
-        */
-        virtual idx_t
-        get_left_halo_size(const std::string& dim
-                      /**< [in] Name of dimension to get.
-                         Must be one of
-                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-        
-        /// Get the right halo size in the specified dimension.
-        /**
-           This value is typically set by the stencil compiler.
-           @returns Elements in halo in given dimension after the domain.
-        */
-        virtual idx_t
-        get_right_halo_size(const std::string& dim
-                      /**< [in] Name of dimension to get.
-                         Must be one of
-                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-        
-        /// Get the first index of the left halo in this rank in the specified dimension.
-        /**
-           @note This function should be called only *after* calling prepare_solution()
-           because prepare_solution() assigns this rank's position in the problem domain.
-           @returns The first index of left halo in this rank or
-           the same value as yk_grid::get_first_rank_domain_index()
-           if the left halo has zero size.
-        */
-        virtual idx_t
-        get_first_rank_halo_index(const std::string& dim
-                                    /**< [in] Name of dimension to get.  Must be one of
-                                       the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
-        /// Get the last index of the right halo in this rank in the specified dimension.
-        /**
-           @note This function should be called only *after* calling prepare_solution()
-           because prepare_solution() assigns this rank's position in the problem domain.
-           @returns The last index of right halo in this rank or
-           the same value as yk_grid::get_last_rank_domain_index()
-           if the right halo has zero size.
-        */
-        virtual idx_t
-        get_last_rank_halo_index(const std::string& dim
-                                    /**< [in] Name of dimension to get.  Must be one of
-                                       the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
-        /// Get the left padding in the specified dimension.
-        /**
-           The left padding is the memory allocated before
-           the domain in a given dimension.
-           The left padding size includes the left halo size.
-           The value may be slightly
-           larger than that provided via set_min_pad_size(), etc. due to rounding.
-           @returns Elements in left padding in given dimension.
-        */
-        virtual idx_t
-        get_left_pad_size(const std::string& dim
-                     /**< [in] Name of dimension to get.
-                         Must be one of
-                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
-        /// Get the right padding in the specified dimension.
-        /**
-           The right padding is the memory allocated after
-           the domain in a given dimension.
-           The right padding size includes the right halo size.
-           The value may be slightly
-           larger than that provided via set_min_pad_size(), etc. due to rounding.
-           @returns Elements in right padding in given dimension.
-        */
-        virtual idx_t
-        get_right_pad_size(const std::string& dim
-                     /**< [in] Name of dimension to get.
-                         Must be one of
-                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
-        /// Get the extra left padding in the specified dimension.
-        /**
-           The *extra* padding size is the left padding size minus the left halo size.
-           @returns Elements in padding in given dimension before the
-           left halo region.
-        */
-        virtual idx_t
-        get_left_extra_pad_size(const std::string& dim
-                           /**< [in] Name of dimension to get.
-                              Must be one of
-                              the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
-        /// Get the extra right padding in the specified dimension.
-        /**
-           The *extra* padding size is the right padding size minus the right halo size.
-           @returns Elements in padding in given dimension after the
-           right halo region.
-        */
-        virtual idx_t
-        get_right_extra_pad_size(const std::string& dim
-                           /**< [in] Name of dimension to get.
-                              Must be one of
-                              the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
-        /// Set the padding in the specified dimension.
-        /**
-           This sets the minimum number of elements in this grid 
-           in both left and right pads.
-           This padding area can be used for required halo regions.
-           
-           The *actual* padding size will be the largest of the following values,
-           additionally rounded up based on the vector-folding dimensions
-           and/or cache-line alignment:
-           - Halo size.
-           - Value provided by any of the pad-size setting functions.
-           
-           The padding size cannot be changed after data storage
-           has been allocated for this grid; attempted changes to the pad size
-           will be ignored.
-           In addition, once a grid's padding is set, it cannot be reduced, only increased.
-           Call get_pad_size() to determine the actual padding size for the grid.
-           See the "Detailed Description" for \ref yk_grid for information on grid sizes.
-        */
-        virtual void
-        set_min_pad_size(const std::string& dim
-                         /**< [in] Name of dimension to set.
-                            Must be one of
-                            the names from yk_solution::get_domain_dim_names(). */,
-                         idx_t size
-                         /**< [in] Minimum number of elements to allocate beyond the domain size. */ ) =0;
-        
-        /// Get the storage allocation in the specified dimension.
-        /**
-           For the step dimension, this is the specified allocation and
-           does not typically depend on the number of steps evaluated.
-           For the non-step dimensions, this includes the domain and padding sizes.
-           See the "Detailed Description" for \ref yk_grid for information on grid sizes.
-           @returns allocation in number of elements (not bytes).
-        */
-        virtual idx_t
-        get_alloc_size(const std::string& dim
-                       /**< [in] Name of dimension to get. */ ) const =0;
-
-        /// Get the first index of a specified miscellaneous dimension.
-        /**
-           @returns the first allowed index in a non-step and non-domain dimension.
-        */
-        virtual idx_t
-        get_first_misc_index(const std::string& dim
-                             /**< [in] Name of dimension to get.  Must be one of
-                                the names from yk_solution::get_misc_dim_names(). */ ) const =0;
-        
-        /// Get the last index of a specified miscellaneous dimension.
-        /**
-           @returns the last allowed index in a non-step and non-domain dimension.
-        */
-        virtual idx_t
-        get_last_misc_index(const std::string& dim
-                            /**< [in] Name of dimension to get.  Must be one of
-                               the names from yk_solution::get_misc_dim_names(). */ ) const =0;
-
-        /// Determine whether the given indices are allocated in this rank.
-        /**
-           Provide indices in a list in the same order returned by get_dim_names().
-           Indices are relative to the *overall* problem domain.
-           @returns `true` if index values fall within the allocated space as returned by
-           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
-           each dimension; `false` otherwise.
-        */
-        virtual bool
-        is_element_allocated(const std::vector<idx_t>& indices
-                             /**< [in] List of indices, one for each grid dimension. */ ) const =0;
-        
-#ifndef SWIG
-        /// Determine whether the given indices are allocated in this rank.
-        /**
-           Provide indices in a list in the same order returned by get_dim_names().
-           Indices are relative to the *overall* problem domain.
-           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
-           @returns `true` if index values fall within the allocated space as returned by
-           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
-           each dimension; `false` otherwise.
-        */
-        virtual bool
-        is_element_allocated(const std::initializer_list<idx_t>& indices
-                             /**< [in] List of indices, one for each grid dimension. */ ) const =0;
-#endif
-        
-        /// Get the value of one grid element.
-        /**
-           Provide indices in a list in the same order returned by get_dim_names().
-           Indices are relative to the *overall* problem domain.
-           Index values must fall within the allocated space as returned by
-           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
-           each dimension.
-           @returns value in grid at given multi-dimensional location.
-        */
-        virtual double
-        get_element(const std::vector<idx_t>& indices
-                    /**< [in] List of indices, one for each grid dimension. */ ) const =0;
-
-#ifndef SWIG
-        /// Get the value of one grid element.
-        /**
-           Provide indices in a list in the same order returned by get_dim_names().
-           Indices are relative to the *overall* problem domain.
-           Index values must fall within the allocated space as returned by
-           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
-           each dimension.
-           @note The return value is a double-precision floating-point value, but
-           it will be converted from a single-precision if 
-           yk_solution::get_element_bytes() returns 4.
-           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
-           @returns value in grid at given multi-dimensional location.
-        */
-        virtual double
-        get_element(const std::initializer_list<idx_t>& indices
-                    /**< [in] List of indices, one for each grid dimension. */ ) const =0;
-#endif
-
-        /// Get grid elements within specified subset of the grid.
-        /**
-           Reads all elements from `first_indices` to `last_indices` in each dimension
-           and writes them to consecutive memory locations in the buffer.
-           Indices in the buffer progress in row-major order.
-           The buffer pointed to must contain the number of bytes equal to
-           yk_solution::get_element_bytes() multiplied by the number of
-           elements in the specified slice.
-           Since the reads proceed in row-major order, the last index is "unit-stride"
-           in the buffer.
-           Provide indices in two lists in the same order returned by get_dim_names().
-           Indices are relative to the *overall* problem domain.
-           Index values must fall within the allocated space as returned by
-           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
-           each dimension.
-           @returns Number of elements read.
-        */
-        virtual idx_t
-        get_elements_in_slice(void* buffer_ptr
-                              /**< [out] Pointer to buffer where values will be written. */,
-                              const std::vector<idx_t>& first_indices
-                              /**< [in] List of initial indices, one for each grid dimension. */,
-                              const std::vector<idx_t>& last_indices
-                              /**< [in] List of final indices, one for each grid dimension. */ ) const =0;
-        
-        /// Set the value of one grid element.
-        /**
-           Provide indices in a list in the same order returned by get_dim_names().
-           Indices are relative to the *overall* problem domain.
-           Index values must fall within the allocated space as returned by
-           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
-           each dimension.
-           @note The parameter value is a double-precision floating-point value, but
-           it will be converted to single-precision if
-           yk_solution::get_element_bytes() returns 4.
-           If storage has not been allocated for this grid, this will have no effect.
-           @returns Number of elements set.
-        */
-        virtual idx_t
-        set_element(double val /**< [in] Element in grid will be set to this. */,
-                    const std::vector<idx_t>& indices
-                    /**< [in] List of indices, one for each grid dimension. */,
-                    bool strict_indices = false
-                    /**< [in] If true, indices must be within domain or padding.
-                       If false, indices outside of domain and padding result
-                       in no change to grid. */ ) =0;
-
-#ifndef SWIG        
-        /// Set the value of one grid element.
-        /**
-           Provide the number of indices equal to the number of dimensions in the grid.
-           Indices beyond that will be ignored.
-           Indices are relative to the *overall* problem domain.
-           If any index values fall outside of the allocated space as returned by
-           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
-           each dimension, this will have no effect.
-           @note The parameter value is a double-precision floating-point value, but
-           it will be converted to single-precision if
-           yk_solution::get_element_bytes() returns 4.
-           If storage has not been allocated for this grid, this will have no effect.
-           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
-           @returns Number of elements set.
-        */
-        virtual idx_t
-        set_element(double val /**< [in] Element in grid will be set to this. */,
-                    const std::initializer_list<idx_t>& indices
-                    /**< [in] List of indices, one for each grid dimension. */,
-                    bool strict_indices = false
-                    /**< [in] If true, indices must be within domain or padding.
-                       If false, indices outside of domain and padding result
-                       in no change to grid. */ ) =0;
-#endif
-        /// Atomically add to the value of one grid element.
-        /**
-           Provide indices in a list in the same order returned by get_dim_names().
-           Indices are relative to the *overall* problem domain.
-           Index values must fall within the allocated space as returned by
-           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
-           each dimension if `strict_indices` is set to true.
-           Updates are OpenMP atomic, meaning that this function can be called by
-           several OpenMP threads without causing a race condition.
-           @note The parameter value is a double-precision floating-point value, but
-           it will be converted to single-precision if
-           yk_solution::get_element_bytes() returns 4.
-           If storage has not been allocated for this grid, this will have no effect.
-           @returns Number of elements updated.
-        */
-        virtual idx_t
-        add_to_element(double val /**< [in] This value will be added to element in grid. */,
-                       const std::vector<idx_t>& indices
-                       /**< [in] List of indices, one for each grid dimension. */,
-                       bool strict_indices = false
-                       /**< [in] If true, indices must be within domain or padding.
-                          If false, indices outside of domain and padding result
-                          in no change to grid. */ ) =0;
-
-#ifndef SWIG        
-        /// Atomically add to the value of one grid element.
-        /**
-           Provide the number of indices equal to the number of dimensions in the grid.
-           Indices beyond that will be ignored.
-           Indices are relative to the *overall* problem domain.
-           Index values must fall within the allocated space as returned by
-           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
-           each dimension if `strict_indices` is set to true.
-           Updates are OpenMP atomic, meaning that this function can be called by
-           several OpenMP threads without causing a race condition.
-           @note The parameter value is a double-precision floating-point value, but
-           it will be converted to single-precision if
-           yk_solution::get_element_bytes() returns 4.
-           If storage has not been allocated for this grid, this will have no effect.
-           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
-           @returns Number of elements set.
-        */
-        virtual idx_t
-        add_to_element(double val /**< [in] This value will be added to element in grid. */,
-                       const std::initializer_list<idx_t>& indices
-                       /**< [in] List of indices, one for each grid dimension. */,
-                       bool strict_indices = false
-                       /**< [in] If true, indices must be within domain or padding.
-                          If false, indices outside of domain and padding result
-                          in no change to grid. */ ) =0;
-#endif
-        
-        /// Initialize all grid elements to the same value.
-        /**
-           Sets all allocated elements, including those in the domain and padding
-           area to the same specified value.
-           @note The parameter is a double-precision floating-point value, but
-           it will be converted to single-precision if
-           yk_solution::get_element_bytes() returns 4.
-           @note If storage has not been allocated via yk_solution::prepare_solution(),
-           this will have no effect.
-        */
-        virtual void
-        set_all_elements_same(double val /**< [in] All elements will be set to this. */ ) =0;
-
-        /// Initialize grid elements within specified subset of the grid to the same value.
-        /**
-           Sets all elements from `first_indices` to `last_indices` in each dimension to the
-           specified value.
-           Provide indices in two lists in the same order returned by get_dim_names().
-           Indices are relative to the *overall* problem domain.
-           Index values must fall within the allocated space as returned by
-           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
-           each dimension.
-           Indices are relative to the *overall* problem domain.
-           If storage has not been allocated for this grid, this will have no effect.
-           @returns Number of elements set.
-        */
-        virtual idx_t
-        set_elements_in_slice_same(double val /**< [in] All elements in the slice will be set to this. */,
-                                   const std::vector<idx_t>& first_indices
-                                   /**< [in] List of initial indices, one for each grid dimension. */,
-                                   const std::vector<idx_t>& last_indices
-                                   /**< [in] List of final indices, one for each grid dimension. */,
-                                   bool strict_indices = false
-                                   /**< [in] If true, indices must be within domain or padding.
-                                      If false, only elements within the allocation of this grid
-                                      will be set, and elements outside will be ignored. */ ) =0;
-
-        /// Set grid elements within specified subset of the grid.
-        /**
-           Reads elements from consecutive memory locations,
-           starting at `buffer_ptr`
-           and writes them from `first_indices` to `last_indices` in each dimension.
-           Indices in the buffer progress in row-major order.
-           The buffer pointed to must contain either 4 or 8 byte FP values per element in the 
-           subset, depending on the FP precision of the solution.
-           The buffer pointed to must contain the number of FP values in the specified slice,
-           where each FP value is the size of yk_solution::get_element_bytes().
-           Since the writes proceed in row-major order, the last index is "unit-stride"
-           in the buffer.
-           Provide indices in two lists in the same order returned by get_dim_names().
-           Indices are relative to the *overall* problem domain.
-           Index values must fall within the allocated space as returned by
-           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
-           each dimension.
-           Indices are relative to the *overall* problem domain.
-           If storage has not been allocated for this grid, this will have no effect.
-           @returns Number of elements written.
-        */
-        virtual idx_t
-        set_elements_in_slice(const void* buffer_ptr
-                              /**< [out] Pointer to buffer where values will be read. */,
-                              const std::vector<idx_t>& first_indices
-                              /**< [in] List of initial indices, one for each grid dimension. */,
-                              const std::vector<idx_t>& last_indices
-                              /**< [in] List of final indices, one for each grid dimension. */ ) =0;
-        
-        /// Format the indices for pretty-printing.
-        /**
-           Provide indices in a list in the same order returned by get_dim_names().
-           @returns A string containing the grid name and the index values.
-        */
-        virtual std::string
-        format_indices(const std::vector<idx_t>& indices
-                       /**< [in] List of indices, one for each grid dimension. */ ) const =0;
-        
-#ifndef SWIG
-        /// Format the indices for pretty-printing.
-        /**
-           Provide indices in a list in the same order returned by get_dim_names().
-           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
-           @returns A string containing the grid name and the index values.
-        */
-        virtual std::string
-        format_indices(const std::initializer_list<idx_t>& indices
-                       /**< [in] List of indices, one for each grid dimension. */ ) const =0;
-#endif
-        
-        /// Determine whether storage has been allocated.
-        /**
-           @returns `true` if storage has been allocated,
-           `false` otherwise.
-        */
-        virtual bool
-        is_storage_allocated() const =0;
-
-        /// Determine size of raw storage in bytes.
-        /**
-           @returns Minimum number of bytes required for
-           storage given the current domain size and padding settings.
-        */
-        virtual idx_t
-        get_num_storage_bytes() const =0;
-
-        /// Determine size of raw storage in elements.
-        /**
-           @returns get_num_storage_bytes() / yk_solution.get_element_bytes().
-        */
-        virtual idx_t
-        get_num_storage_elements() const =0;
-
-        /* Advanced APIs for yk_grid found below are not needed for most applications. */
-
-        /// **[Advanced]** Set the default preferred NUMA node on which to allocate data.
-        /**
-           This value is used when allocating data for this grid.
-           Thus, the desired NUMA policy must be set before calling alloc_data()
-           or yk_solution::prepare_solution().
-        */
-        virtual void
-        set_numa_preferred(int numa_node
-                           /**< [in] Preferred NUMA node.
-                              See yk_solution::set_default_numa_preferred() for other options. */) =0;
-
-        /// **[Advanced]** Get the default preferred NUMA node on which to allocate data.
-        /**
-           @returns Current setting of preferred NUMA node for this grid.
-        */
-        virtual int
-        get_numa_preferred() const =0;
-
-        /// **[Advanced]** Set the left halo size in the specified dimension.
-        /**
-           This value is typically set by the stencil compiler, but
-           this function allows you to override that value.
-           If the left halo is set to a value larger than the left padding size, the
-           left padding size will be automatically increase to accomodate it.
-           @note After data storage has been allocated, the left halo size
-           can only be set to a value less than or equal to the left padding size
-           in the given dimension.
-        */
-        virtual void
-        set_left_halo_size(const std::string& dim
-                      /**< [in] Name of dimension to get.
-                         Must be one of
-                         the names from yk_solution::get_domain_dim_names(). */,
-                      idx_t size
-                      /**< [in] Number of elements in the left halo. */ ) =0;
-
-        /// **[Advanced]** Set the right halo size in the specified dimension.
-        /**
-           This value is typically set by the stencil compiler, but
-           this function allows you to override that value.
-           If the right halo is set to a value larger than the right padding size, the
-           right padding size will be automatically increase to accomodate it.
-           @note After data storage has been allocated, the right halo size
-           can only be set to a value less than or equal to the right padding size
-           in the given dimension.
-        */
-        virtual void
-        set_right_halo_size(const std::string& dim
-                      /**< [in] Name of dimension to get.
-                         Must be one of
-                         the names from yk_solution::get_domain_dim_names(). */,
-                      idx_t size
-                      /**< [in] Number of elements in the right halo. */ ) =0;
-
-        /// **[Advanced]** Set the left and right halo sizes in the specified dimension.
-        /**
-           Alias for set_left_halo_size(dim, size); set_right_halo_size(dim, size).
-        */
-        virtual void
-        set_halo_size(const std::string& dim
-                      /**< [in] Name of dimension to get.
-                         Must be one of
-                         the names from yk_solution::get_domain_dim_names(). */,
-                      idx_t size
-                      /**< [in] Number of elements in the halo. */ ) =0;
-
-
-        /// **[Advanced]** Set the number of elements to allocate in the specified dimension.
-        /** 
-           This setting is only allowed in the step dimension.
-           Typically, the allocation in the step dimension is determined by the
-           stencil compiler, but
-           this function allows you to override that value.
-           Allocations in other dimensions should be set indirectly
-           via the domain and padding sizes.
-           The allocation size cannot be changed after data storage
-           has been allocated for this grid.
-        */
-        virtual void
-        set_alloc_size(const std::string& dim
-                       /**< [in] Name of dimension to set.
-                          Must *not* be one of
-                          the names from yk_solution::get_domain_dim_names(). */,
-                       idx_t size /**< [in] Number of elements to allocate. */ ) =0;
-
-        /// **[Advanced]** Set the first index of a specified miscellaneous dimension.
-        /**
-           Sets the first allowed index in a non-step and non-domain dimension.
-           After calling this function, the last allowed index will be the first index
-           as set by this function plus the allocation size set by set_alloc_size()
-           minus one.
-        */
-        virtual void
-        set_first_misc_index(const std::string& dim
-                             /**< [in] Name of dimension to get.  Must be one of
-                                the names from yk_solution::get_misc_dim_names(). */,
-                             idx_t idx /**< [in] New value for first index.
-                                        May be negative. */ ) =0;
-        
-        /// **[Advanced]** Get the first accessible index in this grid in this rank in the specified dimension.
-        /**
-           This returns the first *overall* index allowed in this grid.
-           This element may be in the domain, left halo, or extra left padding area.
-           This function is only for checking the legality of an index.
-           @returns First allowed index in this grid.
-        */
-        virtual idx_t
-        get_first_rank_alloc_index(const std::string& dim
-                                   /**< [in] Name of dimension to get.
-                                      Must be one of
-                                      the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
-        /// **[Advanced]** Get the last accessible index in this grid in this rank in the specified dimension.
-        /**
-           This returns the last *overall* index allowed in this grid.
-           This element may be in the domain, right halo, or extra right padding area.
-           This function is only for checking the legality of an index.
-           @returns Last allowed index in this grid.
-        */
-        virtual idx_t
-        get_last_rank_alloc_index(const std::string& dim
-                                  /**< [in] Name of dimension to get.
-                                     Must be one of
-                                     the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
-        /// **[Advanced]** Explicitly allocate data-storage memory for this grid.
-        /**
-           Amount of allocation is calculated based on domain, padding, and 
-           step-dimension allocation sizes.
-           Any pre-existing storage will be released before allocation as via release_storage().
-           See allocation options in the "Detailed Description" for \ref yk_grid.
-         */
-        virtual void
-        alloc_storage() =0;
-
-        /// **[Advanced]** Explicitly release any allocated data-storage for this grid.
-        /**
-           This will release storage allocated via any of the options
-           described in the "Detailed Description" for \ref yk_grid.
-           If the data was shared between two or more grids, the data will
-           be retained by the remaining grids.
-        */
-        virtual void
-        release_storage() =0;
-
-        /// **[Advanced]** Determines whether storage layout is the same as another grid.
-        /**
-           In order for the storage layout to be identical, the following
-           must be the same:
-           - Number of dimensions.
-           - Name of each dimension, in the same order.
-           - Allocation size in each dimension.
-           - Rank domain size in each domain dimension.
-           - Padding size in each domain dimension.
-
-           The following do not have to be identical:
-           - Halo size.
-
-           @returns `true` if storage for this grid has the same layout as
-           `other` or `false` otherwise.
-        */
-        virtual bool
-        is_storage_layout_identical(const yk_grid_ptr other) const =0;
-        
-        /// **[Advanced]** Use existing data-storage from specified grid.
-        /**
-           This is an alternative to allocating data storage via 
-           yk_solution::prepare_solution() or alloc_storage().
-           In this case, data from a grid in this or another solution will be shared with
-           this grid.
-           In order to successfully share storage, the following conditions must hold:
-           - The source grid must already have storage allocated.
-           - The two grids must have the same dimensions in the same order.
-           - The two grids must have the same domain sizes in all domain dimensions.
-           - The two grids must have the same allocation sizes in non-domain dimensions.
-           - The required padding size of this grid must be less than or
-           equal to the actual padding size of the source grid in all domain
-           dimensions. The required padding size of this grid will be equal to
-           or greater than its halo size. It is not strictly necessary that the
-           two grids have the same halo sizes, but that is a sufficient condition.
-
-           Any pre-existing storage will be released before allocation as via release_storage().
-           The padding size(s) of this grid will be set to that of the source grid.
-           After calling share_storage(), changes in one grid via set_all_elements()
-           or set_element() will be visible in the other grid.
-
-           See allocation options and more information about grid sizes
-           in the "Detailed Description" for \ref yk_grid.
-        */
-        virtual void
-        share_storage(yk_grid_ptr source
-                      /**< [in] Grid from which storage will be shared. */) =0;
-
-        /// **[Advanced]** Get pointer to raw data storage buffer.
-        /**
-           The following assumptions about the contents of data are safe:
-           - Each FP element starts at a number of bytes from the beginning
-           of the buffer which is a multiple of yk_solution::get_element_bytes().
-           - All the FP elements will be located within get_num_storage_bytes()
-           bytes from the beginning of the buffer.
-           - A call to set_all_elements_same() will initialize all elements
-           within get_num_storage_bytes() bytes from the beginning of the buffer.
-           - If is_storage_layout_identical() returns `true` between this
-           and some other grid, any given element index applied to both grids
-           will refer to an element at the same offset into their respective
-           data buffers. 
-
-           Thus,
-           - You can perform element-wise unary mathematical operations on
-           all elements of a grid via its raw buffer, e.g., add some constant
-           value to all elements.
-           - If the layouts of two grids are identical, you can use their
-           raw buffers to copy or compare the grid contents for equality or
-           perform element-wise binary mathematical operations on them,
-           e.g., add all elements from one grid to another.
-
-           The following assumptions are not safe:
-           - Any expectations regarding the relationship between an element
-           index and that element's offset from the beginning of the buffer
-           such as row-major or column-major layout.
-           - All elements in the buffer are part of the rank domain or halo.
-
-           Thus,
-           - You should not perform any operations dependent on
-           the logical indices of any element via raw buffer, e.g., matrix
-           multiply.
-
-           @returns Pointer to raw data storage if is_storage_allocated()
-           returns `true` or NULL otherwise.
-        */
-        virtual void* get_raw_storage_buffer() =0;
-
-        /* Deprecated APIs for yk_grid found below should be avoided.
-           Use the more explicit form found in the documentation. */
-        
-        /// **[Deprecated]** Get the left halo size in the specified dimension.
-        /**
-           Alias for get_left_halo_size(dim, size).
-           @returns Elements in halo in given dimension before the domain.
-        */
-        virtual idx_t
-        get_halo_size(const std::string& dim
-                      /**< [in] Name of dimension to get.
-                         Must be one of
-                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-        
-        /// **[Deprecated]** Get the left padding in the specified dimension.
-        /**
-           Alias for get_left_pad_size(dim).
-           @returns Elements in left padding in given dimension.
-        */
-        virtual idx_t
-        get_pad_size(const std::string& dim
-                     /**< [in] Name of dimension to get.
-                         Must be one of
-                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
-        /// **[Deprecated]** Get the extra left padding in the specified dimension.
-        /**
-           Alias for get_extra_left_pad_size(dim).
-           @returns Elements in padding in given dimension before the
-           left halo region.
-        */
-        virtual idx_t
-        get_extra_pad_size(const std::string& dim
-                           /**< [in] Name of dimension to get.
-                              Must be one of
-                              the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
-    };
-
-
 } // namespace yask.
 
 #endif
diff --git a/include/yk_grid_api.hpp b/include/yk_grid_api.hpp
new file mode 100644
index 00000000..d6245e9e
--- /dev/null
+++ b/include/yk_grid_api.hpp
@@ -0,0 +1,942 @@
+/*****************************************************************************
+
+YASK: Yet Another Stencil Kernel
+Copyright (c) 2014-2018, Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+* The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
+
+*****************************************************************************/
+
+///////// API for the YASK stencil kernel grid. ////////////
+
+// This file uses Doxygen 1.8 markup for API documentation-generation.
+// See http://www.stack.nl/~dimitri/doxygen.
+/** @file yk_grid_api.hpp */ 
+
+#ifndef YK_GRID_API
+#define YK_GRID_API
+
+#include "yask_kernel_api.hpp"
+
+namespace yask {
+
+    /// A run-time grid.
+    /**
+       "Grid" is a generic term for any n-dimensional array.  A 0-dim grid
+       is a scalar, a 1-dim grid is an array, etc.  A run-time grid contains
+       data, unlike yc_grid, a compile-time grid variable.  
+       
+       Typically, access to each grid is obtained via yk_solution::get_grid().
+       You may also use yk_solution::new_grid() or yk_solution::new_fixed_size_grid() 
+       if you need a grid that is not part of the pre-defined solution.
+
+       Each dimension of a grid is one of the following:
+       - The *step* dimension, typically time ("t"), as identified via yk_solution::get_step_dim_name().
+       - A *domain* dimension, typically a spatial dimension such as "x" or "y",
+       as identified via yk_solution:get_domain_dim_names().
+       - A *miscellaneous* dimension, which is any dimension that is not a domain or step dimension,
+       as identified via yk_solution:get_misc_dim_names().
+       
+       In the step dimension, there is no fixed domain size, and no
+       specified first or last index.
+       However, there is an allocation size, which is the number of values in the step
+       dimension that are stored in memory.
+       Step-dimension indices "wrap-around" within this allocation to reuse memory.
+       For example, if the step dimension is "t", and the t-dimension allocation size is 3,
+       then t=-2, t=0, t=3, t=6, ..., t=303, etc. would all alias to the same spatial values in memory.
+
+       In each domain dimension,
+       grid sizes include the following components:
+       - The *domain* is the elements to which the stencils are applied.
+       - The *left padding* is all the elements before the domain and includes the left halo.
+       - The *right padding* is all the elements before the domain and includes the right halo.
+       - The *left halo* is the elements just before the domain which must be
+       copied between preceding ranks during halo exchanges. The left halo is contained within the left padding.
+       - The *right halo* is the elements just after the domain which must be
+       copied between following ranks during halo exchanges. The right halo is contained within the right padding.
+       - The *extra left padding* is the elements before the domain and left halo
+       and thus does not include the left halo.
+       - The *extra right padding* is the elements after the domain and right halo
+       and thus does not include the right halo.
+       - The *allocation* includes the left padding, domain, and right padding.
+       
+       Domain sizes specified via yk_solution::set_rank_domain_size() apply to each MPI rank.
+       Visually, in each of the domain dimensions, these sizes are related as follows
+       in each rank:
+       <table>
+       <tr><td>extra left padding <td>left halo <td rowspan="2">domain <td>right halo <td>extra right padding
+       <tr><td colspan="2"><center>left padding</center> <td colspan="2"><center>right padding</center>
+       <tr><td colspan="5"><center>allocation</center>
+       </table>
+
+       If MPI is not enabled, a rank's domain is equivalent to the entire problem size.
+       If MPI is enabled, the domains of the ranks are logically abutted to create the 
+       overall problem domain in each dimension:
+       <table>
+       <tr><td>extra left padding of rank A <td>halo of rank A <td>domain of rank A <td>domain of rank B
+         <td>... <td>domain of rank Z <td>halo of rank Z <td>extra right padding of rank Z
+       <tr><td colspan="2"><center>left padding of rank A</center>
+         <td colspan="4"><center>overall problem domain</center>
+         <td colspan="2"><center>right padding of rank Z</center>
+       </table>
+       The intermediate halos and paddings also exist, but are not shown in the above diagram.
+       The halos overlap the domains of adjacent ranks.
+       For example, the left halo of rank B in the diagram would overlap the domain of rank A.
+       Data in these overlapped regions is exchanged as needed during stencil application
+       to maintain a consistent values as if there was only one rank.
+
+       In each miscellaneous dimension, there is only an allocation size,
+       and there is no wrap-around as in the step dimension.
+       Each index must be between its first and last allowed value.
+
+       All sizes are expressed in numbers of elements.
+       Each element may be a 4-byte (single precision)
+       or 8-byte (double precision) floating-point value as returned by
+       yk_solution::get_element_bytes().
+       
+       Initially, a grid is not assigned any allocated storage.
+       This is done to allow modification of domain, padding, and other allocation sizes
+       before allocation.
+       Once the allocation sizes have been set in all dimensions, the data storage itself may
+       be allocated.
+       This can be done in any of the following ways:
+       - Storage for all grids without data storage will be automatically allocated when
+       prepare_solution() is called.
+       - Storage for a specific grid may be allocated before calling prepare_solution()
+       via yk_grid::alloc_storage().
+       - **[Advanced]** Storage for a specific grid may be shared with another grid with
+       existing storage via yk_grid::share_storage().
+       
+       @note The domain index arguments to the \ref yk_grid functions that require indices
+       are *always* relative to the overall problem; they are *not* relative to the current rank.
+       The first and last overall-problem index that lies within a rank can be
+       retrieved via yk_solution::get_first_rank_domain_index() and 
+       yk_solution::get_last_rank_domain_index(), respectively.
+       The first and last accessible index that lies within a rank for a given grid can be
+       retrieved via yk_grid::get_first_rank_alloc_index() and 
+       yk_grid::get_last_rank_alloc_index(), respectively.
+       Also, index arguments are always inclusive. 
+       Specifically, for functions that return or require a "last" index, that
+       index indicates the last one in the relevant range, i.e., *not* one past the last value
+       (this is more like Fortran and Perl than Python and Lisp).
+    */
+    class yk_grid {
+    public:
+        virtual ~yk_grid() {}
+
+        /// Get the name of the grid.
+        /**
+           @returns String containing name provided via yc_solution::new_grid().
+        */
+        virtual const std::string& get_name() const =0;
+
+        /// Determine whether this grid is automatically resized based on the solution.
+        /**
+           @returns `true` if this grid was created via yk_solution::new_fixed_size_grid()
+           or `false` otherwise.
+        */
+        virtual bool is_fixed_size() const =0;
+
+        /// Get the number of dimensions used in this grid.
+        /**
+           This may include domain, step, and/or miscellaneous dimensions.
+           @returns Number of dimensions created via yc_solution::new_grid(),
+           yk_solution::new_grid(), or yk_solution::new_fixed_size_grid().
+        */
+        virtual int get_num_dims() const =0;
+
+        /// Get all the dimensions in this grid.
+        /**
+           This may include domain, step, and/or miscellaneous dimensions.
+           @returns List of names of all the dimensions.
+        */
+        virtual std::vector<std::string>
+        get_dim_names() const =0;
+        
+        /// Determine whether specified dimension exists in this grid.
+        /**
+           @returns `true` if dimension exists (including step-dimension),
+           `false` otherwise.
+        */
+        virtual bool
+        is_dim_used(const std::string& dim) const =0;
+
+        /// Get the domain size for this rank.
+        /**
+           @returns The same value as yk_solution::get_rank_domain_size() if
+           is_fixed_size() returns `false` or the fixed sized provided via
+           yk_solution::new_fixed_size_grid() otherwise.
+        */
+        virtual idx_t
+        get_rank_domain_size(const std::string& dim
+                             /**< [in] Name of dimension to get.  Must be one of
+                                the names from yk_solution::get_domain_dim_names(). */) const =0;
+
+        /// Get the first index of the sub-domain in this rank in the specified dimension.
+        /**
+           @note This function should be called only *after* calling prepare_solution()
+           because prepare_solution() assigns this rank's position in the problem domain.
+           @returns The same value as yk_solution::get_first_rank_domain_index() if
+           is_fixed_size() returns `false` or zero (0) otherwise.
+        */
+        virtual idx_t
+        get_first_rank_domain_index(const std::string& dim
+                                    /**< [in] Name of dimension to get.  Must be one of
+                                       the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+        
+        /// Get the last index of the sub-domain in this rank in the specified dimension.
+        /**
+           @note This function should be called only *after* calling prepare_solution()
+           because prepare_solution() assigns this rank's position in the problem domain.
+           @returns The same value as yk_solution::get_last_rank_domain_index() if
+           is_fixed_size() returns `false` or one less than the fixed sized provided via
+           yk_solution::new_fixed_size_grid() otherwise.
+        */
+        virtual idx_t
+        get_last_rank_domain_index(const std::string& dim
+                                    /**< [in] Name of dimension to get.  Must be one of
+                                       the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+
+        /// Get the left halo size in the specified dimension.
+        /**
+           This value is typically set by the stencil compiler.
+           @returns Elements in halo in given dimension before the domain.
+        */
+        virtual idx_t
+        get_left_halo_size(const std::string& dim
+                      /**< [in] Name of dimension to get.
+                         Must be one of
+                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+        
+        /// Get the right halo size in the specified dimension.
+        /**
+           This value is typically set by the stencil compiler.
+           @returns Elements in halo in given dimension after the domain.
+        */
+        virtual idx_t
+        get_right_halo_size(const std::string& dim
+                      /**< [in] Name of dimension to get.
+                         Must be one of
+                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+        
+        /// Get the first index of the left halo in this rank in the specified dimension.
+        /**
+           @note This function should be called only *after* calling prepare_solution()
+           because prepare_solution() assigns this rank's position in the problem domain.
+           @returns The first index of left halo in this rank or
+           the same value as yk_grid::get_first_rank_domain_index()
+           if the left halo has zero size.
+        */
+        virtual idx_t
+        get_first_rank_halo_index(const std::string& dim
+                                    /**< [in] Name of dimension to get.  Must be one of
+                                       the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+
+        /// Get the last index of the right halo in this rank in the specified dimension.
+        /**
+           @note This function should be called only *after* calling prepare_solution()
+           because prepare_solution() assigns this rank's position in the problem domain.
+           @returns The last index of right halo in this rank or
+           the same value as yk_grid::get_last_rank_domain_index()
+           if the right halo has zero size.
+        */
+        virtual idx_t
+        get_last_rank_halo_index(const std::string& dim
+                                    /**< [in] Name of dimension to get.  Must be one of
+                                       the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+
+        /// Get the left padding in the specified dimension.
+        /**
+           The left padding is the memory allocated before
+           the domain in a given dimension.
+           The left padding size includes the left halo size.
+           The value may be slightly
+           larger than that provided via set_min_pad_size(), etc. due to rounding.
+           @returns Elements in left padding in given dimension.
+        */
+        virtual idx_t
+        get_left_pad_size(const std::string& dim
+                     /**< [in] Name of dimension to get.
+                         Must be one of
+                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+
+        /// Get the right padding in the specified dimension.
+        /**
+           The right padding is the memory allocated after
+           the domain in a given dimension.
+           The right padding size includes the right halo size.
+           The value may be slightly
+           larger than that provided via set_min_pad_size(), etc. due to rounding.
+           @returns Elements in right padding in given dimension.
+        */
+        virtual idx_t
+        get_right_pad_size(const std::string& dim
+                     /**< [in] Name of dimension to get.
+                         Must be one of
+                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+
+        /// Get the extra left padding in the specified dimension.
+        /**
+           The *extra* padding size is the left padding size minus the left halo size.
+           @returns Elements in padding in given dimension before the
+           left halo region.
+        */
+        virtual idx_t
+        get_left_extra_pad_size(const std::string& dim
+                           /**< [in] Name of dimension to get.
+                              Must be one of
+                              the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+
+        /// Get the extra right padding in the specified dimension.
+        /**
+           The *extra* padding size is the right padding size minus the right halo size.
+           @returns Elements in padding in given dimension after the
+           right halo region.
+        */
+        virtual idx_t
+        get_right_extra_pad_size(const std::string& dim
+                           /**< [in] Name of dimension to get.
+                              Must be one of
+                              the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+
+        /// Set the padding in the specified dimension.
+        /**
+           This sets the minimum number of elements in this grid 
+           in both left and right pads.
+           This padding area can be used for required halo regions.
+           
+           The *actual* padding size will be the largest of the following values,
+           additionally rounded up based on the vector-folding dimensions
+           and/or cache-line alignment:
+           - Halo size.
+           - Value provided by any of the pad-size setting functions.
+           
+           The padding size cannot be changed after data storage
+           has been allocated for this grid; attempted changes to the pad size
+           will be ignored.
+           In addition, once a grid's padding is set, it cannot be reduced, only increased.
+           Call get_pad_size() to determine the actual padding size for the grid.
+           See the "Detailed Description" for \ref yk_grid for information on grid sizes.
+        */
+        virtual void
+        set_min_pad_size(const std::string& dim
+                         /**< [in] Name of dimension to set.
+                            Must be one of
+                            the names from yk_solution::get_domain_dim_names(). */,
+                         idx_t size
+                         /**< [in] Minimum number of elements to allocate beyond the domain size. */ ) =0;
+        
+        /// Get the storage allocation in the specified dimension.
+        /**
+           For the step dimension, this is the specified allocation and
+           does not typically depend on the number of steps evaluated.
+           For the non-step dimensions, this includes the domain and padding sizes.
+           See the "Detailed Description" for \ref yk_grid for information on grid sizes.
+           @returns allocation in number of elements (not bytes).
+        */
+        virtual idx_t
+        get_alloc_size(const std::string& dim
+                       /**< [in] Name of dimension to get. */ ) const =0;
+
+        /// Get the first index of a specified miscellaneous dimension.
+        /**
+           @returns the first allowed index in a non-step and non-domain dimension.
+        */
+        virtual idx_t
+        get_first_misc_index(const std::string& dim
+                             /**< [in] Name of dimension to get.  Must be one of
+                                the names from yk_solution::get_misc_dim_names(). */ ) const =0;
+        
+        /// Get the last index of a specified miscellaneous dimension.
+        /**
+           @returns the last allowed index in a non-step and non-domain dimension.
+        */
+        virtual idx_t
+        get_last_misc_index(const std::string& dim
+                            /**< [in] Name of dimension to get.  Must be one of
+                               the names from yk_solution::get_misc_dim_names(). */ ) const =0;
+
+        /// Determine whether the given indices are allocated in this rank.
+        /**
+           Provide indices in a list in the same order returned by get_dim_names().
+           Indices are relative to the *overall* problem domain.
+           @returns `true` if index values fall within the allocated space as returned by
+           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
+           each dimension; `false` otherwise.
+        */
+        virtual bool
+        is_element_allocated(const std::vector<idx_t>& indices
+                             /**< [in] List of indices, one for each grid dimension. */ ) const =0;
+        
+#ifndef SWIG
+        /// Determine whether the given indices are allocated in this rank.
+        /**
+           Provide indices in a list in the same order returned by get_dim_names().
+           Indices are relative to the *overall* problem domain.
+           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
+           @returns `true` if index values fall within the allocated space as returned by
+           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
+           each dimension; `false` otherwise.
+        */
+        virtual bool
+        is_element_allocated(const std::initializer_list<idx_t>& indices
+                             /**< [in] List of indices, one for each grid dimension. */ ) const =0;
+#endif
+        
+        /// Get the value of one grid element.
+        /**
+           Provide indices in a list in the same order returned by get_dim_names().
+           Indices are relative to the *overall* problem domain.
+           Index values must fall within the allocated space as returned by
+           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
+           each dimension.
+           @returns value in grid at given multi-dimensional location.
+        */
+        virtual double
+        get_element(const std::vector<idx_t>& indices
+                    /**< [in] List of indices, one for each grid dimension. */ ) const =0;
+
+#ifndef SWIG
+        /// Get the value of one grid element.
+        /**
+           Provide indices in a list in the same order returned by get_dim_names().
+           Indices are relative to the *overall* problem domain.
+           Index values must fall within the allocated space as returned by
+           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
+           each dimension.
+           @note The return value is a double-precision floating-point value, but
+           it will be converted from a single-precision if 
+           yk_solution::get_element_bytes() returns 4.
+           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
+           @returns value in grid at given multi-dimensional location.
+        */
+        virtual double
+        get_element(const std::initializer_list<idx_t>& indices
+                    /**< [in] List of indices, one for each grid dimension. */ ) const =0;
+#endif
+
+        /// Get grid elements within specified subset of the grid.
+        /**
+           Reads all elements from `first_indices` to `last_indices` in each dimension
+           and writes them to consecutive memory locations in the buffer.
+           Indices in the buffer progress in row-major order.
+           The buffer pointed to must contain the number of bytes equal to
+           yk_solution::get_element_bytes() multiplied by the number of
+           elements in the specified slice.
+           Since the reads proceed in row-major order, the last index is "unit-stride"
+           in the buffer.
+           Provide indices in two lists in the same order returned by get_dim_names().
+           Indices are relative to the *overall* problem domain.
+           Index values must fall within the allocated space as returned by
+           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
+           each dimension.
+           @returns Number of elements read.
+        */
+        virtual idx_t
+        get_elements_in_slice(void* buffer_ptr
+                              /**< [out] Pointer to buffer where values will be written. */,
+                              const std::vector<idx_t>& first_indices
+                              /**< [in] List of initial indices, one for each grid dimension. */,
+                              const std::vector<idx_t>& last_indices
+                              /**< [in] List of final indices, one for each grid dimension. */ ) const =0;
+        
+        /// Set the value of one grid element.
+        /**
+           Provide indices in a list in the same order returned by get_dim_names().
+           Indices are relative to the *overall* problem domain.
+           Index values must fall within the allocated space as returned by
+           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
+           each dimension.
+           @note The parameter value is a double-precision floating-point value, but
+           it will be converted to single-precision if
+           yk_solution::get_element_bytes() returns 4.
+           If storage has not been allocated for this grid, this will have no effect.
+           @returns Number of elements set.
+        */
+        virtual idx_t
+        set_element(double val /**< [in] Element in grid will be set to this. */,
+                    const std::vector<idx_t>& indices
+                    /**< [in] List of indices, one for each grid dimension. */,
+                    bool strict_indices = false
+                    /**< [in] If true, indices must be within domain or padding.
+                       If false, indices outside of domain and padding result
+                       in no change to grid. */ ) =0;
+
+#ifndef SWIG        
+        /// Set the value of one grid element.
+        /**
+           Provide the number of indices equal to the number of dimensions in the grid.
+           Indices beyond that will be ignored.
+           Indices are relative to the *overall* problem domain.
+           If any index values fall outside of the allocated space as returned by
+           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
+           each dimension, this will have no effect.
+           @note The parameter value is a double-precision floating-point value, but
+           it will be converted to single-precision if
+           yk_solution::get_element_bytes() returns 4.
+           If storage has not been allocated for this grid, this will have no effect.
+           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
+           @returns Number of elements set.
+        */
+        virtual idx_t
+        set_element(double val /**< [in] Element in grid will be set to this. */,
+                    const std::initializer_list<idx_t>& indices
+                    /**< [in] List of indices, one for each grid dimension. */,
+                    bool strict_indices = false
+                    /**< [in] If true, indices must be within domain or padding.
+                       If false, indices outside of domain and padding result
+                       in no change to grid. */ ) =0;
+#endif
+        /// Atomically add to the value of one grid element.
+        /**
+           Provide indices in a list in the same order returned by get_dim_names().
+           Indices are relative to the *overall* problem domain.
+           Index values must fall within the allocated space as returned by
+           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
+           each dimension if `strict_indices` is set to true.
+           Updates are OpenMP atomic, meaning that this function can be called by
+           several OpenMP threads without causing a race condition.
+           @note The parameter value is a double-precision floating-point value, but
+           it will be converted to single-precision if
+           yk_solution::get_element_bytes() returns 4.
+           If storage has not been allocated for this grid, this will have no effect.
+           @returns Number of elements updated.
+        */
+        virtual idx_t
+        add_to_element(double val /**< [in] This value will be added to element in grid. */,
+                       const std::vector<idx_t>& indices
+                       /**< [in] List of indices, one for each grid dimension. */,
+                       bool strict_indices = false
+                       /**< [in] If true, indices must be within domain or padding.
+                          If false, indices outside of domain and padding result
+                          in no change to grid. */ ) =0;
+
+#ifndef SWIG        
+        /// Atomically add to the value of one grid element.
+        /**
+           Provide the number of indices equal to the number of dimensions in the grid.
+           Indices beyond that will be ignored.
+           Indices are relative to the *overall* problem domain.
+           Index values must fall within the allocated space as returned by
+           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
+           each dimension if `strict_indices` is set to true.
+           Updates are OpenMP atomic, meaning that this function can be called by
+           several OpenMP threads without causing a race condition.
+           @note The parameter value is a double-precision floating-point value, but
+           it will be converted to single-precision if
+           yk_solution::get_element_bytes() returns 4.
+           If storage has not been allocated for this grid, this will have no effect.
+           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
+           @returns Number of elements set.
+        */
+        virtual idx_t
+        add_to_element(double val /**< [in] This value will be added to element in grid. */,
+                       const std::initializer_list<idx_t>& indices
+                       /**< [in] List of indices, one for each grid dimension. */,
+                       bool strict_indices = false
+                       /**< [in] If true, indices must be within domain or padding.
+                          If false, indices outside of domain and padding result
+                          in no change to grid. */ ) =0;
+#endif
+        
+        /// Initialize all grid elements to the same value.
+        /**
+           Sets all allocated elements, including those in the domain and padding
+           area to the same specified value.
+           @note The parameter is a double-precision floating-point value, but
+           it will be converted to single-precision if
+           yk_solution::get_element_bytes() returns 4.
+           @note If storage has not been allocated via yk_solution::prepare_solution(),
+           this will have no effect.
+        */
+        virtual void
+        set_all_elements_same(double val /**< [in] All elements will be set to this. */ ) =0;
+
+        /// Initialize grid elements within specified subset of the grid to the same value.
+        /**
+           Sets all elements from `first_indices` to `last_indices` in each dimension to the
+           specified value.
+           Provide indices in two lists in the same order returned by get_dim_names().
+           Indices are relative to the *overall* problem domain.
+           Index values must fall within the allocated space as returned by
+           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
+           each dimension.
+           If storage has not been allocated for this grid, this will have no effect.
+           @returns Number of elements set.
+        */
+        virtual idx_t
+        set_elements_in_slice_same(double val /**< [in] All elements in the slice will be set to this. */,
+                                   const std::vector<idx_t>& first_indices
+                                   /**< [in] List of initial indices, one for each grid dimension. */,
+                                   const std::vector<idx_t>& last_indices
+                                   /**< [in] List of final indices, one for each grid dimension. */,
+                                   bool strict_indices = false
+                                   /**< [in] If true, indices must be within domain or padding.
+                                      If false, only elements within the allocation of this grid
+                                      will be set, and elements outside will be ignored. */ ) =0;
+
+        /// Set grid elements within specified subset of the grid.
+        /**
+           Reads elements from consecutive memory locations,
+           starting at `buffer_ptr`
+           and writes them from `first_indices` to `last_indices` in each dimension.
+           Indices in the buffer progress in row-major order.
+           The buffer pointed to must contain either 4 or 8 byte FP values per element in the 
+           subset, depending on the FP precision of the solution.
+           The buffer pointed to must contain the number of FP values in the specified slice,
+           where each FP value is the size of yk_solution::get_element_bytes().
+           Since the writes proceed in row-major order, the last index is "unit-stride"
+           in the buffer.
+           Provide indices in two lists in the same order returned by get_dim_names().
+           Indices are relative to the *overall* problem domain.
+           Index values must fall within the allocated space as returned by
+           get_first_rank_alloc_index() and get_last_rank_alloc_index() for
+           each dimension.
+           If storage has not been allocated for this grid, this will have no effect.
+           @returns Number of elements written.
+        */
+        virtual idx_t
+        set_elements_in_slice(const void* buffer_ptr
+                              /**< [out] Pointer to buffer where values will be read. */,
+                              const std::vector<idx_t>& first_indices
+                              /**< [in] List of initial indices, one for each grid dimension. */,
+                              const std::vector<idx_t>& last_indices
+                              /**< [in] List of final indices, one for each grid dimension. */ ) =0;
+        
+        /// Format the indices for pretty-printing.
+        /**
+           Provide indices in a list in the same order returned by get_dim_names().
+           @returns A string containing the grid name and the index values.
+        */
+        virtual std::string
+        format_indices(const std::vector<idx_t>& indices
+                       /**< [in] List of indices, one for each grid dimension. */ ) const =0;
+        
+#ifndef SWIG
+        /// Format the indices for pretty-printing.
+        /**
+           Provide indices in a list in the same order returned by get_dim_names().
+           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
+           @returns A string containing the grid name and the index values.
+        */
+        virtual std::string
+        format_indices(const std::initializer_list<idx_t>& indices
+                       /**< [in] List of indices, one for each grid dimension. */ ) const =0;
+#endif
+        
+        /// Determine whether storage has been allocated.
+        /**
+           @returns `true` if storage has been allocated,
+           `false` otherwise.
+        */
+        virtual bool
+        is_storage_allocated() const =0;
+
+        /// Determine size of raw storage in bytes.
+        /**
+           @returns Minimum number of bytes required for
+           storage given the current domain size and padding settings.
+        */
+        virtual idx_t
+        get_num_storage_bytes() const =0;
+
+        /// Determine size of raw storage in elements.
+        /**
+           @returns get_num_storage_bytes() / yk_solution.get_element_bytes().
+        */
+        virtual idx_t
+        get_num_storage_elements() const =0;
+
+        /* Advanced APIs for yk_grid found below are not needed for most applications. */
+
+        /// **[Advanced]** Set the default preferred NUMA node on which to allocate data.
+        /**
+           This value is used when allocating data for this grid.
+           Thus, the desired NUMA policy must be set before calling alloc_data()
+           or yk_solution::prepare_solution().
+        */
+        virtual void
+        set_numa_preferred(int numa_node
+                           /**< [in] Preferred NUMA node.
+                              See yk_solution::set_default_numa_preferred() for other options. */) =0;
+
+        /// **[Advanced]** Get the default preferred NUMA node on which to allocate data.
+        /**
+           @returns Current setting of preferred NUMA node for this grid.
+        */
+        virtual int
+        get_numa_preferred() const =0;
+
+        /// **[Advanced]** Set the left halo size in the specified dimension.
+        /**
+           This value is typically set by the stencil compiler, but
+           this function allows you to override that value.
+           If the left halo is set to a value larger than the left padding size, the
+           left padding size will be automatically increase to accomodate it.
+           @note After data storage has been allocated, the left halo size
+           can only be set to a value less than or equal to the left padding size
+           in the given dimension.
+        */
+        virtual void
+        set_left_halo_size(const std::string& dim
+                      /**< [in] Name of dimension to get.
+                         Must be one of
+                         the names from yk_solution::get_domain_dim_names(). */,
+                      idx_t size
+                      /**< [in] Number of elements in the left halo. */ ) =0;
+
+        /// **[Advanced]** Set the right halo size in the specified dimension.
+        /**
+           This value is typically set by the stencil compiler, but
+           this function allows you to override that value.
+           If the right halo is set to a value larger than the right padding size, the
+           right padding size will be automatically increase to accomodate it.
+           @note After data storage has been allocated, the right halo size
+           can only be set to a value less than or equal to the right padding size
+           in the given dimension.
+        */
+        virtual void
+        set_right_halo_size(const std::string& dim
+                      /**< [in] Name of dimension to get.
+                         Must be one of
+                         the names from yk_solution::get_domain_dim_names(). */,
+                      idx_t size
+                      /**< [in] Number of elements in the right halo. */ ) =0;
+
+        /// **[Advanced]** Set the left and right halo sizes in the specified dimension.
+        /**
+           Alias for set_left_halo_size(dim, size); set_right_halo_size(dim, size).
+        */
+        virtual void
+        set_halo_size(const std::string& dim
+                      /**< [in] Name of dimension to get.
+                         Must be one of
+                         the names from yk_solution::get_domain_dim_names(). */,
+                      idx_t size
+                      /**< [in] Number of elements in the halo. */ ) =0;
+
+
+        /// **[Advanced]** Set the number of elements to allocate in the specified dimension.
+        /** 
+           This setting is only allowed in the step dimension.
+           Typically, the allocation in the step dimension is determined by the
+           stencil compiler, but
+           this function allows you to override that value.
+           Allocations in other dimensions should be set indirectly
+           via the domain and padding sizes.
+           The allocation size cannot be changed after data storage
+           has been allocated for this grid.
+        */
+        virtual void
+        set_alloc_size(const std::string& dim
+                       /**< [in] Name of dimension to set.
+                          Must *not* be one of
+                          the names from yk_solution::get_domain_dim_names(). */,
+                       idx_t size /**< [in] Number of elements to allocate. */ ) =0;
+
+        /// **[Advanced]** Set the first index of a specified miscellaneous dimension.
+        /**
+           Sets the first allowed index in a non-step and non-domain dimension.
+           After calling this function, the last allowed index will be the first index
+           as set by this function plus the allocation size set by set_alloc_size()
+           minus one.
+        */
+        virtual void
+        set_first_misc_index(const std::string& dim
+                             /**< [in] Name of dimension to get.  Must be one of
+                                the names from yk_solution::get_misc_dim_names(). */,
+                             idx_t idx /**< [in] New value for first index.
+                                        May be negative. */ ) =0;
+        
+        /// **[Advanced]** Get the first accessible index in this grid in this rank in the specified dimension.
+        /**
+           This returns the first *overall* index allowed in this grid.
+           This element may be in the domain, left halo, or extra left padding area.
+           This function is only for checking the legality of an index.
+           @returns First allowed index in this grid.
+        */
+        virtual idx_t
+        get_first_rank_alloc_index(const std::string& dim
+                                   /**< [in] Name of dimension to get.
+                                      Must be one of
+                                      the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+
+        /// **[Advanced]** Get the last accessible index in this grid in this rank in the specified dimension.
+        /**
+           This returns the last *overall* index allowed in this grid.
+           This element may be in the domain, right halo, or extra right padding area.
+           This function is only for checking the legality of an index.
+           @returns Last allowed index in this grid.
+        */
+        virtual idx_t
+        get_last_rank_alloc_index(const std::string& dim
+                                  /**< [in] Name of dimension to get.
+                                     Must be one of
+                                     the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+
+        /// **[Advanced]** Explicitly allocate data-storage memory for this grid.
+        /**
+           Amount of allocation is calculated based on domain, padding, and 
+           step-dimension allocation sizes.
+           Any pre-existing storage will be released before allocation as via release_storage().
+           See allocation options in the "Detailed Description" for \ref yk_grid.
+         */
+        virtual void
+        alloc_storage() =0;
+
+        /// **[Advanced]** Explicitly release any allocated data-storage for this grid.
+        /**
+           This will release storage allocated via any of the options
+           described in the "Detailed Description" for \ref yk_grid.
+           If the data was shared between two or more grids, the data will
+           be retained by the remaining grids.
+        */
+        virtual void
+        release_storage() =0;
+
+        /// **[Advanced]** Determines whether storage layout is the same as another grid.
+        /**
+           In order for the storage layout to be identical, the following
+           must be the same:
+           - Number of dimensions.
+           - Name of each dimension, in the same order.
+           - Allocation size in each dimension.
+           - Rank domain size in each domain dimension.
+           - Padding size in each domain dimension.
+
+           The following do not have to be identical:
+           - Halo size.
+
+           @returns `true` if storage for this grid has the same layout as
+           `other` or `false` otherwise.
+        */
+        virtual bool
+        is_storage_layout_identical(const yk_grid_ptr other) const =0;
+        
+        /// **[Advanced]** Use existing data-storage from specified grid.
+        /**
+           This is an alternative to allocating data storage via 
+           yk_solution::prepare_solution() or alloc_storage().
+           In this case, data from a grid in this or another solution will be shared with
+           this grid.
+           In order to successfully share storage, the following conditions must hold:
+           - The source grid must already have storage allocated.
+           - The two grids must have the same dimensions in the same order.
+           - The two grids must have the same domain sizes in all domain dimensions.
+           - The two grids must have the same allocation sizes in non-domain dimensions.
+           - The required padding size of this grid must be less than or
+           equal to the actual padding size of the source grid in all domain
+           dimensions. The required padding size of this grid will be equal to
+           or greater than its halo size. It is not strictly necessary that the
+           two grids have the same halo sizes, but that is a sufficient condition.
+
+           Any pre-existing storage will be released before allocation as via release_storage().
+           The padding size(s) of this grid will be set to that of the source grid.
+           After calling share_storage(), changes in one grid via set_all_elements()
+           or set_element() will be visible in the other grid.
+
+           See allocation options and more information about grid sizes
+           in the "Detailed Description" for \ref yk_grid.
+        */
+        virtual void
+        share_storage(yk_grid_ptr source
+                      /**< [in] Grid from which storage will be shared. */) =0;
+
+        /// **[Advanced]** Get pointer to raw data storage buffer.
+        /**
+           The following assumptions about the contents of data are safe:
+           - Each FP element starts at a number of bytes from the beginning
+           of the buffer which is a multiple of yk_solution::get_element_bytes().
+           - All the FP elements will be located within get_num_storage_bytes()
+           bytes from the beginning of the buffer.
+           - A call to set_all_elements_same() will initialize all elements
+           within get_num_storage_bytes() bytes from the beginning of the buffer.
+           - If is_storage_layout_identical() returns `true` between this
+           and some other grid, any given element index applied to both grids
+           will refer to an element at the same offset into their respective
+           data buffers. 
+
+           Thus,
+           - You can perform element-wise unary mathematical operations on
+           all elements of a grid via its raw buffer, e.g., add some constant
+           value to all elements.
+           - If the layouts of two grids are identical, you can use their
+           raw buffers to copy or compare the grid contents for equality or
+           perform element-wise binary mathematical operations on them,
+           e.g., add all elements from one grid to another.
+
+           The following assumptions are not safe:
+           - Any expectations regarding the relationship between an element
+           index and that element's offset from the beginning of the buffer
+           such as row-major or column-major layout.
+           - All elements in the buffer are part of the rank domain or halo.
+
+           Thus,
+           - You should not perform any operations dependent on
+           the logical indices of any element via raw buffer, e.g., matrix
+           multiply.
+
+           @returns Pointer to raw data storage if is_storage_allocated()
+           returns `true` or NULL otherwise.
+        */
+        virtual void* get_raw_storage_buffer() =0;
+
+        /* Deprecated APIs for yk_grid found below should be avoided.
+           Use the more explicit form found in the documentation. */
+        
+        /// **[Deprecated]** Get the left halo size in the specified dimension.
+        /**
+           Alias for get_left_halo_size(dim, size).
+           @returns Elements in halo in given dimension before the domain.
+        */
+        virtual idx_t
+        get_halo_size(const std::string& dim
+                      /**< [in] Name of dimension to get.
+                         Must be one of
+                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+        
+        /// **[Deprecated]** Get the left padding in the specified dimension.
+        /**
+           Alias for get_left_pad_size(dim).
+           @returns Elements in left padding in given dimension.
+        */
+        virtual idx_t
+        get_pad_size(const std::string& dim
+                     /**< [in] Name of dimension to get.
+                         Must be one of
+                         the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+
+        /// **[Deprecated]** Get the extra left padding in the specified dimension.
+        /**
+           Alias for get_extra_left_pad_size(dim).
+           @returns Elements in padding in given dimension before the
+           left halo region.
+        */
+        virtual idx_t
+        get_extra_pad_size(const std::string& dim
+                           /**< [in] Name of dimension to get.
+                              Must be one of
+                              the names from yk_solution::get_domain_dim_names(). */ ) const =0;
+
+    };
+
+
+} // namespace yask.
+
+#endif
diff --git a/include/yk_solution_api.hpp b/include/yk_solution_api.hpp
new file mode 100644
index 00000000..130b2858
--- /dev/null
+++ b/include/yk_solution_api.hpp
@@ -0,0 +1,836 @@
+/*****************************************************************************
+
+YASK: Yet Another Stencil Kernel
+Copyright (c) 2014-2018, Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+* The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
+
+*****************************************************************************/
+
+///////// API for the YASK stencil kernel solution. ////////////
+
+// This file uses Doxygen 1.8 markup for API documentation-generation.
+// See http://www.stack.nl/~dimitri/doxygen.
+/** @file yk_solution_api.hpp */ 
+
+#ifndef YK_SOLN_API
+#define YK_SOLN_API
+
+#include "yask_kernel_api.hpp"
+
+namespace yask {
+
+    /// Allocate grids on local NUMA node.
+    /**
+       This is used in yk_solution::set_default_numa_preferred
+       and yk_grid::set_numa_preferred.
+       In Python, specify as `yask_kernel.cvar.yask_numa_local`. 
+    */
+    const int yask_numa_local = -1;
+
+    /// Allocate grids across all available NUMA nodes.
+    /**
+       This is used in yk_solution::set_default_numa_preferred
+       and yk_grid::set_numa_preferred.
+       In Python, specify as `yask_kernel.cvar.yask_numa_interleave`. 
+    */
+    const int yask_numa_interleave = -2;
+
+    /// Do not specify any NUMA binding.
+    /**
+       This is used in yk_solution::set_default_numa_preferred
+       and yk_grid::set_numa_preferred.
+       In Python, specify as `yask_kernel.cvar.yask_numa_none`. 
+    */
+    const int yask_numa_none = -9;
+
+    /// Stencil solution as defined by the generated code from the YASK stencil compiler.
+    /**
+       Objects of this type contain all the grids and equations
+       that comprise a solution.
+    */
+    class yk_solution {
+    public:
+        virtual ~yk_solution() {}
+
+        /// Set object to receive debug output.
+        virtual void
+        set_debug_output(yask_output_ptr debug
+                         /**< [out] Pointer to object to receive debug output. 
+                            See \ref yask_output_factory. */ ) =0;
+
+        /// Get the name of the solution.
+        /**
+           @returns String containing the solution name provided during stencil compilation.
+        */
+        virtual const std::string&
+        get_name() const =0;
+
+        /// Get the floating-point precision size.
+        /**
+           @returns Number of bytes in each FP element: 4 or 8.
+        */
+        virtual int 
+        get_element_bytes() const =0;
+        
+        /// Get the solution step dimension.
+        /**
+           @returns String containing the step-dimension name. 
+        */
+        virtual std::string
+        get_step_dim_name() const =0;
+
+        /// Get the number of domain dimensions used in this solution.
+        /**
+           The domain dimensions are those over which the stencil is 
+           applied in each step.
+           Does *not* include the step dimension or any miscellaneous dimensions.
+           @returns Number of dimensions that define the problem domain.
+        */
+        virtual int
+        get_num_domain_dims() const =0;
+
+        /// Get all the domain dimension names.
+        /**
+           @returns List of all domain-dimension names.
+        */
+        virtual std::vector<std::string>
+        get_domain_dim_names() const =0;
+
+        /// Get all the miscellaneous dimension names.
+        /**
+           @returns List of all dimension names used in the solution
+           that are not step or domain dimensions.
+        */
+        virtual std::vector<std::string>
+        get_misc_dim_names() const =0;
+
+        /// Set the size of the solution domain for this rank.
+        /**
+           The domain defines the number of elements that will be evaluated with the stencil(s). 
+           If MPI is not enabled, this is the entire problem domain.
+           If MPI is enabled, this is the domain for the current rank only,
+           and the problem domain consists of the sum of all rank domains
+           in each dimension (weak-scaling).
+           The domain size in each rank does not have to be the same, but
+           all domains in the same column must have the same width,
+           all domains in the same row must have the same height,
+           and so forth, for each domain dimension.
+           The domain size does *not* include the halo region or any padding.
+           For best performance, set the rank domain
+           size to a multiple of the number of elements in a vector-cluster in
+           each dimension whenever possible.
+           See the "Detailed Description" for \ref yk_grid for more information on grid sizes.
+           There is no domain-size setting allowed in the
+           solution-step dimension (usually "t"). 
+        */
+        virtual void
+        set_rank_domain_size(const std::string& dim
+                             /**< [in] Name of dimension to set.  Must be one of
+                                the names from get_domain_dim_names(). */,
+                             idx_t size /**< [in] Elements in the domain in this `dim`. */ ) =0;
+
+        /// Get the domain size for this rank.
+        /**
+           @returns Current setting of rank domain size in specified dimension.
+        */
+        virtual idx_t
+        get_rank_domain_size(const std::string& dim
+                             /**< [in] Name of dimension to get.  Must be one of
+                                the names from get_domain_dim_names(). */) const =0;
+
+        /// Set the block size in the given dimension.
+        /**
+           This sets the approximate number of elements that are evaluated in
+           each "block".
+           This is a performance setting and should not affect the functional
+           correctness or total number of elements evaluated.
+           A block is typically the unit of work done by a
+           top-level OpenMP thread.  The actual number of elements evaluated
+           in a block may be greater than the specified size due to rounding
+           up to fold-cluster sizes.  The number of elements in a block may
+           also be smaller than the specified size when the block is at the
+           edge of the domain. The block size cannot be set in the
+           solution-step dimension (because temporal blocking is not yet enabled).
+
+           Unless auto-tuning is disabled, the block size will be used as
+           a starting point for an automated search for a higher-performing
+           block size.
+        */
+        virtual void
+        set_block_size(const std::string& dim
+                       /**< [in] Name of dimension to set.  Must be one of
+                          the names from get_domain_dim_names(). */,
+                       idx_t size
+                       /**< [in] Elements in a block in this `dim`. */ ) =0;
+
+        /// Get the block size.
+        /**
+           Returned value may be slightly larger than the value provided
+           via set_block_size() due to rounding.
+           @returns Current settings of block size.
+        */
+        virtual idx_t
+        get_block_size(const std::string& dim
+                        /**< [in] Name of dimension to get.  Must be one of
+                           the names from get_domain_dim_names(). */) const =0;
+
+        /// Set the number of MPI ranks in the given dimension.
+        /**
+           The *product* of the number of ranks across all dimensions must
+           equal yk_env::get_num_ranks().
+           The curent MPI rank will be assigned a unique location 
+           within the overall problem domain based on its MPI rank index.
+           The same number of MPI ranks must be set via this API on each
+           constituent MPI rank to ensure a consistent overall configuration.
+           The number of ranks in each dimension must be properly set
+           before calling yk_solution::prepare_solution().
+           There is no rank setting allowed in the
+           solution-step dimension (usually "t").
+        */
+        virtual void
+        set_num_ranks(const std::string& dim
+                      /**< [in] Name of dimension to set.  Must be one of
+                         the names from get_domain_dim_names(). */,
+                      idx_t num /**< [in] Number of ranks in `dim`. */ ) =0;
+
+        /// Get the number of MPI ranks in the given dimension.
+        /**
+           @returns Current setting of rank size.
+        */
+        virtual idx_t
+        get_num_ranks(const std::string& dim
+                      /**< [in] Name of dimension to get.  Must be one of
+                         the names from get_domain_dim_names(). */) const =0;
+
+        /// Get the rank index in the specified dimension.
+        /**
+           The overall rank indices in the specified dimension will range from
+           zero (0) to get_num_ranks() - 1, inclusive.
+           @returns Zero-based index of this rank.
+        */
+        virtual idx_t
+        get_rank_index(const std::string& dim
+                       /**< [in] Name of dimension to get.  Must be one of
+                         the names from get_domain_dim_names(). */ ) const =0;
+
+        /// Get the number of grids in the solution.
+        /**
+           Grids may be pre-defined by the stencil compiler
+           (e.g., via yc_solution::new_grid())
+           or created explicitly via yk_solution::new_grid().
+           @returns Number of grids that have been created.
+        */
+        virtual int
+        get_num_grids() const =0;
+        
+        /// Get the specified grid.
+        /**
+           This cannot be used to access scratch grids.
+           @returns Pointer to the specified grid or null pointer if it does not exist.
+        */
+        virtual yk_grid_ptr
+        get_grid(const std::string& name /**< [in] Name of the grid. */ ) =0;
+
+        /// Get all the grids.
+        /**
+           @returns List of all non-scratch grids in the solution.
+        */
+        virtual std::vector<yk_grid_ptr>
+        get_grids() =0;
+
+        /// Prepare the solution for stencil application.
+        /**
+           Allocates data in grids that do not already have storage allocated.
+           Calculates the position of each rank in the overall problem domain.
+           Sets many other data structures needed for proper stencil application.
+           Since this function initiates MPI communication, it must be called
+           on all MPI ranks, and it will block until all ranks have completed.
+           Must be called before applying any stencils.
+        */
+        virtual void
+        prepare_solution() =0;
+
+        /// Get the first index of the sub-domain in this rank in the specified dimension.
+        /**
+           This returns the first *overall* index at the beginning of the domain.
+           Elements within the domain in this rank lie between the values returned by
+           get_first_rank_domain_index() and get_last_rank_domain_index(), inclusive.
+           If there is only one MPI rank, this is typically zero (0).
+           If there is more than one MPI rank, the value depends
+           on the the rank's position within the overall problem domain.
+
+           @note This function should be called only *after* calling prepare_solution()
+           because prepare_solution() assigns this rank's position in the problem domain.
+           @returns First domain index in this rank. 
+        */
+        virtual idx_t
+        get_first_rank_domain_index(const std::string& dim
+                                    /**< [in] Name of dimension to get.  Must be one of
+                                       the names from get_domain_dim_names(). */ ) const =0;
+
+        /// Get the last index of the sub-domain in this rank the specified dimension.
+        /**
+           This returns the last *overall* index within the domain in this rank
+           (*not* one past the end).
+           If there is only one MPI rank, this is typically one less than the value
+           provided by set_rank_domain_size().
+           If there is more than one MPI rank, the value depends
+           on the the rank's position within the overall problem domain.
+           See get_first_rank_domain_index() for more information.
+
+           @note This function should be called only *after* calling prepare_solution()
+           because prepare_solution() assigns this rank's position in the problem domain.
+           @returns Last index in this rank.
+        */
+        virtual idx_t
+        get_last_rank_domain_index(const std::string& dim
+                                   /**< [in] Name of dimension to get.  Must be one of
+                                      the names from get_domain_dim_names(). */ ) const =0;
+
+        /// Get the overall problem size in the specified dimension.
+        /**
+           The overall domain indices in the specified dimension will range from
+           zero (0) to get_overall_domain_size() - 1, inclusive.
+           Call get_first_rank_domain_index() and get_last_rank_domain_index()
+           to find the subset of this domain in each rank.
+
+           @note This function should be called only *after* calling prepare_solution()
+           because prepare_solution() obtains the sub-domain sizes from other ranks.
+           @returns Sum of all ranks' domain sizes in the given dimension.
+        */
+        virtual idx_t
+        get_overall_domain_size(const std::string& dim
+                                /**< [in] Name of dimension to get.  Must be one of
+                                   the names from get_domain_dim_names(). */ ) const =0;
+
+        /// Run the stencil solution for the specified steps.
+        /**
+           The stencil(s) in the solution are applied to the grid data, setting the
+           index variables as follows:
+           1. If temporal wave-fronts are *not* used (the default):
+            - The step index (e.g., `t` for "time") will be sequentially set to values
+            from `first_step_index` to `last_step_index`, inclusive.
+             + If the stencil equations were defined with dependencies on lower-valued steps,
+             e.g., `t+1` depends on `t`, then `last_step_index` should be greater than or equal to
+             `first_step_index` (forward solution).
+             + If the stencil equations were defined with dependencies on higher-valued steps,
+             e.g., `t-1` depends on `t`, then `last_step_index` should be less than or equal to
+             `first_step_index` (reverse solution).
+            - For each step index, the domain indices will be set
+            to values across the entire domain as returned by yk_solution::get_overall_domain_size()
+            (not necessarily sequentially).
+            - MPI halo exchanges will occur as necessary before, after, or during a step.
+            - Since this function initiates MPI communication, it must be called
+              on all MPI ranks, and it will block until all ranks have completed.
+           2. **[Advanced]** If temporal wave-fronts *are* enabled (currently only possible via apply_command_line_options()):
+            - The step index (e.g., `t` for "time") will be sequentially set to values
+            from `first_step_index` to `last_step_index`, inclusive, within each wave-front tile.
+             + The number of steps in a wave-front tile may also be restricted by the size
+             of the tile in the step dimension. In that case, tiles will be done in slices of that size.
+             + Reverse solutions are not allowed with wave-front tiling.
+            - For each step index within each wave-front tile, the domain indices will be set
+            to values across the entire tile (not necessarily sequentially).
+            - Ultimately, the stencil(s) will be applied to same the elements in both the step 
+            and domain dimensions as when wave-front tiling is not used.
+            - MPI is not supported with wave-front tiling.
+
+           This function should be called only *after* calling prepare_solution().
+        */
+        virtual void
+        run_solution(idx_t first_step_index /**< [in] First index in the step dimension */,
+                     idx_t last_step_index /**< [in] Last index in the step dimension */ ) =0;
+
+        /// Run the stencil solution for the specified step.
+        /**
+           This function is simply an alias for `run_solution(step_index, step_index)`, i.e.,
+           the solution will be applied for exactly one step across the domain.
+
+           Typical C++ usage:
+
+           \code{.cpp}
+           soln->prepare_solution();
+           for (idx_t t = 1; t <= num_steps; t++)
+               soln->run_solution(t);
+           soln->end_solution();
+           \endcode
+
+           As written, the above loop is identical to
+
+           \code{.cpp}
+           soln->prepare_solution();
+           soln->run_solution(1, num_steps);
+           soln->end_solution();
+           \endcode
+
+           @note The parameter is *not* the number of steps to run.
+           @note Since only one step is taken per call, using this function effectively disables
+           wave-front tiling.
+        */
+        virtual void
+        run_solution(idx_t step_index /**< [in] Index in the step dimension */ ) =0;
+
+        /// Finish using a solution.
+        /**
+           Performs a final MPI halo exchange.
+           Releases shared ownership of memory used by the grids.  This will
+           result in deallocating each memory block that is not
+           referenced by another shared pointer.
+        */
+        virtual void
+        end_solution() =0;
+
+
+        /// Get performance statistics associated with preceding calls to run_solution().
+        /**
+           Side effect: resets all statistics, so a subsequent call will
+           measure performance after the current call.
+           @returns Pointer to statistics object.
+        */
+        virtual yk_stats_ptr
+        get_stats() =0;
+
+        /// Determine whether the auto-tuner is enabled on this rank.
+        /**
+           The auto-tuner is enabled by default.
+           It will become disabled after it has converged or after reset_auto_tuner(false) has been called.
+           @returns Whether the auto-tuner is still searching.
+        */
+        virtual bool
+        is_auto_tuner_enabled() =0;
+
+        /* Advanced APIs for yk_solution found below are not needed for most applications. */
+        
+        /// **[Advanced]** Set the minimum amount of grid padding for all grids.
+        /**
+           This sets the minimum number of elements in each grid that is
+           reserved outside of the rank domain in the given dimension.
+           This padding area can be used for required halo regions.  At
+           least the specified number of elements will be added to both
+           sides, i.e., both "before" and "after" the domain.
+           
+           The *actual* padding size will be the largest of the following values,
+           additionally rounded up based on the vector-folding dimensions
+           and/or cache-line alignment:
+           - Halo size.
+           - Value provided by any of the pad-size setting functions.
+           
+           The padding size cannot be changed after data storage
+           has been allocated for a given grid; attempted changes to the pad size for such
+           grids will be ignored.
+           In addition, once a grid's padding is set, it cannot be reduced, only increased.
+           Call yk_grid::get_pad_size() to determine the actual padding size for a given grid.
+           See the "Detailed Description" for \ref yk_grid for more information on grid sizes.
+           There is no padding allowed in the solution-step dimension (usually "t").
+        */
+        virtual void
+        set_min_pad_size(const std::string& dim
+                         /**< [in] Name of dimension to set.  Must
+                            be one of the names from get_domain_dim_names(). */,
+                         idx_t size
+                         /**< [in] Elements in this `dim` applied
+                            to both sides of the domain. */ ) =0;
+
+        /// **[Advanced]** Get the minimum amount of grid padding for all grids.
+        /**
+           @returns Current setting of minimum amount of grid padding for all grids.
+        */
+        virtual idx_t
+        get_min_pad_size(const std::string& dim
+                         /**< [in] Name of dimension to get.  Must be one of
+                            the names from get_domain_dim_names(). */) const =0;
+
+        /// **[Advanced]** Restart or disable the auto-tuner on this rank.
+        /**
+           Under normal operation, an auto-tuner is invoked automatically during calls to
+           run_solution().
+           Currently, only the block size is set by the auto-tuner, and the search begins from the 
+           sizes set via set_block_size() or the default size if set_block_size() has
+           not been called.
+           This function is used to apply the current best-known settings if the tuner has
+           been running, reset the state of the auto-tuner, and either
+           restart its search or disable it from running.
+           This call must be made on each rank where the change is desired.
+        */
+        virtual void
+        reset_auto_tuner(bool enable
+                         /**< [in] If _true_, start or restart the auto-tuner search.
+                            If _false_, disable the auto-tuner from running. */,
+                         bool verbose = false
+                         /**< [in] If _true_, print progress information to the debug object
+                            set via set_debug_output(). */ ) =0;
+
+        /// **[Advanced]** Automatically tune selected settings immediately.
+        /**
+           Executes a search algorithm to find [locally] optimum values for some of the
+           settings.
+           Under normal operation, an auto-tuner is invoked during calls to
+           run_solution().
+           See reset_auto_tuner() for more information.
+           This function causes the stencil solution to be run immediately
+           until the auto-tuner converges on all ranks.
+           It is useful for benchmarking, where performance is to be timed
+           for a given number of steps after the best settings are found.
+           This function should be called only *after* calling prepare_solution().
+           This call must be made on each rank.
+           @warning Modifies the contents of the grids by calling run_solution()
+           an arbitrary number of times, but without halo exchange.
+           (See run_solution() for other restrictions and warnings.)
+           Thus, grid data should be set *after* calling this function when
+           used in a production or test setting where correct results are expected.
+        */
+        virtual void
+        run_auto_tuner_now(bool verbose = true
+                           /**< [in] If _true_, print progress information to the debug object
+                              set via set_debug_output(). */ ) =0;
+        
+        /// **[Advanced]** Add a new grid to the solution.
+        /**
+           This is typically not needed because grids used by the stencils are pre-defined
+           by the solution itself via the stencil compiler.
+           However, a grid may be created explicitly via this function
+           in order to use it for purposes other than by the
+           pre-defined stencils within the current solution.
+
+           Grids created by this function will be treated like a pre-defined grid.
+           For example,
+           - For each domain dimension of the grid,
+           the new grid's domain size will be the same as that returned by
+           get_rank_domain_size().
+           - Calls to set_rank_domain_size() will resize the corresponding domain 
+           size in this grid.
+           - This grid's first domain index in this rank will be determined
+           by the position of this rank.
+           - This grid's initial padding size will be the same as that returned by
+           get_min_pad_size().
+           - After creating a new grid, you can increase its padding
+           sizes in the domain dimensions via yk_grid::set_min_pad_size(), etc.
+           - For step and misc dimensions, you can change the allocation via
+           yk_grid::set_alloc_size().
+
+           If you want a grid that is not automatically resized based on the
+           solution settings, use new_fixed_size_grid() instead.
+
+           @note A new grid contains only the meta-data for the grid; data storage
+           is not yet allocated.
+           Storage may be allocated in any of the methods listed
+           in the "Detailed Description" for \ref yk_grid.
+           @returns Pointer to the new grid.
+        */
+        virtual yk_grid_ptr
+        new_grid(const std::string& name
+                 /**< [in] Name of the grid; must be unique
+                    within the solution. */,
+                 const std::vector<std::string>& dims
+                 /**< [in] List of names of all dimensions. 
+                    Names must be valid C++ identifiers and 
+                    not repeated within this grid. */ ) =0;
+
+#ifndef SWIG
+        /// **[Advanced]** Add a new grid to the solution.
+        /**
+           See documentation for the version of new_grid() with a vector of dimension names
+           as a parameter.
+           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
+           @returns Pointer to the new grid.
+        */
+        virtual yk_grid_ptr
+        new_grid(const std::string& name
+                 /**< [in] Name of the grid; must be unique
+                    within the solution. */,
+                 const std::initializer_list<std::string>& dims
+                 /**< [in] List of names of all dimensions. 
+                    Names must be valid C++ identifiers and 
+                    not repeated within this grid. */ ) =0;
+#endif
+
+        /// **[Advanced]** Add a new grid to the solution with a specified size.
+        /**
+           This is typically not needed because grids used by the stencils are pre-defined
+           by the solution itself via the stencil compiler.
+           However, a grid may be created explicitly via this function
+           in order to use it for purposes other than by the
+           pre-defined stencils within the current solution.
+
+           Unlike new_grid(),
+           grids created by this function will *not* be treated like a pre-defined grid.
+           For example,
+           - For each domain dimension of the grid,
+           the new grid's domain size is provided during creation and cannot be changed.
+           - Calls to set_rank_domain_size() will *not* resize the corresponding domain 
+           size in this grid.
+           - This grid's first domain index in this rank will be fixed at zero (0)
+           regardless of this rank's position.
+           - This grid's padding size will be affected only by calls to 
+           yk_grid::set_min_pad_size(), etc.
+           - For step and misc dimensions, you can still change the allocation via
+           yk_grid::set_alloc_size().
+
+           @note A new grid contains only the meta-data for the grid; data storage
+           is not yet allocated.
+           Storage may be allocated in any of the methods listed
+           in the "Detailed Description" for \ref yk_grid.
+           @returns Pointer to the new grid.
+        */
+        virtual yk_grid_ptr
+        new_fixed_size_grid(const std::string& name
+                       /**< [in] Name of the grid; must be unique
+                          within the solution. */,
+                       const std::vector<std::string>& dims
+                       /**< [in] List of names of all dimensions. 
+                          Names must be valid C++ identifiers and 
+                          not repeated within this grid. */,
+                       const std::vector<idx_t>& dim_sizes
+                       /**< [in] Initial allocation in each dimension.
+                          Must be exatly one size for each dimension. */ ) =0;
+
+#ifndef SWIG
+        /// **[Advanced]** Add a new grid to the solution with a specified size.
+        /**
+           See documentation for the version of new_fixed_size_grid() with a vector of dimension names
+           as a parameter.
+           @note This version is not available (or needed) in SWIG-based APIs, e.g., Python.
+           @returns Pointer to the new grid.
+        */
+        virtual yk_grid_ptr
+        new_fixed_size_grid(const std::string& name
+                       /**< [in] Name of the grid; must be unique
+                          within the solution. */,
+                       const std::initializer_list<std::string>& dims
+                       /**< [in] List of names of all dimensions. 
+                          Names must be valid C++ identifiers and 
+                          not repeated within this grid. */,
+                       const std::initializer_list<idx_t>& dim_sizes
+                       /**< [in] Initial allocation in each dimension.
+                          Must be exatly one size for each dimension. */ ) =0;
+#endif
+
+        /// **[Advanced]** Set the default preferred NUMA node on which to allocate data.
+        /**
+           This value is used when allocating grids and MPI buffers.
+           The NUMA "preferred node allocation" policy is used, meaning that
+           memory will be allocated in an alternative node if the preferred one
+           doesn't have enough space available or is otherwise restricted.
+           Instead of specifying a NUMA node, a special value may be used
+           to specify another policy as listed.
+           This setting may be overridden for any specific grid.
+        */
+        virtual void
+        set_default_numa_preferred(int numa_node
+                                   /**< [in] Preferred NUMA node for data
+                                      allocation.  Alternatively, use
+                                      `yask_numa_local` for explicit
+                                      local-node allocation,
+                                      `yask_numa_interleave` for
+                                      interleaving pages across all nodes,
+                                      or `yask_numa_none` for no explicit NUMA
+                                      policy. These constants are defined in 
+                                      the _Variable Documentation_ section of
+                                      \ref yk_solution_api.hpp. */) =0;
+
+        /// **[Advanced]** Get the default preferred NUMA node on which to allocate data.
+        /**
+           @returns Current setting of preferred NUMA node.
+        */
+        virtual int
+        get_default_numa_preferred() const =0;
+
+        /// **[Advanced]** Set performance parameters from an option string.
+        /**
+           Parses the string for options as if from a command-line.
+           Example: "-bx 64 -block_threads 4" sets the block-size in the *x*
+           dimension to 64 and the number of threads used to process each
+           block to 4.
+           See the help message from the YASK kernel binary for documentation
+           on the command-line options.
+
+           @returns Any strings that were not recognized by the parser as options.
+        */
+        virtual std::string
+        apply_command_line_options(const std::string& args
+                                   /**< [in] String of arguments to parse. */ ) =0;
+
+        /// **[Advanced]** Get the specified stencil group.
+        /**
+           @returns Pointer to the specified \ref yk_stencil_group
+           or null pointer if it does not exist.
+        */
+        virtual yk_stencil_group_ptr
+        get_stencil_group(const std::string& name
+                          /**< [in] Name of the group. */ ) =0;
+
+        /// **[Advanced]** Get all the stencil groups.
+        /**
+           @returns List of all stencil groups in the solution.
+        */
+        virtual std::vector<yk_stencil_group_ptr>
+        get_stencil_groups() =0;
+
+        /// **[Advanced]** Run the specified stencil group over the given sub-domain.
+        /**
+           Applies all the stencil kernels in the given group
+           from `first_domain_indices` at `first_step_index`
+           to `last_domain_indices` at `last_domain_index` (inclusive) in each dimension.
+           Each list of domain indices should contain the indices for the 
+           dimensions returned by get_domain_dim_names() in the same order.
+
+           Indices are relative to the *overall* problem domain and
+           need not be limited to fall within the domain of the current MPI rank.
+           The actual points to which the group is applied on each rank will be
+           limited internally as needed.
+
+           Example C++ usage:
+
+           \code{.cpp}
+           // Find my custom stencil group created in the YASK compiler.
+           auto my_group = soln->get_stencil_group("my_group");
+           ...
+           soln->prepare_solution();
+           ...
+           // Set first_indices and last_indices to apply my_group
+           // to only the first slice in the "z" dimension.
+           std::vector<idx_t> first_indices, last_indices;
+           for (auto dim : soln->get_domain_dim_names()) {
+             auto overall_size = soln->get_overall_domain_size(dim);
+             first_indices.push_back(0);
+             if (dim == "z")
+               last_indices.push_back(0);
+             else
+               last_indices.push_back(overall_size - 1);
+           }
+           ...
+           // Execute the time-steps.
+           for (idx_t t = 0; t < num_steps; t++) {
+
+               // Apply the automatically-scheduled stencils.
+               soln->run_solution(t);
+
+               // Apply my custom stencil group.
+               soln->run_stencil_group(my_group, 
+                                       t, first_indices,
+                                       t, last_indices);
+           }
+           soln->end_solution();
+           \endcode
+
+           @returns Number of points to which the group was applied.
+        */
+        virtual idx_t
+        run_stencil_group(yk_stencil_group_ptr stencil_group
+                          /**< [in] Pointer to the stencil group obtained from
+                             get_stencil_groups() or get_stencil_group(). */,
+                          const std::vector<idx_t>& first_domain_indices
+                          /**< [in] List of initial domain indices. */,
+                          const std::vector<idx_t>& last_domain_indices
+                          /**< [in] List of final domain indices. */ ) =0;
+
+        /// **[Advanced]** Use data-storage from existing grids in specified solution.
+        /**
+           Calls yk_grid::share_storage() for each pair of grids that have the same name
+           in this solution and the source solution.
+           All conditions listed in yk_grid::share_storage() must hold for each pair.
+        */
+        virtual void
+        share_grid_storage(yk_solution_ptr source
+                           /**< [in] Solution from which grid storage will be shared. */) =0;
+    };
+
+    /// Statistics from calls to run_solution().
+    /**
+       A throughput rate may be calculated by multiplying an
+       amount-of-work-per-step quantity by the number of steps done and
+       dividing by the number of seconds elapsed.
+    */
+    class yk_stats {
+    public:
+    	virtual ~yk_stats() {}
+
+        /// Get the number of elements in the overall domain.
+        /**
+           @returns Product of all the overal domain sizes across all domain dimensions.
+        */
+        virtual idx_t
+        get_num_elements() =0;
+
+        /// Get the number of elements written in each step.
+        /**
+           @returns Number of elements written to each output grid.
+           This is the same value as get_num_elements() if there is only one output grid.
+        */
+        virtual idx_t
+        get_num_writes() =0;
+
+        /// Get the estimated number of floating-point operations required for each step.
+        /**
+           @returns Number of FP ops created by the stencil compiler.
+           It may be slightly more or less than the actual number of FP ops executed 
+           by the CPU due to C++ compiler transformations.
+        */
+        virtual idx_t
+        get_est_fp_ops() =0;
+
+        /// Get the number of steps calculated via run_solution().
+        /**
+           @returns A positive number, regardless of whether run_solution() steps were executed
+           forward or backward.
+        */
+        virtual idx_t
+        get_num_steps_done() =0;
+
+        /// Get the number of seconds elapsed during calls to run_solution().
+        /**
+           @returns Only the time spent in run_solution(), not in any other code in your
+           application between calls.
+        */
+        virtual double
+        get_elapsed_run_secs() =0;
+    };
+    
+    /// A group of stencil kernels.
+    /**
+       Groups of stencils are created automatically by the YASK stencil compiler
+       or manually via yc_solution::new_equation_group(). See the latter for
+       more information.
+    */
+    class yk_stencil_group {
+    public:
+    	virtual ~yk_stencil_group() {}
+
+        /// Get the name of this group.
+        /**
+           @returns Default name given by the YASK stencil compiler
+           or the name provided via yc_solution::new_equation_group().
+        */
+        virtual const std::string&
+        get_name() const =0;
+
+        /// Determine whether this group will be automatically scheduled.
+        /**
+           @returns `true` if this group will be run via yk_solution::run_solution()
+           or `false` if this group must be run via yk_solution::run_stencil_group().
+           This is the `do_schedule` setting passed via yc_solution::new_equation_group().
+        */
+        virtual bool
+        is_scheduled() const =0;
+
+    };
+
+} // namespace yask.
+
+#endif
diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp
index 28550e4b..1d683006 100644
--- a/src/common/common_utils.cpp
+++ b/src/common/common_utils.cpp
@@ -41,7 +41,7 @@ namespace yask {
     // for numbers above 9 (at least up to 99).
 
     // Format: "major.minor.patch".
-    const string version = "2.05.07";
+    const string version = "2.06.00";
 
     string yask_get_version_string() {
         return version;
diff --git a/src/compiler/lib/Cpp.hpp b/src/compiler/lib/Cpp.hpp
index 89bc299a..46e7f597 100644
--- a/src/compiler/lib/Cpp.hpp
+++ b/src/compiler/lib/Cpp.hpp
@@ -238,12 +238,12 @@ namespace yask {
     // Print out a stencil in C++ form for YASK.
     class YASKCppPrinter : public PrinterBase {
     protected:
-        EqGroups& _clusterEqGroups;
+        EqBundles& _clusterEqBundles;
         const Dimensions* _dims;
         string _context, _context_base;
 
         // Print an expression as a one-line C++ comment.
-        void addComment(ostream& os, EqGroup& eq);
+        void addComment(ostream& os, EqBundle& eq);
 
         // A factory method to create a new PrintHelper.
         // This can be overridden in derived classes to provide
@@ -260,17 +260,17 @@ namespace yask {
         // Print pieces of YASK output.
         virtual void printMacros(ostream& os);
         virtual void printData(ostream& os);
-        virtual void printEqGroups(ostream& os);
+        virtual void printEqBundles(ostream& os);
         virtual void printContext(ostream& os);
         
         
     public:
         YASKCppPrinter(StencilSolution& stencil,
-                       EqGroups& eqGroups,
-                       EqGroups& clusterEqGroups,
+                       EqBundles& eqBundles,
+                       EqBundles& clusterEqBundles,
                        const Dimensions* dims) :
-            PrinterBase(stencil, eqGroups),
-            _clusterEqGroups(clusterEqGroups),
+            PrinterBase(stencil, eqBundles),
+            _clusterEqBundles(clusterEqBundles),
             _dims(dims)
         {
             // name of C++ struct.
diff --git a/src/compiler/lib/CppIntrin.hpp b/src/compiler/lib/CppIntrin.hpp
index e9b48f76..86693c93 100644
--- a/src/compiler/lib/CppIntrin.hpp
+++ b/src/compiler/lib/CppIntrin.hpp
@@ -202,10 +202,10 @@ namespace yask {
 
     public:
         YASKKncPrinter(StencilSolution& stencil,
-                       EqGroups& eqGroups,
-                       EqGroups& clusterEqGroups,
+                       EqBundles& eqBundles,
+                       EqBundles& clusterEqBundles,
                        const Dimensions* dims) :
-            YASKCppPrinter(stencil, eqGroups, clusterEqGroups,
+            YASKCppPrinter(stencil, eqBundles, clusterEqBundles,
                            dims) { }
 
         virtual int num_vec_elems() const { return 64 / _settings._elem_bytes; }
@@ -225,10 +225,10 @@ namespace yask {
 
     public:
         YASKAvx256Printer(StencilSolution& stencil,
-                          EqGroups& eqGroups,
-                          EqGroups& clusterEqGroups,
+                          EqBundles& eqBundles,
+                          EqBundles& clusterEqBundles,
                           const Dimensions* dims) :
-            YASKCppPrinter(stencil, eqGroups, clusterEqGroups, dims) { }
+            YASKCppPrinter(stencil, eqBundles, clusterEqBundles, dims) { }
 
         virtual int num_vec_elems() const { return 32 / _settings._elem_bytes; }
     };
@@ -244,10 +244,10 @@ namespace yask {
 
     public:
         YASKAvx512Printer(StencilSolution& stencil,
-                          EqGroups& eqGroups,
-                          EqGroups& clusterEqGroups,
+                          EqBundles& eqBundles,
+                          EqBundles& clusterEqBundles,
                           const Dimensions* dims) :
-            YASKCppPrinter(stencil, eqGroups, clusterEqGroups,
+            YASKCppPrinter(stencil, eqBundles, clusterEqBundles,
                            dims) { }
 
         virtual int num_vec_elems() const { return 64 / _settings._elem_bytes; }
diff --git a/src/compiler/lib/Eqs.cpp b/src/compiler/lib/Eqs.cpp
index 0a8acac3..94e14f0a 100644
--- a/src/compiler/lib/Eqs.cpp
+++ b/src/compiler/lib/Eqs.cpp
@@ -23,7 +23,7 @@ IN THE SOFTWARE.
 
 *****************************************************************************/
 
-///////// Methods for equations and equation groups ////////////
+///////// Methods for equations and equation bundles ////////////
 
 #include "Print.hpp"
 #include "ExprUtils.hpp"
@@ -182,14 +182,12 @@ namespace yask {
     }
     
     // Find dependencies based on all eqs.
-    // If 'eq_deps' is set, save dependencies between eqs.
     // Side effect: sets _stepDir in dims.
     // Throws exceptions on illegal dependencies.
     // TODO: split this into smaller functions.
     // BIG-TODO: replace dependency algorithms with integration of a polyhedral
     // library.
     void Eqs::findDeps(Dimensions& dims,
-                       EqDepMap* eq_deps,
                        ostream& os) {
         auto& stepDim = dims._stepDim;
         
@@ -380,21 +378,15 @@ namespace yask {
                     }
 
                     // Save dependency.
-                    if (eq_deps) {
 #ifdef DEBUG_DEP
-                        cout << "  Exact match found to " << op1->makeQuotedStr() << ".\n";
+                    cout << "  Exact match found to " << op1->makeQuotedStr() << ".\n";
 #endif                        
-                        (*eq_deps)[cur_step_dep].set_imm_dep_on(eq2, eq1);
-                    }
+                    _eq_deps[cur_step_dep].set_imm_dep_on(eq2, eq1);
                         
                     // Move along to next eq2.
                     continue;
                 }
 
-                // Check more only if saving dependencies.
-                if (!eq_deps)
-                    continue;
-
                 // Next dep check: inexact matches on LHS of eq1 to RHS of eq2.
                 // Does eq1 define *any* point in a grid that eq2 inputs
                 // at the same step index?  If so, they *might* have a
@@ -443,12 +435,10 @@ namespace yask {
                             }
 
                             // Save dependency.
-                            if (eq_deps) {
 #ifdef DEBUG_DEP
-                                cout << "  Likely match found to " << op1->makeQuotedStr() << ".\n";
+                            cout << "  Likely match found to " << op1->makeQuotedStr() << ".\n";
 #endif                        
-                                (*eq_deps)[cur_step_dep].set_imm_dep_on(eq2, eq1);
-                            }
+                            _eq_deps[cur_step_dep].set_imm_dep_on(eq2, eq1);
 
                             // Move along to next equation.
                             break;
@@ -463,11 +453,9 @@ namespace yask {
         } // for all eqs (eq1).
 
         // Resolve indirect dependencies.
-        if (eq_deps) {
-            os << " Resolving indirect dependencies...\n";
-            for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1))
-                (*eq_deps)[dt].find_all_deps();
-        }
+        os << " Resolving indirect dependencies...\n";
+        for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1))
+            _eq_deps[dt].find_all_deps();
         os << " Done with dependency analysis.\n";
     }
 
@@ -607,7 +595,7 @@ namespace yask {
 
     // Update access stats for the grids.
     // Also finds scratch-grid eqs needed for each non-scratch eq.
-    void Eqs::updateGridStats(EqDepMap& eq_deps) {
+    void Eqs::updateGridStats() {
 
         // Find all LHS and RHS points and grids for all eqs.
         PointVisitor pv;
@@ -638,7 +626,7 @@ namespace yask {
             // 'eq1'.  It's important to visit the eqs in dep order to
             // properly propagate halos sizes thru chains of scratch grids.
             if (!og1->isScratch()) {
-                eq_deps[cur_step_dep].visitDeps
+                _eq_deps[cur_step_dep].visitDeps
 
                     // 'eq1' is 'b' or depends on 'b', immediately or indirectly.
                     (eq1, [&](EqualsExprPtr b, EqDeps::EqVecSet& path) {
@@ -719,9 +707,9 @@ namespace yask {
     }
    
    
-    // Get the full name of an eq-group.
+    // Get the full name of an eq-bundle.
     // Must be unique.
-    string EqGroup::getName() const {
+    string EqBundle::getName() const {
 
         // Add index to base name.
         ostringstream oss;
@@ -729,11 +717,11 @@ namespace yask {
         return oss.str();
     }
 
-    // Make a human-readable description of this eq group.
-    string EqGroup::getDescription(bool show_cond,
+    // Make a human-readable description of this eq bundle.
+    string EqBundle::getDescription(bool show_cond,
                                    string quote) const
     {
-        string des = "equation-group " + quote + getName() + quote;
+        string des = "equation-bundle " + quote + getName() + quote;
         if (show_cond) {
             if (cond.get())
                 des += " w/condition " + cond->makeQuotedStr(quote);
@@ -743,11 +731,11 @@ namespace yask {
         return des;
     }
 
-    // Add an equation to an EqGroup.
-    void EqGroup::addEq(EqualsExprPtr ee)
+    // Add an equation to an EqBundle.
+    void EqBundle::addEq(EqualsExprPtr ee)
     {
-#ifdef DEBUG_EQ_GROUP
-        cout << "EqGroup: adding " << ee->makeQuotedStr() << endl;
+#ifdef DEBUG_EQ_BUNDLE
+        cout << "EqBundle: adding " << ee->makeQuotedStr() << endl;
 #endif
         _eqs.insert(ee);
 
@@ -755,7 +743,7 @@ namespace yask {
         PointVisitor pv;
         ee->accept(&pv);
         
-        // update list of input and output grids for this group.
+        // update list of input and output grids for this bundle.
         auto* outGrid = pv.getOutputGrids().at(ee.get());
         _outGrids.insert(outGrid);
         auto& inGrids = pv.getInputGrids().at(ee.get());
@@ -764,8 +752,10 @@ namespace yask {
     }
 
     // Check for and set dependencies on eg2.
-    void EqGroup::checkDeps(Eqs& allEqs, EqDepMap& eq_deps, const EqGroup& eg2)
+    void EqBundle::checkDeps(Eqs& allEqs, const EqBundle& eg2)
     {
+        auto& eq_deps = allEqs.getDeps();
+        
         // Eqs in this.
         for (auto& eq1 : getEqs()) {
             auto& sdeps1 = allEqs.getScratchDeps(eq1);
@@ -776,13 +766,13 @@ namespace yask {
                 for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1)) {
 
                     // Immediate dep.
-                    if (eq_deps[dt].is_imm_dep_on(eq1, eq2)) {
+                    if (eq_deps.at(dt).is_imm_dep_on(eq1, eq2)) {
                         _imm_dep_on[dt].insert(eg2.getName());
                         _dep_on[dt].insert(eg2.getName());
                     }
 
                     // Indirect dep.
-                    else if (eq_deps[dt].is_dep_on(eq1, eq2)) {
+                    else if (eq_deps.at(dt).is_dep_on(eq1, eq2)) {
                         _dep_on[dt].insert(eg2.getName());
                     }
                 }
@@ -795,8 +785,8 @@ namespace yask {
     }
 
 
-    // Print stats from eqGroup.
-    void EqGroup::printStats(ostream& os, const string& msg)
+    // Print stats from eqBundle.
+    void EqBundle::printStats(ostream& os, const string& msg)
     {
         CounterVisitor cv;
         visitEqs(&cv);
@@ -823,10 +813,10 @@ namespace yask {
 
     // Replicate each equation at the non-zero offsets for
     // each vector in a cluster.
-    void EqGroup::replicateEqsInCluster(Dimensions& dims)
+    void EqBundle::replicateEqsInCluster(Dimensions& dims)
     {
         // Make a copy of the original equations so we can iterate through
-        // them while adding to the group.
+        // them while adding to the bundle.
         EqList eqs(_eqs);
 
         // Loop thru points in cluster.
@@ -853,7 +843,7 @@ namespace yask {
                         OffsetVisitor ov(clusterOffset);
                         eq2->accept(&ov);
 
-                        // Put new equation into group.
+                        // Put new equation into bundle.
                         addEq(eq2);
                     }
                 }
@@ -864,13 +854,13 @@ namespace yask {
         assert(_eqs.size() == eqs.size() * dims._clusterMults.product());
     }
 
-    // Reorder groups based on dependencies.
-    void EqGroups::sort()
+    // Reorder bundles based on dependencies.
+    void EqBundles::sort()
     {
         if (size() < 2)
             return;
 
-        cout << " Sorting " << size() << " eq-group(s)...\n";
+        cout << " Sorting " << size() << " eq-bundle(s)...\n";
 
         // Want to keep original order as much as possible.
         // Only reorder if dependencies are in conflict.
@@ -881,7 +871,7 @@ namespace yask {
             bool done = false;
             while (!done) {
         
-                // Does eq-group[i] depend on any eq-group after it?
+                // Does eq-bundle[i] depend on any eq-bundle after it?
                 auto& egi = at(i);
                 for (size_t j = i+1; j < size(); j++) {
                 
@@ -892,13 +882,13 @@ namespace yask {
 
                         // Error if also back-dep.
                         if (egj.isDepOn(cur_step_dep, egi)) {
-                            THROW_YASK_EXCEPTION("Error: circular dependency between eq-groups " <<
+                            THROW_YASK_EXCEPTION("Error: circular dependency between eq-bundles " <<
                                 egi.getDescription() << " and " <<
                                 egj.getDescription());
                         }
 
                         // Swap them.
-                        EqGroup temp(egi);
+                        EqBundle temp(egi);
                         egi = egj;
                         egj = temp;
 
@@ -912,24 +902,29 @@ namespace yask {
         }
     }
 
-    // Add expression 'eq' with condition 'cond' to eq-group with 'baseName'
+    // Add expression 'eq' from 'eqs' to eq-bundle with 'baseName'
     // unless alread added.  The corresponding index in '_indices' will be
-    // incremented if a new group is created.
+    // incremented if a new bundle is created.
     // 'eq_deps': pre-computed dependencies between equations.
-    // Returns whether a new group was created.
-    bool EqGroups::addExprToGroup(EqualsExprPtr eq,
-                                  BoolExprPtr cond,
-                                  const string& baseName,
-                                  bool is_scratch,
-                                  EqDepMap& eq_deps)
+    // Returns whether a new bundle was created.
+    bool EqBundles::addExprToBundle(Eqs& eqs,
+                                    EqualsExprPtr eq,
+                                    const string& baseName,
+                                    bool is_scratch)
     {
         // Equation already added?
-        if (_eqs_in_groups.count(eq))
+        if (_eqs_in_bundles.count(eq))
             return false;
 
-        // Loop through existing groups, looking for one that
+        // Get condition, if any.
+        auto cond = eqs.getCond(eq);
+
+        // Get deps.
+        auto& eq_deps = eqs.getDeps();
+        
+        // Loop through existing bundles, looking for one that
         // 'eq' can be added to.
-        EqGroup* target = 0;
+        EqBundle* target = 0;
         for (auto& eg : *this) {
 
             // Must match name and condition.
@@ -942,7 +937,7 @@ namespace yask {
                 for (auto& eq2 : eg.getEqs()) {
 
                     for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1)) {
-                        if (eq_deps[dt].is_dep(eq, eq2)) {
+                        if (eq_deps.at(dt).is_dep(eq, eq2)) {
 #if DEBUG_ADD_EXPRS
                             cout << "addExprsFromGrid: not adding equation " <<
                                 eq->makeQuotedStr() << " to " << eg.getDescription() <<
@@ -957,7 +952,7 @@ namespace yask {
                         break;
                 }
 
-                // Remember target group if found and no deps.
+                // Remember target bundle if found and no deps.
                 if (!is_dep) {
                     target = &eg;
                     break;
@@ -965,23 +960,23 @@ namespace yask {
             }
         }
         
-        // Make new group if no target group found.
-        bool newGroup = false;
+        // Make new bundle if no target bundle found.
+        bool newBundle = false;
         if (!target) {
-            EqGroup ne(*_dims, is_scratch);
+            EqBundle ne(*_dims, is_scratch);
             push_back(ne);
             target = &back();
             target->baseName = baseName;
             target->index = _indices[baseName]++;
             target->cond = cond;
-            newGroup = true;
+            newBundle = true;
         
 #if DEBUG_ADD_EXPRS
             cout << "Creating new " << target->getDescription() << endl;
 #endif
         }
 
-        // Add eq to target eq-group.
+        // Add eq to target eq-bundle.
         assert(target);
 #if DEBUG_ADD_EXPRS
         cout << "Adding " << eq->makeQuotedStr() <<
@@ -990,27 +985,26 @@ namespace yask {
         target->addEq(eq);
     
         // Remember eq and updated grid.
-        _eqs_in_groups.insert(eq);
+        _eqs_in_bundles.insert(eq);
         _outGrids.insert(eq->getGrid());
 
-        return newGroup;
+        return newBundle;
     }
 
-    // Divide all equations into eqGroups.
+    // Divide all equations into eqBundles.
     // Only process updates to grids in 'gridRegex'.
-    // 'targets': string provided by user to specify grouping.
+    // 'targets': string provided by user to specify bundleing.
     // 'eq_deps': pre-computed dependencies between equations.
-    void EqGroups::makeEqGroups(Eqs& allEqs,
-                                const string& gridRegex,
-                                const string& targets,
-                                EqDepMap& eq_deps,
-                                ostream& os)
+    void EqBundles::makeEqBundles(Eqs& allEqs,
+                                  const string& gridRegex,
+                                  const string& targets,
+                                  ostream& os)
     {
-        os << "Partitioning " << allEqs.getNumEqs() << " equation(s) into groups...\n";
+        os << "Partitioning " << allEqs.getNumEqs() << " equation(s) into bundles...\n";
         //auto& stepDim = _dims->_stepDim;
 
-        // Add each scratch equation to a separate group.
-        // TODO: Allow multiple scratch eqs in a group with same conds & halos.
+        // Add each scratch equation to a separate bundle.
+        // TODO: Allow multiple scratch eqs in a bundle with same conds & halos.
         // TODO: Only add scratch eqs that are needed by grids in 'gridRegex'.
         for (auto eq : allEqs.getEqs()) {
 
@@ -1021,7 +1015,7 @@ namespace yask {
                 string gname = gp->getName();
 
                 // Add equation.
-                addExprToGroup(eq, allEqs.getCond(eq), gname, true, eq_deps);
+                addExprToBundle(allEqs, eq, gname, true);
             }
         }
         
@@ -1029,7 +1023,7 @@ namespace yask {
         regex gridx(gridRegex);
     
         // Handle each key-value pair in 'targets' string.
-        // Key is eq-group name (with possible format strings); value is regex pattern.
+        // Key is eq-bundle name (with possible format strings); value is regex pattern.
         ArgParser ap;
         ap.parseKeyValuePairs
             (targets, [&](const string& egfmt, const string& pattern) {
@@ -1058,7 +1052,7 @@ namespace yask {
                     string egname = mr.format(egfmt);
 
                     // Add equation.
-                    addExprToGroup(eq, allEqs.getCond(eq), egname, false, eq_deps);
+                    addExprToBundle(allEqs, eq, egname, false);
                 }
             });
 
@@ -1075,11 +1069,11 @@ namespace yask {
                 continue;
 
             // Add equation.
-            addExprToGroup(eq, allEqs.getCond(eq), _basename_default, false, eq_deps);
+            addExprToBundle(allEqs, eq, _basename_default, false);
         }
-        os << "Created " << size() << " equation group(s):\n";
+        os << "Created " << size() << " equation bundle(s):\n";
 
-        // Find dependencies between eq-groups based on deps between their eqs.
+        // Find dependencies between eq-bundles based on deps between their eqs.
         for (auto& eg1 : *this) {
             os << " " << eg1.getDescription() << ":\n"
                 "  Contains " << eg1.getNumEqs() << " equation(s).\n"
@@ -1092,25 +1086,25 @@ namespace yask {
             }
             os << ".\n";
 
-            // Check to see if eg1 depends on other eq-groups.
+            // Check to see if eg1 depends on other eq-bundles.
             for (auto& eg2 : *this) {
 
                 // Don't check against self.
                 if (eg1.getName() == eg2.getName())
                     continue;
 
-                eg1.checkDeps(allEqs, eq_deps, eg2);
+                eg1.checkDeps(allEqs, eg2);
                 DepType dt = cur_step_dep;
                 if (eg1.isImmDepOn(dt, eg2))
-                    os << "  Immediately dependent on group " <<
+                    os << "  Immediately dependent on bundle " <<
                         eg2.getName() << ".\n";
                 else if (eg1.isDepOn(dt, eg2))
-                    os << "  Indirectly dependent on group " <<
+                    os << "  Indirectly dependent on bundle " <<
                         eg2.getName() << ".\n";
             }
             auto& sdeps = eg1.getScratchDeps();
             if (sdeps.size()) {
-                os << "  Requires evaluation of the following scratch-grid group(s):";
+                os << "  Requires evaluation of the following scratch-grid bundle(s):";
                 for (auto& sname : sdeps)
                     os << " " << sname;
                 os << ".\n";
@@ -1121,8 +1115,8 @@ namespace yask {
         sort();
     }
 
-    // Print stats from eqGroups.
-    void EqGroups::printStats(ostream& os, const string& msg) {
+    // Print stats from eqBundles.
+    void EqBundles::printStats(ostream& os, const string& msg) {
         CounterVisitor cv;
         for (auto& eq : *this) {
             CounterVisitor ecv;
@@ -1133,15 +1127,15 @@ namespace yask {
     }
 
     // Apply optimizations according to the 'settings'.
-    void EqGroups::optimizeEqGroups(CompilerSettings& settings,
+    void EqBundles::optimizeEqBundles(CompilerSettings& settings,
                                     const string& descr,
                                     bool printSets,
                                     ostream& os) {
         // print stats.
-        string edescr = "for " + descr + " equation-group(s)";
+        string edescr = "for " + descr + " equation-bundle(s)";
         printStats(os, edescr);
     
-        // Make a list of optimizations to apply to eqGroups.
+        // Make a list of optimizations to apply to eqBundles.
         vector<OptVisitor*> opts;
 
         // CSE.
@@ -1164,7 +1158,7 @@ namespace yask {
             visitEqs(optimizer);
             int numChanges = optimizer->getNumChanges();
             string odescr = "after applying " + optimizer->getName() + " to " +
-                descr + " equation-group(s)";
+                descr + " equation-bundle(s)";
 
             // Get new stats.
             if (numChanges)
@@ -1173,9 +1167,9 @@ namespace yask {
                 os << "No changes " << odescr << '.' << endl;
         }
 
-        // Final stats per equation group.
+        // Final stats per equation bundle.
         if (printSets && size() > 1) {
-            os << "Stats per equation-group:\n";
+            os << "Stats per equation-bundle:\n";
             for (auto eg : *this)
                 eg.printStats(os, "for " + eg.getDescription());
         }
diff --git a/src/compiler/lib/Eqs.hpp b/src/compiler/lib/Eqs.hpp
index 07f507ea..e005b07e 100644
--- a/src/compiler/lib/Eqs.hpp
+++ b/src/compiler/lib/Eqs.hpp
@@ -23,7 +23,7 @@ IN THE SOFTWARE.
 
 *****************************************************************************/
 
-///////// Classes for equations and equation groups ////////////
+///////// Classes for equations and equation bundles ////////////
 
 #ifndef EQS_HPP
 #define EQS_HPP
@@ -123,6 +123,7 @@ namespace yask {
         EqList _eqs;          // just equations w/o conditions.
         CondMap _conds;       // map from equations to their conditions, if any.
 
+        EqDepMap _eq_deps;            // dependencies between all eqs.
         EqDeps::DepMap _scratch_deps;   // dependencies through scratch grids.
         
     public:
@@ -158,7 +159,12 @@ namespace yask {
                 return nullptr;
         }
 
-        // Get the scratch-grid eqs that contribute to this eq.
+        // Get all the deps.
+        virtual const EqDepMap& getDeps() const {
+            return _eq_deps;
+        }
+        
+        // Get the scratch-grid eqs that contribute to 'eq'.
         virtual const EqDeps::EqSet& getScratchDeps(EqualsExprPtr ep) const {
             return _scratch_deps.at(ep);
         }
@@ -174,7 +180,6 @@ namespace yask {
         // Find dependencies based on all eqs.  If 'eq_deps' is
         // set, save dependencies between eqs in referent.
         virtual void findDeps(Dimensions& dims,
-                              EqDepMap* eq_deps,
                               std::ostream& os);
 
         // Determine which grid points can be vectorized.
@@ -184,37 +189,37 @@ namespace yask {
         virtual void analyzeLoop(const Dimensions& dims);
 
         // Update grid access stats.
-        virtual void updateGridStats(EqDepMap& eq_deps);
+        virtual void updateGridStats();
     };
 
-    // A named equation group, which contains one or more grid-update equations.
-    // All equations in a group must have the same condition.
+    // A named equation bundle, which contains one or more grid-update equations.
+    // All equations in a bundle must have the same condition.
     // Equations should not have inter-dependencies because they will be
     // combined into a single expression.
-    class EqGroup {
+    class EqBundle {
     protected:
-        EqList _eqs; // expressions in this eqGroup (not including conditions).
-        Grids _outGrids;          // grids updated by this eqGroup.
-        Grids _inGrids;          // grids read from by this eqGroup.
+        EqList _eqs; // expressions in this eqBundle (not including conditions).
+        Grids _outGrids;          // grids updated by this eqBundle.
+        Grids _inGrids;          // grids read from by this eqBundle.
         const Dimensions* _dims = 0;
         bool _isScratch = false; // true if updating temp grid(s).
 
-        // Other eq-groups that this group depends on. This means that an
-        // equation in this group has a grid value on the RHS that appears in
+        // Other eq-bundles that this bundle depends on. This means that an
+        // equation in this bundle has a grid value on the RHS that appears in
         // the LHS of the dependency.
         map<DepType, set<string>> _imm_dep_on; // immediate deps.
         map<DepType, set<string>> _dep_on;     // immediate and indirect deps.
-        set<string> _scratch_deps;             // scratch groups needed for this group.
+        set<string> _scratch_deps;             // scratch bundles needed for this bundle.
 
     public:
 
         // TODO: move these into protected section and make accessors.
-        string baseName;            // base name of this eqGroup.
+        string baseName;            // base name of this eqBundle.
         int index;                  // index to distinguish repeated names.
         BoolExprPtr cond;           // condition (default is null).
 
         // Ctor.
-        EqGroup(const Dimensions& dims, bool is_scratch) :
+        EqBundle(const Dimensions& dims, bool is_scratch) :
             _dims(&dims), _isScratch(is_scratch) {
 
             // Create empty map entries.
@@ -223,16 +228,16 @@ namespace yask {
                 _dep_on[dt];
             }
         }
-        virtual ~EqGroup() {}
+        virtual ~EqBundle() {}
 
-        // Add an equation to this group.
+        // Add an equation to this bundle.
         virtual void addEq(EqualsExprPtr ee);
     
         // Visit all the equations.
         virtual void visitEqs(ExprVisitor* ev) {
             for (auto& ep : _eqs) {
-#ifdef DEBUG_EQ_GROUP
-                cout << "EqGroup: visiting " << ep->makeQuotedStr() << endl;
+#ifdef DEBUG_EQ_BUNDLE
+                cout << "EqBundle: visiting " << ep->makeQuotedStr() << endl;
 #endif
                 ep->accept(ev);
             }
@@ -277,16 +282,16 @@ namespace yask {
             return _inGrids;
         }
 
-        // Get whether this eq-group depends on eg2.
+        // Get whether this eq-bundle depends on eg2.
         // Must have already been set via checkDeps().
-        virtual bool isImmDepOn(DepType dt, const EqGroup& eq2) const {
+        virtual bool isImmDepOn(DepType dt, const EqBundle& eq2) const {
             return _imm_dep_on.at(dt).count(eq2.getName()) > 0;
         }
-        virtual bool isDepOn(DepType dt, const EqGroup& eq2) const {
+        virtual bool isDepOn(DepType dt, const EqBundle& eq2) const {
             return _dep_on.at(dt).count(eq2.getName()) > 0;
         }
 
-        // Get dependencies on this eq-group.
+        // Get dependencies on this eq-bundle.
         virtual const set<string>& getImmDeps(DepType dt) const {
             return _imm_dep_on.at(dt);
         }
@@ -294,24 +299,24 @@ namespace yask {
             return _dep_on.at(dt);
         }
 
-        // Get scratch-group dependencies.
+        // Get scratch-bundle dependencies.
         virtual const set<string>& getScratchDeps() const {
             return _scratch_deps;
         }
     
         // Check for and set dependencies on eg2.
-        virtual void checkDeps(Eqs& allEqs, EqDepMap& eq_deps, const EqGroup& eg2);
+        virtual void checkDeps(Eqs& allEqs, const EqBundle& eg2);
 
         // Replicate each equation at the non-zero offsets for
         // each vector in a cluster.
         virtual void replicateEqsInCluster(Dimensions& dims);
         
-        // Print stats for the equation(s) in this group.
+        // Print stats for the equation(s) in this bundle.
         virtual void printStats(ostream& os, const string& msg);
     };
 
-    // Container for multiple equation groups.
-    class EqGroups : public vector<EqGroup> {
+    // Container for multiple equation bundles.
+    class EqBundles : public vector<EqBundle> {
     protected:
 
         // Copy of some global data.
@@ -321,29 +326,27 @@ namespace yask {
         // Track grids that are udpated.
         Grids _outGrids;
 
-        // Map to track indices per eq-group name.
+        // Map to track indices per eq-bundle name.
         map<string, int> _indices;
 
         // Track equations that have been added already.
-        set<EqualsExprPtr> _eqs_in_groups;
+        set<EqualsExprPtr> _eqs_in_bundles;
     
-        // Add expression 'eq' with condition 'cond' to eq-group with 'baseName'
+        // Add expression 'eq' from 'eqs' to eq-bundle with 'baseName'
         // unless alread added.  The corresponding index in '_indices' will be
-        // incremented if a new group is created.
-        // 'eq_deps': pre-computed dependencies between equations.
-        // Returns whether a new group was created.
-        virtual bool addExprToGroup(EqualsExprPtr eq,
-                                    BoolExprPtr cond, // may be nullptr.
-                                    const string& baseName,
-                                    bool is_scratch,
-                                    EqDepMap& eq_deps);
+        // incremented if a new bundle is created.
+        // Returns whether a new bundle was created.
+        virtual bool addExprToBundle(Eqs& eqs,
+                                     EqualsExprPtr eq,
+                                     const string& baseName,
+                                     bool is_scratch);
 
     public:
-        EqGroups() {}
-        EqGroups(const string& basename_default, Dimensions& dims) :
+        EqBundles() {}
+        EqBundles(const string& basename_default, Dimensions& dims) :
             _basename_default(basename_default),
             _dims(&dims) {}
-        virtual ~EqGroups() {}
+        virtual ~EqBundles() {}
 
         virtual void set_basename_default(const string& basename_default) {
             _basename_default = basename_default;
@@ -352,24 +355,23 @@ namespace yask {
             _dims = &dims;
         }
         
-        // Separate a set of equations into eqGroups based
+        // Separate a set of equations into eqBundles based
         // on the target string.
         // Target string is a comma-separated list of key-value pairs, e.g.,
-        // "eqGroup1=foo,eqGroup2=bar".
-        // In this example, all eqs updating grid names containing 'foo' go in eqGroup1,
-        // all eqs updating grid names containing 'bar' go in eqGroup2, and
-        // each remaining eq goes into a separate eqGroup.
-        void makeEqGroups(Eqs& eqs,
-                          const string& gridRegex,
-                          const string& targets,
-                          EqDepMap& eq_deps,
-                          std::ostream& os);
+        // "eqBundle1=foo,eqBundle2=bar".
+        // In this example, all eqs updating grid names containing 'foo' go in eqBundle1,
+        // all eqs updating grid names containing 'bar' go in eqBundle2, and
+        // each remaining eq goes into a separate eqBundle.
+        void makeEqBundles(Eqs& eqs,
+                           const string& gridRegex,
+                           const string& targets,
+                           std::ostream& os);
         
         virtual const Grids& getOutputGrids() const {
             return _outGrids;
         }
 
-        // Visit all the equations in all eqGroups.
+        // Visit all the equations in all eqBundles.
         // This will not visit the conditions.
         virtual void visitEqs(ExprVisitor* ev) {
             for (auto& eg : *this)
@@ -383,26 +385,26 @@ namespace yask {
                 eg.replicateEqsInCluster(dims);
         }
 
-        // Reorder groups based on dependencies.
+        // Reorder bundles based on dependencies.
         virtual void sort();
     
-        // Print a list of eqGroups.
+        // Print a list of eqBundles.
         virtual void printInfo(ostream& os) const {
-            os << "Identified stencil equation-groups:" << endl;
+            os << "Identified stencil equation-bundles:" << endl;
             for (auto& eq : *this) {
                 for (auto gp : eq.getOutputGrids()) {
                     string eqName = eq.getName();
-                    os << "  Equation group '" << eqName << "' updates grid '" <<
+                    os << "  Equation bundle '" << eqName << "' updates grid '" <<
                         gp->getName() << "'." << endl;
                 }
             }
         }
 
-        // Print stats for the equation(s) in all groups.
+        // Print stats for the equation(s) in all bundles.
         virtual void printStats(ostream& os, const string& msg);
 
         // Apply optimizations requested in settings.
-        void optimizeEqGroups(CompilerSettings& settings,
+        void optimizeEqBundles(CompilerSettings& settings,
                               const string& descr,
                               bool printSets,
                               ostream& os);
diff --git a/src/compiler/lib/Grid.cpp b/src/compiler/lib/Grid.cpp
index 00522efc..a4bd9e42 100644
--- a/src/compiler/lib/Grid.cpp
+++ b/src/compiler/lib/Grid.cpp
@@ -304,7 +304,7 @@ namespace yask {
         if (sz > 1 && first_max_halo == 0 && last_max_halo == 0)
             sz--;
 
-        // TODO: recognize that reading in one eq-group and then writing in
+        // TODO: recognize that reading in one eq-bundle and then writing in
         // another can also reuse storage.
 
         return sz;
diff --git a/src/compiler/lib/Grid.hpp b/src/compiler/lib/Grid.hpp
index b253df40..332c2f05 100644
--- a/src/compiler/lib/Grid.hpp
+++ b/src/compiler/lib/Grid.hpp
@@ -66,7 +66,7 @@ namespace yask {
         // various step-index values.
         // bool key: true=left, false=right.
         // int key: step-dim offset or 0 if no step-dim.
-        // TODO: keep separate halos for each equation group.
+        // TODO: keep separate halos for each equation bundle.
         map<bool, map<int, IntTuple>> _halos;  
     
     public:
@@ -293,7 +293,7 @@ namespace yask {
         IntTuple _foldOptions;    // vector fold.
         IntTuple _clusterOptions; // cluster multipliers.
         bool _firstInner = true; // first dimension of fold is unit step.
-        string _eq_group_basename_default = "stencil";
+        string _eq_bundle_basename_default = "stencil_bundle";
         bool _allowUnalignedLoads = false;
         int _haloSize = 0;      // 0 => calculate each halo separately and automatically.
         int _stepAlloc = 0;     // 0 => calculate step allocation automatically.
@@ -302,7 +302,7 @@ namespace yask {
         bool _doCse = true;      // do common-subexpr elim.
         bool _doComb = true;    // combine commutative operations.
         bool _doOptCluster = true; // apply optimizations also to cluster.
-        string _eqGroupTargets;  // how to group equations.
+        string _eqBundleTargets;  // how to bundle equations.
         string _gridRegex;       // grids to update.
     };
     
diff --git a/src/compiler/lib/Print.cpp b/src/compiler/lib/Print.cpp
index 4ab2e5ae..e1164195 100644
--- a/src/compiler/lib/Print.cpp
+++ b/src/compiler/lib/Print.cpp
@@ -538,11 +538,11 @@ namespace yask {
 
         os << "Stencil '" << _stencil.getName() << "' pseudo-code:" << endl;
 
-        // Loop through all eqGroups.
-        for (auto& eq : _eqGroups) {
+        // Loop through all eqBundles.
+        for (auto& eq : _eqBundles) {
 
             string egName = eq.getName();
-            os << endl << " ////// Equation group '" << egName <<
+            os << endl << " ////// Equation bundle '" << egName <<
                 "' //////" << endl;
 
             CounterVisitor cv;
@@ -579,9 +579,9 @@ namespace yask {
         os << "digraph \"Stencil " << _stencil.getName() << "\" {\n"
             "rankdir=LR; ranksep=1.5;\n";
 
-        // Loop through all eqGroups.
-        for (auto& eq : _eqGroups) {
-            os << "subgraph \"Equation-group " << eq.getName() << "\" {" << endl;
+        // Loop through all eqBundles.
+        for (auto& eq : _eqBundles) {
+            os << "subgraph \"Equation-bundle " << eq.getName() << "\" {" << endl;
             eq.visitEqs(pv);
             os << "}" << endl;
         }
@@ -602,8 +602,8 @@ namespace yask {
             "  look_at <0, 0, 0>" << endl <<
             "}" << endl;
 
-        // Loop through all eqGroups.
-        for (auto& eq : _eqGroups) {
+        // Loop through all eqBundles.
+        for (auto& eq : _eqBundles) {
 
             // TODO: separate mutiple grids.
             POVRayPrintVisitor pv(os);
diff --git a/src/compiler/lib/Print.hpp b/src/compiler/lib/Print.hpp
index efafd83d..606f3b21 100644
--- a/src/compiler/lib/Print.hpp
+++ b/src/compiler/lib/Print.hpp
@@ -434,15 +434,15 @@ namespace yask {
     protected:
         StencilSolution& _stencil;
         Grids& _grids;
-        EqGroups& _eqGroups;
+        EqBundles& _eqBundles;
         CompilerSettings& _settings;
         
     public:
         PrinterBase(StencilSolution& stencil,
-                    EqGroups& eqGroups) :
+                    EqBundles& eqBundles) :
             _stencil(stencil), 
             _grids(stencil.getGrids()),
-            _eqGroups(eqGroups),
+            _eqBundles(eqBundles),
             _settings(stencil.getSettings())
         { }
         virtual ~PrinterBase() { }
@@ -476,8 +476,8 @@ namespace yask {
         
     public:
         PseudoPrinter(StencilSolution& stencil,
-                      EqGroups& eqGroups) :
-            PrinterBase(stencil, eqGroups) { }
+                      EqBundles& eqBundles) :
+            PrinterBase(stencil, eqBundles) { }
         virtual ~PseudoPrinter() { }
 
         virtual void print(ostream& os);
@@ -489,9 +489,9 @@ namespace yask {
         bool _isSimple;
         
     public:
-        DOTPrinter(StencilSolution& stencil, EqGroups& eqGroups,
+        DOTPrinter(StencilSolution& stencil, EqBundles& eqBundles,
                    bool isSimple) :
-            PrinterBase(stencil, eqGroups),
+            PrinterBase(stencil, eqBundles),
             _isSimple(isSimple) { }
         virtual ~DOTPrinter() { }
 
@@ -502,8 +502,8 @@ namespace yask {
     class POVRayPrinter : public PrinterBase {
         
     public:
-        POVRayPrinter(StencilSolution& stencil, EqGroups& eqGroups) :
-            PrinterBase(stencil, eqGroups) { }
+        POVRayPrinter(StencilSolution& stencil, EqBundles& eqBundles) :
+            PrinterBase(stencil, eqBundles) { }
         virtual ~POVRayPrinter() { }
 
         virtual void print(ostream& os);
diff --git a/src/compiler/lib/Soln.cpp b/src/compiler/lib/Soln.cpp
index e544a984..a296dcb4 100644
--- a/src/compiler/lib/Soln.cpp
+++ b/src/compiler/lib/Soln.cpp
@@ -85,27 +85,26 @@ namespace yask {
         _eqs.analyzeLoop(_dims);
 
         // Find dependencies between equations.
-        EqDepMap eq_deps;
-        _eqs.findDeps(_dims, &eq_deps, *_dos);
+        _eqs.findDeps(_dims, *_dos);
 
         // Update access stats for the grids.
-        _eqs.updateGridStats(eq_deps);
+        _eqs.updateGridStats();
         
-        // Create equation groups based on dependencies and/or target strings.
-        _eqGroups.set_basename_default(_settings._eq_group_basename_default);
-        _eqGroups.set_dims(_dims);
-        _eqGroups.makeEqGroups(_eqs, _settings._gridRegex,
-                               _settings._eqGroupTargets, eq_deps, *_dos);
-        _eqGroups.optimizeEqGroups(_settings, "scalar & vector", false, *_dos);
+        // Create equation bundles based on dependencies and/or target strings.
+        _eqBundles.set_basename_default(_settings._eq_bundle_basename_default);
+        _eqBundles.set_dims(_dims);
+        _eqBundles.makeEqBundles(_eqs, _settings._gridRegex,
+                                 _settings._eqBundleTargets, *_dos);
+        _eqBundles.optimizeEqBundles(_settings, "scalar & vector", false, *_dos);
 
         // Make a copy of each equation at each cluster offset.
         // We will use these for inter-cluster optimizations and code generation.
         *_dos << "Constructing cluster of equations containing " <<
             _dims._clusterMults.product() << " vector(s)...\n";
-        _clusterEqGroups = _eqGroups;
-        _clusterEqGroups.replicateEqsInCluster(_dims);
+        _clusterEqBundles = _eqBundles;
+        _clusterEqBundles.replicateEqsInCluster(_dims);
         if (_settings._doOptCluster)
-            _clusterEqGroups.optimizeEqGroups(_settings, "cluster", true, *_dos);
+            _clusterEqBundles.optimizeEqBundles(_settings, "cluster", true, *_dos);
     }
 
     // Format in given format-type.
@@ -117,21 +116,21 @@ namespace yask {
         // Data itself will be created in analyze_solution().
         PrinterBase* printer = 0;
         if (format_type == "cpp")
-            printer = new YASKCppPrinter(*this, _eqGroups, _clusterEqGroups, &_dims);
+            printer = new YASKCppPrinter(*this, _eqBundles, _clusterEqBundles, &_dims);
         else if (format_type == "knc")
-            printer = new YASKKncPrinter(*this, _eqGroups, _clusterEqGroups, &_dims);
+            printer = new YASKKncPrinter(*this, _eqBundles, _clusterEqBundles, &_dims);
         else if (format_type == "avx" || format_type == "avx2")
-            printer = new YASKAvx256Printer(*this, _eqGroups, _clusterEqGroups, &_dims);
+            printer = new YASKAvx256Printer(*this, _eqBundles, _clusterEqBundles, &_dims);
         else if (format_type == "avx512" || format_type == "avx512f")
-            printer = new YASKAvx512Printer(*this, _eqGroups, _clusterEqGroups, &_dims);
+            printer = new YASKAvx512Printer(*this, _eqBundles, _clusterEqBundles, &_dims);
         else if (format_type == "dot")
-            printer = new DOTPrinter(*this, _clusterEqGroups, false);
+            printer = new DOTPrinter(*this, _clusterEqBundles, false);
         else if (format_type == "dot-lite")
-            printer = new DOTPrinter(*this, _clusterEqGroups, true);
+            printer = new DOTPrinter(*this, _clusterEqBundles, true);
         else if (format_type == "pseudo")
-            printer = new PseudoPrinter(*this, _clusterEqGroups);
+            printer = new PseudoPrinter(*this, _clusterEqBundles);
         else if (format_type == "pov-ray") // undocumented.
-            printer = new POVRayPrinter(*this, _clusterEqGroups);
+            printer = new POVRayPrinter(*this, _clusterEqBundles);
         else {
             THROW_YASK_EXCEPTION("Error: format-type '" << format_type <<
                                  "' is not recognized");
@@ -140,7 +139,7 @@ namespace yask {
         int vlen = printer->num_vec_elems();
         bool is_folding_efficient = printer->is_folding_efficient();
 
-        // Set data for equation groups, dims, etc.
+        // Set data for equation bundles, dims, etc.
         analyze_solution(vlen, is_folding_efficient);
 
         // Create the output.
diff --git a/src/compiler/lib/Soln.hpp b/src/compiler/lib/Soln.hpp
index 0139fafe..20142e69 100644
--- a/src/compiler/lib/Soln.hpp
+++ b/src/compiler/lib/Soln.hpp
@@ -72,8 +72,8 @@ namespace yask {
 
         // Intermediate data needed to format output.
         Dimensions _dims;       // various dimensions.
-        EqGroups _eqGroups;     // eq-groups for scalar and vector.
-        EqGroups _clusterEqGroups; // eq-groups for scalar and vector.
+        EqBundles _eqBundles;     // eq-bundles for scalar and vector.
+        EqBundles _clusterEqBundles; // eq-bundles for scalar and vector.
 
         // Create the intermediate data.
         void analyze_solution(int vlen,
diff --git a/src/compiler/lib/YaskKernel.cpp b/src/compiler/lib/YaskKernel.cpp
index ba1a204b..4088fd9b 100644
--- a/src/compiler/lib/YaskKernel.cpp
+++ b/src/compiler/lib/YaskKernel.cpp
@@ -41,7 +41,7 @@ namespace yask {
     }
 
     // Print an expression as a one-line C++ comment.
-    void YASKCppPrinter::addComment(ostream& os, EqGroup& eq) {
+    void YASKCppPrinter::addComment(ostream& os, EqBundle& eq) {
         
         // Use a simple human-readable visitor to create a comment.
         PrintHelper ph(_dims, NULL, "temp", "", " // ", ".\n");
@@ -68,8 +68,8 @@ namespace yask {
         // First, create a class to hold the data (grids).
         printData(os);
         
-        // A struct for each equation group.
-        printEqGroups(os);
+        // A struct for each equation bundle.
+        printEqBundles(os);
 
         // Finish the context.
         printContext(os);
@@ -153,7 +153,7 @@ namespace yask {
 
         // get stats.
         CounterVisitor cve;
-        _eqGroups.visitEqs(&cve);
+        _eqBundles.visitEqs(&cve);
 
         os << endl << " ////// Stencil-specific data //////" << endl <<
             "class " << _context_base << " : public StencilContext {\n"
@@ -185,7 +185,7 @@ namespace yask {
             os << " '" << grid << "', which is ";
             if (gp->isScratch())
                 os << " a scratch variable.\n";
-            else if (_eqGroups.getOutputGrids().count(gp))
+            else if (_eqBundles.getOutputGrids().count(gp))
                 os << "updated by one or more equations.\n";
             else
                 os << "not updated by any equation (read-only).\n";
@@ -361,7 +361,7 @@ namespace yask {
                 ctorCode += initCode;
                 ctorCode += " " + grid + " = " + grid + "_ptr.get();\n";
                 ctorCode += " addGrid(" + grid + "_ptr, ";
-                if (_eqGroups.getOutputGrids().count(gp))
+                if (_eqBundles.getOutputGrids().count(gp))
                     ctorCode += "true /* is an output grid */";
                 else
                     ctorCode += "false /* is not an output grid */";
@@ -431,25 +431,25 @@ namespace yask {
         os << "}; // " << _context_base << endl;
     }
 
-    // Print YASK equation groups.
-    void YASKCppPrinter::printEqGroups(ostream& os) {
+    // Print YASK equation bundles.
+    void YASKCppPrinter::printEqBundles(ostream& os) {
         
-        for (size_t ei = 0; ei < _eqGroups.size(); ei++) {
+        for (size_t ei = 0; ei < _eqBundles.size(); ei++) {
 
-            // Scalar eqGroup.
-            auto& eq = _eqGroups.at(ei);
+            // Scalar eqBundle.
+            auto& eq = _eqBundles.at(ei);
             string egName = eq.getName();
             string egDesc = eq.getDescription();
-            string egsName = "StencilGroup_" + egName;
+            string egsName = "StencilBundle_" + egName;
 
             os << endl << " ////// Stencil " << egDesc << " //////\n" <<
-                "\n class " << egsName << " : public StencilGroupBase {\n"
+                "\n class " << egsName << " : public StencilBundleBase {\n"
                 " protected:\n"
                 " typedef " << _context_base << " _context_type;\n"
                 " _context_type* _context = 0;\n"
                 " public:\n";
 
-            // Stats for this eqGroup.
+            // Stats for this eqBundle.
             CounterVisitor stats;
             eq.visitEqs(&stats);
             
@@ -457,10 +457,10 @@ namespace yask {
             os << endl << " // " << stats.getNumOps() << " FP operation(s) per point:" << endl;
             addComment(os, eq);
 
-            // Stencil-group ctor.
+            // Stencil-bundle ctor.
             {
                 os << " " << egsName << "(" << _context_base << "* context) :\n"
-                    " StencilGroupBase(context),\n"
+                    " StencilBundleBase(context),\n"
                     " _context(context) {\n"
                     " _name = \"" << egName << "\";\n"
                     " _scalar_fp_ops = " << stats.getNumOps() << ";\n"
@@ -529,13 +529,13 @@ namespace yask {
             // Vector/Cluster code.
             for (int do_cluster = 0; do_cluster <= 1; do_cluster++) {
 
-                // Cluster eqGroup at same 'ei' index.
-                // This should be the same eq-group because it was copied from the
+                // Cluster eqBundle at same 'ei' index.
+                // This should be the same eq-bundle because it was copied from the
                 // scalar one.
-                auto& vceq = do_cluster ? _clusterEqGroups.at(ei) : _eqGroups.at(ei);
+                auto& vceq = do_cluster ? _clusterEqBundles.at(ei) : _eqBundles.at(ei);
                 assert(egDesc == vceq.getDescription());
 
-                // Create vector info for this eqGroup.
+                // Create vector info for this eqBundle.
                 // The visitor is accepted at all nodes in the cluster AST;
                 // for each grid access node in the AST, the vectors
                 // needed are determined and saved in the visitor.
@@ -638,7 +638,7 @@ namespace yask {
 
             os << "}; // " << egsName << ".\n"; // end of class.
             
-        } // stencil eqGroups.
+        } // stencil eqBundles.
     }
 
     // Print final YASK context.
@@ -647,36 +647,36 @@ namespace yask {
         os << endl << " ////// Overall stencil-specific context //////" << endl <<
             "struct " << _context << " : public " << _context_base << " {" << endl;
 
-        // Stencil eqGroup objects.
-        os << endl << " // Stencil equation-groups." << endl;
-        for (auto& eg : _eqGroups) {
+        // Stencil eqBundle objects.
+        os << endl << " // Stencil equation-bundles." << endl;
+        for (auto& eg : _eqBundles) {
             string egName = eg.getName();
-            string sgName = "stencilGroup_" + egName;
-            os << " StencilGroup_" << egName << " " << sgName << ";" << endl;
+            string sgName = "stencilBundle_" + egName;
+            os << " StencilBundle_" << egName << " " << sgName << ";" << endl;
         }
 
         // Ctor.
         os << "\n // Constructor.\n" <<
             " " << _context << "(KernelEnvPtr env, KernelSettingsPtr settings) : " <<
             _context_base << "(env, settings)";
-        for (auto& eg : _eqGroups) {
+        for (auto& eg : _eqBundles) {
             string egName = eg.getName();
-            string sgName = "stencilGroup_" + egName;
+            string sgName = "stencilBundle_" + egName;
             os << ",\n  " << sgName << "(this)";
         }
         os << " {\n";
         
-        // Push eq-group pointers to list.
-        os << "\n // Stencil groups.\n";
-        for (auto& eg : _eqGroups) {
+        // Push eq-bundle pointers to list.
+        os << "\n // Stencil bundles.\n";
+        for (auto& eg : _eqBundles) {
             string egName = eg.getName();
-            string sgName = "stencilGroup_" + egName;
-            os << "  stGroups.push_back(&" << sgName << ");\n";
+            string sgName = "stencilBundle_" + egName;
+            os << "  stBundles.push_back(&" << sgName << ");\n";
 
-            // Add other-group deps.
+            // Add other-bundle deps.
             for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1)) {
                 for (auto& dep : eg.getDeps(dt)) {
-                    string depName = "stencilGroup_" + dep;
+                    string depName = "stencilBundle_" + dep;
                     string dtName = (dt == cur_step_dep) ? "cur_step_dep" :
                         (dt == prev_step_dep) ? "prev_step_dep" :
                         "internal_error";
@@ -686,17 +686,17 @@ namespace yask {
                 }
             }
 
-            // Add scratch-group deps in proper order.
+            // Add scratch-bundle deps in proper order.
             auto& sdeps = eg.getScratchDeps();
-            for (auto& eg2 : _eqGroups) {
+            for (auto& eg2 : _eqBundles) {
                 string eg2Name = eg2.getName();
-                string sg2Name = "stencilGroup_" + eg2Name;
+                string sg2Name = "stencilBundle_" + eg2Name;
                 if (sdeps.count(eg2Name))
                     os << "  " << sgName <<
                         ".add_scratch_dep(&" << sg2Name << ");\n";
             }
             
-        } // eq-groups.
+        } // eq-bundles.
         os << " } // Ctor.\n";
 
         // Dims creator.
diff --git a/src/compiler/main.cpp b/src/compiler/main.cpp
index 06905a96..a1c43ea4 100644
--- a/src/compiler/main.cpp
+++ b/src/compiler/main.cpp
@@ -81,21 +81,21 @@ void usage(const string& cmd) {
         " -grids <regex>\n"
         "    Only process updates to grids whose names match <regex>.\n"
         "      This can be used to generate code for a subset of the stencil equations.\n"
-        " -eq-groups <name>=<regex>,...\n"
-        "    Put updates to grids matching <regex> in equation-group with base-name <name>.\n"
-        "      By default, eq-groups are created as needed based on dependencies between equations:\n"
-        "        equations that do not depend on each other are grouped together into groups with the\n"
-        "        base-name '" << settings._eq_group_basename_default << "'.\n"
-        "      Each eq-group base-name is appended with a unique index number, so the default group\n"
-        "        names are '" << settings._eq_group_basename_default << "_0', " <<
-        settings._eq_group_basename_default << "_1', etc.\n"
-        "      This option allows more control over this grouping.\n"
-        "      Example: \"-eq-groups a=foo,b=b[aeiou]r\" creates one or more eq-groups named 'a_0', 'a_1', etc.\n"
-        "        containing updates to each grid whose name contains 'foo' and one or more eq-groups\n"
+        " -eq-bundles <name>=<regex>,...\n"
+        "    Put updates to grids matching <regex> in equation-bundle with base-name <name>.\n"
+        "      By default, eq-bundles are created as needed based on dependencies between equations:\n"
+        "        equations that do not depend on each other are bundled together into bundles with the\n"
+        "        base-name '" << settings._eq_bundle_basename_default << "'.\n"
+        "      Each eq-bundle base-name is appended with a unique index number, so the default bundle\n"
+        "        names are '" << settings._eq_bundle_basename_default << "_0', " <<
+        settings._eq_bundle_basename_default << "_1', etc.\n"
+        "      This option allows more control over this bundling.\n"
+        "      Example: \"-eq-bundles a=foo,b=b[aeiou]r\" creates one or more eq-bundles named 'a_0', 'a_1', etc.\n"
+        "        containing updates to each grid whose name contains 'foo' and one or more eq-bundles\n"
         "        named 'b_0', 'b_1', etc. containing updates to each grid whose name matches 'b[aeiou]r'.\n"
         "      Standard regex-format tokens in <name> will be replaced based on matches to <regex>.\n"
-        "      Example: \"-eq-groups 'g_$&=b[aeiou]r'\" with grids 'bar_x', 'bar_y', 'ber_x', and 'ber_y'\n"
-        "        would create eq-group 'g_bar_0' for grids 'bar_x' and 'bar_y' and eq-group 'g_ber_0' for\n"
+        "      Example: \"-eq-bundles 'g_$&=b[aeiou]r'\" with grids 'bar_x', 'bar_y', 'ber_x', and 'ber_y'\n"
+        "        would create eq-bundle 'g_bar_0' for grids 'bar_x' and 'bar_y' and eq-bundle 'g_ber_0' for\n"
         "        grids 'ber_x' and 'ber_y' because '$&' is substituted by the string that matches the regex.\n"
         " -step-alloc <size>\n"
         "    Specify the size of the step-dimension memory allocation.\n"
@@ -202,8 +202,8 @@ void parseOpts(int argc, const char* argv[])
                     solutionName = argop;
                 else if (opt == "-grids")
                     settings._gridRegex = argop;
-                else if (opt == "-eq-groups")
-                    settings._eqGroupTargets = argop;
+                else if (opt == "-eq-bundles")
+                    settings._eqBundleTargets = argop;
                 else if (opt == "-fold" || opt == "-cluster") {
 
                     // example: x=4,y=2
diff --git a/src/compiler/swig/yask_compiler_api.i b/src/compiler/swig/yask_compiler_api.i
index a5d1f0e5..e9c3c5e1 100644
--- a/src/compiler/swig/yask_compiler_api.i
+++ b/src/compiler/swig/yask_compiler_api.i
@@ -40,6 +40,7 @@ IN THE SOFTWARE.
 // Must declare shared_ptrs for the entire expr_node hierarchy!
 %shared_ptr(yask::yc_solution)
  //%shared_ptr(yask::yc_grid)
+%shared_ptr(yask::yc_equation_group)
 %shared_ptr(yask::yc_expr_node)
 %shared_ptr(yask::yc_index_node)
 %shared_ptr(yask::yc_equation_node)
diff --git a/src/kernel/Makefile b/src/kernel/Makefile
index 4e37b3ef..fb130302 100644
--- a/src/kernel/Makefile
+++ b/src/kernel/Makefile
@@ -269,7 +269,7 @@ NDIMS_OPT		:=	`cat $(YK_DIMS_FILE)`
 RANK_LOOP_OPTS		?=	$(NDIMS_OPT) -inVar rank_idxs
 RANK_LOOP_ORDER		?=	1 .. N-1
 RANK_LOOP_CODE		?=	$(RANK_LOOP_OUTER_MODS) loop($(RANK_LOOP_ORDER)) \
-				{ $(RANK_LOOP_INNER_MODS) call(calc_region(stGroup_ptr)); }
+				{ $(RANK_LOOP_INNER_MODS) call(calc_region(stBundle_ptr)); }
 
 # Region loops break up a region using OpenMP threading into blocks.  The
 # 'omp' modifier creates an outer OpenMP loop so that each block is assigned
@@ -326,7 +326,7 @@ MISC_LOOP_CODE		?=	$(MISC_LOOP_OUTER_MODS) loop($(MISC_LOOP_ORDER)) \
 # Flags passed to stencil compiler.
 YC_FLAGS   	+=	-stencil $(stencil) -elem-bytes $(real_bytes) -cluster $(cluster) -fold $(fold)
 ifneq ($(eqs),)
- YC_FLAGS   	+=	-eq-groups $(eqs)
+ YC_FLAGS   	+=	-eq-bundles $(eqs)
 endif
 ifneq ($(radius),)
  YC_FLAGS   	+=	-radius $(radius)
diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp
index 66c2c957..7524b41f 100644
--- a/src/kernel/lib/context.cpp
+++ b/src/kernel/lib/context.cpp
@@ -146,7 +146,7 @@ namespace yask {
     
     ///// Top-level methods for evaluating reference and optimized stencils.
 
-    // Eval stencil group(s) over grid(s) using reference scalar code.
+    // Eval stencil bundle(s) over grid(s) using reference scalar code.
     void StencilContext::calc_rank_ref()
     {
         run_time.start();
@@ -223,11 +223,11 @@ namespace yask {
             rank_idxs.stop[step_posn] = stop_t;
             rank_idxs.step[step_posn] = step_t;
 
-            // Loop thru groups.
+            // Loop thru bundles.
             // For this reference-code implementation, we
-            // will do all stencil groups at this level,
+            // will do all stencil bundles at this level,
             // even scratch-grid ones.
-            for (auto* sg : stGroups) {
+            for (auto* sg : stBundles) {
 
                 // Exchange all dirty halos.
                 exchange_halos_all();
@@ -239,7 +239,7 @@ namespace yask {
                 
                 // Define misc-loop function.  Since step is always 1, we
                 // ignore misc_stop.  If point is in sub-domain for this
-                // group, then evaluate the reference scalar code.
+                // bundle, then evaluate the reference scalar code.
 #define misc_fn(misc_idxs)   do {                                       \
                     if (sg->is_in_valid_domain(misc_idxs.start))        \
                         sg->calc_scalar(scratch_grid_idx, misc_idxs.start);   \
@@ -247,17 +247,17 @@ namespace yask {
                 
                 // Scan through n-D space.
                 TRACE_MSG("calc_rank_ref: step " << start_t <<
-                          " in group '" << sg->get_name() << "': " <<
+                          " in bundle '" << sg->get_name() << "': " <<
                           misc_idxs.begin.makeValStr(ndims) <<
                           " ... (end before) " << misc_idxs.end.makeValStr(ndims));
 #include "yask_misc_loops.hpp"
 #undef misc_fn
                 
-                // Remember grids that have been written to by this group,
+                // Remember grids that have been written to by this bundle,
                 // updated at next step (+/- 1).
                 mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);
                 
-            } // groups.
+            } // bundles.
         } // iterations.
 
         // Final halo exchange.
@@ -266,7 +266,7 @@ namespace yask {
         run_time.stop();
     }
 
-    // Eval stencil group(s) over grid(s) using optimized code.
+    // Eval stencil bundle(s) over grid(s) using optimized code.
     void StencilContext::run_solution(idx_t first_step_index,
                                       idx_t last_step_index)
     {
@@ -320,8 +320,8 @@ namespace yask {
         // Extend end points for overlapping regions due to wavefront angle.
         // For each subsequent time step in a region, the spatial location
         // of each block evaluation is shifted by the angle for each
-        // stencil-group. So, the total shift in a region is the angle * num
-        // stencils * num timesteps. This assumes all groups
+        // stencil-bundle. So, the total shift in a region is the angle * num
+        // stencils * num timesteps. This assumes all bundles
         // are inter-dependent to find maximum extension. Actual required
         // extension may be less, but this will just result in some calls to
         // calc_region() that do nothing.
@@ -388,44 +388,44 @@ namespace yask {
             rank_idxs.stop[step_posn] = stop_t;
             rank_idxs.step[step_posn] = step_t;
             
-            // If no wave-fronts (default), loop through groups here, and do
-            // only one group at a time in calc_region(). This is similar to
+            // If no wave-fronts (default), loop through bundles here, and do
+            // only one bundle at a time in calc_region(). This is similar to
             // loop in calc_rank_ref().
             if (step_t == 1) {
 
-                for (auto* sg : stGroups) {
+                for (auto* sg : stBundles) {
 
                     // Don't do scratch updates here.
                     if (sg->is_scratch())
                         continue;
 
-                    // Exchange halo(s) needed for this group.
+                    // Exchange halo(s) needed for this bundle.
                     exchange_halos(start_t, stop_t, *sg);
 
-                    // Eval this group in calc_region().
-                    StencilGroupSet stGroup_set;
-                    stGroup_set.insert(sg);
-                    StencilGroupSet* stGroup_ptr = &stGroup_set;
+                    // Eval this bundle in calc_region().
+                    StencilBundleSet stBundle_set;
+                    stBundle_set.insert(sg);
+                    StencilBundleSet* stBundle_ptr = &stBundle_set;
 
                     // Include automatically-generated loop code that calls
                     // calc_region() for each region.
                     TRACE_MSG("run_solution: step " << start_t <<
-                              " in group '" << sg->get_name() << "'");
+                              " in bundle '" << sg->get_name() << "'");
 #include "yask_rank_loops.hpp"
                 }
             }
 
-            // If doing wave-fronts, must loop through all groups in
+            // If doing wave-fronts, must loop through all bundles in
             // calc_region().
-            // TODO: make this the only case, allowing all groups to be done
+            // TODO: make this the only case, allowing all bundles to be done
             // between MPI exchanges, even w/o wave-fronts.
             else {
 
                 // Exchange all dirty halo(s).
                 exchange_halos_all();
                 
-                // Eval all stencil groups.
-                StencilGroupSet* stGroup_ptr = NULL;
+                // Eval all stencil bundles.
+                StencilBundleSet* stBundle_ptr = NULL;
                 
                 // Include automatically-generated loop code that calls calc_region() for each region.
                 TRACE_MSG("run_solution: steps " << start_t << " ... (end before) " << stop_t);
@@ -442,9 +442,6 @@ namespace yask {
             
         } // step loop.
 
-        // Final halo exchange.
-        exchange_halos_all();
-
 #ifdef MODEL_CACHE
         // Print cache stats, then disable.
         // Thus, cache is only modeled for first call.
@@ -477,7 +474,7 @@ namespace yask {
     // Each region is typically computed in a separate OpenMP 'for' region.
     // In it, we loop over the time steps and the stencils
     // and evaluate the blocks in the region.
-    void StencilContext::calc_region(StencilGroupSet* stGroup_set,
+    void StencilContext::calc_region(StencilBundleSet* stBundle_set,
                                      const ScanIndices& rank_idxs) {
 
         int ndims = _dims->_stencil_dims.size();
@@ -523,18 +520,18 @@ namespace yask {
             region_idxs.start[step_posn] = start_t;
             region_idxs.stop[step_posn] = stop_t;
             
-            // Stencil groups to evaluate at this time step.
-            for (auto* sg : stGroups) {
+            // Stencil bundles to evaluate at this time step.
+            for (auto* sg : stBundles) {
 
                 // Don't do scratch updates here.
                 if (sg->is_scratch())
                     continue;
 
-                // Group not selected.
-                if (stGroup_set && !stGroup_set->count(sg))
+                // Bundle not selected.
+                if (stBundle_set && !stBundle_set->count(sg))
                     continue;
                 
-                TRACE_MSG("calc_region: stencil-group '" << sg->get_name() << "' w/BB " <<
+                TRACE_MSG("calc_region: stencil-bundle '" << sg->get_name() << "' w/BB " <<
                           sg->bb_begin.makeDimValStr() << " ... (end before) " <<
                           sg->bb_end.makeDimValStr());
 
@@ -544,7 +541,7 @@ namespace yask {
                 // each time-step, the parallelogram may be trimmed
                 // based on the BB and WF extensions outside of the rank-BB.
                     
-                // Actual region boundaries must stay within [extended] BB for this group.
+                // Actual region boundaries must stay within [extended] BB for this bundle.
                 bool ok = true;
                 for (int i = 0; i < ndims; i++) {
                     if (i == step_posn) continue;
@@ -587,7 +584,7 @@ namespace yask {
                     // contains the outer OpenMP loop(s).
 #include "yask_region_loops.hpp"
 
-                    // Remember grids that have been written to by this group,
+                    // Remember grids that have been written to by this bundle,
                     // updated at next step (+/- 1).
                     mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);
                 }
@@ -596,7 +593,7 @@ namespace yask {
                 // implement temporal wavefront.  Between regions, we only shift
                 // backward, so region loops must strictly increment. They may do
                 // so in any order.  TODO: shift only what is needed by
-                // this group, not the global max.
+                // this bundle, not the global max.
                 for (int i = 0; i < ndims; i++) {
                     if (i == step_posn) continue;
                     auto& dname = _dims->_stencil_dims.getDimName(i);
@@ -606,7 +603,7 @@ namespace yask {
                     stop[i] -= angle;
                 }
 
-            } // stencil groups.
+            } // stencil bundles.
         } // time.
     } // calc_region.
 
@@ -1145,7 +1142,7 @@ namespace yask {
         // based on the grids' halos.
         update_grids();
 
-        // Determine bounding-boxes for all groups.
+        // Determine bounding-boxes for all bundles.
         // This must be done after finding WF extensions.
         find_bounding_boxes();
 
@@ -1324,7 +1321,7 @@ namespace yask {
                             // Determine size of exchange. This will be the actual halo size
                             // plus any wave-front extensions. In the current implementation,
                             // we need the wave-front extensions regardless of whether there
-                            // is a halo on a given grid. This is because each stencil-group
+                            // is a halo on a given grid. This is because each stencil-bundle
                             // gets shifted by the WF angles at each step in the WF.
 
                             // Neighbor is to the left.
@@ -1810,7 +1807,7 @@ namespace yask {
         if (wf_steps > 1)
 
             // TODO: don't shift for scratch grids.
-            num_wf_shifts = max((idx_t(stGroups.size()) * wf_steps) - 1, idx_t(0));
+            num_wf_shifts = max((idx_t(stBundles.size()) * wf_steps) - 1, idx_t(0));
         for (auto& dim : _dims->_domain_dims.getDims()) {
             auto& dname = dim.getName();
             auto rksize = _opts->_rank_sizes[dname];
@@ -1941,13 +1938,11 @@ namespace yask {
             }
         }
 #endif
-        
 
         // Some grid stats.
         os << endl;
         os << "Num grids: " << gridPtrs.size() << endl;
         os << "Num grids to be updated: " << outputGridPtrs.size() << endl;
-        os << "Num stencil groups: " << stGroups.size() << endl;
         
         // Set up data based on MPI rank, including grid positions.
         // Update all the grid sizes.
@@ -2013,12 +2008,12 @@ namespace yask {
         }
         os << endl;
         
-        // sums across groups for this rank.
+        // sums across bundles for this rank.
         rank_numWrites_1t = 0;
         rank_reads_1t = 0;
         rank_numFpOps_1t = 0;
-        os << "Num equation-groups: " << stGroups.size() << endl;
-        for (auto* sg : stGroups) {
+        os << "Num stencil bundles: " << stBundles.size() << endl;
+        for (auto* sg : stBundles) {
             idx_t updates1 = sg->get_scalar_points_written();
             idx_t updates_domain = updates1 * sg->bb_num_points;
             rank_numWrites_1t += updates_domain;
@@ -2028,7 +2023,7 @@ namespace yask {
             idx_t fpops1 = sg->get_scalar_fp_ops();
             idx_t fpops_domain = fpops1 * sg->bb_num_points;
             rank_numFpOps_1t += fpops_domain;
-            os << "Stats for equation-group '" << sg->get_name() << "':\n" <<
+            os << "Stats for bundle '" << sg->get_name() << "':\n" <<
                 " sub-domain:                 " << sg->bb_begin.makeDimValStr() <<
                 " ... " << sg->bb_end.subElements(1).makeDimValStr() << endl <<
                 " sub-domain size:            " << sg->bb_len.makeDimValStr(" * ") << endl <<
@@ -2110,9 +2105,9 @@ namespace yask {
             "Notes:\n"
             " Domain-sizes and overall-problem-sizes are based on rank-domain sizes\n"
             "  and number of ranks regardless of number of grids or sub-domains.\n"
-            " Num-writes-required is based on sum of grid-updates in sub-domain across stencil-group(s).\n"
-            " Num-reads-required is based on sum of grid-reads in sub-domain across stencil-group(s).\n"
-            " Est-FP-ops are based on sum of est-FP-ops in sub-domain across stencil-group(s).\n"
+            " Num-writes-required is based on sum of grid-updates in sub-domain across stencil-bundle(s).\n"
+            " Num-reads-required is based on sum of grid-reads in sub-domain across stencil-bundle(s).\n"
+            " Est-FP-ops are based on sum of est-FP-ops in sub-domain across stencil-bundle(s).\n"
             "\n";
     }
 
@@ -2167,6 +2162,9 @@ namespace yask {
     // Dealloc grids, etc.
     void StencilContext::end_solution() {
 
+        // Final halo exchange.
+        exchange_halos_all();
+
         // Release any MPI data.
         mpiData.clear();
 
@@ -2268,7 +2266,7 @@ namespace yask {
         bb_valid = true;
     }
     
-    // Set the bounding-box for each stencil-group and whole domain.
+    // Set the bounding-box for each stencil-bundle and whole domain.
     void StencilContext::find_bounding_boxes()
     {
         ostream& os = get_ostr();
@@ -2283,13 +2281,13 @@ namespace yask {
         ext_bb.bb_end = rank_bb.bb_end.addElements(right_wf_exts);
         ext_bb.update_bb(os, "extended-rank", *this, true);
 
-        // Find BB for each group.
-        for (auto sg : stGroups)
+        // Find BB for each bundle.
+        for (auto sg : stBundles)
             sg->find_bounding_box();
     }
 
     // Exchange dirty halo data for all grids and all steps, regardless
-    // of their stencil-group.
+    // of their stencil-bundle.
     void StencilContext::exchange_halos_all() {
 
 #ifdef USE_MPI
@@ -2305,8 +2303,8 @@ namespace yask {
             }
         }
         
-        // Initial halo exchange for each group.
-        for (auto* sg : stGroups) {
+        // Initial halo exchange for each bundle.
+        for (auto* sg : stBundles) {
 
             // Do exchange over max steps.
             exchange_halos(start, stop, *sg);
@@ -2314,17 +2312,17 @@ namespace yask {
 #endif
     }
     
-    // Exchange halo data needed by stencil-group 'sg' at the given time.
+    // Exchange halo data needed by stencil-bundle 'sg' at the given time.
     // Data is needed for input grids that have not already been updated.
     // [BIG] TODO: overlap halo exchange with computation.
-    void StencilContext::exchange_halos(idx_t start, idx_t stop, StencilGroupBase& sg)
+    void StencilContext::exchange_halos(idx_t start, idx_t stop, StencilBundleBase& sg)
     {
 #ifdef USE_MPI
         if (!enable_halo_exchange || _env->num_ranks < 2)
             return;
         mpi_time.start();
         TRACE_MSG("exchange_halos: " << start << " ... (end before) " << stop <<
-                  " for eq-group '" << sg.get_name() << "'");
+                  " for stencil-bundle '" << sg.get_name() << "'");
         auto opts = get_settings();
         auto& sd = _dims->_step_dim;
 
@@ -2358,7 +2356,7 @@ namespace yask {
                 else if (halo_step == halo_unpack)
                     TRACE_MSG("exchange_halos: unpacking data for step " << t << "...");
             
-                // Loop thru all input grids in this group.
+                // Loop thru all input grids in this bundle.
                 for (size_t gi = 0; gi < sg.inputGridPtrs.size(); gi++) {
                     auto gp = sg.inputGridPtrs[gi];
                     MPI_Request* grid_recv_reqs = recv_reqs[gi];
@@ -2519,10 +2517,10 @@ namespace yask {
 #endif
     }
 
-    // Mark grids that have been written to by stencil-group 'sg'.
+    // Mark grids that have been written to by stencil-bundle 'sg'.
     // TODO: only mark grids that are written to in their halo-read area.
     // TODO: add index for misc dim(s).
-    void StencilContext::mark_grids_dirty(idx_t start, idx_t stop, StencilGroupBase& sg) {
+    void StencilContext::mark_grids_dirty(idx_t start, idx_t stop, StencilBundleBase& sg) {
         idx_t step = (start < stop) ? 1 : -1;
         for (auto gp : sg.outputGridPtrs) {
             for (idx_t t = start; t != stop; t += step) {
diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp
index a329dfe7..7e436776 100644
--- a/src/kernel/lib/context.hpp
+++ b/src/kernel/lib/context.hpp
@@ -94,9 +94,9 @@ namespace yask {
     };
     
     // Collections of things in a context.
-    class StencilGroupBase;
-    typedef std::vector<StencilGroupBase*> StencilGroupList;
-    typedef std::set<StencilGroupBase*> StencilGroupSet;
+    class StencilBundleBase;
+    typedef std::vector<StencilBundleBase*> StencilBundleList;
+    typedef std::set<StencilBundleBase*> StencilBundleSet;
     typedef std::map<std::string, YkGridPtr> GridPtrMap;
     
     // Data and hierarchical sizes.
@@ -155,10 +155,10 @@ namespace yask {
         // If WFs are not used, this is the same as rank_bb;
         BoundingBox ext_bb;
         
-        // List of all stencil groups in the order in which
+        // List of all stencil bundles in the order in which
         // they should be evaluated within a step.
         // TODO: use dependency info, allowing more parallelism.
-        StencilGroupList stGroups;
+        StencilBundleList stBundles;
 
         // All grids.
         GridPtrs gridPtrs;
@@ -189,7 +189,7 @@ namespace yask {
         // 'tot_' prefix indicates over all ranks.
         // 'domain' indicates points in domain-size specified on cmd-line.
         // 'numpts' indicates points actually calculated in sub-domains.
-        // 'reads' indicates points actually read by stencil-groups.
+        // 'reads' indicates points actually read by stencil-bundles.
         // 'numFpOps' indicates est. number of FP ops.
         // 'nbytes' indicates number of bytes allocated.
         // '_1t' suffix indicates work for one time-step.
@@ -518,20 +518,20 @@ namespace yask {
         // rank-domain loops; the actual begin_r* and end_r* values for the
         // region are derived from these.  TODO: create a public interface
         // w/a more logical index ordering.
-        virtual void calc_region(StencilGroupSet* stGroup_set,
+        virtual void calc_region(StencilBundleSet* stBundle_set,
                                  const ScanIndices& rank_idxs);
 
-        // Exchange all dirty halo data for all stencil groups
+        // Exchange all dirty halo data for all stencil bundles
         // and max number of steps for each grid.
         virtual void exchange_halos_all();
 
-        // Exchange halo data needed by stencil-group 'sg' at the given step(s).
-        virtual void exchange_halos(idx_t start, idx_t stop, StencilGroupBase& sg);
+        // Exchange halo data needed by stencil-bundle 'sg' at the given step(s).
+        virtual void exchange_halos(idx_t start, idx_t stop, StencilBundleBase& sg);
 
-        // Mark grids that have been written to by group 'sg'.
-        virtual void mark_grids_dirty(idx_t start, idx_t stop, StencilGroupBase& sg);
+        // Mark grids that have been written to by bundle 'sg'.
+        virtual void mark_grids_dirty(idx_t start, idx_t stop, StencilBundleBase& sg);
         
-        // Set the bounding-box around all eq groups.
+        // Set the bounding-box around all stencil bundles.
         virtual void find_bounding_boxes();
 
         // Make new scratch grids.
diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp
index fddf9e88..c3c6b287 100644
--- a/src/kernel/lib/stencil_calc.cpp
+++ b/src/kernel/lib/stencil_calc.cpp
@@ -31,8 +31,8 @@ namespace yask {
     // Calculate results within a block.
     // Typically called by a top-level OMP thread.
     // It is here that any required scratch-grid stencils are evaluated
-    // first and then the non-scratch stencils in the stencil group.
-    void StencilGroupBase::calc_block(const ScanIndices& region_idxs) {
+    // first and then the non-scratch stencils in the stencil bundle.
+    void StencilBundleBase::calc_block(const ScanIndices& region_idxs) {
 
         auto opts = _generic_context->get_settings();
         auto dims = _generic_context->get_dims();
@@ -40,7 +40,7 @@ namespace yask {
         auto& step_dim = dims->_step_dim;
         int thread_idx = omp_get_thread_num(); // used to index the scratch grids.
         TRACE_MSG3("calc_block:" <<
-                   " in non-scratch group '" << get_name() << "': " <<
+                   " in non-scratch bundle '" << get_name() << "': " <<
                    region_idxs.start.makeValStr(ndims) <<
                    " ... (end before) " << region_idxs.stop.makeValStr(ndims) <<
                    " by thread " << thread_idx);
@@ -56,12 +56,12 @@ namespace yask {
         // Groups in block loops are based on sub-block-group sizes.
         def_block_idxs.group_size = opts->_sub_block_group_sizes;
 
-        // Update offsets of scratch grids based on this group's location.
+        // Update offsets of scratch grids based on this bundle's location.
         _generic_context->update_scratch_grids(thread_idx, def_block_idxs);
         
-        // Define the groups that need to be processed in
+        // Define the bundles that need to be processed in
         // this block. This will be the prerequisite scratch-grid
-        // groups plus this non-scratch group.
+        // bundles plus this non-scratch bundle.
         auto sg_list = get_scratch_deps();
         sg_list.push_back(this);
 
@@ -70,7 +70,7 @@ namespace yask {
         // This should be nested within a top-level OpenMP task.
         _generic_context->set_block_threads();
 
-        // Loop through all the needed groups.
+        // Loop through all the needed bundles.
         for (auto* sg : sg_list) {
 
             // Indices needed for the generated loops.  Will normally be a
@@ -78,7 +78,7 @@ namespace yask {
             ScanIndices block_idxs = sg->adjust_scan(thread_idx, def_block_idxs);
 
             TRACE_MSG3("calc_block: " <<
-                       " in group '" << sg->get_name() << "': " <<
+                       " in bundle '" << sg->get_name() << "': " <<
                        block_idxs.begin.makeValStr(ndims) <<
                        " ... (end before) " << block_idxs.end.makeValStr(ndims) <<
                        " by thread " << thread_idx);
@@ -93,7 +93,7 @@ namespace yask {
     // Normalize the indices, i.e., divide by vector len in each dim.
     // Ranks offsets must already be subtracted.
     // Each dim in 'orig' must be a multiple of corresponding vec len.
-    void StencilGroupBase::normalize_indices(const Indices& orig, Indices& norm) const {
+    void StencilBundleBase::normalize_indices(const Indices& orig, Indices& norm) const {
         auto* cp = _generic_context;
         auto dims = cp->get_dims();
         int nsdims = dims->_stencil_dims.size();
@@ -124,7 +124,7 @@ namespace yask {
     // The index ranges in 'block_idxs' are sub-divided
     // into full vector-clusters, full vectors, and sub-vectors
     // and finally evaluated by the YASK-compiler-generated loops.
-    void StencilGroupBase::calc_sub_block(int thread_idx,
+    void StencilBundleBase::calc_sub_block(int thread_idx,
                                           const ScanIndices& block_idxs) {
         auto* cp = _generic_context;
         auto opts = cp->get_settings();
@@ -134,7 +134,7 @@ namespace yask {
         auto& step_dim = dims->_step_dim;
         auto step_posn = Indices::step_posn;
         TRACE_MSG3("calc_sub_block:" <<
-                   " in group '" << get_name() << "': " <<
+                   " in bundle '" << get_name() << "': " <<
                    block_idxs.start.makeValStr(nsdims) <<
                    " ... (end before) " << block_idxs.stop.makeValStr(nsdims));
 
@@ -467,7 +467,7 @@ namespace yask {
 
             // Define misc-loop function.
             // If point is in sub-domain for this
-            // group, then evaluate the reference scalar code.
+            // bundle, then evaluate the reference scalar code.
             // If no holes, don't need to check each point in domain.
             // Since step is always 1, we ignore misc_idxs.stop.
 #define misc_fn(misc_idxs)  do {                                        \
@@ -505,7 +505,7 @@ namespace yask {
     // The 'loop_idxs' must specify a range only in the inner dim.
     // Indices must be rank-relative.
     // Indices must be normalized, i.e., already divided by VLEN_*.
-    void StencilGroupBase::calc_loop_of_clusters(int thread_idx,
+    void StencilBundleBase::calc_loop_of_clusters(int thread_idx,
                                                  const ScanIndices& loop_idxs) {
         auto* cp = _generic_context;
         auto dims = cp->get_dims();
@@ -541,7 +541,7 @@ namespace yask {
     // The 'loop_idxs' must specify a range only in the inner dim.
     // Indices must be rank-relative.
     // Indices must be normalized, i.e., already divided by VLEN_*.
-    void StencilGroupBase::calc_loop_of_vectors(int thread_idx,
+    void StencilBundleBase::calc_loop_of_vectors(int thread_idx,
                                                 const ScanIndices& loop_idxs,
                                                 idx_t write_mask) {
         auto* cp = _generic_context;
@@ -571,11 +571,11 @@ namespace yask {
         calc_loop_of_vectors(thread_idx, start_idxs, stop_inner, write_mask);
     }
 
-    // If this group is updating scratch grid(s),
+    // If this bundle is updating scratch grid(s),
     // expand indices to calculate values in halo.
     // This will often change vec-len aligned indices to non-aligned.
     // Return adjusted indices.
-    ScanIndices StencilGroupBase::adjust_scan(int thread_idx, const ScanIndices& idxs) const {
+    ScanIndices StencilBundleBase::adjust_scan(int thread_idx, const ScanIndices& idxs) const {
 
         ScanIndices adj_idxs(idxs);
         auto* cp = _generic_context;
@@ -583,7 +583,7 @@ namespace yask {
         int nsdims = dims->_stencil_dims.size();
         auto step_posn = Indices::step_posn;
 
-        // Loop thru vecs of scratch grids for this group.
+        // Loop thru vecs of scratch grids for this bundle.
         for (auto* sv : outputScratchVecs) {
             assert(sv);
 
@@ -626,8 +626,8 @@ namespace yask {
         return adj_idxs;
     }
     
-    // Set the bounding-box vars for this group in this rank.
-    void StencilGroupBase::find_bounding_box() {
+    // Set the bounding-box vars for this bundle in this rank.
+    void StencilBundleBase::find_bounding_box() {
         StencilContext& context = *_generic_context;
         ostream& os = context.get_ostr();
         auto settings = context.get_settings();
@@ -658,7 +658,7 @@ namespace yask {
         misc_idxs.end = end;
 
         // Define misc-loop function.  Since step is always 1, we ignore
-        // misc_stop.  Update only if point is in domain for this group.
+        // misc_stop.  Update only if point is in domain for this bundle.
 #define misc_fn(misc_idxs) do {                                  \
         if (is_in_valid_domain(misc_idxs.start)) {               \
             min_pts = min_pts.minElements(misc_idxs.start);      \
diff --git a/src/kernel/lib/stencil_calc.hpp b/src/kernel/lib/stencil_calc.hpp
index 374ebd8b..830d3746 100644
--- a/src/kernel/lib/stencil_calc.hpp
+++ b/src/kernel/lib/stencil_calc.hpp
@@ -27,11 +27,11 @@ IN THE SOFTWARE.
 
 namespace yask {
     
-    /// Classes that support evaluation of one stencil group.
-    /// A stencil context contains one or more groups.
+    /// Classes that support evaluation of one stencil bundle.
+    /// A stencil context contains one or more bundles.
 
-    // A pure-virtual class base for a stencil group.
-    class StencilGroupBase : public BoundingBox {
+    // A pure-virtual class base for a stencil bundle.
+    class StencilBundleBase : public BoundingBox {
     protected:
         StencilContext* _generic_context = 0;
         std::string _name;
@@ -42,12 +42,12 @@ namespace yask {
         // Position of inner dim in stencil-dims tuple.
         int _inner_posn = 0;
 
-        // Other groups that this one depends on.
-        std::map<DepType, StencilGroupSet> _depends_on;
+        // Other bundles that this one depends on.
+        std::map<DepType, StencilBundleSet> _depends_on;
 
-        // List of scratch-grid groups that need to be evaluated
-        // before this group. Listed in eval order first-to-last.
-        StencilGroupList _scratch_deps;
+        // List of scratch-grid bundles that need to be evaluated
+        // before this bundle. Listed in eval order first-to-last.
+        StencilBundleList _scratch_deps;
 
         // Whether this updates scratch grid(s);
         bool _is_scratch = false;
@@ -71,7 +71,7 @@ namespace yask {
         ScratchVecs inputScratchVecs;
         
         // ctor, dtor.
-        StencilGroupBase(StencilContext* context) :
+        StencilBundleBase(StencilContext* context) :
             _generic_context(context) {
 
             // Make sure map entries exist.
@@ -91,7 +91,7 @@ namespace yask {
             }
         }
 
-        virtual ~StencilGroupBase() { }
+        virtual ~StencilBundleBase() { }
 
         // Access to dims and MPI info.
         virtual DimsPtr get_dims() const {
@@ -101,7 +101,7 @@ namespace yask {
             return _generic_context->get_mpi_info();
         }
 
-        // Get name of this group.
+        // Get name of this bundle.
         virtual const std::string& get_name() const { return _name; }
 
         // Get estimated number of FP ops done for one scalar eval.
@@ -116,32 +116,32 @@ namespace yask {
         virtual void set_scratch(bool is_scratch) { _is_scratch = is_scratch; }
         
         // Add dependency.
-        virtual void add_dep(DepType dt, StencilGroupBase* eg) {
+        virtual void add_dep(DepType dt, StencilBundleBase* eg) {
             _depends_on.at(dt).insert(eg);
         }
 
         // Get dependencies.
-        virtual const StencilGroupSet& get_deps(DepType dt) const {
+        virtual const StencilBundleSet& get_deps(DepType dt) const {
             return _depends_on.at(dt);
         }
 
-        // Add needed scratch-group.
-        virtual void add_scratch_dep(StencilGroupBase* eg) {
+        // Add needed scratch-bundle.
+        virtual void add_scratch_dep(StencilBundleBase* eg) {
             _scratch_deps.push_back(eg);
         }
 
-        // Get needed scratch-group(s).
-        virtual const StencilGroupList& get_scratch_deps() const {
+        // Get needed scratch-bundle(s).
+        virtual const StencilBundleList& get_scratch_deps() const {
             return _scratch_deps;
         }
 
-        // If this group is updating scratch grid(s),
+        // If this bundle is updating scratch grid(s),
         // expand indices to calculate values in halo.
         // Adjust offsets in grids based on original idxs.
         // Return adjusted indices.
         virtual ScanIndices adjust_scan(int thread_idx, const ScanIndices& idxs) const;
         
-        // Set the bounding-box vars for this group in this rank.
+        // Set the bounding-box vars for this bundle in this rank.
         virtual void find_bounding_box();
 
         // Determine whether indices are in [sub-]domain.
diff --git a/src/kernel/swig/yask_kernel_api.i b/src/kernel/swig/yask_kernel_api.i
index a4a200aa..2da88f25 100644
--- a/src/kernel/swig/yask_kernel_api.i
+++ b/src/kernel/swig/yask_kernel_api.i
@@ -43,6 +43,7 @@ IN THE SOFTWARE.
 %shared_ptr(yask::yk_settings)
 %shared_ptr(yask::yk_solution)
 %shared_ptr(yask::yk_grid)
+%shared_ptr(yask::yk_stencil_group)
 %shared_ptr(yask::yk_stats)
 
 // Mutable buffer to access raw data.

From f73854289eb2c214089603184c0384f0b5a9e292 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Tue, 17 Apr 2018 17:45:29 -0700
Subject: [PATCH 02/21] Fix grid test to work with recent numa change.

---
 src/kernel/tests/grid_test.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/kernel/tests/grid_test.cpp b/src/kernel/tests/grid_test.cpp
index adf2c488..57158efa 100644
--- a/src/kernel/tests/grid_test.cpp
+++ b/src/kernel/tests/grid_test.cpp
@@ -66,10 +66,10 @@ int main(int argc, char** argv) {
         os << "0-D test...\n";
         GridDimNames gdims;
         string name = "test grid";
-        YkGridPtr g0 = make_shared<YkElemGrid<Layout_0d, false>>(dims, name, gdims, settings, &osp);
+        YkGridPtr g0 = make_shared<YkElemGrid<Layout_0d, false>>(dims, name, gdims, &settings, &osp);
         g0->alloc_storage();
         os << g0->make_info_string() << endl;
-        YkGridPtr g1 = make_shared<YkElemGrid<Layout_0d, false>>(dims, name, gdims, settings, &osp);
+        YkGridPtr g1 = make_shared<YkElemGrid<Layout_0d, false>>(dims, name, gdims, &settings, &osp);
         g1->alloc_storage();
         os << g1->make_info_string() << endl;
 
@@ -87,8 +87,8 @@ int main(int argc, char** argv) {
         os << "3-D test...\n";
         GridDimNames gdims = {"x", "y", "z"};
         string name = "test grid";
-        YkGridPtr g3 = make_shared<YkElemGrid<Layout_321, false>>(dims, name, gdims, settings, &osp);
-        YkGridPtr g3f = make_shared<YkVecGrid<Layout_123, false, VLEN_X, VLEN_Y, VLEN_Z>>(dims, name, gdims, settings, &osp);
+        YkGridPtr g3 = make_shared<YkElemGrid<Layout_321, false>>(dims, name, gdims, &settings, &osp);
+        YkGridPtr g3f = make_shared<YkVecGrid<Layout_123, false, VLEN_X, VLEN_Y, VLEN_Z>>(dims, name, gdims, &settings, &osp);
         int i = 0;
         int min_pad = 3;
         for (auto dname : gdims) {

From 3a628e074fff3b3a68aedf72bf97fbd940c0bcde Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Fri, 20 Apr 2018 08:43:40 -0700
Subject: [PATCH 03/21] Re-enable surface conditions in AWE Elastic stencil.
 v2.05.09.

Fix some issues with combos of sub-domains, scratch grids, and MPI.
---
 src/common/common_utils.cpp        |   2 +-
 src/compiler/lib/Eqs.cpp           |  19 +-
 src/kernel/lib/context.cpp         | 250 ++++++++++++-------
 src/kernel/lib/context.hpp         |   1 -
 src/kernel/lib/settings.hpp        |   1 +
 src/kernel/yask_main.cpp           |   4 +-
 src/stencils/AwpElasticStencil.hpp | 379 ++++++++++++++++-------------
 7 files changed, 379 insertions(+), 277 deletions(-)

diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp
index ef7d44bd..0e64ad0f 100644
--- a/src/common/common_utils.cpp
+++ b/src/common/common_utils.cpp
@@ -41,7 +41,7 @@ namespace yask {
     // for numbers above 9 (at least up to 99).
 
     // Format: "major.minor.patch".
-    const string version = "2.05.08";
+    const string version = "2.05.09";
 
     string yask_get_version_string() {
         return version;
diff --git a/src/compiler/lib/Eqs.cpp b/src/compiler/lib/Eqs.cpp
index 0a8acac3..f50285e7 100644
--- a/src/compiler/lib/Eqs.cpp
+++ b/src/compiler/lib/Eqs.cpp
@@ -634,6 +634,8 @@ namespace yask {
                 g->updateConstIndices(ap->getArgConsts());
             }
 
+            // We want to start with non-scratch eqs and walk the dep
+            // tree to find all dependent scratch eqs.
             // If 'eq1' has a non-scratch output, visit all dependencies of
             // 'eq1'.  It's important to visit the eqs in dep order to
             // properly propagate halos sizes thru chains of scratch grids.
@@ -643,18 +645,15 @@ namespace yask {
                     // 'eq1' is 'b' or depends on 'b', immediately or indirectly.
                     (eq1, [&](EqualsExprPtr b, EqDeps::EqVecSet& path) {
 
-                        // Only check if conditions are same.
-                        auto cond1 = getCond(eq1);
-                        auto cond2 = getCond(b);
-                        bool same_cond = areExprsSame(cond1, cond2);
-                        
                         // Does 'b' have a scratch-grid output?
+                        // NB: scratch eqs don't have conditions, so
+                        // we don't need to check them.
                         auto* og2 = pv.getOutputGrids().at(b.get());
-                        if (same_cond && og2->isScratch()) {
+                        if (og2->isScratch()) {
 
                             // Get halos from the output scratch grid.
                             // These are the points that are read from
-                            // in dependent eq(s).
+                            // the dependent eq(s).
                             // For scratch grids, the halo areas must also be written to.
                             auto _left_ohalo = og2->getHaloSizes(true);
                             auto _right_ohalo = og2->getHaloSizes(false);
@@ -679,12 +678,6 @@ namespace yask {
                         EqualsExprPtr prev;
                         for (auto eq2 : path) {
 
-                            // Only continue if conditions are same.
-                            auto cond1 = getCond(eq1);
-                            auto cond2 = getCond(eq2);
-                            if (!areExprsSame(cond1, cond2))
-                                break;
-
                             // Look for scratch-grid dep from 'prev' to 'eq2'.
                             if (prev) {
 
diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp
index 3d724692..ee358044 100644
--- a/src/kernel/lib/context.cpp
+++ b/src/kernel/lib/context.cpp
@@ -224,40 +224,56 @@ namespace yask {
             rank_idxs.step[step_posn] = step_t;
 
             // Loop thru groups.
-            // For this reference-code implementation, we
-            // will do all stencil groups at this level,
-            // even scratch-grid ones.
-            for (auto* sg : stGroups) {
+            for (auto* asg : stGroups) {
 
+                // Don't do scratch updates here.
+                if (asg->is_scratch())
+                    continue;
+
+                // Scan through n-D space.
+                TRACE_MSG("calc_rank_ref: step " << start_t <<
+                          " in non-scratch group '" << asg->get_name());
+                
                 // Exchange all dirty halos.
                 exchange_halos_all();
 
-                // Indices needed for the generated misc loops.  Will normally be a
-                // copy of rank_idxs except when updating scratch-grids.
-                ScanIndices misc_idxs = sg->adjust_scan(scratch_grid_idx, rank_idxs);
-                misc_idxs.step.setFromConst(1); // ensure unit step.
+                // Find the groups that need to be processed.
+                // This will be the prerequisite scratch-grid
+                // groups plus this non-scratch group.
+                auto sg_list = asg->get_scratch_deps();
+                sg_list.push_back(asg);
+
+                // Loop through all the needed groups.
+                for (auto* sg : sg_list) {
+
+                    // Indices needed for the generated misc loops.  Will normally be a
+                    // copy of rank_idxs except when updating scratch-grids.
+                    ScanIndices misc_idxs = sg->adjust_scan(scratch_grid_idx, rank_idxs);
+                    misc_idxs.step.setFromConst(1); // ensure unit step.
                 
-                // Define misc-loop function.  Since step is always 1, we
-                // ignore misc_stop.  If point is in sub-domain for this
-                // group, then evaluate the reference scalar code.
+                    // Define misc-loop function.  Since step is always 1, we
+                    // ignore misc_stop.  If point is in sub-domain for this
+                    // group, then evaluate the reference scalar code.
+                    // TODO: fix domain of scratch grids.
 #define misc_fn(misc_idxs)   do {                                       \
-                    if (sg->is_in_valid_domain(misc_idxs.start))        \
-                        sg->calc_scalar(scratch_grid_idx, misc_idxs.start);   \
-                } while(0)
+                        if (sg->is_in_valid_domain(misc_idxs.start))    \
+                            sg->calc_scalar(scratch_grid_idx, misc_idxs.start); \
+                    } while(0)
                 
-                // Scan through n-D space.
-                TRACE_MSG("calc_rank_ref: step " << start_t <<
-                          " in group '" << sg->get_name() << "': " <<
-                          misc_idxs.begin.makeValStr(ndims) <<
-                          " ... (end before) " << misc_idxs.end.makeValStr(ndims));
+                    // Scan through n-D space.
+                    TRACE_MSG("calc_rank_ref: step " << start_t <<
+                              " in group '" << sg->get_name() << "': " <<
+                              misc_idxs.begin.makeValStr(ndims) <<
+                              " ... (end before) " << misc_idxs.end.makeValStr(ndims));
 #include "yask_misc_loops.hpp"
 #undef misc_fn
-                
+                } // groups in chain.
+
                 // Remember grids that have been written to by this group,
                 // updated at next step (+/- 1).
-                mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);
+                mark_grids_dirty(start_t + step_t, stop_t + step_t, *asg);
                 
-            } // groups.
+            } // all groups.
         } // iterations.
 
         // Final halo exchange.
@@ -471,6 +487,9 @@ namespace yask {
         }
 
         run_solution(first_t, last_t);
+
+        // Final halo exchange.
+        exchange_halos_all();
     }
 
     // Calculate results within a region.
@@ -1403,62 +1422,64 @@ namespace yask {
                         // Adjust along domain dims in this grid.
                         for (auto& dim : _dims->_domain_dims.getDims()) {
                             auto& dname = dim.getName();
+                            if (gp->is_dim_used(dname)) {
 
-                            // Init range to whole rank domain (including
-                            // outer halos).  These may be changed below
-                            // depending on the neighbor's direction.
-                            copy_begin[dname] = first_outer_idx[dname];
-                            copy_end[dname] = last_outer_idx[dname] + 1; // end = last + 1.
+                                // Init range to whole rank domain (including
+                                // outer halos).  These may be changed below
+                                // depending on the neighbor's direction.
+                                copy_begin[dname] = first_outer_idx[dname];
+                                copy_end[dname] = last_outer_idx[dname] + 1; // end = last + 1.
 
-                            // Neighbor direction in this dim.
-                            auto neigh_ofs = neigh_offsets[dname];
+                                // Neighbor direction in this dim.
+                                auto neigh_ofs = neigh_offsets[dname];
                             
-                            // Region to read from, i.e., data from inside
-                            // this rank's domain to be put into neighbor's
-                            // halo.
-                            if (bd == MPIBufs::bufSend) {
+                                // Region to read from, i.e., data from inside
+                                // this rank's domain to be put into neighbor's
+                                // halo.
+                                if (bd == MPIBufs::bufSend) {
 
-                                // Neighbor is to the left.
-                                if (neigh_ofs == idx_t(MPIInfo::rank_prev)) {
+                                    // Neighbor is to the left.
+                                    if (neigh_ofs == idx_t(MPIInfo::rank_prev)) {
 
-                                    // Only read slice as wide as halo from beginning.
-                                    copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname];
-                                }
+                                        // Only read slice as wide as halo from beginning.
+                                        copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname];
+                                    }
                             
-                                // Neighbor is to the right.
-                                else if (neigh_ofs == idx_t(MPIInfo::rank_next)) {
+                                    // Neighbor is to the right.
+                                    else if (neigh_ofs == idx_t(MPIInfo::rank_next)) {
                                     
-                                    // Only read slice as wide as halo before end.
-                                    copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname];
-                                }
+                                        // Only read slice as wide as halo before end.
+                                        copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname];
+                                    }
                             
-                                // Else, this neighbor is in same posn as I am in this dim,
-                                // so we leave the default begin/end settings.
-                            }
+                                    // Else, this neighbor is in same posn as I am in this dim,
+                                    // so we leave the default begin/end settings.
+                                }
                         
-                            // Region to write to, i.e., into this rank's halo.
-                            else if (bd == MPIBufs::bufRecv) {
+                                // Region to write to, i.e., into this rank's halo.
+                                else if (bd == MPIBufs::bufRecv) {
 
-                                // Neighbor is to the left.
-                                if (neigh_ofs == idx_t(MPIInfo::rank_prev)) {
+                                    // Neighbor is to the left.
+                                    if (neigh_ofs == idx_t(MPIInfo::rank_prev)) {
 
-                                    // Only read slice as wide as halo before beginning.
-                                    copy_begin[dname] = first_inner_idx[dname] - my_halo_sizes[dname];
-                                    copy_end[dname] = first_inner_idx[dname];
-                                }
+                                        // Only read slice as wide as halo before beginning.
+                                        copy_begin[dname] = first_inner_idx[dname] - my_halo_sizes[dname];
+                                        copy_end[dname] = first_inner_idx[dname];
+                                    }
                             
-                                // Neighbor is to the right.
-                                else if (neigh_ofs == idx_t(MPIInfo::rank_next)) {
+                                    // Neighbor is to the right.
+                                    else if (neigh_ofs == idx_t(MPIInfo::rank_next)) {
                                     
-                                    // Only read slice as wide as halo after end.
-                                    copy_begin[dname] = last_inner_idx[dname] + 1;
-                                    copy_end[dname] = last_inner_idx[dname] + 1 + my_halo_sizes[dname];
-                                }
+                                        // Only read slice as wide as halo after end.
+                                        copy_begin[dname] = last_inner_idx[dname] + 1;
+                                        copy_end[dname] = last_inner_idx[dname] + 1 + my_halo_sizes[dname];
+                                    }
                                 
-                                // Else, this neighbor is in same posn as I am in this dim,
-                                // so we leave the default begin/end settings.
-                            }
-                        } // domain dims in this grid.
+                                    // Else, this neighbor is in same posn as I am in this dim,
+                                    // so we leave the default begin/end settings.
+                                }
+                            } // domain dims in this grid.
+                        } // domain dims.
 
                         // Sizes of buffer in all dims of this grid.
                         // Also, set begin/end value for non-domain dims.
@@ -2286,6 +2307,7 @@ namespace yask {
 
     // Exchange dirty halo data for all grids and all steps, regardless
     // of their stencil-group.
+    // TODO: loop through all grids in exchange_halos() instead.
     void StencilContext::exchange_halos_all() {
 
 #ifdef USE_MPI
@@ -2305,6 +2327,8 @@ namespace yask {
         for (auto* sg : stGroups) {
 
             // Do exchange over max steps.
+            // Steps that don't exist in a particular grid or
+            // steps that are clean will be skipped.
             exchange_halos(start, stop, *sg);
         }
 #endif
@@ -2318,20 +2342,17 @@ namespace yask {
 #ifdef USE_MPI
         if (!enable_halo_exchange || _env->num_ranks < 2)
             return;
+
+        // Don't exchange for scratch groups.
+        if (sg.is_scratch())
+            return;
+
         mpi_time.start();
         TRACE_MSG("exchange_halos: " << start << " ... (end before) " << stop <<
                   " for eq-group '" << sg.get_name() << "'");
         auto opts = get_settings();
         auto& sd = _dims->_step_dim;
 
-        // 1D array to store send request handles.
-        // We use a 1D array so we can call MPI_Waitall().
-        MPI_Request send_reqs[sg.inputGridPtrs.size() * _mpiInfo->neighborhood_size];
-
-        // 2D array for receive request handles.
-        // We use a 2D array to simplify individual indexing.
-        MPI_Request recv_reqs[sg.inputGridPtrs.size()][_mpiInfo->neighborhood_size];
-
         // Loop through steps.  This loop has to be outside halo-step loop
         // because we only have one buffer per step. Normally, we only
         // exchange one step; in that case, it doesn't matter. It would be more
@@ -2340,10 +2361,57 @@ namespace yask {
         assert(start != stop);
         idx_t step = (start < stop) ? 1 : -1;
         for (idx_t t = start; t != stop; t += step) {
-            int num_send_reqs = 0;
+
+            // Get list of grids that need to be swapped.
+            // Use an ordered map to make sure grids are in
+            // same order on all ranks.
+            GridPtrMap gridsToSwap;
+            
+            // Find the groups that need to be processed.
+            // This will be the prerequisite scratch-grid
+            // groups plus this non-scratch group.
+            auto sg_list = sg.get_scratch_deps();
+            sg_list.push_back(&sg);
+
+            // Loop through all the needed groups.
+            for (auto* csg : sg_list) {
+
+                // Loop thru all *input* grids in this group.
+                for (auto gp : csg->inputGridPtrs) {
+
+                    // Don't swap scratch grids.
+                    if (gp->is_scratch())
+                        continue;
+                    
+                    // Only need to swap grids whose halos are not up-to-date
+                    // for this step.
+                    if (!gp->is_dirty(t))
+                        continue;
+
+                    // Only need to swap grids that have MPI buffers.
+                    auto& gname = gp->get_name();
+                    if (mpiData.count(gname) == 0)
+                        continue;
+
+                    // Swap this grid.
+                    gridsToSwap[gname] = gp;
+                }
+            }
+            TRACE_MSG("exchange_halos: need to exchange halos for " <<
+                      gridsToSwap.size() << " grid(s)");
+
+            // 1D array to store send request handles.
+            // We use a 1D array so we can call MPI_Waitall().
+            MPI_Request send_reqs[gridsToSwap.size() * _mpiInfo->neighborhood_size];
+            
+            // 2D array for receive request handles.
+            // We use a 2D array to simplify individual indexing.
+            MPI_Request recv_reqs[gridsToSwap.size()][_mpiInfo->neighborhood_size];
 
             // Sequence of things to do for each grid's neighbors
             // (isend includes packing).
+            int num_send_reqs = 0;
+            int num_recv_reqs = 0;
             enum halo_steps { halo_irecv, halo_pack_isend, halo_unpack, halo_nsteps };
             for (int halo_step = 0; halo_step < halo_nsteps; halo_step++) {
 
@@ -2353,21 +2421,15 @@ namespace yask {
                     TRACE_MSG("exchange_halos: packing and sending data for step " << t << "...");
                 else if (halo_step == halo_unpack)
                     TRACE_MSG("exchange_halos: unpacking data for step " << t << "...");
-            
-                // Loop thru all input grids in this group.
-                for (size_t gi = 0; gi < sg.inputGridPtrs.size(); gi++) {
-                    auto gp = sg.inputGridPtrs[gi];
-                    MPI_Request* grid_recv_reqs = recv_reqs[gi];
 
-                    // Only need to swap grids whose halos are not up-to-date
-                    // for this step.
-                    if (!gp->is_dirty(t))
-                        continue;
-
-                    // Only need to swap grids that have MPI buffers.
-                    auto& gname = gp->get_name();
-                    if (mpiData.count(gname) == 0)
-                        continue;
+                // Loop thru all grids to swap.
+                // Use 'gi' as a unique MPI index.
+                int gi = -1;
+                for (auto gtsi : gridsToSwap) {
+                    auto& gname = gtsi.first;
+                    auto gp = gtsi.second;
+                    gi++;
+                    MPI_Request* grid_recv_reqs = recv_reqs[gi];
                     TRACE_MSG(" for grid '" << gname << "'...");
 
                     // Visit all this rank's neighbors.
@@ -2375,7 +2437,7 @@ namespace yask {
                     grid_mpi_data.visitNeighbors
                         ([&](const IdxTuple& offsets, // NeighborOffset.
                              int neighbor_rank,
-                             int ni, // 1D index.
+                             int ni, // unique neighbor index.
                              MPIBufs& bufs) {
                             auto& sendBuf = bufs.bufs[MPIBufs::bufSend];
                             auto& recvBuf = bufs.bufs[MPIBufs::bufRecv];
@@ -2400,6 +2462,7 @@ namespace yask {
                                     TRACE_MSG("   requesting " << makeByteStr(nbytes) << "...");
                                     MPI_Irecv(buf, nbytes, MPI_BYTE,
                                               neighbor_rank, int(gi), _env->comm, &grid_recv_reqs[ni]);
+                                    num_recv_reqs++;
                                 }
                             }
 
@@ -2454,7 +2517,7 @@ namespace yask {
                                 if (nbytes) {
 
                                     // Wait for data from neighbor before unpacking it.
-                                    TRACE_MSG("   waiting for MPI data...");
+                                    TRACE_MSG("   waiting for " << makeByteStr(nbytes) << "...");
                                     MPI_Wait(&grid_recv_reqs[ni], MPI_STATUS_IGNORE);
 
                                     // Vec ok?
@@ -2491,16 +2554,19 @@ namespace yask {
             } // exchange sequence.
             
             // Mark grids as up-to-date.
-            for (size_t gi = 0; gi < sg.inputGridPtrs.size(); gi++) {
-                auto gp = sg.inputGridPtrs[gi];
+            for (auto gtsi : gridsToSwap) {
+                auto& gname = gtsi.first;
+                auto gp = gtsi.second;
                 if (gp->is_dirty(t)) {
                     gp->set_dirty(false, t);
-                    TRACE_MSG("grid '" << gp->get_name() <<
+                    TRACE_MSG("grid '" << gname <<
                               "' marked as clean at step " << t);
                 }
             }
 
             // Wait for all send requests to complete.
+            TRACE_MSG("exchange_halos: " << num_recv_reqs <<
+                      " MPI receive request(s) completed");
             if (num_send_reqs) {
                 TRACE_MSG("exchange_halos: waiting for " << num_send_reqs <<
                           " MPI send request(s) to complete...");
diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp
index 16591148..900fc8a3 100644
--- a/src/kernel/lib/context.hpp
+++ b/src/kernel/lib/context.hpp
@@ -97,7 +97,6 @@ namespace yask {
     class StencilGroupBase;
     typedef std::vector<StencilGroupBase*> StencilGroupList;
     typedef std::set<StencilGroupBase*> StencilGroupSet;
-    typedef std::map<std::string, YkGridPtr> GridPtrMap;
     
     // Data and hierarchical sizes.
     // This is a pure-virtual class that must be implemented
diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp
index a197e8f6..f9d305e6 100644
--- a/src/kernel/lib/settings.hpp
+++ b/src/kernel/lib/settings.hpp
@@ -340,6 +340,7 @@ namespace yask {
     typedef std::shared_ptr<YkGridBase> YkGridPtr;
     typedef std::set<YkGridPtr> GridPtrSet;
     typedef std::vector<YkGridPtr> GridPtrs;
+    typedef std::map<std::string, YkGridPtr> GridPtrMap;
     typedef std::vector<GridPtrs*> ScratchVecs;
     
     // Environmental settings.
diff --git a/src/kernel/yask_main.cpp b/src/kernel/yask_main.cpp
index d5b96658..04063de2 100644
--- a/src/kernel/yask_main.cpp
+++ b/src/kernel/yask_main.cpp
@@ -343,7 +343,7 @@ int main(int argc, char** argv)
                 best_elapsed_time = stats->get_elapsed_run_secs();
             }
         }
-
+        
         os << divLine <<
             "best-elapsed-time (sec):           " << makeNumStr(best_elapsed_time) << endl <<
             "best-throughput (num-writes/sec):  " << makeNumStr(best_apps) << endl <<
@@ -407,9 +407,11 @@ int main(int argc, char** argv)
                     cerr << "This is not uncommon for low-precision FP; try with 8-byte reals." << endl;
                 ok = false;
             }
+            ref_soln->end_solution();
         }
         else
             os << "\nRESULTS NOT VERIFIED.\n";
+        ksoln->end_solution();
 
         kenv->global_barrier();
         if (!ok)
diff --git a/src/stencils/AwpElasticStencil.hpp b/src/stencils/AwpElasticStencil.hpp
index 8ecb9b6e..888a03b5 100644
--- a/src/stencils/AwpElasticStencil.hpp
+++ b/src/stencils/AwpElasticStencil.hpp
@@ -35,8 +35,10 @@ IN THE SOFTWARE.
 //#define FULL_SPONGE_GRID
 
 // Set the following macro to calculate free-surface boundary values.
-// This feature is currently under development.
-//#define DO_SURFACE
+#define DO_ABOVE_SURFACE
+
+// Set the following macro to use intermediate scratch grids.
+#define USE_SCRATCH_GRIDS
 
 #include "Soln.hpp"
 
@@ -86,6 +88,25 @@ class AwpElasticStencil : public StencilBase {
     MAKE_SCALAR(delta_t);
     MAKE_SCALAR(h);
 
+    // For the surface stress conditions, we need to write into 2 points
+    // above the surface.  Since we can only write into the "domain", we
+    // will define the surface index to be 2 points before the last domain
+    // index. Thus, there will be two layers in the domain above the surface.
+#define SURFACE_IDX (last_index(z) - 2)
+    
+    // Define some sub-domains related to the surface.
+#define IF_BELOW_SURFACE IF (z < SURFACE_IDX)
+#define IF_AT_SURFACE IF (z == SURFACE_IDX)
+#define IF_AT_OR_BELOW_SURFACE IF (z <= SURFACE_IDX)
+#define IF_ONE_ABOVE_SURFACE IF (z == SURFACE_IDX + 1)
+#define IF_TWO_ABOVE_SURFACE IF (z == SURFACE_IDX + 2)
+
+#ifdef USE_SCRATCH_GRIDS
+        MAKE_SCRATCH_GRID(tmp_vel_x, x, y, z);
+        MAKE_SCRATCH_GRID(tmp_vel_y, x, y, z);
+        MAKE_SCRATCH_GRID(tmp_vel_z, x, y, z);
+#endif
+    
 public:
 
     AwpElasticStencil(StencilList& stencils) :
@@ -107,7 +128,7 @@ class AwpElasticStencil : public StencilBase {
     // time or space, so half-steps due to staggered grids are adjusted
     // appropriately.
 
-    void define_vel_x(Condition at_last_z) {
+    GridValue get_next_vel_x(GridIndex x, GridIndex y, GridIndex z) {
         GridValue rho_val = (rho(x, y,   z  ) +
                              rho(x, y-1, z  ) +
                              rho(x, y,   z-1) +
@@ -122,10 +143,10 @@ class AwpElasticStencil : public StencilBase {
         GridValue next_vel_x = vel_x(t, x, y, z) + (delta_t / (h * rho_val)) * d_val;
         adjust_for_sponge(next_vel_x);
 
-        // define the value at t+1.
-        vel_x(t+1, x, y, z) EQUALS next_vel_x;
+        // Return the value at t+1.
+        return next_vel_x;
     }
-    void define_vel_y(Condition at_last_z) {
+    GridValue get_next_vel_y(GridIndex x, GridIndex y, GridIndex z) {
         GridValue rho_val = (rho(x,   y, z  ) +
                              rho(x+1, y, z  ) +
                              rho(x,   y, z-1) +
@@ -140,10 +161,10 @@ class AwpElasticStencil : public StencilBase {
         GridValue next_vel_y = vel_y(t, x, y, z) + (delta_t / (h * rho_val)) * d_val;
         adjust_for_sponge(next_vel_y);
 
-        // define the value at t+1.
-        vel_y(t+1, x, y, z) EQUALS next_vel_y;
+        // Return the value at t+1.
+        return next_vel_y;
     }
-    void define_vel_z(Condition at_last_z) {
+    GridValue get_next_vel_z(GridIndex x, GridIndex y, GridIndex z) {
         GridValue rho_val = (rho(x,   y,   z) +
                              rho(x+1, y,   z) +
                              rho(x,   y-1, z) +
@@ -158,42 +179,100 @@ class AwpElasticStencil : public StencilBase {
         GridValue next_vel_z = vel_z(t, x, y, z) + (delta_t / (h * rho_val)) * d_val;
         adjust_for_sponge(next_vel_z);
 
-        // define the value at t+1.
-        vel_z(t+1, x, y, z) EQUALS next_vel_z;
+        // Return the value at t+1.
+        return next_vel_z;
     }
 
     // Free-surface boundary equations for velocity.
-    void define_free_surface_vel(Condition at_last_z) {
+    void define_free_surface_vel() {
+
+        // Since we're defining points when z == surface + 1,
+        // the surface itself will be at z - 1;
+        GridIndex surf = z - 1;
+
+#ifdef USE_SCRATCH_GRIDS
+
+        // The values for velocity at t+1 will be needed
+        // in multiple free-surface calculations.
+        // Thus, it will reduce the number of FP ops
+        // required if we pre-compute them and store them
+        // in scratch grids.
+#define VEL_X tmp_vel_x
+#define VEL_Y tmp_vel_y
+#define VEL_Z tmp_vel_z
+        VEL_X(x, y, z) EQUALS get_next_vel_x(x, y, z);
+        VEL_Y(x, y, z) EQUALS get_next_vel_y(x, y, z);
+        VEL_Z(x, y, z) EQUALS get_next_vel_z(x, y, z);
+
+#else
+
+        // If not using scratch grids, just call the
+        // functions to calculate each value of velocity
+        // at t+1 every time it's needed.
+#define VEL_X get_next_vel_x
+#define VEL_Y get_next_vel_y
+#define VEL_Z get_next_vel_z
+#endif
 
-        // Following expressions are valid only when z == last value in domain.
-        // Note that values beyond the last index are updated, i.e., in the halo.
-        
         // A couple of intermediate values.
-        GridValue d_x_val = vel_x(t+1, x+1, y, z) -
-            (vel_z(t+1, x+1, y, z) - vel_z(t+1, x, y, z));
-        GridValue d_y_val = vel_y(t+1, x, y-1, z) -
-            (vel_z(t+1, x, y, z) - vel_z(t+1, x, y-1, z));
+        GridValue d_x_val = VEL_X(x+1, y, surf) -
+            (VEL_Z(x+1, y, surf) - VEL_Z(x, y, surf));
+        GridValue d_y_val = VEL_Y(x, y-1, surf) -
+            (VEL_Z(x, y, surf) - VEL_Z(x, y-1, surf));
         
-        // Following values are valid at the free surface.
-        GridValue plus1_vel_x = vel_x(t+1, x, y, z) -
-            (vel_z(t+1, x, y, z) - vel_z(t+1, x-1, y, z));
-        GridValue plus1_vel_y = vel_y(t+1, x, y, z) -
-            (vel_z(t+1, x, y+1, z) - vel_z(t+1, x, y, z));
-        GridValue plus1_vel_z = vel_z(t+1, x, y, z) -
+        // Following values are valid one layer above the free surface.
+        GridValue plus1_vel_x = VEL_X(x, y, surf) -
+            (VEL_Z(x, y, surf) - VEL_Z(x-1, y, surf));
+        GridValue plus1_vel_y = VEL_Y(x, y, surf) -
+            (VEL_Z(x, y+1, surf) - VEL_Z(x, y, surf));
+        GridValue plus1_vel_z = VEL_Z(x, y, surf) -
             ((d_x_val - plus1_vel_x) +
-             (vel_x(t+1, x+1, y, z) - vel_x(t+1, x, y, z)) +
+             (VEL_X(x+1, y, surf) - VEL_X(x, y, surf)) +
              (plus1_vel_y - d_y_val) +
-             (vel_y(t+1, x, y, z) - vel_y(t+1, x, y-1, z))) /
-            ((mu(x, y, z) *
-              (2.0 / mu(x, y, z) + 1.0 / lambda(x, y, z))));
-
-        // Define equivalencies to be valid only when z == last value in domain.
-        vel_x(t+1, x, y, z+1) EQUALS plus1_vel_x
-            IF at_last_z;
-        vel_y(t+1, x, y, z+1) EQUALS plus1_vel_y
-            IF at_last_z;
-        vel_z(t+1, x, y, z+1) EQUALS plus1_vel_z
-            IF at_last_z;
+             (VEL_Y(x, y, surf) - VEL_Y(x, y-1, surf))) /
+            ((mu(x, y, surf) *
+              (2.0 / mu(x, y, surf) + 1.0 / lambda(x, y, surf))));
+#undef VEL_X
+#undef VEL_Y
+#undef VEL_Z
+        
+        // Define layer at one point above surface.
+        vel_x(t+1, x, y, z) EQUALS plus1_vel_x IF_ONE_ABOVE_SURFACE;
+        vel_y(t+1, x, y, z) EQUALS plus1_vel_y IF_ONE_ABOVE_SURFACE;
+        vel_z(t+1, x, y, z) EQUALS plus1_vel_z IF_ONE_ABOVE_SURFACE;
+
+        // Define layer two points above surface for completeness, even
+        // though these aren't input to any stencils.
+        vel_x(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE;
+        vel_y(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE;
+        vel_z(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE;
+    }
+
+    // Compute average of 8 neighbors.
+    GridValue ave8(Grid& g, GridIndex x, GridIndex y, GridIndex z) {
+        
+        return 8.0 /
+            (g(x,   y,   z  ) + g(x+1, y,   z  ) +
+             g(x,   y-1, z  ) + g(x+1, y-1, z  ) +
+             g(x,   y,   z-1) + g(x+1, y,   z-1) +
+             g(x,   y-1, z-1) + g(x+1, y-1, z-1));
+    }
+
+    // Some common velocity calculations.
+    GridValue d_x_val(GridIndex x, GridIndex y, GridIndex z) {
+        return
+            c1 * (vel_x(t+1, x+1, y,   z  ) - vel_x(t+1, x,   y,   z  )) +
+            c2 * (vel_x(t+1, x+2, y,   z  ) - vel_x(t+1, x-1, y,   z  ));
+    }
+    GridValue d_y_val(GridIndex x, GridIndex y, GridIndex z) {
+        return
+            c1 * (vel_y(t+1, x,   y,   z  ) - vel_y(t+1, x,   y-1, z  )) +
+            c2 * (vel_y(t+1, x,   y+1, z  ) - vel_y(t+1, x,   y-2, z  ));
+    }
+    GridValue d_z_val(GridIndex x, GridIndex y, GridIndex z) {
+        return
+            c1 * (vel_z(t+1, x,   y,   z  ) - vel_z(t+1, x,   y,   z-1)) +
+            c2 * (vel_z(t+1, x,   y,   z+1) - vel_z(t+1, x,   y,   z-2));
     }
     
     // Stress-grid define functions.  For each D in xx, yy, zz, xy, xz, yz,
@@ -204,33 +283,43 @@ class AwpElasticStencil : public StencilBase {
     // space, so half-steps due to staggered grids are adjusted
     // appropriately.
 
-    void define_stress_xx(Condition at_last_z,
-                          GridValue lambda_val, GridValue mu_val,
-                          GridValue d_x_val, GridValue d_y_val, GridValue d_z_val) {
+    GridValue get_next_stress_xx(GridIndex x, GridIndex y, GridIndex z) {
 
         GridValue next_stress_xx = stress_xx(t, x, y, z) +
-            ((delta_t / h) * ((2 * mu_val * d_x_val) +
-                              (lambda_val * (d_x_val + d_y_val + d_z_val))));
+            ((delta_t / h) * ((2 * ave8(mu, x, y, z) * d_x_val(x, y, z)) +
+                              (ave8(lambda, x, y, z) *
+                               (d_x_val(x, y, z) + d_y_val(x, y, z) + d_z_val(x, y, z)))));
         adjust_for_sponge(next_stress_xx);
 
-        // define the value at t+1.
-        stress_xx(t+1, x, y, z) EQUALS next_stress_xx;
+        // Return the value at t+1.
+        return next_stress_xx;
     }
-    void define_stress_yy(Condition at_last_z,
-                          GridValue lambda_val, GridValue mu_val,
-                          GridValue d_x_val, GridValue d_y_val, GridValue d_z_val) {
+    GridValue get_next_stress_yy(GridIndex x, GridIndex y, GridIndex z) {
 
         GridValue next_stress_yy = stress_yy(t, x, y, z) +
-            ((delta_t / h) * ((2 * mu_val * d_y_val) +
-                              (lambda_val * (d_x_val + d_y_val + d_z_val))));
+            ((delta_t / h) * ((2 * ave8(mu, x, y, z) * d_y_val(x, y, z)) +
+                              (ave8(lambda, x, y, z) *
+                               (d_x_val(x, y, z) + d_y_val(x, y, z) + d_z_val(x, y, z)))));
         adjust_for_sponge(next_stress_yy);
 
-        // define the value at t+1.
-        stress_yy(t+1, x, y, z) EQUALS next_stress_yy;
+        // Return the value at t+1.
+        return next_stress_yy;
     }
-    void define_stress_xy(Condition at_last_z) {
+    GridValue get_next_stress_zz(GridIndex x, GridIndex y, GridIndex z) {
+
+        GridValue next_stress_zz = stress_zz(t, x, y, z) +
+            ((delta_t / h) * ((2 * ave8(mu, x, y, z) * d_z_val(x, y, z)) +
+                              (ave8(lambda, x, y, z) *
+                               (d_x_val(x, y, z) + d_y_val(x, y, z) + d_z_val(x, y, z)))));
+        adjust_for_sponge(next_stress_zz);
 
-        GridValue mu_val = 2.0 /
+        // return the value at t+1.
+        return next_stress_zz;
+    }
+    GridValue get_next_stress_xy(GridIndex x, GridIndex y, GridIndex z) {
+
+        // Compute average of 2 neighbors.
+        GridValue mu2 = 2.0 /
             (mu(x,   y,   z  ) + mu(x,   y,   z-1));
 
         // Note that we are using the velocity values at t+1.
@@ -242,15 +331,16 @@ class AwpElasticStencil : public StencilBase {
             c2 * (vel_y(t+1, x+1, y,   z  ) - vel_y(t+1, x-2, y,   z  ));
 
         GridValue next_stress_xy = stress_xy(t, x, y, z) +
-            ((mu_val * delta_t / h) * (d_xy_val + d_yx_val));
+            ((mu2 * delta_t / h) * (d_xy_val + d_yx_val));
         adjust_for_sponge(next_stress_xy);
 
-        // define the value at t+1.
-        stress_xy(t+1, x, y, z) EQUALS next_stress_xy;
+        // return the value at t+1.
+        return next_stress_xy;
     }
-    void define_stress_xz(Condition at_last_z) {
+    GridValue get_next_stress_xz(GridIndex x, GridIndex y, GridIndex z) {
 
-        GridValue mu_val = 2.0 /
+        // Compute average of 2 neighbors.
+        GridValue mu2 = 2.0 /
             (mu(x,   y,   z  ) + mu(x,   y-1, z  ));
 
         // Note that we are using the velocity values at t+1.
@@ -262,22 +352,16 @@ class AwpElasticStencil : public StencilBase {
             c2 * (vel_z(t+1, x+1, y,   z  ) - vel_z(t+1, x-2, y,   z  ));
 
         GridValue next_stress_xz = stress_xz(t, x, y, z) +
-            ((mu_val * delta_t / h) * (d_xz_val + d_zx_val));
+            ((mu2 * delta_t / h) * (d_xz_val + d_zx_val));
         adjust_for_sponge(next_stress_xz);
 
-        // define the value at t+1 (special case: zero at surface).
-#ifdef DO_SURFACE
-        stress_xz(t+1, x, y, z) EQUALS next_stress_xz
-            IF !at_last_z;
-        stress_xz(t+1, x, y, z) EQUALS 0.0
-            IF at_last_z;
-#else
-        stress_xz(t+1, x, y, z) EQUALS next_stress_xz;
-#endif
+        // return the value at t+1.
+        return next_stress_xz;
     }
-    void define_stress_yz(Condition at_last_z) {
+    GridValue get_next_stress_yz(GridIndex x, GridIndex y, GridIndex z) {
 
-        GridValue mu_val = 2.0 /
+        // Compute average of 2 neighbors.
+        GridValue mu2 = 2.0 /
             (mu(x,   y,   z  ) + mu(x+1, y,   z  ));
 
         // Note that we are using the velocity values at t+1.
@@ -289,122 +373,79 @@ class AwpElasticStencil : public StencilBase {
             c2 * (vel_z(t+1, x,   y+2, z  ) - vel_z(t+1, x,   y-1, z  ));
 
         GridValue next_stress_yz = stress_yz(t, x, y, z) +
-            ((mu_val * delta_t / h) * (d_yz_val + d_zy_val));
+            ((mu2 * delta_t / h) * (d_yz_val + d_zy_val));
         adjust_for_sponge(next_stress_yz);
 
-        // define the value at t+1 (special case: zero at surface).
-#ifdef DO_SURFACE
-        stress_yz(t+1, x, y, z) EQUALS next_stress_yz
-            IF !at_last_z;
-        stress_yz(t+1, x, y, z) EQUALS 0.0
-            IF at_last_z;
-#else
-        stress_yz(t+1, x, y, z) EQUALS next_stress_yz;
-#endif
-    }
-    void define_stress_zz(Condition at_last_z,
-                          GridValue lambda_val, GridValue mu_val,
-                          GridValue d_x_val, GridValue d_y_val, GridValue d_z_val) {
-
-        GridValue next_stress_zz = stress_zz(t, x, y, z) +
-            ((delta_t / h) * ((2 * mu_val * d_z_val) +
-                              (lambda_val * (d_x_val + d_y_val + d_z_val))));
-        adjust_for_sponge(next_stress_zz);
-
-        // define the value at t+1 (special case: zero at surface).
-#ifdef DO_SURFACE
-        stress_zz(t+1, x, y, z) EQUALS next_stress_zz
-            IF !at_last_z;
-        stress_zz(t+1, x, y, z) EQUALS 0.0
-            IF at_last_z;
-#else
-        stress_zz(t+1, x, y, z) EQUALS next_stress_zz;
-#endif
+        // return the value at t+1.
+        return next_stress_yz;
     }
 
     // Free-surface boundary equations for stress.
-    void define_free_surface_stress(Condition at_last_z) {
+    void define_free_surface_stress() {
 
-        // Define equivalencies to be valid only when z == last value in domain.
-        // Note that values beyond the last index are updated, i.e., in the halo.
+        // When z == surface + 1, the surface will be at z - 1;
+        GridIndex surf = z - 1;
 
-        stress_zz(t+1, x, y, z+1) EQUALS -stress_zz(t+1, x, y, z)
-            IF at_last_z;
-        stress_zz(t+1, x, y, z+2) EQUALS -stress_zz(t+1, x, y, z-1)
-            IF at_last_z;
+        stress_zz(t+1, x, y, z) EQUALS -get_next_stress_zz(x, y, surf) IF_ONE_ABOVE_SURFACE;
+        stress_xz(t+1, x, y, z) EQUALS -get_next_stress_xz(x, y, surf-1) IF_ONE_ABOVE_SURFACE;
+        stress_yz(t+1, x, y, z) EQUALS -get_next_stress_yz(x, y, surf-1) IF_ONE_ABOVE_SURFACE;
 
-        stress_xz(t+1, x, y, z+1) EQUALS -stress_xz(t+1, x, y, z-1)
-            IF at_last_z;
-        stress_xz(t+1, x, y, z+2) EQUALS -stress_xz(t+1, x, y, z-2)
-            IF at_last_z;
-
-        stress_yz(t+1, x, y, z+1) EQUALS -stress_yz(t+1, x, y, z-1)
-            IF at_last_z;
-        stress_yz(t+1, x, y, z+2) EQUALS -stress_yz(t+1, x, y, z-2)
-            IF at_last_z;
+        // Define other 3 stress values for completeness, even
+        // though these aren't input to any stencils.
+        stress_xx(t+1, x, y, z) EQUALS 0.0 IF_ONE_ABOVE_SURFACE;
+        stress_yy(t+1, x, y, z) EQUALS 0.0 IF_ONE_ABOVE_SURFACE;
+        stress_xy(t+1, x, y, z) EQUALS 0.0 IF_ONE_ABOVE_SURFACE;
+        
+        // When z == surface + 2, the surface will be at z - 2;
+        surf = z - 2;
+
+        stress_zz(t+1, x, y, z) EQUALS -get_next_stress_zz(x, y, surf-1) IF_TWO_ABOVE_SURFACE;
+        stress_xz(t+1, x, y, z) EQUALS -get_next_stress_xz(x, y, surf-2) IF_TWO_ABOVE_SURFACE;
+        stress_yz(t+1, x, y, z) EQUALS -get_next_stress_yz(x, y, surf-2) IF_TWO_ABOVE_SURFACE;
+
+        // Define other 3 stress values for completeness, even
+        // though these aren't input to any stencils.
+        stress_xx(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE;
+        stress_yy(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE;
+        stress_xy(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE;
     }
     
-    // Call all the define_* functions.
+    // Define the t+1 values for all velocity and stress grids.
     virtual void define() {
 
-        // A condition that is true when index 'z' is at the free-surface boundary.
-        Condition at_last_z = (z == last_index(z));
-        
         // Define velocity components.
-        define_vel_x(at_last_z);
-        define_vel_y(at_last_z);
-        define_vel_z(at_last_z);
+        vel_x(t+1, x, y, z) EQUALS get_next_vel_x(x, y, z) IF_AT_OR_BELOW_SURFACE;
+        vel_y(t+1, x, y, z) EQUALS get_next_vel_y(x, y, z) IF_AT_OR_BELOW_SURFACE;
+        vel_z(t+1, x, y, z) EQUALS get_next_vel_z(x, y, z) IF_AT_OR_BELOW_SURFACE;
+
+        // Define stress components.  Use non-overlapping sub-domains only,
+        // i.e. AT and BELOW but not AT_OR_BELOW, even though there are some
+        // repeated stencils. This allows the YASK compiler to bundle all
+        // the stress equations together.
+        stress_xx(t+1, x, y, z) EQUALS get_next_stress_xx(x, y, z) IF_BELOW_SURFACE;
+        stress_yy(t+1, x, y, z) EQUALS get_next_stress_yy(x, y, z) IF_BELOW_SURFACE;
+        stress_xy(t+1, x, y, z) EQUALS get_next_stress_xy(x, y, z) IF_BELOW_SURFACE;
+        stress_xz(t+1, x, y, z) EQUALS get_next_stress_xz(x, y, z) IF_BELOW_SURFACE;
+        stress_yz(t+1, x, y, z) EQUALS get_next_stress_yz(x, y, z) IF_BELOW_SURFACE;
+        stress_zz(t+1, x, y, z) EQUALS get_next_stress_zz(x, y, z) IF_BELOW_SURFACE;
+
+        stress_xx(t+1, x, y, z) EQUALS get_next_stress_xx(x, y, z) IF_AT_SURFACE;
+        stress_yy(t+1, x, y, z) EQUALS get_next_stress_yy(x, y, z) IF_AT_SURFACE;
+        stress_xy(t+1, x, y, z) EQUALS get_next_stress_xy(x, y, z) IF_AT_SURFACE;
+        stress_xz(t+1, x, y, z) EQUALS 0.0 IF_AT_SURFACE;
+        stress_yz(t+1, x, y, z) EQUALS 0.0 IF_AT_SURFACE;
+        stress_zz(t+1, x, y, z) EQUALS get_next_stress_zz(x, y, z) IF_AT_SURFACE;
 
         // Boundary conditions.
-#ifdef DO_SURFACE
-        define_free_surface_vel(at_last_z);
-#endif
-
-        // Define some values common to the diagonal stress equations.
-#ifdef PRECOMPUTED_LAMBDA
-        // Use this the lambda values are pre-computed once before
-        // all time-steps.
-        GridValue lambda_val = lambda(x, y, z);
-#else
-        GridValue lambda_val = 8.0 /
-            (lambda(x,   y,   z  ) + lambda(x+1, y,   z  ) +
-             lambda(x,   y-1, z  ) + lambda(x+1, y-1, z  ) +
-             lambda(x,   y,   z-1) + lambda(x+1, y,   z-1) +
-             lambda(x,   y-1, z-1) + lambda(x+1, y-1, z-1));
-#endif
-        GridValue mu_val = 8.0 /
-            (mu(x,   y,   z  ) + mu(x+1, y,   z  ) +
-             mu(x,   y-1, z  ) + mu(x+1, y-1, z  ) +
-             mu(x,   y,   z-1) + mu(x+1, y,   z-1) +
-             mu(x,   y-1, z-1) + mu(x+1, y-1, z-1));
-
-        // Note that we are using the velocity values at t+1.
-        GridValue d_x_val =
-            c1 * (vel_x(t+1, x+1, y,   z  ) - vel_x(t+1, x,   y,   z  )) +
-            c2 * (vel_x(t+1, x+2, y,   z  ) - vel_x(t+1, x-1, y,   z  ));
-        GridValue d_y_val =
-            c1 * (vel_y(t+1, x,   y,   z  ) - vel_y(t+1, x,   y-1, z  )) +
-            c2 * (vel_y(t+1, x,   y+1, z  ) - vel_y(t+1, x,   y-2, z  ));
-        GridValue d_z_val =
-            c1 * (vel_z(t+1, x,   y,   z  ) - vel_z(t+1, x,   y,   z-1)) +
-            c2 * (vel_z(t+1, x,   y,   z+1) - vel_z(t+1, x,   y,   z-2));
-
-        // Define stress components.
-        define_stress_xx(at_last_z,
-                         lambda_val, mu_val, d_x_val, d_y_val, d_z_val);
-        define_stress_yy(at_last_z,
-                         lambda_val, mu_val, d_x_val, d_y_val, d_z_val);
-        define_stress_zz(at_last_z,
-                         lambda_val, mu_val, d_x_val, d_y_val, d_z_val);
-        define_stress_xy(at_last_z);
-        define_stress_xz(at_last_z);
-        define_stress_yz(at_last_z);
-
-        // Boundary conditions.
-#ifdef DO_SURFACE
-        define_free_surface_stress(at_last_z);
+#ifdef DO_ABOVE_SURFACE
+        define_free_surface_vel();
+        define_free_surface_stress();
 #endif
     }
 };
 
 REGISTER_STENCIL(AwpElasticStencil);
+
+#undef DO_SURFACE
+#undef FULL_SPONGE_GRID
+#undef USE_SCRATCH_GRIDS

From 2f1ec50ea39ab7a0bfd643821af2ae972826c571 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Fri, 20 Apr 2018 08:56:18 -0700
Subject: [PATCH 04/21] Remove some deprecated compiler APIs.

---
 include/yask_compiler_api.hpp         | 148 ++++++--------------------
 include/yask_kernel_api.hpp           |   6 +-
 include/yk_solution_api.hpp           | 103 ------------------
 src/compiler/lib/Expr.hpp             |  11 +-
 src/compiler/lib/Soln.hpp             |   8 +-
 src/compiler/swig/yask_compiler_api.i |   5 +-
 src/kernel/swig/yask_kernel_api.i     |   3 +-
 src/stencils/AwpElasticStencil.hpp    |  38 +++----
 8 files changed, 65 insertions(+), 257 deletions(-)

diff --git a/include/yask_compiler_api.hpp b/include/yask_compiler_api.hpp
index a6c3bd81..f865e167 100644
--- a/include/yask_compiler_api.hpp
+++ b/include/yask_compiler_api.hpp
@@ -44,13 +44,9 @@ namespace yask {
     typedef std::shared_ptr<yc_solution> yc_solution_ptr;
 
     class yc_grid;
-    /// Shared pointer to \ref yc_grid
+    /// Pointer to \ref yc_grid
     typedef yc_grid* yc_grid_ptr;
 
-    class yc_equation_group;
-    /// Shared pointer to \ref yc_equation_group;
-    typedef std::shared_ptr<yc_equation_group> yc_equation_group_ptr;
-
     // Forward declarations of expression nodes and their pointers.
 
     class yc_expr_node;
@@ -105,7 +101,7 @@ namespace yask {
     /// Shared pointer to \ref yc_bool_node
     typedef std::shared_ptr<yc_bool_node> yc_bool_node_ptr;
 
-    /// Factory to create objects needed to define a stencil solution.
+    /// Bootstrap factory to create objects needed to define a stencil solution.
     class yc_factory {
     public:
         virtual ~yc_factory() {}
@@ -157,6 +153,16 @@ namespace yask {
         set_name(std::string name
                  /**< [in] Name; must be a valid C++ identifier. */ ) =0;
 
+        /// Get current floating-point precision setting.
+        /** @returns Number of bytes in a FP number. */
+        virtual int
+        get_element_bytes() const =0;
+
+        /// Set floating-point precision.
+        virtual void
+        set_element_bytes(int nbytes /**< [in] Number of bytes in a FP number.
+                                        Should be 4 or 8. */ ) =0;
+
         /// Create an n-dimensional grid variable in the solution.
         /**
            "Grid" is a generic term for any n-dimensional variable.  A 0-dim
@@ -237,64 +243,21 @@ namespace yask {
                             Each dimension is identified by an associated index. */ ) =0;
 #endif
         
-        /// Get all the grids in the solution.
-        /** @returns Vector containing pointer to all grids. */
-        virtual std::vector<yc_grid_ptr>
-        get_grids() =0;
-        
         /// Get the number of grids in the solution.
         /** @returns Number of grids that have been created via new_grid(). */
         virtual int
         get_num_grids() const =0;
         
+        /// Get all the grids in the solution.
+        /** @returns Vector containing pointer to all grids. */
+        virtual std::vector<yc_grid_ptr>
+        get_grids() =0;
+        
         /// Get the specified grid.
         /** @returns Pointer to the specified grid or null pointer if it does not exist. */
         virtual yc_grid_ptr
         get_grid(const std::string& name /**< [in] Name of the grid. */ ) =0;
         
-        /// Get the number of equations in the solution.
-        /** Equations are added when yc_node_factory::new_equation_node() is called.
-            @returns Number of equations that have been created. */
-        virtual int
-        get_num_equations() const =0;
-
-        /// Get the specified equation.
-        /** @returns Pointer to \ref yc_equation_node of nth equation. */
-        virtual yc_equation_node_ptr
-        get_equation(int n /**< [in] Index of equation between zero (0)
-                              and get_num_equations()-1. */ ) =0;
-
-        /// Create a new equation group.
-        /**
-           In normal usage, equation groups are created automatically when
-           format() is called.  Under automatic grouping, the YASK compiler
-           discovers dependencies between equations and places equations
-           together in a group if they do not depend upon one another.
-           Then, the YASK compiler schedules the resulting groups for
-           execution in the kernel based on the dependencies between groups.
-
-           A \ref yc_equation_group object allows manual grouping of equations.
-           Under manual grouping, the YASK compiler does _not_ check
-           for illegal dependencies within the group.
-           In addition, if `do_schedule` is `false`, the YASK compiler
-           will not check for dependencies with other groups and
-           will not schedule the group for execution in the kernel.
-           Then, it will be the programmer's responsibility to run the
-           stencil group via yk_solution::run_stencil_group().
-
-           This capability is useful for processing equations that
-           the YASK compiler cannot currently handle, like equations
-           with dependencies between different points of a grid
-           at the same step index.
-
-           @returns Pointer to the new \ref yc_equation_group object. 
-         */
-        virtual yc_equation_group_ptr
-        new_equation_group(const std::string& name
-                           /**< [in] Name of the group. */,
-                           bool do_schedule = true
-                           /**< [in] Schedule the group for execution in the kernel. */ ) =0;
-
         /// Set the vectorization length in given dimension.
         /** For YASK-code generation, the product of the fold lengths should
             be equal to the number of elements in a HW SIMD register.
@@ -321,16 +284,6 @@ namespace yask {
         virtual void
         clear_folding() =0;
 
-        /// Get current floating-point precision setting.
-        /** @returns Number of bytes in a FP number. */
-        virtual int
-        get_element_bytes() const =0;
-
-        /// Set floating-point precision.
-        virtual void
-        set_element_bytes(int nbytes /**< [in] Number of bytes in a FP number.
-                                        Should be 4 or 8. */ ) =0;
-
         /// Set the cluster multiplier (unroll factor) in given dimension.
         /** For YASK-code generation, this will have the effect of creating
             N vectors of output for each equation, where N is the product of
@@ -349,6 +302,18 @@ namespace yask {
         virtual void
         clear_clustering() =0;
         
+        /// Get the number of equations in the solution.
+        /** Equations are added when yc_node_factory::new_equation_node() is called.
+            @returns Number of equations that have been created. */
+        virtual int
+        get_num_equations() const =0;
+
+        /// Get a list of all the defined equations.
+        /** @returns Vector of containing pointers to all 
+            equations that have been created. */
+        virtual std::vector<yc_equation_node_ptr>
+        get_equations() =0;
+
         /// Format the current equation(s) and write to given output object.
         /** Currently supported format types:
             Type    | Output
@@ -396,12 +361,6 @@ namespace yask {
         /** @returns Number of dimensions created via new_grid(). */
         virtual int get_num_dims() const =0;
 
-        /// Get the name of the specified dimension.
-        /** @returns String containing name of dimension created via new_grid(). */
-        virtual const std::string&
-        get_dim_name(int n /**< [in] Index of dimension between zero (0)
-                              and get_num_dims()-1. */ ) const =0;
-
         /// Get all the dimensions in this grid.
         /**
            Includes step dimension if it is a dimension of this grid.
@@ -584,7 +543,7 @@ namespace yask {
         virtual yc_number_node_ptr get_rhs() =0;
     };
 
-    /// Base class for all real or integer AST nodes.
+    /// Base class for all numerical AST nodes.
     /** An object of this abstract type cannot be created. */
     class yc_number_node : public virtual yc_expr_node { };
 
@@ -671,11 +630,10 @@ namespace yask {
         virtual int
         get_num_operands() =0;
 
-        /// Get the specified operand.
-        /** @returns Pointer to node at given position or null pointer if out of bounds. */
-        virtual yc_number_node_ptr
-        get_operand(int i /**< [in] Index between zero (0)
-                             and get_num_operands()-1. */ ) =0;
+        /// Get a list of the operands.
+        /** @returns Vector of pointers to all operand nodes. */
+        virtual std::vector<yc_number_node_ptr>
+        get_operands() =0;
 
         /// Add an operand.
         virtual void
@@ -722,44 +680,6 @@ namespace yask {
         get_rhs() =0;
     };
 
-    /// A manual grouping of stencil equations.
-    /**
-       Created via yc_solution::new_equation_group().
-       See yc_solution::new_equation_group() for a description of
-       automatic versus manual grouping.
-
-       After a \ref yc_equation_group is processed by the YASK
-       compiler and the resulting kernel is compiled,
-       it will be visible as a \ref yk_stencil_group
-       in the corresponding YASK kernel.
-    */
-    class yc_equation_group {
-    public:
-
-        /// Get the name of this group.
-        /**
-           @returns Name created via yc_solution::new_equation_group().
-        */
-        virtual const std::string&
-        get_name() const =0;
-
-        /// Determine whether this group will be automatically scheduled.
-        /**
-           @returns `true` if this group will be run via yk_solution::run_solution()
-           or `false` if this group must be run via yk_solution::run_stencil_group().
-           This is the `do_schedule` setting passed via yc_solution::new_equation_group().
-        */
-        virtual bool
-        get_do_schedule() const =0;
-
-        /// Add an equation to this group.
-        virtual void
-        add_equation(yc_equation_node_ptr equation
-                     /**< [in] Pointer to equation to be added. */ ) =0;
-                                  
-    public:
-    };
-
 } // namespace yask.
 
 #endif
diff --git a/include/yask_kernel_api.hpp b/include/yask_kernel_api.hpp
index e579cac1..8d407962 100644
--- a/include/yask_kernel_api.hpp
+++ b/include/yask_kernel_api.hpp
@@ -60,10 +60,6 @@ namespace yask {
     /// Shared pointer to \ref yk_grid.
     typedef std::shared_ptr<yk_grid> yk_grid_ptr;
 
-    class yk_stencil_group;
-    /// Shared pointer to \ref yk_stencil_group.
-    typedef std::shared_ptr<yk_stencil_group> yk_stencil_group;
-
     class yk_stats;
     /// Shared pointer to \ref yk_stats.
     typedef std::shared_ptr<yk_stats> yk_stats_ptr;
@@ -75,7 +71,7 @@ namespace yask {
 
 namespace yask {
 
-    /// Factory to create a stencil solution.
+    /// Bootstrap factory to create a stencil solution.
     class yk_factory {
     public:
         virtual ~yk_factory() {}
diff --git a/include/yk_solution_api.hpp b/include/yk_solution_api.hpp
index d23f335c..d7c554a8 100644
--- a/include/yk_solution_api.hpp
+++ b/include/yk_solution_api.hpp
@@ -668,80 +668,6 @@ namespace yask {
         apply_command_line_options(const std::string& args
                                    /**< [in] String of arguments to parse. */ ) =0;
 
-        /// **[Advanced]** Get the specified stencil group.
-        /**
-           @returns Pointer to the specified \ref yk_stencil_group
-           or null pointer if it does not exist.
-        */
-        virtual yk_stencil_group_ptr
-        get_stencil_group(const std::string& name
-                          /**< [in] Name of the group. */ ) =0;
-
-        /// **[Advanced]** Get all the stencil groups.
-        /**
-           @returns List of all stencil groups in the solution.
-        */
-        virtual std::vector<yk_stencil_group_ptr>
-        get_stencil_groups() =0;
-
-        /// **[Advanced]** Run the specified stencil group over the given sub-domain.
-        /**
-           Applies all the stencil kernels in the given group
-           from `first_domain_indices` at `first_step_index`
-           to `last_domain_indices` at `last_domain_index` (inclusive) in each dimension.
-           Each list of domain indices should contain the indices for the 
-           dimensions returned by get_domain_dim_names() in the same order.
-
-           Indices are relative to the *overall* problem domain and
-           need not be limited to fall within the domain of the current MPI rank.
-           The actual points to which the group is applied on each rank will be
-           limited internally as needed.
-
-           Example C++ usage:
-
-           \code{.cpp}
-           // Find my custom stencil group created in the YASK compiler.
-           auto my_group = soln->get_stencil_group("my_group");
-           ...
-           soln->prepare_solution();
-           ...
-           // Set first_indices and last_indices to apply my_group
-           // to only the first slice in the "z" dimension.
-           std::vector<idx_t> first_indices, last_indices;
-           for (auto dim : soln->get_domain_dim_names()) {
-             auto overall_size = soln->get_overall_domain_size(dim);
-             first_indices.push_back(0);
-             if (dim == "z")
-               last_indices.push_back(0);
-             else
-               last_indices.push_back(overall_size - 1);
-           }
-           ...
-           // Execute the time-steps.
-           for (idx_t t = 0; t < num_steps; t++) {
-
-               // Apply the automatically-scheduled stencils.
-               soln->run_solution(t);
-
-               // Apply my custom stencil group.
-               soln->run_stencil_group(my_group, 
-                                       t, first_indices,
-                                       t, last_indices);
-           }
-           soln->end_solution();
-           \endcode
-
-           @returns Number of points to which the group was applied.
-        */
-        virtual idx_t
-        run_stencil_group(yk_stencil_group_ptr stencil_group
-                          /**< [in] Pointer to the stencil group obtained from
-                             get_stencil_groups() or get_stencil_group(). */,
-                          const std::vector<idx_t>& first_domain_indices
-                          /**< [in] List of initial domain indices. */,
-                          const std::vector<idx_t>& last_domain_indices
-                          /**< [in] List of final domain indices. */ ) =0;
-
         /// **[Advanced]** Use data-storage from existing grids in specified solution.
         /**
            Calls yk_grid::share_storage() for each pair of grids that have the same name
@@ -804,35 +730,6 @@ namespace yask {
         get_elapsed_run_secs() =0;
     };
     
-    /// A group of stencil kernels.
-    /**
-       Groups of stencils are created automatically by the YASK stencil compiler
-       or manually via yc_solution::new_equation_group(). See the latter for
-       more information.
-    */
-    class yk_stencil_group {
-    public:
-    	virtual ~yk_stencil_group() {}
-
-        /// Get the name of this group.
-        /**
-           @returns Default name given by the YASK stencil compiler
-           or the name provided via yc_solution::new_equation_group().
-        */
-        virtual const std::string&
-        get_name() const =0;
-
-        /// Determine whether this group will be automatically scheduled.
-        /**
-           @returns `true` if this group will be run via yk_solution::run_solution()
-           or `false` if this group must be run via yk_solution::run_stencil_group().
-           This is the `do_schedule` setting passed via yc_solution::new_equation_group().
-        */
-        virtual bool
-        is_scheduled() const =0;
-
-    };
-
 } // namespace yask.
 
 #endif
diff --git a/src/compiler/lib/Expr.hpp b/src/compiler/lib/Expr.hpp
index de6d9810..b83db317 100644
--- a/src/compiler/lib/Expr.hpp
+++ b/src/compiler/lib/Expr.hpp
@@ -662,12 +662,11 @@ namespace yask {
         virtual int get_num_operands() {
             return _ops.size();
         }
-        virtual yc_number_node_ptr get_operand(int i) {
-            if (i >= 0 &&
-                size_t(i) < _ops.size())
-                return _ops.at(size_t(i));
-            else
-                return nullptr;
+        virtual std::vector<yc_number_node_ptr> get_operands() {
+            std::vector<yc_number_node_ptr> nv;
+            for (int i = 0; i < get_num_operands(); i++)
+                nv.push_back(_ops.at(i));
+            return nv;
         }
         virtual void add_operand(yc_number_node_ptr node) {
             auto p = dynamic_pointer_cast<NumExpr>(node);
diff --git a/src/compiler/lib/Soln.hpp b/src/compiler/lib/Soln.hpp
index 20142e69..b91a50da 100644
--- a/src/compiler/lib/Soln.hpp
+++ b/src/compiler/lib/Soln.hpp
@@ -164,9 +164,11 @@ namespace yask {
         virtual int get_num_equations() const {
             return _eqs.getNumEqs();
         }
-        virtual yc_equation_node_ptr get_equation(int n) {
-            assert(n >= 0 && n < get_num_equations());
-            return _eqs.getEqs().at(n);
+        virtual std::vector<yc_equation_node_ptr> get_equations() {
+            std::vector<yc_equation_node_ptr> ev;
+            for (int i = 0; i < get_num_equations(); i++)
+                ev.push_back(_eqs.getEqs().at(i));
+            return ev;
         }
         virtual void set_fold(const std::string& dim, int len) {
             auto& fold = _settings._foldOptions;
diff --git a/src/compiler/swig/yask_compiler_api.i b/src/compiler/swig/yask_compiler_api.i
index e9c3c5e1..9b36ae0e 100644
--- a/src/compiler/swig/yask_compiler_api.i
+++ b/src/compiler/swig/yask_compiler_api.i
@@ -40,7 +40,6 @@ IN THE SOFTWARE.
 // Must declare shared_ptrs for the entire expr_node hierarchy!
 %shared_ptr(yask::yc_solution)
  //%shared_ptr(yask::yc_grid)
-%shared_ptr(yask::yc_equation_group)
 %shared_ptr(yask::yc_expr_node)
 %shared_ptr(yask::yc_index_node)
 %shared_ptr(yask::yc_equation_node)
@@ -63,7 +62,9 @@ IN THE SOFTWARE.
 // All vector types used in API.
 %template(vector_int) std::vector<int>;
 %template(vector_str) std::vector<std::string>;
-%template(vector_index_ptr) std::vector<std::shared_ptr<yask::yc_index_node>>;
+%template(vector_index) std::vector<std::shared_ptr<yask::yc_index_node>>;
+%template(vector_num) std::vector<std::shared_ptr<yask::yc_number_node>>;
+%template(vector_eq) std::vector<std::shared_ptr<yask::yc_equation_node>>;
 %template(vector_grid) std::vector<yask::yc_grid*>;
 
 %exception {
diff --git a/src/kernel/swig/yask_kernel_api.i b/src/kernel/swig/yask_kernel_api.i
index 2da88f25..421f79ad 100644
--- a/src/kernel/swig/yask_kernel_api.i
+++ b/src/kernel/swig/yask_kernel_api.i
@@ -43,7 +43,6 @@ IN THE SOFTWARE.
 %shared_ptr(yask::yk_settings)
 %shared_ptr(yask::yk_solution)
 %shared_ptr(yask::yk_grid)
-%shared_ptr(yask::yk_stencil_group)
 %shared_ptr(yask::yk_stats)
 
 // Mutable buffer to access raw data.
@@ -70,3 +69,5 @@ IN THE SOFTWARE.
 
 %include "yask_common_api.hpp"
 %include "yask_kernel_api.hpp"
+%include "yk_solution_api.hpp"
+%include "yk_grid_api.hpp"
diff --git a/src/stencils/AwpElasticStencil.hpp b/src/stencils/AwpElasticStencil.hpp
index 8ecb9b6e..5dabd289 100644
--- a/src/stencils/AwpElasticStencil.hpp
+++ b/src/stencils/AwpElasticStencil.hpp
@@ -188,6 +188,7 @@ class AwpElasticStencil : public StencilBase {
               (2.0 / mu(x, y, z) + 1.0 / lambda(x, y, z))));
 
         // Define equivalencies to be valid only when z == last value in domain.
+        // This writes into the halo region.
         vel_x(t+1, x, y, z+1) EQUALS plus1_vel_x
             IF at_last_z;
         vel_y(t+1, x, y, z+1) EQUALS plus1_vel_y
@@ -265,15 +266,12 @@ class AwpElasticStencil : public StencilBase {
             ((mu_val * delta_t / h) * (d_xz_val + d_zx_val));
         adjust_for_sponge(next_stress_xz);
 
-        // define the value at t+1 (special case: zero at surface).
-#ifdef DO_SURFACE
+        // define the value at t+1.
         stress_xz(t+1, x, y, z) EQUALS next_stress_xz
-            IF !at_last_z;
-        stress_xz(t+1, x, y, z) EQUALS 0.0
-            IF at_last_z;
-#else
-        stress_xz(t+1, x, y, z) EQUALS next_stress_xz;
+#ifdef DO_SURFACE
+            IF !at_last_z
 #endif
+            ;
     }
     void define_stress_yz(Condition at_last_z) {
 
@@ -292,15 +290,12 @@ class AwpElasticStencil : public StencilBase {
             ((mu_val * delta_t / h) * (d_yz_val + d_zy_val));
         adjust_for_sponge(next_stress_yz);
 
-        // define the value at t+1 (special case: zero at surface).
-#ifdef DO_SURFACE
+        // define the value at t+1.
         stress_yz(t+1, x, y, z) EQUALS next_stress_yz
-            IF !at_last_z;
-        stress_yz(t+1, x, y, z) EQUALS 0.0
-            IF at_last_z;
-#else
-        stress_yz(t+1, x, y, z) EQUALS next_stress_yz;
+#ifdef DO_SURFACE
+            IF !at_last_z
 #endif
+            ;
     }
     void define_stress_zz(Condition at_last_z,
                           GridValue lambda_val, GridValue mu_val,
@@ -311,33 +306,30 @@ class AwpElasticStencil : public StencilBase {
                               (lambda_val * (d_x_val + d_y_val + d_z_val))));
         adjust_for_sponge(next_stress_zz);
 
-        // define the value at t+1 (special case: zero at surface).
-#ifdef DO_SURFACE
-        stress_zz(t+1, x, y, z) EQUALS next_stress_zz
-            IF !at_last_z;
-        stress_zz(t+1, x, y, z) EQUALS 0.0
-            IF at_last_z;
-#else
+        // define the value at t+1 (no special case at surface).
         stress_zz(t+1, x, y, z) EQUALS next_stress_zz;
-#endif
     }
 
     // Free-surface boundary equations for stress.
     void define_free_surface_stress(Condition at_last_z) {
 
         // Define equivalencies to be valid only when z == last value in domain.
-        // Note that values beyond the last index are updated, i.e., in the halo.
+        // This writes into the halo region.
 
         stress_zz(t+1, x, y, z+1) EQUALS -stress_zz(t+1, x, y, z)
             IF at_last_z;
         stress_zz(t+1, x, y, z+2) EQUALS -stress_zz(t+1, x, y, z-1)
             IF at_last_z;
 
+        stress_xz(t+1, x, y, z) EQUALS 0.0
+            IF at_last_z;
         stress_xz(t+1, x, y, z+1) EQUALS -stress_xz(t+1, x, y, z-1)
             IF at_last_z;
         stress_xz(t+1, x, y, z+2) EQUALS -stress_xz(t+1, x, y, z-2)
             IF at_last_z;
 
+        stress_yz(t+1, x, y, z) EQUALS 0.0
+            IF at_last_z;
         stress_yz(t+1, x, y, z+1) EQUALS -stress_yz(t+1, x, y, z-1)
             IF at_last_z;
         stress_yz(t+1, x, y, z+2) EQUALS -stress_yz(t+1, x, y, z-2)

From 4224e47954efd4b73296d83b1946ae9eb5057713 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Mon, 23 Apr 2018 17:42:01 -0700
Subject: [PATCH 05/21] Add API to turn off dependency checker.

Improved docs by adding separate modules.
Add operator overloading for numerical ops. Still need to do bool ones.
Partial work on issues #93 and #96.
---
 bin/gen_nodes.pl                              | 159 ++++++++++---
 bin/yask_compiler_api_test.py                 |  35 +--
 docs/api/mainpage.txt                         |  79 ++++---
 include/yask_common_api.hpp                   |   8 +
 include/yask_compiler_api.hpp                 |  38 +++-
 include/yask_kernel_api.hpp                   |  13 ++
 include/{yc_nodes.hpp => yc_node_api.hpp}     | 209 +++++++++++-------
 include/yk_grid_api.hpp                       |   8 +-
 include/yk_solution_api.hpp                   |   6 +
 src/compiler/lib/Eqs.cpp                      |  29 ++-
 src/compiler/lib/Eqs.hpp                      |   5 +-
 src/compiler/lib/Expr.cpp                     |  59 ++++-
 src/compiler/lib/Expr.hpp                     |   8 +-
 src/compiler/lib/Grid.hpp                     |   1 +
 src/compiler/lib/Soln.cpp                     |   2 +-
 src/compiler/lib/Soln.hpp                     |   2 +
 src/compiler/main.cpp                         |  17 +-
 src/compiler/swig/yask_compiler_api.i         |  63 +++++-
 src/compiler/tests/yask_compiler_api_test.cpp |  13 +-
 19 files changed, 564 insertions(+), 190 deletions(-)
 rename include/{yc_nodes.hpp => yc_node_api.hpp} (81%)

diff --git a/bin/gen_nodes.pl b/bin/gen_nodes.pl
index fc6c3abd..897fa461 100755
--- a/bin/gen_nodes.pl
+++ b/bin/gen_nodes.pl
@@ -1,6 +1,10 @@
 #! /usr/bin/env perl
 #-*-Perl-*- This line forces emacs to use Perl mode.
 
+# This utility is used to generate some API code for the compiler.
+# The code generated may require additional editing, so it is only
+# used for one-time generation.
+
 use strict;
 use File::Basename;
 use File::Path;
@@ -14,12 +18,30 @@
 
 $| = 1;                         # autoflush.
 
-my @nbops = qw(equals not_equals less_than greater_than not_less_than not_greater_than);
-my @bbops = qw(and or);
-my @ubops = qw(not);
+# num to num.
+my %nops = ("add" => "+",
+            "subtract" => "-",
+            "multiply" => "*",
+            "divide" => "/");
+my %bnops = ("negate" => "-");
+
+# num to bool.
+my %nbops = ("equals" => "==",
+             "not_equals" => "!=",
+             "less_than" => "<",
+             "greater_than" => ">",
+             "not_less_than" => ">=",
+             "not_greater_than" => "<=");
+
+# bool to bool.
+my %bbops = ("and" => "&&",
+             "or" => "||");
+my %ubops = ("not" => "!");
 
 # decls.
-for my $node (@nbops, @bbops, @ubops) {
+for my $node (sort keys %nbops,
+              sort keys %bbops,
+              sort keys %ubops) {
   my $n2 = "yc_${node}_node";
   print
     "    class $n2;\n".
@@ -28,14 +50,17 @@
 }
 
 # swig decls.
-for my $node (@nbops, @bbops, @ubops) {
+for my $node (sort keys %nbops,
+              sort keys %bbops,
+              sort keys %ubops) {
   my $n2 = "yc_${node}_node";
   print "\%shared_ptr(yask::$n2)\n";
 }
 
 # binary ops.
-for my $node (@bbops) {
-  my $n2 = "yc_${node}_node";
+for my $node (sort keys %bbops) {
+    my $n2 = "yc_${node}_node";
+    my $oper = $bbops{$node};
 
   print <<"END";
         /// Create a boolean $node node.
@@ -43,14 +68,15 @@
            \@returns Pointer to new \\ref $n2 object.
         */
         virtual ${n2}_ptr
-        new_${node}_node(yc_bool_node_ptr lhs /**< [in] Expression before '?' sign. */,
-                        yc_bool_node_ptr rhs /**< [in] Expression after '?' sign. */ );
+        new_${node}_node(yc_bool_node_ptr lhs /**< [in] Expression before '$oper' sign. */,
+                        yc_bool_node_ptr rhs /**< [in] Expression after '$oper' sign. */ );
 END
 }
 
 # comparison ops.
-for my $node (@nbops) {
-  my $n2 = "yc_${node}_node";
+for my $node (sort keys %nbops) {
+    my $n2 = "yc_${node}_node";
+    my $oper = $nbops{$node};
 
   print <<"END";
 
@@ -59,31 +85,32 @@ END
            \@returns Pointer to new \\ref $n2 object.
         */
         virtual ${n2}_ptr
-        new_${node}_node(yc_number_node_ptr lhs /**< [in] Expression before '?' sign. */,
-                        yc_number_node_ptr rhs /**< [in] Expression after '?' sign. */ );
+        new_${node}_node(yc_number_node_ptr lhs /**< [in] Expression before '$oper' sign. */,
+                        yc_number_node_ptr rhs /**< [in] Expression after '$oper' sign. */ );
 END
 }
 
 # binary ops.
-for my $node (@bbops) {
-  my $n2 = "yc_${node}_node";
+for my $node (sort keys %bbops) {
+    my $n2 = "yc_${node}_node";
+    my $oper = $bbops{$node};
 
   print <<"END";
 
     /// A boolean '$node' operator.
-    /** Example: used to implement `a ?? b`.
+    /** Example: used to implement `a $oper b`.
         Created via yc_node_factory::new_${node}_node().
      */
     class $n2 : public virtual yc_bool_node {
     public:
 
         /// Get the left-hand-side operand.
-        /** \@returns Expression node on left-hand-side of '?' sign. */
+        /** \@returns Expression node on left-hand-side of '$oper' sign. */
         virtual yc_bool_node_ptr
         get_lhs() =0;
 
         /// Get the right-hand-size operand.
-        /** \@returns Expression node on right-hand-side of '?' sign. */
+        /** \@returns Expression node on right-hand-side of '$oper' sign. */
         virtual yc_bool_node_ptr
         get_rhs() =0;
     };
@@ -91,25 +118,26 @@ END
 }
 
 # comparison ops.
-for my $node (@nbops) {
-  my $n2 = "yc_${node}_node";
+for my $node (sort keys %nbops) {
+    my $n2 = "yc_${node}_node";
+    my $oper = $nbops{$node};
 
   print <<"END";
 
     /// A numerical-comparison '$node' operator.
-    /** Example: used to implement `a ?? b`.
+    /** Example: used to implement `a $oper b`.
         Created via yc_node_factory::new_${node}_node().
      */
     class $n2 : public virtual yc_bool_node {
     public:
 
         /// Get the left-hand-side operand.
-        /** \@returns Expression node on left-hand-side of '?' sign. */
+        /** \@returns Expression node on left-hand-side of '$oper' sign. */
         virtual yc_bool_node_ptr
         get_lhs() =0;
 
         /// Get the right-hand-size operand.
-        /** \@returns Expression node on right-hand-side of '?' sign. */
+        /** \@returns Expression node on right-hand-side of '$oper' sign. */
         virtual yc_bool_node_ptr
         get_rhs() =0;
     };
@@ -117,11 +145,13 @@ END
 }
 
 # binary ops.
-for my $node (@bbops) {
-  my $n2 = "yc_${node}_node";
-  my $n3 = $node;
-  $n3 =~ s/(\w+)/\u\L$1/g;
-  $n3 .= 'Expr';
+for my $node (sort keys %bbops) {
+    my $n2 = "yc_${node}_node";
+    my $oper = $bbops{$node};
+    my $n3 = $node;
+    $n3 =~ s/([a-z]+)/\u\L$1/g;
+    $n3 =~ s/_//g;
+    $n3 .= 'Expr';
   
   print <<"END";
     ${n2}_ptr
@@ -137,13 +167,14 @@ END
 END
 }
 
-# comp. ops.
-for my $node (@nbops) {
-  my $n2 = "yc_${node}_node";
-  my $n3 = $node;
-  $n3 =~ s/(\w+)/\u\L$1/g;
-  $n3 =~ s/_//g;
-  $n3 .= 'Expr';
+# comparison ops.
+for my $node (sort keys %nbops) {
+    my $n2 = "yc_${node}_node";
+    my $oper = $nbops{$node};
+    my $n3 = $node;
+    $n3 =~ s/([a-z]+)/\u\L$1/g;
+    $n3 =~ s/_//g;
+    $n3 .= 'Expr';
   
   print <<"END";
     ${n2}_ptr
@@ -157,3 +188,61 @@ END
     }
 END
 }
+
+# binary num ops.
+for my $node (sort keys %nops) {
+    my $n2 = "yc_${node}_node";
+    my $n2p = $n2.'_ptr';
+    my $oper = $nops{$node};
+    print "$n2p operator$oper(yc_number_node_ptr lhs, yc_number_node_ptr rhs);\n";
+    print "$n2p operator$oper(double lhs, yc_number_node_ptr rhs);\n";
+    print "$n2p operator$oper(yc_number_node_ptr lhs, double);\n";
+}
+
+# binary num ops.
+for my $node (sort keys %nops) {
+    my $n2 = "yc_${node}_node";
+    my $n2p = $n2.'_ptr';
+    my $oper = $nops{$node};
+    my $n3 = $node;
+    $n3 =~ s/([a-z]+)/\u\L$1/g;
+    $n3 =~ s/_//g;
+    $n3 .= 'Expr';
+
+    print <<"END";
+    $n2p operator$oper(yc_number_node_ptr lhs, yc_number_node_ptr rhs) {
+        auto lp = dynamic_pointer_cast<NumExpr>(lhs);
+        assert(lp);
+        auto rp = dynamic_pointer_cast<NumExpr>(rhs);
+        assert(rp);
+        return make_shared<$n3>(lp, rp);
+    }
+    $n2p operator$oper(double lhs, yc_number_node_ptr rhs) {
+        return operator$oper(constNum(lhs), rhs);
+    }
+    $n2p operator$oper(yc_number_node_ptr lhs, double rhs) {
+        return operator$oper(lhs, constNum(rhs));
+    }
+END
+}
+
+# binary num ops.
+for my $node (sort keys %nops) {
+    my $n2 = "yc_${node}_node";
+    my $n2p = $n2.'_ptr';
+    my $oper = $nops{$node};
+    my $n3 = $node;
+    $n3 =~ s/([a-z]+)/\u\L$1/g;
+    $n3 =~ s/_//g;
+    $n3 .= 'Expr';
+
+    print <<"END";
+%extend yask::yc_number_node {
+    yask::yc_number_node_ptr __${node}__(yask::yc_number_node* rhs) {
+        auto lp = \$self->clone_ast();
+        auto rp = rhs->clone_ast();
+        return yask::operator$oper(lp, rp);
+    }
+ };
+END
+}
diff --git a/bin/yask_compiler_api_test.py b/bin/yask_compiler_api_test.py
index 1e9979b0..1932ce21 100755
--- a/bin/yask_compiler_api_test.py
+++ b/bin/yask_compiler_api_test.py
@@ -54,28 +54,31 @@
     # Create an expression for the new value.
     # This will average some of the neighboring points around the
     # current stencil application point in the current timestep.
-    n0 = g1.new_relative_grid_point([0, 0, 0, 0])  # center-point at this timestep.
-    n1 = nfac.new_add_node(n0, g1.new_relative_grid_point([0, -1,  0,  0])) # left.
-    n1 = nfac.new_add_node(n1, g1.new_relative_grid_point([0,  1,  0,  0])) # right.
-    n1 = nfac.new_add_node(n1, g1.new_relative_grid_point([0,  0, -1,  0])) # above.
-    n1 = nfac.new_add_node(n1, g1.new_relative_grid_point([0,  0,  1,  0])) # below.
-    n1 = nfac.new_add_node(n1, g1.new_relative_grid_point([0,  0,  0, -1])) # in front.
-    n1 = nfac.new_add_node(n1, g1.new_relative_grid_point([0,  0,  0,  1])) # behind.
-    n2 = nfac.new_divide_node(n1, nfac.new_const_number_node(7)) # div by 7.
-
-    # Define value in scratch grid.
-    sn0 = sg1.new_relative_grid_point([0, 0, 0]) # center-point.
+    n1 = (g1.new_relative_grid_point([0, 0, 0, 0]) +  # center-point at this timestep.
+          g1.new_relative_grid_point([0, -1,  0,  0]) + # left.
+          g1.new_relative_grid_point([0,  1,  0,  0]) + # right.
+          g1.new_relative_grid_point([0,  0, -1,  0]) + # above.
+          g1.new_relative_grid_point([0,  0,  1,  0]) + # below.
+          g1.new_relative_grid_point([0,  0,  0, -1]) + # in front.
+          g1.new_relative_grid_point([0,  0,  0,  1])) # behind.
+    n2 = n1 / 7  # ave of the 7 points.
+
+    # Define value in scratch grid to be the above equation, i.e.,
+    # this is a temporary 3-D variable that holds the average
+    # values of each point.
+    sn0 = sg1.new_relative_grid_point([0, 0, 0]) # LHS of eq is just a point on scratch-grid
     sn1 = nfac.new_equation_node(sn0, n2) # equate to expr n2.
     print("Scratch-grid equation before formatting: " + sn1.format_simple())
 
-    # Use values in scratch grid.
-    sn2 = sg1.new_relative_grid_point([1, 0, 0])
-    sn3 = nfac.new_add_node(sn2, sg1.new_relative_grid_point([0, 1, 0]))
-    sn4 = nfac.new_add_node(sn3, sg1.new_relative_grid_point([0, 0, 1]))
+    # Use values in scratch grid to make a new eq.
+    sn2 = (sg1.new_relative_grid_point([1, 0, 0]) +
+           sg1.new_relative_grid_point([0, 1, 0]) +
+           sg1.new_relative_grid_point([0, 0, 1]))
+    sn5 = -sn2 * 2.5 - 9.1
     
     # Create an equation to define the value at the next timestep.
     n3 = g1.new_relative_grid_point([1, 0, 0, 0]) # center-point at next timestep.
-    n4 = nfac.new_equation_node(n3, sn4) # equate to expr from scratch grid.
+    n4 = nfac.new_equation_node(n3, sn5) # equate to expr from scratch grid.
     print("Main-grid equation before formatting: " + n4.format_simple())
     print("Solution '" + soln.get_name() + "' contains " +
           str(soln.get_num_grids()) + " grid(s), and " +
diff --git a/docs/api/mainpage.txt b/docs/api/mainpage.txt
index c4afffd1..d6d6ddf1 100644
--- a/docs/api/mainpage.txt
+++ b/docs/api/mainpage.txt
@@ -7,17 +7,22 @@ namespace yask {
 
 /** @mainpage
 
+\tableofcontents
+
 @section intro Introduction
 
 The typical high-level YASK workflow is as follows:
--# Define a stencil solution.
-  Use the YASK stencil compiler to generate C++ kernel code from the stencil solution.
--# Compile the generated kernel code to create a YASK kernel library.
-  Create and use a stencil-based application using the kernel library.
-
-There are two sets of APIs provided by YASK corresponding to these tasks:
--# The YASK Stencil Compiler API (available in C++ and Python).
--# The YASK Stencil Kernel API (available in C++ and Python).
+-# \ref yc_intro
+  - Define a stencil solution using the YASK domain-specific language (DSL).
+  - Use the YASK stencil compiler to generate C++ kernel code from the stencil solution.
+-# \ref yk_intro
+  - Compile the generated C++ kernel code to create a YASK kernel library.
+  - Create and use a stencil-based application using the kernel library.
+-# Test and deploy your new YASK-enabled application.
+
+There are two sets of APIs provided by YASK corresponding to the first two tasks:
+-# The \ref sec_yc (available in C++ and Python).
+-# The \ref sec_yk (available in C++ and Python).
 
 For each of the tasks, you can either use the YASK-provided application
 or create your own application built with the corresponding API.
@@ -48,15 +53,16 @@ A new stencil solution may be defined in one of the following ways:
   - See example stencils in `src/stencils`.
     These stencils are written as classes inherited from the `StencilBase` class.
 -# Use the YASK compiler API to create another application that defines stencils.
-  - This approach is typically taken when a front-end tool will be creating stencils
-    from a higher-level description, e.g., applying finite-difference methods to differential equations.
+  - This approach is typically taken when a 3rd-party front-end tool will be creating stencils
+    from another, possibly higher-level, description,
+    e.g., applying finite-difference methods to differential equations.
   - In this case, the equations are built up programmatically into an abstract syntax tree (AST).
   - The mechanism to generate the kernel code in this approach depends on the application
     created from the APIs. The application might be run from a command-prompt, or the user
     might control it interactively.
   - See `src/compiler/tests/yask_compiler_api_test.cpp` for an example stencil definition in C++.
   - See `bin/yask_compiler_api_test.py` for an example stencil definition in Python.
-  - See \ref yc for documentation on the compiler API.
+  - See \ref sec_yc for documentation on the compiler API.
 
 In either case, the resulting generated code should written to the C++ stencil-code file,
 `src/kernel/gen/yask_stencil_code.hpp`.
@@ -88,31 +94,13 @@ This may be done in one of the following ways:
     final-state data for analysis or further processing.
   - See `src/kernel/tests/yask_kernel_api_test.cpp` for an example kernel usage in C++.
   - See `bin/yask_kernel_api_test.py` for an example kernel usage in Python.
-  - See \ref yk for documentation on the kernel API.
+  - See \ref sec_yk for documentation on the kernel API.
 
-@note Anytime you want to change the name or compile-time properties of the kernel, be sure to run
+@note Anytime you want to change the name or any compile-time properties of the kernel, be sure to run
 `make clean` to force the removal of all kernel-specific intermediate code.
 Otherwise, you will likely see some unexpected errors when building the new kernel.
 
-@subsection examples Example Tests
-
-The following examples illustrate possible combinations of compilers and kernels.
-- You can substitute `snb` for one of the other architectures listed in the `Makefile` if desired.
-- Run `make clean` before all of the example commands to ensure consistent builds.
-
-Stencil Compiler    | Stencil Application | Test Command
---------------------|---------------------|-------------
-YASK-provided       | YASK-provided       | `make -j stencil=iso3dfd arch=snb yc-and-yk-test`
-YASK-provided       | C++ test example    | `make -j stencil=iso3dfd arch=snb yc-and-cxx-yk-api-test`
-YASK-provided       | Python test example | `make -j stencil=iso3dfd arch=snb yc-and-py-yk-api-test`
-C++ test example    | YASK-provided       | `make -j stencil=test arch=snb cxx-yc-api-and-yk-test`
-C++ test example    | C++ test example    | `make -j stencil=test arch=snb cxx-yc-api-and-cxx-yk-api-test`
-C++ test example    | Python test example | `make -j stencil=test arch=snb cxx-yc-api-and-py-yk-api-test`
-Python test example | YASK-provided       | `make -j stencil=test arch=snb py-yc-api-and-yk-test`
-Python test example | C++ test example    | `make -j stencil=test arch=snb py-yc-api-and-cxx-yk-api-test`
-Python test example | Python test example | `make -j stencil=test arch=snb py-yc-api-and-py-yk-api-test`
-
-@section yc YASK Stencil Compiler API
+@section sec_yc YASK Stencil Compiler API
 
 @subsection yc_oview Compiler Overview
 This section provides usage information for the YASK stencil compiler
@@ -121,6 +109,8 @@ The API is available for C++ and for Python via SWIG.
 Type names are prefixed with 'yc_' to indicate "YASK compiler";
 this distinguishes them from the 'yk_'-prefixed types used in the "YASK kernel" API.
 
+The types, classes, and functions are listed in \ref yc.
+
 @subsection yc_usage Typical Program Flow using the Compiler API
 - Create a yc_factory. This is the "bootstrap" object that will be used to create others.
 - Create a yc_solution object via yc_factory::new_solution().
@@ -149,7 +139,7 @@ this distinguishes them from the 'yk_'-prefixed types used in the "YASK kernel"
   yc_solution::set_fold_len() and/or yc_solution::set_cluster_mult().
 - Format the equations for additional processing via yc_solution::format().
 
-@section yk YASK Stencil Kernel API
+@section sec_yk YASK Stencil Kernel API
 
 @subsection yk_oview Kernel Overview
 This section provides usage information for the YASK stencil kernel
@@ -158,6 +148,8 @@ The API is available for C++ and for Python via SWIG.
 Type names are prefixed with 'yk_' to indicate "YASK kernel";
 this distinguishes them from the 'yc_'-prefixed types used in the "YASK compiler" API.
 
+The types, classes, and functions are listed in \ref yk.
+
 @subsection yk_usage Typical Program Flow using the Kernel API
 - Create a \ref yk_factory. This is the "bootstrap" object that will be used to create others.
 - Create a \ref yk_env object via yk_factory::new_env(). This initializes MPI if you have enabled it.
@@ -177,6 +169,25 @@ this distinguishes them from the 'yc_'-prefixed types used in the "YASK compiler
 - Apply the stencil(s) to the grids via yk_solution::run_solution().
   There are versions for advancing one or more steps.
 - Retrieve the final results via yk_grid::get_element().
-*/
 
+@section sec_tests Example Tests
+
+The following examples illustrate possible combinations of compilers and kernels.
+- You can add `stencil=`_stencil-name_ to use a specific stencil for testing.
+- You can add `arch=`_arch-name_ to target one of the architectures listed in the `Makefile` if desired.
+- Run `make clean` before all of the example commands to ensure consistent builds.
+
+Stencil Compiler    | Stencil Application | Test Command
+--------------------|---------------------|-------------
+YASK-provided       | YASK-provided       | `make -j yc-and-yk-test`
+YASK-provided       | C++ test example    | `make -j yc-and-cxx-yk-api-test`
+YASK-provided       | Python test example | `make -j yc-and-py-yk-api-test`
+C++ test example    | YASK-provided       | `make -j cxx-yc-api-and-yk-test`
+C++ test example    | C++ test example    | `make -j cxx-yc-api-and-cxx-yk-api-test`
+C++ test example    | Python test example | `make -j cxx-yc-api-and-py-yk-api-test`
+Python test example | YASK-provided       | `make -j py-yc-api-and-yk-test`
+Python test example | C++ test example    | `make -j py-yc-api-and-cxx-yk-api-test`
+Python test example | Python test example | `make -j py-yc-api-and-py-yk-api-test`
+
+*/
 }
diff --git a/include/yask_common_api.hpp b/include/yask_common_api.hpp
index cce4c80a..7443c897 100644
--- a/include/yask_common_api.hpp
+++ b/include/yask_common_api.hpp
@@ -39,6 +39,12 @@ IN THE SOFTWARE.
 
 namespace yask {
 
+    /**
+     * \defgroup yask YASK Commmon Utilities
+     * Types, clases, and functions used in both the \ref sec_yc and \ref sec_yk.
+     * @{
+     */
+
     /// Version information.
     /**
        @returns String describing the current version.
@@ -181,6 +187,8 @@ namespace yask {
         virtual ~yask_null_output() {}
     };
 
+    /** @}*/
+
 } // namespace yask.
 
 #endif
diff --git a/include/yask_compiler_api.hpp b/include/yask_compiler_api.hpp
index 41614756..e3ab4951 100644
--- a/include/yask_compiler_api.hpp
+++ b/include/yask_compiler_api.hpp
@@ -37,6 +37,12 @@ IN THE SOFTWARE.
 
 namespace yask {
 
+    /**
+     * \defgroup yc YASK Compiler
+     * Types, clases, and functions used in the \ref sec_yc.
+     * @{
+     */
+
     // Forward declarations of classes and their pointers.
     // See yask_compiler_api.hpp for more.
 
@@ -65,12 +71,19 @@ namespace yask {
     class yc_grid_point_node;
     /// Shared pointer to \ref yc_grid_point_node
     typedef std::shared_ptr<yc_grid_point_node> yc_grid_point_node_ptr;
+
+    /** @}*/
 }
 
-#include "yc_nodes.hpp"
+#include "yc_node_api.hpp"
 
 namespace yask {
 
+    /**
+     * \addtogroup yc
+     * @{
+     */
+
     /// Bootstrap factory to create objects needed to define a stencil solution.
     class yc_factory {
     public:
@@ -310,6 +323,27 @@ namespace yask {
                yask_output_ptr output
                /**< [out] Pointer to object to receive formatted output. 
                   See \ref yask_output_factory. */) =0;
+
+        /// **[Advanced]** Enable or disable automatic dependency checker.
+        /**
+           This should be used whenever the built-in dependency checker is
+           insufficient. Currently, the provided checker does not allow
+           stencils in which points in one sub-domain depend on points
+           in another sub-domain within the same value of the step index.
+
+           @warning If dependency checker is disabled, *all* dependencies
+           must be set via the APIs.
+         */
+        virtual void
+        set_dependency_checker_enabled(bool enable
+                                       /**< [in] `true` to enable or `false` to disable. */) =0;
+
+        /// **[Advanced]** Determine whether automatic dependency checker is enabled.
+        /**
+           @returns Current setting.
+        */
+        virtual bool
+        is_dependency_checker_enabled() const =0;
     };
 
     /// A compile-time grid.
@@ -366,6 +400,8 @@ namespace yask {
 #endif
     };
     
+    /** @}*/
+
 } // namespace yask.
 
 #endif
diff --git a/include/yask_kernel_api.hpp b/include/yask_kernel_api.hpp
index 8d407962..c1579f50 100644
--- a/include/yask_kernel_api.hpp
+++ b/include/yask_kernel_api.hpp
@@ -38,6 +38,12 @@ IN THE SOFTWARE.
 
 namespace yask {
 
+    /**
+     * \defgroup yk YASK Kernel
+     * Types, clases, and functions used in both the \ref sec_yk.
+     * @{
+     */
+
     /// Type to use for indexing grids.
     /** Index types are signed to allow negative indices in padding/halos. */
 #ifdef SWIG
@@ -64,6 +70,7 @@ namespace yask {
     /// Shared pointer to \ref yk_stats.
     typedef std::shared_ptr<yk_stats> yk_stats_ptr;
 
+    /** @}*/
 } // namespace yask.
 
 #include "yk_solution_api.hpp"
@@ -71,6 +78,11 @@ namespace yask {
 
 namespace yask {
 
+    /**
+     * \addtogroup yk
+     * @{
+     */
+
     /// Bootstrap factory to create a stencil solution.
     class yk_factory {
     public:
@@ -144,6 +156,7 @@ namespace yask {
         global_barrier() const =0;
     };
 
+    /** @}*/
 } // namespace yask.
 
 #endif
diff --git a/include/yc_nodes.hpp b/include/yc_node_api.hpp
similarity index 81%
rename from include/yc_nodes.hpp
rename to include/yc_node_api.hpp
index b6c3079e..86bf2eb4 100644
--- a/include/yc_nodes.hpp
+++ b/include/yc_node_api.hpp
@@ -27,13 +27,18 @@ IN THE SOFTWARE.
 
 // This file uses Doxygen 1.8 markup for API documentation-generation.
 // See http://www.stack.nl/~dimitri/doxygen.
-/** @file yask_compiler_api.hpp */ 
+/** @file yc_node_api.hpp */ 
 
 #ifndef YC_NODES
 #define YC_NODES
 
 namespace yask {
 
+    /**
+     * \addtogroup yc
+     * @{
+     */
+
     // Forward declarations of expression nodes and their pointers.
     // See yask_compiler_api.hpp for more.
 
@@ -125,7 +130,7 @@ namespace yask {
         */
         virtual yc_index_node_ptr
         new_step_index(const std::string& name
-                     /**< [in] Step dimension name. */ );
+                       /**< [in] Step dimension name. */ );
 
         /// Create a domain-index node.
         /**
@@ -135,10 +140,10 @@ namespace yask {
            This should *not* include the step dimension, which is specified via
            new_step_index().
            @returns Pointer to new \ref yc_index_node object.
-         */
+        */
         virtual yc_index_node_ptr
         new_domain_index(const std::string& name
-                     /**< [in] Domain index name. */ );
+                         /**< [in] Domain index name. */ );
         
         /// Create a new miscellaneous index.
         /**
@@ -146,7 +151,7 @@ namespace yask {
            some dimension that is not the step dimension
            or a domain dimension. Example: index into an array.
            @returns Pointer to new \ref yc_index_node object.
-         */
+        */
         virtual yc_index_node_ptr
         new_misc_index(const std::string& name
                        /**< [in] Index name. */ );
@@ -179,7 +184,7 @@ namespace yask {
         /// Create a constant numerical value node.
         /** 
             This is unary negation.
-            Use new_subtraction_node() for binary '-'.
+            Use new_subtraction_node() for binary `-`.
             @returns Pointer to new \ref yc_const_number_node object. 
         */
         virtual yc_const_number_node_ptr
@@ -187,48 +192,56 @@ namespace yask {
 
         /// Create a numerical negation operator node.
         /**
-           @returns Pointer to new \ref yc_negate_node object. 
+            New negation nodes can also be created via the overloaded unary `-` operator.
+            @returns Pointer to new \ref yc_negate_node object. 
         */
         virtual yc_negate_node_ptr
-        new_negate_node(yc_number_node_ptr rhs /**< [in] Expression after '-' sign. */ );
+        new_negate_node(yc_number_node_ptr rhs /**< [in] Expression after `-` sign. */ );
 
         /// Create an addition node.
         /** 
             Nodes must be created with at least two operands, and more can
             be added by calling add_operand() on the returned node.
+
+            New addition nodes can also be created via the overloaded `+` operator.
             @returns Pointer to new \ref yc_add_node object. 
         */
         virtual yc_add_node_ptr
-        new_add_node(yc_number_node_ptr lhs /**< [in] Expression before '+' sign. */,
-                     yc_number_node_ptr rhs /**< [in] Expression after '+' sign. */ );
+        new_add_node(yc_number_node_ptr lhs /**< [in] Expression before `+` sign. */,
+                     yc_number_node_ptr rhs /**< [in] Expression after `+` sign. */ );
 
         /// Create a multiplication node.
         /**
            Nodes must be created with at least two operands, and more can
            be added by calling add_operand() on the returned node.
+
+            New multiplication nodes can also be created via the overloaded `*` operator.
            @returns Pointer to new \ref yc_multiply_node object. 
         */
         virtual yc_multiply_node_ptr
-        new_multiply_node(yc_number_node_ptr lhs /**< [in] Expression before '*' sign. */,
-                          yc_number_node_ptr rhs /**< [in] Expression after '*' sign. */ );
+        new_multiply_node(yc_number_node_ptr lhs /**< [in] Expression before `*` sign. */,
+                          yc_number_node_ptr rhs /**< [in] Expression after `*` sign. */ );
 
         /// Create a subtraction node.
         /**
            This is binary subtraction.
-           Use new_negation_node() for unary '-'.
+           Use new_negation_node() for unary `-`.
+
+            New subtraction nodes can also be created via the overloaded `-` operator.
            @returns Pointer to new \ref yc_subtract_node object. 
         */
         virtual yc_subtract_node_ptr
-        new_subtract_node(yc_number_node_ptr lhs /**< [in] Expression before '-' sign. */,
-                          yc_number_node_ptr rhs /**< [in] Expression after '-' sign. */ );
+        new_subtract_node(yc_number_node_ptr lhs /**< [in] Expression before `-` sign. */,
+                          yc_number_node_ptr rhs /**< [in] Expression after `-` sign. */ );
 
         /// Create a division node.
         /**
+            New division nodes can also be created via the overloaded `/` operator.
            @returns Pointer to new \ref yc_divide_node object. 
         */
         virtual yc_divide_node_ptr
-        new_divide_node(yc_number_node_ptr lhs /**< [in] Expression before '/' sign. */,
-                        yc_number_node_ptr rhs /**< [in] Expression after '/' sign. */ );
+        new_divide_node(yc_number_node_ptr lhs /**< [in] Expression before `/` sign. */,
+                        yc_number_node_ptr rhs /**< [in] Expression after `/` sign. */ );
 
         /// Create a symbol for the first index value in a given dimension.
         /**
@@ -244,14 +257,14 @@ namespace yask {
 
            // Create expression for "first_x + 10".
            auto left10 = node_fac.new_add_node(first_x,
-                           node_fac.new_const_number_node(10));
+           node_fac.new_const_number_node(10));
 
            // Create boolean expression for "x > first_x + 10".
            auto expr = node_fac.new_greater_than_node(x, left10);
            \endcode
 
            @returns Pointer to new \ref yc_index_node object.
-         */
+        */
         virtual yc_number_node_ptr
         new_first_domain_index(yc_index_node_ptr idx
                                /**< [in] Domain index. */ );
@@ -270,14 +283,14 @@ namespace yask {
 
            // Create expression for "last_x - 10".
            auto right10 = node_fac.new_subtract_node(last_x,
-                              node_fac.new_const_number_node(10));
+           node_fac.new_const_number_node(10));
 
            // Create boolean expression for "x < first_x - 10".
            auto expr = node_fac.new_less_than_node(x, right10);
            \endcode
 
            @returns Pointer to new \ref yc_index_node object.
-         */
+        */
         virtual yc_number_node_ptr
         new_last_domain_index(yc_index_node_ptr idx
                               /**< [in] Domain index. */ );
@@ -287,71 +300,71 @@ namespace yask {
            @returns Pointer to new \ref yc_not_node object. 
         */
         virtual yc_not_node_ptr
-        new_not_node(yc_bool_node_ptr rhs /**< [in] Expression after '!' sign. */ );
+        new_not_node(yc_bool_node_ptr rhs /**< [in] Expression after `!` sign. */ );
 
         /// Create a boolean 'and' node.
         /**
            @returns Pointer to new \ref yc_and_node object.
         */
         virtual yc_and_node_ptr
-        new_and_node(yc_bool_node_ptr lhs /**< [in] Expression before '&&' sign. */,
-                     yc_bool_node_ptr rhs /**< [in] Expression after '&&' sign. */ );
+        new_and_node(yc_bool_node_ptr lhs /**< [in] Expression before `&&` sign. */,
+                     yc_bool_node_ptr rhs /**< [in] Expression after `&&` sign. */ );
 
         /// Create a boolean 'or' node.
         /**
            @returns Pointer to new \ref yc_or_node object.
         */
         virtual yc_or_node_ptr
-        new_or_node(yc_bool_node_ptr lhs /**< [in] Expression before '||' sign. */,
-                    yc_bool_node_ptr rhs /**< [in] Expression after '||' sign. */ );
+        new_or_node(yc_bool_node_ptr lhs /**< [in] Expression before `||` sign. */,
+                    yc_bool_node_ptr rhs /**< [in] Expression after `||` sign. */ );
 
         /// Create a numerical-comparison 'equals' node.
         /**
            @returns Pointer to new \ref yc_equals_node object.
         */
         virtual yc_equals_node_ptr
-        new_equals_node(yc_number_node_ptr lhs /**< [in] Expression before '==' sign. */,
-                        yc_number_node_ptr rhs /**< [in] Expression after '==' sign. */ );
+        new_equals_node(yc_number_node_ptr lhs /**< [in] Expression before `==` sign. */,
+                        yc_number_node_ptr rhs /**< [in] Expression after `==` sign. */ );
 
         /// Create a numerical-comparison 'not-equals' node.
         /**
            @returns Pointer to new \ref yc_not_equals_node object.
         */
         virtual yc_not_equals_node_ptr
-        new_not_equals_node(yc_number_node_ptr lhs /**< [in] Expression before '!=' sign. */,
-                            yc_number_node_ptr rhs /**< [in] Expression after '!=' sign. */ );
+        new_not_equals_node(yc_number_node_ptr lhs /**< [in] Expression before `!=` sign. */,
+                            yc_number_node_ptr rhs /**< [in] Expression after `!=` sign. */ );
 
         /// Create a numerical-comparison 'less-than' node.
         /**
            @returns Pointer to new \ref yc_less_than_node object.
         */
         virtual yc_less_than_node_ptr
-        new_less_than_node(yc_number_node_ptr lhs /**< [in] Expression before '<' sign. */,
-                           yc_number_node_ptr rhs /**< [in] Expression after '<' sign. */ );
+        new_less_than_node(yc_number_node_ptr lhs /**< [in] Expression before `<` sign. */,
+                           yc_number_node_ptr rhs /**< [in] Expression after `<` sign. */ );
 
         /// Create a numerical-comparison 'greater-than' node.
         /**
            @returns Pointer to new \ref yc_greater_than_node object.
         */
         virtual yc_greater_than_node_ptr
-        new_greater_than_node(yc_number_node_ptr lhs /**< [in] Expression before '>' sign. */,
-                              yc_number_node_ptr rhs /**< [in] Expression after '>' sign. */ );
+        new_greater_than_node(yc_number_node_ptr lhs /**< [in] Expression before `>` sign. */,
+                              yc_number_node_ptr rhs /**< [in] Expression after `>` sign. */ );
 
         /// Create a numerical-comparison 'greater-than or equals' node.
         /**
            @returns Pointer to new \ref yc_not_less_than_node object.
         */
         virtual yc_not_less_than_node_ptr
-        new_not_less_than_node(yc_number_node_ptr lhs /**< [in] Expression before '>=' sign. */,
-                               yc_number_node_ptr rhs /**< [in] Expression after '>=' sign. */ );
+        new_not_less_than_node(yc_number_node_ptr lhs /**< [in] Expression before `>=` sign. */,
+                               yc_number_node_ptr rhs /**< [in] Expression after `>=` sign. */ );
 
         /// Create a numerical-comparison 'less-than or equals' node.
         /**
            @returns Pointer to new \ref yc_not_greater_than_node object.
         */
         virtual yc_not_greater_than_node_ptr
-        new_not_greater_than_node(yc_number_node_ptr lhs /**< [in] Expression before '<=' sign. */,
-                                  yc_number_node_ptr rhs /**< [in] Expression after '<=' sign. */ );
+        new_not_greater_than_node(yc_number_node_ptr lhs /**< [in] Expression before `<=` sign. */,
+                                  yc_number_node_ptr rhs /**< [in] Expression after `<=` sign. */ );
 
     };
 
@@ -365,7 +378,7 @@ namespace yask {
         /**
            Formats the expression starting at this node.
            @returns String containing a single-line human-readable version of the expression.
-         */
+        */
         virtual std::string format_simple() const =0;
 
         /// Count the size of the AST.
@@ -406,7 +419,12 @@ namespace yask {
 
     /// Base class for all numerical AST nodes.
     /** An object of this abstract type cannot be created. */
-    class yc_number_node : public virtual yc_expr_node { };
+    class yc_number_node : public virtual yc_expr_node {
+    public:
+        
+        /// Create a deep copy of AST starting with this node.
+        virtual yc_number_node_ptr clone_ast() const =0;
+    };
 
     /// Base class for all boolean AST nodes.
     /** An object of this abstract type cannot be created. */
@@ -432,7 +450,7 @@ namespace yask {
     /**
        Created via yc_grid::new_relative_grid_point().
     */
-   class yc_grid_point_node : public virtual yc_number_node {
+    class yc_grid_point_node : public virtual yc_number_node {
     public:
 
         /// Get the grid this point is in.
@@ -464,14 +482,14 @@ namespace yask {
     /// A numerical negation operator.
     /** Example: used to implement -(a*b).
         Created via yc_node_factory::new_negate_node().
-     */
+    */
     class yc_negate_node : public virtual yc_number_node {
     public:
 
         /// Get the [only] operand.
         /**  This node implements unary negation only, not subtraction, so there is
-            never a left-hand-side.
-            @returns Expression node on right-hand-side of '-' sign. */
+             never a left-hand-side.
+             @returns Expression node on right-hand-side of `-` sign. */
         virtual yc_number_node_ptr
         get_rhs() =0;
     };
@@ -485,8 +503,8 @@ namespace yask {
         /// Get the number of operands.
         /** If there is just one operand, the operation itself is moot.  If
             there are more than one operand, the operation applies between
-            them. Example: for an add operator, if the operands are 'a',
-            'b', and 'c', then the expression is 'a + b + c'.
+            them. Example: for an add operator, if the operands are `a`,
+            `b`, and `c`, then the expression is `a + b + c`.
             @returns Number of operands. */
         virtual int
         get_num_operands() =0;
@@ -515,12 +533,12 @@ namespace yask {
     public:
 
         /// Get the left-hand-side operand.
-        /** @returns Pointer to expression node appearing before the '-' sign. */
+        /** @returns Pointer to expression node appearing before the `-` sign. */
         virtual yc_number_node_ptr
         get_lhs() =0;
     
         /// Get the right-hand-side operand.
-        /** @returns Pointer to expression node appearing after the '-' sign. */
+        /** @returns Pointer to expression node appearing after the `-` sign. */
         virtual yc_number_node_ptr
         get_rhs() =0;
     };
@@ -531,12 +549,12 @@ namespace yask {
     public:
 
         /// Get the left-hand-side operand.
-        /** @returns Pointer to expression node appearing before the '/' sign. */
+        /** @returns Pointer to expression node appearing before the `/` sign. */
         virtual yc_number_node_ptr
         get_lhs() =0;
     
         /// Get the right-hand-side operand.
-        /** @returns Pointer to expression node appearing after the '/' sign. */
+        /** @returns Pointer to expression node appearing after the `/` sign. */
         virtual yc_number_node_ptr
         get_rhs() =0;
     };
@@ -544,12 +562,12 @@ namespace yask {
     /// A boolean inversion operator.
     /** Example: used to implement `!(a || b)`.
         Created via yc_node_factory::new_not_node().
-     */
+    */
     class yc_not_node : public virtual yc_bool_node {
     public:
 
         /// Get the [only] operand.
-        /** @returns Expression node on right-hand-side of '!' sign. */
+        /** @returns Expression node on right-hand-side of `!` sign. */
         virtual yc_bool_node_ptr
         get_rhs() =0;
     };
@@ -557,17 +575,17 @@ namespace yask {
     /// A boolean 'and' operator.
     /** Example: used to implement `a && b`.
         Created via yc_node_factory::new_and_node().
-     */
+    */
     class yc_and_node : public virtual yc_bool_node {
     public:
 
         /// Get the left-hand-side operand.
-        /** @returns Expression node on left-hand-side of '&&' sign. */
+        /** @returns Expression node on left-hand-side of `&&` sign. */
         virtual yc_bool_node_ptr
         get_lhs() =0;
 
         /// Get the right-hand-size operand.
-        /** @returns Expression node on right-hand-side of '&&' sign. */
+        /** @returns Expression node on right-hand-side of `&&` sign. */
         virtual yc_bool_node_ptr
         get_rhs() =0;
     };
@@ -575,17 +593,17 @@ namespace yask {
     /// A boolean 'or' operator.
     /** Example: used to implement `a || b`.
         Created via yc_node_factory::new_or_node().
-     */
+    */
     class yc_or_node : public virtual yc_bool_node {
     public:
 
         /// Get the left-hand-side operand.
-        /** @returns Expression node on left-hand-side of '||' sign. */
+        /** @returns Expression node on left-hand-side of `||` sign. */
         virtual yc_bool_node_ptr
         get_lhs() =0;
 
         /// Get the right-hand-size operand.
-        /** @returns Expression node on right-hand-side of '||' sign. */
+        /** @returns Expression node on right-hand-side of `||` sign. */
         virtual yc_bool_node_ptr
         get_rhs() =0;
     };
@@ -593,17 +611,17 @@ namespace yask {
     /// A numerical-comparison 'equals' operator.
     /** Example: used to implement `a == b`.
         Created via yc_node_factory::new_equals_node().
-     */
+    */
     class yc_equals_node : public virtual yc_bool_node {
     public:
 
         /// Get the left-hand-side operand.
-        /** @returns Expression node on left-hand-side of '==' sign. */
+        /** @returns Expression node on left-hand-side of `==` sign. */
         virtual yc_number_node_ptr
         get_lhs() =0;
 
         /// Get the right-hand-size operand.
-        /** @returns Expression node on right-hand-side of '==' sign. */
+        /** @returns Expression node on right-hand-side of `==` sign. */
         virtual yc_number_node_ptr
         get_rhs() =0;
     };
@@ -611,17 +629,17 @@ namespace yask {
     /// A numerical-comparison 'not_equals' operator.
     /** Example: used to implement `a != b`.
         Created via yc_node_factory::new_not_equals_node().
-     */
+    */
     class yc_not_equals_node : public virtual yc_bool_node {
     public:
 
         /// Get the left-hand-side operand.
-        /** @returns Expression node on left-hand-side of '!=' sign. */
+        /** @returns Expression node on left-hand-side of `!=` sign. */
         virtual yc_number_node_ptr
         get_lhs() =0;
 
         /// Get the right-hand-size operand.
-        /** @returns Expression node on right-hand-side of '!=' sign. */
+        /** @returns Expression node on right-hand-side of `!=` sign. */
         virtual yc_number_node_ptr
         get_rhs() =0;
     };
@@ -629,17 +647,17 @@ namespace yask {
     /// A numerical-comparison 'less_than' operator.
     /** Example: used to implement `a < b`.
         Created via yc_node_factory::new_less_than_node().
-     */
+    */
     class yc_less_than_node : public virtual yc_bool_node {
     public:
 
         /// Get the left-hand-side operand.
-        /** @returns Expression node on left-hand-side of '<' sign. */
+        /** @returns Expression node on left-hand-side of `<` sign. */
         virtual yc_number_node_ptr
         get_lhs() =0;
 
         /// Get the right-hand-size operand.
-        /** @returns Expression node on right-hand-side of '<' sign. */
+        /** @returns Expression node on right-hand-side of `<` sign. */
         virtual yc_number_node_ptr
         get_rhs() =0;
     };
@@ -647,17 +665,17 @@ namespace yask {
     /// A numerical-comparison 'greater_than' operator.
     /** Example: used to implement `a > b`.
         Created via yc_node_factory::new_greater_than_node().
-     */
+    */
     class yc_greater_than_node : public virtual yc_bool_node {
     public:
 
         /// Get the left-hand-side operand.
-        /** @returns Expression node on left-hand-side of '>' sign. */
+        /** @returns Expression node on left-hand-side of `>` sign. */
         virtual yc_number_node_ptr
         get_lhs() =0;
 
         /// Get the right-hand-size operand.
-        /** @returns Expression node on right-hand-side of '>' sign. */
+        /** @returns Expression node on right-hand-side of `>` sign. */
         virtual yc_number_node_ptr
         get_rhs() =0;
     };
@@ -666,17 +684,17 @@ namespace yask {
     /// A numerical-comparison 'not_less_than' operator.
     /** Example: used to implement `a >= b`.
         Created via yc_node_factory::new_not_less_than_node().
-     */
+    */
     class yc_not_less_than_node : public virtual yc_bool_node {
     public:
 
         /// Get the left-hand-side operand.
-        /** @returns Expression node on left-hand-side of '>=' sign. */
+        /** @returns Expression node on left-hand-side of `>=` sign. */
         virtual yc_number_node_ptr
         get_lhs() =0;
 
         /// Get the right-hand-size operand.
-        /** @returns Expression node on right-hand-side of '>=' sign. */
+        /** @returns Expression node on right-hand-side of `>=` sign. */
         virtual yc_number_node_ptr
         get_rhs() =0;
     };
@@ -684,23 +702,62 @@ namespace yask {
     /// A numerical-comparison 'not_greater_than' operator.
     /** Example: used to implement `a <= b`.
         Created via yc_node_factory::new_not_greater_than_node().
-     */
+    */
     class yc_not_greater_than_node : public virtual yc_bool_node {
     public:
 
         /// Get the left-hand-side operand.
-        /** @returns Expression node on left-hand-side of '<=' sign. */
+        /** @returns Expression node on left-hand-side of `<=` sign. */
         virtual yc_number_node_ptr
         get_lhs() =0;
 
         /// Get the right-hand-size operand.
-        /** @returns Expression node on right-hand-side of '<=' sign. */
+        /** @returns Expression node on right-hand-side of `<=` sign. */
         virtual yc_number_node_ptr
         get_rhs() =0;
     };
 
+    // Non-class operators.
+    // These are only defined if the older "internal DSL" is not used.
+    // The internal version will eventually be deprecated and
+    // perhaps removed in favor of this API.
+    
+#ifndef USE_INTERNAL_DSL
 
+    /// Operator version of yc_node_factory::new_negation_node().
+    yc_negate_node_ptr operator-(yc_number_node_ptr rhs);
+    
+    /// Operator version of yc_node_factory::new_addition_node().
+    yc_add_node_ptr operator+(yc_number_node_ptr lhs, yc_number_node_ptr rhs);
+    /// Operator version of yc_node_factory::new_addition_node().
+    yc_add_node_ptr operator+(double lhs, yc_number_node_ptr rhs);
+    /// Operator version of yc_node_factory::new_addition_node().
+    yc_add_node_ptr operator+(yc_number_node_ptr lhs, double rhs);
+
+    /// Operator version of yc_node_factory::new_division_node().
+    yc_divide_node_ptr operator/(yc_number_node_ptr lhs, yc_number_node_ptr rhs);
+    /// Operator version of yc_node_factory::new_division_node().
+    yc_divide_node_ptr operator/(double lhs, yc_number_node_ptr rhs);
+    /// Operator version of yc_node_factory::new_division_node().
+    yc_divide_node_ptr operator/(yc_number_node_ptr lhs, double rhs);
+
+    /// Operator version of yc_node_factory::new_multiplication_node().
+    yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, yc_number_node_ptr rhs);
+    /// Operator version of yc_node_factory::new_multiplication_node().
+    yc_multiply_node_ptr operator*(double lhs, yc_number_node_ptr rhs);
+    /// Operator version of yc_node_factory::new_multiplication_node().
+    yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, double rhs);
+
+    /// Operator version of yc_node_factory::new_subtraction_node().
+    yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, yc_number_node_ptr rhs);
+    /// Operator version of yc_node_factory::new_subtraction_node().
+    yc_subtract_node_ptr operator-(double lhs, yc_number_node_ptr rhs);
+    /// Operator version of yc_node_factory::new_subtraction_node().
+    yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, double rhs);
+#endif
     
+    /** @}*/
+
 } // namespace yask.
 
 #endif
diff --git a/include/yk_grid_api.hpp b/include/yk_grid_api.hpp
index c0219109..d06dd6b4 100644
--- a/include/yk_grid_api.hpp
+++ b/include/yk_grid_api.hpp
@@ -36,6 +36,11 @@ IN THE SOFTWARE.
 
 namespace yask {
 
+    /**
+     * \addtogroup yk
+     * @{
+     */
+
     /// A run-time grid.
     /**
        "Grid" is a generic term for any n-dimensional array.  A 0-dim grid
@@ -935,10 +940,9 @@ namespace yask {
                            /**< [in] Name of dimension to get.
                               Must be one of
                               the names from yk_solution::get_domain_dim_names(). */ ) const =0;
-
     };
 
-
+    /** @}*/
 } // namespace yask.
 
 #endif
diff --git a/include/yk_solution_api.hpp b/include/yk_solution_api.hpp
index d7c554a8..b66feb02 100644
--- a/include/yk_solution_api.hpp
+++ b/include/yk_solution_api.hpp
@@ -36,6 +36,11 @@ IN THE SOFTWARE.
 
 namespace yask {
 
+    /**
+     * \addtogroup yk
+     * @{
+     */
+
     /// Allocate grids on local NUMA node.
     /**
        This is used in yk_solution::set_default_numa_preferred
@@ -730,6 +735,7 @@ namespace yask {
         get_elapsed_run_secs() =0;
     };
     
+    /** @}*/
 } // namespace yask.
 
 #endif
diff --git a/src/compiler/lib/Eqs.cpp b/src/compiler/lib/Eqs.cpp
index 27af5e44..94955543 100644
--- a/src/compiler/lib/Eqs.cpp
+++ b/src/compiler/lib/Eqs.cpp
@@ -180,22 +180,24 @@ namespace yask {
                     });
         _done = true;
     }
-    
-    // Find dependencies based on all eqs.
-    // Side effect: sets _stepDir in dims.
+
+    // Analyze group of equations.
+    // Sets _stepDir in dims.
+    // Finds dependencies based on all eqs if 'settings._findDeps'.
     // Throws exceptions on illegal dependencies.
     // TODO: split this into smaller functions.
     // BIG-TODO: replace dependency algorithms with integration of a polyhedral
     // library.
-    void Eqs::findDeps(Dimensions& dims,
-                       ostream& os) {
+    void Eqs::analyzeEqs(CompilerSettings& settings,
+                         Dimensions& dims,
+                         ostream& os) {
         auto& stepDim = dims._stepDim;
         
         // Gather points from all eqs in all grids.
         PointVisitor pt_vis;
 
         // Gather initial stats from all eqs.
-        os << "Scanning " << getEqs().size() << " equation(s) for dependencies...\n";
+        os << "Scanning " << getEqs().size() << " stencil equation(s) for dependencies...\n";
         for (auto eq1 : getEqs())
             eq1->accept(&pt_vis);
         auto& outGrids = pt_vis.getOutputGrids();
@@ -315,6 +317,9 @@ namespace yask {
                                              "' on LHS");
                     }
                 }
+
+                // TODO: check that domain indices are simple offsets and
+                // misc indices are consts.
             }
 
             // TODO: check to make sure cond1 depends only on indices.
@@ -363,6 +368,7 @@ namespace yask {
                 // dependencies by looking for exact matches.
                 // We do this check first because it's quicker than the
                 // detailed scan done later if this one doesn't find a dep.
+                // Also, this is always illegal, even if not finding deps.
                 //
                 // Example:
                 //  eq1: a(t+1, x, ...) EQUALS ...
@@ -380,13 +386,18 @@ namespace yask {
                     // Save dependency.
 #ifdef DEBUG_DEP
                     cout << "  Exact match found to " << op1->makeQuotedStr() << ".\n";
-#endif                        
-                    _eq_deps[cur_step_dep].set_imm_dep_on(eq2, eq1);
+#endif
+                    if (settings._findDeps)
+                        _eq_deps[cur_step_dep].set_imm_dep_on(eq2, eq1);
                         
                     // Move along to next eq2.
                     continue;
                 }
 
+                // Don't do more conservative checks if not looking for deps.
+                if (!settings._findDeps)
+                    continue;
+                
                 // Next dep check: inexact matches on LHS of eq1 to RHS of eq2.
                 // Does eq1 define *any* point in a grid that eq2 inputs
                 // at the same step index?  If so, they *might* have a
@@ -453,6 +464,8 @@ namespace yask {
         } // for all eqs (eq1).
 
         // Resolve indirect dependencies.
+        // Do this even if not finding deps because we want to
+        // resolve deps provided by the user.
         os << " Resolving indirect dependencies...\n";
         for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1))
             _eq_deps[dt].find_all_deps();
diff --git a/src/compiler/lib/Eqs.hpp b/src/compiler/lib/Eqs.hpp
index ae942798..d403ea0a 100644
--- a/src/compiler/lib/Eqs.hpp
+++ b/src/compiler/lib/Eqs.hpp
@@ -162,8 +162,9 @@ namespace yask {
 
         // Find dependencies based on all eqs.  If 'eq_deps' is
         // set, save dependencies between eqs in referent.
-        virtual void findDeps(Dimensions& dims,
-                              std::ostream& os);
+        virtual void analyzeEqs(CompilerSettings& settings,
+                                Dimensions& dims,
+                                std::ostream& os);
 
         // Determine which grid points can be vectorized.
         virtual void analyzeVec(const Dimensions& dims);
diff --git a/src/compiler/lib/Expr.cpp b/src/compiler/lib/Expr.cpp
index 198113b0..361ccf43 100644
--- a/src/compiler/lib/Expr.cpp
+++ b/src/compiler/lib/Expr.cpp
@@ -203,7 +203,64 @@ namespace yask {
         if (!p)
             THROW_YASK_EXCEPTION("Error: new_last_domain_index() called without index-node argument");
         return last_index(p);
-    }    
+    }
+    yc_negate_node_ptr operator-(yc_number_node_ptr rhs) {
+        auto p = dynamic_pointer_cast<NumExpr>(rhs);
+        assert(p);
+        return make_shared<NegExpr>(p);
+    }
+    yc_add_node_ptr operator+(yc_number_node_ptr lhs, yc_number_node_ptr rhs) {
+        auto lp = dynamic_pointer_cast<NumExpr>(lhs);
+        assert(lp);
+        auto rp = dynamic_pointer_cast<NumExpr>(rhs);
+        assert(rp);
+        return make_shared<AddExpr>(lp, rp);
+    }
+    yc_add_node_ptr operator+(double lhs, yc_number_node_ptr rhs) {
+        return operator+(constNum(lhs), rhs);
+    }
+    yc_add_node_ptr operator+(yc_number_node_ptr lhs, double rhs) {
+        return operator+(lhs, constNum(rhs));
+    }
+    yc_divide_node_ptr operator/(yc_number_node_ptr lhs, yc_number_node_ptr rhs) {
+        auto lp = dynamic_pointer_cast<NumExpr>(lhs);
+        assert(lp);
+        auto rp = dynamic_pointer_cast<NumExpr>(rhs);
+        assert(rp);
+        return make_shared<DivExpr>(lp, rp);
+    }
+    yc_divide_node_ptr operator/(double lhs, yc_number_node_ptr rhs) {
+        return operator/(constNum(lhs), rhs);
+    }
+    yc_divide_node_ptr operator/(yc_number_node_ptr lhs, double rhs) {
+        return operator/(lhs, constNum(rhs));
+    }
+    yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, yc_number_node_ptr rhs) {
+        auto lp = dynamic_pointer_cast<NumExpr>(lhs);
+        assert(lp);
+        auto rp = dynamic_pointer_cast<NumExpr>(rhs);
+        assert(rp);
+        return make_shared<MultExpr>(lp, rp);
+    }
+    yc_multiply_node_ptr operator*(double lhs, yc_number_node_ptr rhs) {
+        return operator*(constNum(lhs), rhs);
+    }
+    yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, double rhs) {
+        return operator*(lhs, constNum(rhs));
+    }
+    yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, yc_number_node_ptr rhs) {
+        auto lp = dynamic_pointer_cast<NumExpr>(lhs);
+        assert(lp);
+        auto rp = dynamic_pointer_cast<NumExpr>(rhs);
+        assert(rp);
+        return make_shared<SubExpr>(lp, rp);
+    }
+    yc_subtract_node_ptr operator-(double lhs, yc_number_node_ptr rhs) {
+        return operator-(constNum(lhs), rhs);
+    }
+    yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, double rhs) {
+        return operator-(lhs, constNum(rhs));
+    }
 
     // Compare 2 expr pointers and return whether the expressions are
     // equivalent.
diff --git a/src/compiler/lib/Expr.hpp b/src/compiler/lib/Expr.hpp
index 3b45743c..f10c1ebd 100644
--- a/src/compiler/lib/Expr.hpp
+++ b/src/compiler/lib/Expr.hpp
@@ -207,7 +207,8 @@ namespace yask {
     }
 
     // Real or int value.
-    class NumExpr : public Expr, public virtual yc_number_node {
+    class NumExpr : public Expr,
+                    public virtual yc_number_node {
     public:
 
         // Return 'true' if this is a compile-time constant.
@@ -245,6 +246,9 @@ namespace yask {
         // For this to work properly, each derived type
         // should also implement a deep-copy copy ctor.
         virtual NumExprPtr clone() const =0;
+        virtual yc_number_node_ptr clone_ast() const {
+            return clone();
+        }
     };
 
     // Grid index types.
@@ -306,7 +310,7 @@ namespace yask {
 
     // A free function to create a constant expression.
     // Usually not needed due to operator overloading.
-    NumExprPtr constNum(double rhs);
+    NumExprPtr constNum(double val);
 
     // Free functions to create boundary indices, e.g., 'first_index(x)'.
     NumExprPtr first_index(IndexExprPtr dim);
diff --git a/src/compiler/lib/Grid.hpp b/src/compiler/lib/Grid.hpp
index 332c2f05..399f856d 100644
--- a/src/compiler/lib/Grid.hpp
+++ b/src/compiler/lib/Grid.hpp
@@ -304,6 +304,7 @@ namespace yask {
         bool _doOptCluster = true; // apply optimizations also to cluster.
         string _eqBundleTargets;  // how to bundle equations.
         string _gridRegex;       // grids to update.
+        bool _findDeps = true;
     };
     
     // Stencil dimensions.
diff --git a/src/compiler/lib/Soln.cpp b/src/compiler/lib/Soln.cpp
index a296dcb4..53f95bfe 100644
--- a/src/compiler/lib/Soln.cpp
+++ b/src/compiler/lib/Soln.cpp
@@ -85,7 +85,7 @@ namespace yask {
         _eqs.analyzeLoop(_dims);
 
         // Find dependencies between equations.
-        _eqs.findDeps(_dims, *_dos);
+        _eqs.analyzeEqs(_settings, _dims, *_dos);
 
         // Update access stats for the grids.
         _eqs.updateGridStats();
diff --git a/src/compiler/lib/Soln.hpp b/src/compiler/lib/Soln.hpp
index b91a50da..4cefe9f5 100644
--- a/src/compiler/lib/Soln.hpp
+++ b/src/compiler/lib/Soln.hpp
@@ -184,6 +184,8 @@ namespace yask {
         virtual void clear_clustering() { _settings._clusterOptions.clear(); }
         virtual void set_element_bytes(int nbytes) { _settings._elem_bytes = nbytes; }
         virtual int get_element_bytes() const { return _settings._elem_bytes; }
+        virtual bool is_dependency_checker_enabled() const { return _settings._findDeps; }
+        virtual void set_dependency_checker_enabled(bool enable) { _settings._findDeps = enable; }
         virtual void format(const std::string& format_type,
                             yask_output_ptr output);
     };
diff --git a/src/compiler/main.cpp b/src/compiler/main.cpp
index a1c43ea4..1fd99f35 100644
--- a/src/compiler/main.cpp
+++ b/src/compiler/main.cpp
@@ -25,6 +25,11 @@ IN THE SOFTWARE.
 
 /////////////// Main vector-folding code-generation code. /////////////
 
+// This macro blocks the operator overloads in the API.
+// This is temporary until the "internal DSL" gets completely
+// replaced by the APIs.
+#define USE_INTERNAL_DSL
+
 // Generation code.
 #include "ExprUtils.hpp"
 #include "Grid.hpp"
@@ -121,11 +126,11 @@ void usage(const string& cmd) {
         " [-no]-opt-cluster\n"
         "    Do [not] apply optimizations across the cluster (default=" << settings._doOptCluster << ").\n"
         " -max-es <num-nodes>\n"
-        "    Set heuristic for max single expression-size (default=" <<
-        settings._maxExprSize << ").\n"
+        "    Set heuristic for max single expression-size (default=" << settings._maxExprSize << ").\n"
         " -min-es <num-nodes>\n"
-        "    Set heuristic for min expression-size for reuse (default=" <<
-        settings._minExprSize << ").\n"
+        "    Set heuristic for min expression-size for reuse (default=" << settings._minExprSize << ").\n"
+        " [-no]-find-deps\n"
+        "    Find dependencies between stencil equations (default=" << settings._findDeps << ").\n"
         "\n"
         " -p <format-type> <filename>\n"
         "    Format output per <format-type> and write to <filename>.\n"
@@ -184,6 +189,10 @@ void parseOpts(int argc, const char* argv[])
                 settings._doOptCluster = true;
             else if (opt == "-no-opt-cluster")
                 settings._doOptCluster = false;
+            else if (opt == "-find-deps")
+                settings._findDeps = true;
+            else if (opt == "-no-find-deps")
+                settings._findDeps = false;
 
             // add any more options w/o values above.
 
diff --git a/src/compiler/swig/yask_compiler_api.i b/src/compiler/swig/yask_compiler_api.i
index a9b154d8..715632cc 100644
--- a/src/compiler/swig/yask_compiler_api.i
+++ b/src/compiler/swig/yask_compiler_api.i
@@ -76,6 +76,7 @@ IN THE SOFTWARE.
 %template(vector_eq) std::vector<std::shared_ptr<yask::yc_equation_node>>;
 %template(vector_grid) std::vector<yask::yc_grid*>;
 
+// Tell SWIG how to catch a YASK exception and rethrow it in Python.
 %exception {
   try {
     $action
@@ -85,6 +86,66 @@ IN THE SOFTWARE.
   }
 }
 
+// Tell SWIG how to handle non-class overloaded operators in Python.
+%extend yask::yc_number_node {
+    yask::yc_number_node_ptr __neg__() {
+        auto p = $self->clone_ast();
+        return yask::operator-(p);
+    }
+ };
+%extend yask::yc_number_node {
+    yask::yc_number_node_ptr __add__(yask::yc_number_node* rhs) {
+        auto lp = $self->clone_ast();
+        auto rp = rhs->clone_ast();
+        return yask::operator+(lp, rp);
+    }
+ };
+%extend yask::yc_number_node {
+    yask::yc_number_node_ptr __add__(double rhs) {
+        auto lp = $self->clone_ast();
+        return yask::operator+(lp, rhs);
+    }
+ };
+%extend yask::yc_number_node {
+    yask::yc_number_node_ptr __truediv__(yask::yc_number_node* rhs) {
+        auto lp = $self->clone_ast();
+        auto rp = rhs->clone_ast();
+        return yask::operator/(lp, rp);
+    }
+ };
+%extend yask::yc_number_node {
+    yask::yc_number_node_ptr __truediv__(double rhs) {
+        auto lp = $self->clone_ast();
+        return yask::operator/(lp, rhs);
+    }
+ };
+%extend yask::yc_number_node {
+    yask::yc_number_node_ptr __mul__(yask::yc_number_node* rhs) {
+        auto lp = $self->clone_ast();
+        auto rp = rhs->clone_ast();
+        return yask::operator*(lp, rp);
+    }
+ };
+%extend yask::yc_number_node {
+    yask::yc_number_node_ptr __mul__(double rhs) {
+        auto lp = $self->clone_ast();
+        return yask::operator*(lp, rhs);
+    }
+ };
+%extend yask::yc_number_node {
+    yask::yc_number_node_ptr __sub__(yask::yc_number_node* rhs) {
+        auto lp = $self->clone_ast();
+        auto rp = rhs->clone_ast();
+        return yask::operator-(lp, rp);
+    }
+ };
+%extend yask::yc_number_node {
+    yask::yc_number_node_ptr __sub__(double rhs) {
+        auto lp = $self->clone_ast();
+        return yask::operator-(lp, rhs);
+    }
+ };
+
 %include "yask_common_api.hpp"
 %include "yask_compiler_api.hpp"
-%include "yc_nodes.hpp"
+%include "yc_node_api.hpp"
diff --git a/src/compiler/tests/yask_compiler_api_test.cpp b/src/compiler/tests/yask_compiler_api_test.cpp
index 7bf77a66..910e105b 100644
--- a/src/compiler/tests/yask_compiler_api_test.cpp
+++ b/src/compiler/tests/yask_compiler_api_test.cpp
@@ -61,20 +61,19 @@ int main() {
         auto n1 = fac.new_const_number_node(3.14);
         cout << n1->format_simple() << endl;
 
-        auto n2 = fac.new_negate_node(n1);
+        auto n2 = g1->new_relative_grid_point({0, +1, 0, -2});
         cout << n2->format_simple() << endl;
 
-        auto n3 = g1->new_relative_grid_point({0, +1, 0, -2});
+        auto n3 = n1 + n2;
         cout << n3->format_simple() << endl;
 
-        auto n4a = fac.new_add_node(n2, n3);
-        auto n4b = fac.new_add_node(n4a, n1);
-        cout << n4b->format_simple() << endl;
+        auto n4 = n2 * -n3 * 0.9;
+        cout << n4->format_simple() << endl;
 
         auto n5 = g1->new_relative_grid_point({0, +1, -1, 0});
         cout << n5->format_simple() << endl;
 
-        auto n6 = fac.new_multiply_node(n4b, n5);
+        auto n6 = n4 / n5;
         cout << n6->format_simple() << endl;
 
         // Define scratch grid value.
@@ -87,7 +86,7 @@ int main() {
         // Use scratch grid value.
         auto n7a = sg1->new_relative_grid_point({-1, 0, +2});
         auto n7b = sg1->new_relative_grid_point({+1, -1, -2});
-        auto n8 = fac.new_add_node(n7a, n7b);
+        auto n8 = n7a + n7b;
         cout << n8->format_simple() << endl;
 
         // Define main grid value at t+1.

From d9b5e4032eb87475860e84ea113c997b742efd72 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Tue, 24 Apr 2018 09:30:39 -0700
Subject: [PATCH 06/21] Add explicit API for integer consts.

---
 bin/yask_compiler_api_test.py | 2 +-
 include/yask_common_api.hpp   | 8 ++++++++
 include/yask_kernel_api.hpp   | 8 --------
 include/yc_node_api.hpp       | 8 ++++++++
 src/compiler/lib/Expr.cpp     | 4 ++++
 src/compiler/lib/Expr.hpp     | 5 +++++
 6 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/bin/yask_compiler_api_test.py b/bin/yask_compiler_api_test.py
index 1932ce21..adcde90e 100755
--- a/bin/yask_compiler_api_test.py
+++ b/bin/yask_compiler_api_test.py
@@ -74,7 +74,7 @@
     sn2 = (sg1.new_relative_grid_point([1, 0, 0]) +
            sg1.new_relative_grid_point([0, 1, 0]) +
            sg1.new_relative_grid_point([0, 0, 1]))
-    sn5 = -sn2 * 2.5 - 9.1
+    sn5 = -sn2 * 2.5 - 9
     
     # Create an equation to define the value at the next timestep.
     n3 = g1.new_relative_grid_point([1, 0, 0, 0]) # center-point at next timestep.
diff --git a/include/yask_common_api.hpp b/include/yask_common_api.hpp
index 7443c897..6d8050b4 100644
--- a/include/yask_common_api.hpp
+++ b/include/yask_common_api.hpp
@@ -51,6 +51,14 @@ namespace yask {
     */
     std::string yask_get_version_string();
     
+    /// Type to use for indexing grids.
+    /** Index types are signed to allow negative indices in padding/halos. */
+#ifdef SWIG
+    typedef long int idx_t;     // SWIG doesn't seem to understand int64_t.
+#else
+    typedef std::int64_t idx_t;
+#endif
+
     // Forward declarations of class-pointers.
 
     class yask_output;
diff --git a/include/yask_kernel_api.hpp b/include/yask_kernel_api.hpp
index c1579f50..93b734c8 100644
--- a/include/yask_kernel_api.hpp
+++ b/include/yask_kernel_api.hpp
@@ -44,14 +44,6 @@ namespace yask {
      * @{
      */
 
-    /// Type to use for indexing grids.
-    /** Index types are signed to allow negative indices in padding/halos. */
-#ifdef SWIG
-    typedef long int idx_t;     // SWIG doesn't seem to understand int64_t.
-#else
-    typedef std::int64_t idx_t;
-#endif
-
     // Forward declarations of classes and pointers.
 
     class yk_env;
diff --git a/include/yc_node_api.hpp b/include/yc_node_api.hpp
index 86bf2eb4..eeb02311 100644
--- a/include/yc_node_api.hpp
+++ b/include/yc_node_api.hpp
@@ -190,6 +190,14 @@ namespace yask {
         virtual yc_const_number_node_ptr
         new_const_number_node(double val /**< [in] Value to store in node. */ );
 
+        ///
+        /**
+           Integer version of new_const_number_node(double).
+           @returns Pointer to new \ref yc_const_number_node object. 
+        */
+        virtual yc_const_number_node_ptr
+        new_const_number_node(idx_t val /**< [in] Value to store in node. */ );
+
         /// Create a numerical negation operator node.
         /**
             New negation nodes can also be created via the overloaded unary `-` operator.
diff --git a/src/compiler/lib/Expr.cpp b/src/compiler/lib/Expr.cpp
index 361ccf43..e6191f62 100644
--- a/src/compiler/lib/Expr.cpp
+++ b/src/compiler/lib/Expr.cpp
@@ -70,6 +70,10 @@ namespace yask {
     yc_node_factory::new_const_number_node(double val) {
         return make_shared<ConstExpr>(val);
     }
+    yc_const_number_node_ptr
+    yc_node_factory::new_const_number_node(idx_t val) {
+        return make_shared<ConstExpr>(val);
+    }
     yc_negate_node_ptr
     yc_node_factory::new_negate_node(yc_number_node_ptr rhs) {
         auto p = dynamic_pointer_cast<NumExpr>(rhs);
diff --git a/src/compiler/lib/Expr.hpp b/src/compiler/lib/Expr.hpp
index f10c1ebd..89023f95 100644
--- a/src/compiler/lib/Expr.hpp
+++ b/src/compiler/lib/Expr.hpp
@@ -358,6 +358,11 @@ namespace yask {
 
     public:
         ConstExpr(double f) : _f(f) { }
+        ConstExpr(idx_t i) : _f(i) {
+            if (idx_t(_f) != i)
+                THROW_YASK_EXCEPTION("Error: integer value " << i <<
+                                     " cannot be stored accurately as a double");
+        }
         ConstExpr(const ConstExpr& src) : _f(src._f) { }
         virtual ~ConstExpr() { }
 

From c24a2987df428026820da9fa439e8056f197d99b Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Tue, 24 Apr 2018 19:13:18 -0700
Subject: [PATCH 07/21] Add API to specify flow dependencies. Closes #96.
 Closes #85.

Improve some other API docs.
---
 include/yask_compiler_api.hpp | 75 +++++++++++++++++++++++++++++---
 include/yc_node_api.hpp       | 80 +++++++++++++++++++----------------
 src/compiler/lib/Eqs.cpp      |  7 +--
 src/compiler/lib/Eqs.hpp      | 34 +++++++++++----
 src/compiler/lib/Soln.cpp     |  2 +-
 src/compiler/lib/Soln.hpp     | 26 ++++++++----
 6 files changed, 162 insertions(+), 62 deletions(-)

diff --git a/include/yask_compiler_api.hpp b/include/yask_compiler_api.hpp
index e3ab4951..251049a8 100644
--- a/include/yask_compiler_api.hpp
+++ b/include/yask_compiler_api.hpp
@@ -268,9 +268,10 @@ namespace yask {
         clear_folding() =0;
 
         /// Set the cluster multiplier (unroll factor) in given dimension.
-        /** For YASK-code generation, this will have the effect of creating
+        /** For YASK kernel-code generation, this will have the effect of creating
             N vectors of output for each equation, where N is the product of
             the cluster multipliers. 
+
             @note A multiplier >1 cannot be applied to
             the step dimension. 
             @note Default is one (1) in each dimension. */
@@ -326,13 +327,13 @@ namespace yask {
 
         /// **[Advanced]** Enable or disable automatic dependency checker.
         /**
-           This should be used whenever the built-in dependency checker is
-           insufficient. Currently, the provided checker does not allow
-           stencils in which points in one sub-domain depend on points
+           Disabling the built-in dependency checker may be done when it is
+           overly conservative. Currently, the provided checker does not
+           allow stencils in which points in one sub-domain depend on points
            in another sub-domain within the same value of the step index.
 
            @warning If dependency checker is disabled, *all* dependencies
-           must be set via the APIs.
+           must be set via add_flow_dependency().
          */
         virtual void
         set_dependency_checker_enabled(bool enable
@@ -344,6 +345,70 @@ namespace yask {
         */
         virtual bool
         is_dependency_checker_enabled() const =0;
+
+        /// **[Advanced]** Add a dependency between two equations.
+        /**
+           This function adds an arc in the data dependency graph `from` one
+           equation `to` another one,
+           indicating that the `from` equation depends on the `to` equation.
+           In other words, the `to` expression must be evaluated _before_
+           the `from` equation.
+           In compiler-theory terms, this is a _flow_ dependency, also 
+           known as a _true_ or _read-after-write_ (RAW) dependency.
+           (Strictly speaking, however, equations in the YASK compiler
+           are declarative instead of imperative, so they describe
+           equalities rather than assignments with reads and writes.)
+
+           Additional considerations:
+           - Only _immediate_ dependencies should be added.
+           For example, if **A** depends on **B** and **B** depends on **C**,
+           it is not necessary to add a derived dependence from **A** to **C**.
+
+           - Only dependencies at a given step-index value should
+           be added.
+           For example, given 
+           equation **A**: `A(t+1, x) EQUALS B(t+1, x) + 5` and
+           equation **B**: `B(t+1, x) EQUALS A(t, x) / 2`,
+           **A** depends on **B** at some value of the step-index `t`.
+           It is true that `B(t+2)` depends on `A(t+1)`, but that
+           inter-step dependency should not be added with this function.
+
+           - If a cycle of dependencies is created, the YASK compiler
+           will throw an exception containing an error message
+           about a circular dependency. This exception may not be
+           thrown until format() is called.
+
+           - If using scratch grids, dependencies among scratch grids
+           and between scratch-grid equations and non-scratch-grid
+           equations should also be added. Each scratch grid equation
+           should ultimately depend on non-scratch-grid values.
+
+           - This function can be used in cooperation with or instead of
+           the built-in automatic dependency checker.
+           When used in cooperation with the built-in checker,
+           both dependencies from this function and the built-in checker
+           will be considered.
+           When the built-in checker is diabled via
+           `set_dependency_checker_enabled(false)`, only dependencies
+           from this function will be considered.
+           In this case, it is imperative that all immediate
+           dependencies are added.
+           If the dependency graph is incomplete, the resulting generated
+           stencil code will contain illegal race conditions,
+           and it will most likely produce incorrect results.
+        */
+        virtual void
+        add_flow_dependency(yc_equation_node_ptr from
+                            /**< [in] Equation that must be evaluated _after_ `to`. */,
+                            yc_equation_node_ptr to
+                            /**< [in] Equation that must be evaluated _before_ `from`. */) =0;
+
+        /// **[Advanced]** Remove all existing dependencies.
+        /**
+           Removes dependencies added via add_flow_dependency().
+         */
+        virtual void
+        clear_dependencies() =0;
     };
 
     /// A compile-time grid.
diff --git a/include/yc_node_api.hpp b/include/yc_node_api.hpp
index eeb02311..57ab4e32 100644
--- a/include/yc_node_api.hpp
+++ b/include/yc_node_api.hpp
@@ -136,8 +136,12 @@ namespace yask {
         /**
            Create a variable to be used to index grids in the
            solution-domain dimension.
-           The name usually describes spatial dimensions, e.g. "x" or "y". 
-           This should *not* include the step dimension, which is specified via
+           The name usually describes spatial dimensions, e.g. "x" or "y",
+           but it can be any dimension that is specified at run-time,
+           such as an index into a number of parallel problems
+           being solved simultaneously.
+
+           @note This should *not* include the step dimension, which is specified via
            new_step_index().
            @returns Pointer to new \ref yc_index_node object.
         */
@@ -149,7 +153,9 @@ namespace yask {
         /**
            Create an variable to be used to index grids in the
            some dimension that is not the step dimension
-           or a domain dimension. Example: index into an array.
+           or a domain dimension.
+           The value of these indices are normally compile-time
+           constants, e.g., a fixed index into an array.
            @returns Pointer to new \ref yc_index_node object.
         */
         virtual yc_index_node_ptr
@@ -164,12 +170,15 @@ namespace yask {
             LHS.
 
             An optional condition may be provided to define the sub-domain
-            to which this equation applies. Example: `x > 10`.
+            to which this equation applies. See new_first_domain_index()
+            for more information and an example.
             Conditions are always evaluated with respect to the overall
-            problem domain independent of any MPI domain decomposition
-            that might occur at run-time.
+            problem domain, i.e., independent of any specific
+            MPI domain decomposition that might occur at run-time.
             If a condition is not provided, the equation applies to the
             entire problem domain.
+            A condition can be added to an equation after its creation
+            via yc_equation_node.set_cond().
 
             @returns Pointer to new \ref yc_equation_node object. 
         */
@@ -179,7 +188,8 @@ namespace yask {
                           yc_number_node_ptr rhs
                           /**< [in] Expression after EQUALS operator. */,
                           yc_bool_node_ptr cond = nullptr
-                          /**< [in] Expression defining sub-domain. */ );
+                          /**< [in] Optional expression defining sub-domain
+                             where `lhs EQUALS rhs` is valid. */ );
 
         /// Create a constant numerical value node.
         /** 
@@ -261,16 +271,28 @@ namespace yask {
 
            \code{.cpp}
            auto x = node_fac.new_domain_index("x");
-           auto first_x = node_fac.new_first_domain_index(x);
 
-           // Create expression for "first_x + 10".
-           auto left10 = node_fac.new_add_node(first_x,
-           node_fac.new_const_number_node(10));
+           // Create boolean expression for the
+           // boundary sub-domain "x < first_x + 10".
+           auto first_x = node_fac.new_first_domain_index(x);
+           auto left_bc_cond = node_fac.new_less_than_node(x, first_x + 10);
 
-           // Create boolean expression for "x > first_x + 10".
-           auto expr = node_fac.new_greater_than_node(x, left10);
+           // Create a new equation that is valid in this range.
+           auto left_bc_eq = 
+             node_fac.new_equation_node(grid_pt_expr, left_bc_expr, left_bc_cond);
            \endcode
 
+           Specification of the "interior" part of a 2-D domain could be
+           represented by an expression similar to
+           `x >= new_first_domain_index(x) + 20 &&
+           x <= new_last_domain_index(x) - 20 &&
+           y >= new_first_domain_index(y) + 20 &&
+           y <= new_last_domain_index(y) - 20`.
+
+           @note The entire domain in dimension "x" would be represented by
+           `x >= new_first_domain_index(x) && x <= new_last_domain_index(x)`, but
+           that is the default condition so does not need to be specified.
+
            @returns Pointer to new \ref yc_index_node object.
         */
         virtual yc_number_node_ptr
@@ -283,20 +305,6 @@ namespace yask {
            domain in `dim` dimension.
            The `dim` argument is created via new_domain_index().
 
-           Typical C++ usage:
-
-           \code{.cpp}
-           auto x = node_fac.new_domain_index("x");
-           auto last_x = node_fac.new_last_domain_index(x);
-
-           // Create expression for "last_x - 10".
-           auto right10 = node_fac.new_subtract_node(last_x,
-           node_fac.new_const_number_node(10));
-
-           // Create boolean expression for "x < first_x - 10".
-           auto expr = node_fac.new_less_than_node(x, right10);
-           \endcode
-
            @returns Pointer to new \ref yc_index_node object.
         */
         virtual yc_number_node_ptr
@@ -734,34 +742,34 @@ namespace yask {
 
     /// Operator version of yc_node_factory::new_negation_node().
     yc_negate_node_ptr operator-(yc_number_node_ptr rhs);
-    
+
+    //@{
     /// Operator version of yc_node_factory::new_addition_node().
     yc_add_node_ptr operator+(yc_number_node_ptr lhs, yc_number_node_ptr rhs);
-    /// Operator version of yc_node_factory::new_addition_node().
     yc_add_node_ptr operator+(double lhs, yc_number_node_ptr rhs);
-    /// Operator version of yc_node_factory::new_addition_node().
     yc_add_node_ptr operator+(yc_number_node_ptr lhs, double rhs);
+    //@}
 
+    //@{
     /// Operator version of yc_node_factory::new_division_node().
     yc_divide_node_ptr operator/(yc_number_node_ptr lhs, yc_number_node_ptr rhs);
-    /// Operator version of yc_node_factory::new_division_node().
     yc_divide_node_ptr operator/(double lhs, yc_number_node_ptr rhs);
-    /// Operator version of yc_node_factory::new_division_node().
     yc_divide_node_ptr operator/(yc_number_node_ptr lhs, double rhs);
+    //@}
 
+    //@{
     /// Operator version of yc_node_factory::new_multiplication_node().
     yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, yc_number_node_ptr rhs);
-    /// Operator version of yc_node_factory::new_multiplication_node().
     yc_multiply_node_ptr operator*(double lhs, yc_number_node_ptr rhs);
-    /// Operator version of yc_node_factory::new_multiplication_node().
     yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, double rhs);
+    //@}
 
+    //@{
     /// Operator version of yc_node_factory::new_subtraction_node().
     yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, yc_number_node_ptr rhs);
-    /// Operator version of yc_node_factory::new_subtraction_node().
     yc_subtract_node_ptr operator-(double lhs, yc_number_node_ptr rhs);
-    /// Operator version of yc_node_factory::new_subtraction_node().
     yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, double rhs);
+    //@}
 #endif
     
     /** @}*/
diff --git a/src/compiler/lib/Eqs.cpp b/src/compiler/lib/Eqs.cpp
index 94955543..d554dcc5 100644
--- a/src/compiler/lib/Eqs.cpp
+++ b/src/compiler/lib/Eqs.cpp
@@ -183,7 +183,8 @@ namespace yask {
 
     // Analyze group of equations.
     // Sets _stepDir in dims.
-    // Finds dependencies based on all eqs if 'settings._findDeps'.
+    // Finds dependencies based on all eqs if 'settings._findDeps', setting
+    // _imm_dep_on and _dep_on.
     // Throws exceptions on illegal dependencies.
     // TODO: split this into smaller functions.
     // BIG-TODO: replace dependency algorithms with integration of a polyhedral
@@ -606,7 +607,7 @@ namespace yask {
         visitEqs(&slv);
     }
 
-    // Update access stats for the grids.
+    // Update access stats for the grids, i.e., halos and const indices.
     // Also finds scratch-grid eqs needed for each non-scratch eq.
     void Eqs::updateGridStats() {
 
@@ -647,7 +648,7 @@ namespace yask {
                     (eq1, [&](EqualsExprPtr b, EqDeps::EqVecSet& path) {
 
                         // Does 'b' have a scratch-grid output?
-                        // NB: scratch eqs don't have conditions, so
+                        // NB: scratch eqs don't have their own conditions, so
                         // we don't need to check them.
                         auto* og2 = pv.getOutputGrids().at(b.get());
                         if (og2->isScratch()) {
diff --git a/src/compiler/lib/Eqs.hpp b/src/compiler/lib/Eqs.hpp
index d403ea0a..5399f432 100644
--- a/src/compiler/lib/Eqs.hpp
+++ b/src/compiler/lib/Eqs.hpp
@@ -67,6 +67,14 @@ namespace yask {
             _all.insert(b);
             _done = false;
         }
+
+        // Clear all deps.
+        virtual void clear_deps() {
+            _imm_deps.clear();
+            _full_deps.clear();
+            _all.clear();
+            _done = false;
+        }
     
         // Check whether eq a directly depends on b.
         virtual bool is_imm_dep_on(EqualsExprPtr a, EqualsExprPtr b) const {
@@ -120,14 +128,21 @@ namespace yask {
     protected:
     
         // Equations(s) describing how values in this grid are computed.
-        EqList _eqs;          // just equations w/o conditions.
+        EqList _eqs;
+
+        // Dependencies between all eqs.
+        EqDepMap _eq_deps;
 
-        EqDepMap _eq_deps;            // dependencies between all eqs.
-        EqDeps::DepMap _scratch_deps;   // dependencies through scratch grids.
+        // Dependencies through scratch grids.
+        EqDeps::DepMap _scratch_deps;
         
     public:
 
-        Eqs() {}
+        Eqs() {
+            // Make sure map keys exist.
+            for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1))
+                _eq_deps[dt];
+        }
         virtual ~Eqs() {}
 
         // Equation accessors.
@@ -146,6 +161,9 @@ namespace yask {
         virtual const EqDepMap& getDeps() const {
             return _eq_deps;
         }
+        virtual EqDepMap& getDeps() {
+            return _eq_deps;
+        }
         
         // Get the scratch-grid eqs that contribute to 'eq'.
         virtual const EqDeps::EqSet& getScratchDeps(EqualsExprPtr ep) const {
@@ -176,10 +194,10 @@ namespace yask {
         virtual void updateGridStats();
     };
 
-    // A named equation bundle, which contains one or more grid-update equations.
-    // All equations in a bundle must have the same condition.
-    // Equations should not have inter-dependencies because they will be
-    // combined into a single expression.
+    // A named equation bundle, which contains one or more grid-update
+    // equations.  All equations in a bundle must have the same condition.
+    // Equations in a bundle should not have inter-dependencies because they
+    // will be combined into a single expression.
     class EqBundle {
     protected:
         EqList _eqs; // expressions in this eqBundle (not including conditions).
diff --git a/src/compiler/lib/Soln.cpp b/src/compiler/lib/Soln.cpp
index 53f95bfe..5c21a9c6 100644
--- a/src/compiler/lib/Soln.cpp
+++ b/src/compiler/lib/Soln.cpp
@@ -70,7 +70,7 @@ namespace yask {
                                            bool is_folding_efficient) {
 
         // Call the stencil 'define' method to create ASTs.
-        // ASTs can also be created via the APIs.
+        // ASTs and grids can also be created via the APIs.
         define();
 
         // Find all the stencil dimensions from the grids.
diff --git a/src/compiler/lib/Soln.hpp b/src/compiler/lib/Soln.hpp
index 4cefe9f5..46e05939 100644
--- a/src/compiler/lib/Soln.hpp
+++ b/src/compiler/lib/Soln.hpp
@@ -53,10 +53,10 @@ namespace yask {
 
         // Debug output.
         yask_output_ptr _debug_output;
-        ostream* _dos = &std::cout;
+        ostream* _dos = &std::cout; // just a handy pointer to an ostream.
     
         // All vars accessible by the kernel.
-        Grids _grids;       // keep track of all registered grids.
+        Grids _grids;
 
         // All equations defined in this solution.
         Eqs _eqs;
@@ -170,22 +170,30 @@ namespace yask {
                 ev.push_back(_eqs.getEqs().at(i));
             return ev;
         }
-        virtual void set_fold(const std::string& dim, int len) {
-            auto& fold = _settings._foldOptions;
-            auto* p = fold.lookup(dim);
-            if (p)
-                *p = len;
-            else
-                fold.addDimBack(dim, len);
+        virtual void add_flow_dependency(yc_equation_node_ptr from,
+                                         yc_equation_node_ptr to) {
+            auto fp = dynamic_pointer_cast<EqualsExpr>(from);
+            assert(fp);
+            auto tp = dynamic_pointer_cast<EqualsExpr>(to);
+            assert(tp);
+            _eqs.getDeps().at(cur_step_dep).set_imm_dep_on(fp, tp);
         }
+        virtual void clear_dependencies() {
+            for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1))
+                _eqs.getDeps().at(dt).clear_deps();
+        }
+
         virtual void set_fold_len(const yc_index_node_ptr, int len);
         virtual void clear_folding() { _settings._foldOptions.clear(); }
         virtual void set_cluster_mult(const yc_index_node_ptr, int mult);
         virtual void clear_clustering() { _settings._clusterOptions.clear(); }
+
         virtual void set_element_bytes(int nbytes) { _settings._elem_bytes = nbytes; }
         virtual int get_element_bytes() const { return _settings._elem_bytes; }
+
         virtual bool is_dependency_checker_enabled() const { return _settings._findDeps; }
         virtual void set_dependency_checker_enabled(bool enable) { _settings._findDeps = enable; }
+
         virtual void format(const std::string& format_type,
                             yask_output_ptr output);
     };

From b2e16f330dfbbc0f35c58762e587dec98216fb56 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Wed, 25 Apr 2018 13:02:54 -0700
Subject: [PATCH 08/21] Restructure code to reduce compile time.

Turns off prefetching at O0 and O1.
Only builds grids to max size in stencil.
---
 src/kernel/Makefile            |  166 +++--
 src/kernel/lib/context.cpp     | 1230 +------------------------------
 src/kernel/lib/grid_apis.cpp   |  387 ++++++++++
 src/kernel/lib/realv_grids.cpp |  354 ---------
 src/kernel/lib/setup.cpp       | 1260 ++++++++++++++++++++++++++++++++
 src/kernel/lib/yask.hpp        |    5 +
 6 files changed, 1740 insertions(+), 1662 deletions(-)
 create mode 100644 src/kernel/lib/grid_apis.cpp
 create mode 100644 src/kernel/lib/setup.cpp

diff --git a/src/kernel/Makefile b/src/kernel/Makefile
index 3ab5bf9c..3d5dccdf 100644
--- a/src/kernel/Makefile
+++ b/src/kernel/Makefile
@@ -219,7 +219,6 @@ def_pad_args		?=	-ep 1
 cluster			?=	x=1
 pfd_l1			?=	0
 pfd_l2			?=	2
-max_dims		?=	5  # max grid dims >= max stencil dims.
 
 # default folding depends on HW vector size.
 ifneq ($(findstring INTRIN512,$(MACROS)),)  # 512 bits.
@@ -248,76 +247,6 @@ endif # not 512 bits.
 # Select fold based on size of reals.
 fold	= 	$(fold_$(real_bytes)byte) # e.g., fold_4byte
 
-######## Loop-compiler configuration:
-# The loop indices range from 0..N-1.
-# Dim 0 is the step dim, usually time.
-# The step loop is handled outside of the generated loops,
-# so the following loop codes do not scan over dim 0.
-# Dims 1..N-1 are the domain dims, usually spatial.
-# Thus, N-1 is the inner-most dim.
-# For best perf, this should be the unit-stride dim in the grids.
-
-# File with number of dims extracted from YASK compiler output.
-YK_DIMS_FILE		:=	num_dims.$(stencil).txt
-NDIMS_OPT		:=	`cat $(YK_DIMS_FILE)`
-
-# Rank loops break up the whole rank into smaller regions.  In order for
-# temporal wavefronts to operate properly, the order of spatial dimensions
-# may be changed, but the scanning paths must have strictly incrementing
-# indices. Those that do not (e.g., grouped, serpentine, square-wave) may
-# *not* be used here when using temporal wavefronts. The time loop may be
-# found in StencilEquations::run_solution().
-RANK_LOOP_OPTS		?=	$(NDIMS_OPT) -inVar rank_idxs
-RANK_LOOP_ORDER		?=	1 .. N-1
-RANK_LOOP_CODE		?=	$(RANK_LOOP_OUTER_MODS) loop($(RANK_LOOP_ORDER)) \
-				{ $(RANK_LOOP_INNER_MODS) call(calc_region(stBundle_ptr)); }
-
-# Region loops break up a region using OpenMP threading into blocks.  The
-# 'omp' modifier creates an outer OpenMP loop so that each block is assigned
-# to a top-level OpenMP thread.  The region time loops are not coded here to
-# allow for proper spatial skewing for temporal wavefronts. The time loop
-# may be found in StencilEquations::calc_region().
-REGION_LOOP_OPTS	?=     	$(NDIMS_OPT) -inVar region_idxs \
-				-ompConstruct '$(omp_par_for) schedule($(omp_region_schedule)) proc_bind(spread)' \
-				-callPrefix 'sg->'
-REGION_LOOP_OUTER_MODS	?=	grouped omp
-REGION_LOOP_ORDER	?=	1 .. N-1
-REGION_LOOP_CODE	?=	$(REGION_LOOP_OUTER_MODS) loop($(REGION_LOOP_ORDER)) { \
-				$(REGION_LOOP_INNER_MODS) call(calc_block); }
-
-# Block loops break up a block into sub-blocks.  The 'omp' modifier creates
-# a *nested* OpenMP loop so that each sub-block is assigned to a nested OpenMP
-# thread.  There is no time loop because threaded temporal blocking is
-# not yet supported.
-BLOCK_LOOP_OPTS		?=     	$(NDIMS_OPT) -inVar block_idxs \
-				-ompConstruct '$(omp_par_for) schedule($(omp_block_schedule)) proc_bind(close)' \
-				-callPrefix 'sg->'
-BLOCK_LOOP_OUTER_MODS	?=	grouped omp
-BLOCK_LOOP_ORDER	?=	1 .. N-1
-BLOCK_LOOP_CODE		?=	$(BLOCK_LOOP_OUTER_MODS) loop($(BLOCK_LOOP_ORDER)) { \
-				$(BLOCK_LOOP_INNER_MODS) call(calc_sub_block(thread_idx)); }
-
-# Sub-block loops break up a sub-block into clusters or vectors.  These loops
-# are run by a single OMP thread.  The N-1 (inner) loop is generated by the
-# stencil compiler.  There is no time loop because threaded temporal
-# blocking is not yet supported.  The indexes in this loop are 'normalized',
-# i.e., vector units and rank-relative.
-SUB_BLOCK_LOOP_OPTS		?=     	$(NDIMS_OPT) -inVar norm_sub_block_idxs
-SUB_BLOCK_LOOP_OUTER_MODS	?=
-SUB_BLOCK_LOOP_ORDER		?=	1 .. N-2
-SUB_BLOCK_LOOP_CODE		?=	$(SUB_BLOCK_LOOP_OUTER_MODS) loop($(SUB_BLOCK_LOOP_ORDER)) { \
-					$(SUB_BLOCK_LOOP_INNER_MODS) call(calc_inner_loop(thread_idx)); }
-
-# General-purpose parallel loop.
-# Nested OpenMP is not used here because there is no sharing between threads.
-# TODO: Consider using nested OpenMP to hide more latency.
-MISC_LOOP_OPTS		?=     	$(NDIMS_OPT) -inVar misc_idxs \
-				-ompConstruct '$(omp_par_for) schedule($(omp_misc_schedule)) proc_bind(spread)'
-MISC_LOOP_OUTER_MODS	?=	omp
-MISC_LOOP_ORDER		?=	1 .. N-1
-MISC_LOOP_CODE		?=	$(MISC_LOOP_OUTER_MODS) loop($(MISC_LOOP_ORDER)) \
-				$(MISC_LOOP_INNER_MODS) { call(misc_fn); }
-
 ######## End of vars that control the function and performance of the kernel.
 
 # The remainder of this file specifies how to build and test the kernel.
@@ -370,6 +299,7 @@ YK_PY_MOD	:=	$(YASK_DIR)/$(YK_MODULE).py
 YK_API_TEST_EXEC :=	$(BIN_DIR)/$(YK_BASE)_api_test.exe
 YK_GRID_TEST_EXEC :=	$(BIN_DIR)/$(YK_BASE)_grid_test.exe
 YK_API_TEST_EXEC_WITH_EXCEPTION :=	$(BIN_DIR)/$(YK_BASE)_api_exception_test.exe
+YK_DIMS_FILE	:=	num_dims.$(stencil).txt
 
 MAKE_REPORT_FILE:=	make-report.$(YK_TAG).txt
 
@@ -382,7 +312,7 @@ COMM_SRC_BASES	:=	$(addprefix $(COMM_DIR)/,$(COMM_SRC_NAMES))
 YK_SWIG_DIR	:=	./swig
 YK_GEN_DIR	:=	./gen
 YK_LIB_DIR	:=	./lib
-YK_SRC_NAMES	:=	factory new_grid generic_grids realv_grids utils settings context stencil_calc
+YK_SRC_NAMES	:=	factory context setup realv_grids grid_apis new_grid generic_grids utils settings stencil_calc
 YK_SRC_BASES	:=	$(addprefix $(YK_LIB_DIR)/,$(YK_SRC_NAMES))
 YK_OBJS		:=	$(addsuffix .$(YK_TAG).o,$(YK_SRC_BASES) $(COMM_SRC_BASES))
 YK_MACRO_FILE	:=	$(YK_GEN_DIR)/yask_macros.hpp
@@ -462,10 +392,15 @@ PYINC		:= 	$(addprefix -I,$(shell $(PYTHON) -c 'import distutils.sysconfig; prin
 
 RUN_PYTHON	:= 	$(RUN_PREFIX) env PYTHONPATH=$(LIB_DIR):$(YASK_DIR):$(PYTHONPATH) $(PYTHON)
 
+# Turn off prefetching at O0 or O1.
+ifneq ($(filter -O0 -O1,$(YK_CXXOPT)),)
+ pfd_l1		=	0
+ pfd_l2		=	0
+endif
+
 # Set MACROS based on individual makefile vars.
 # MACROS and EXTRA_MACROS will be written to a header file.
 MACROS		+=	PFD_L1=$(pfd_l1) PFD_L2=$(pfd_l2)
-MACROS		+=	MAX_DIMS=$(max_dims)
 ifeq ($(streaming_stores),1)
  MACROS		+=	USE_STREAMING_STORE
 endif
@@ -539,6 +474,75 @@ endif
 # Add in final flags and user-added flags.
 YK_CXXFLAGS	+=	$(YK_CXXOPT) $(OMPFLAGS) $(EXTRA_YK_CXXFLAGS)
 
+# Number of dims extracted from YASK compiler output.
+NDIMS		:=	`cat $(YK_DIMS_FILE)`
+
+######## Loop-compiler configuration:
+# The loop indices range from 0..N-1.
+# Dim 0 is the step dim, usually time.
+# The step loop is handled outside of the generated loops,
+# so the following loop codes do not scan over dim 0.
+# Dims 1..N-1 are the domain dims, usually spatial.
+# Thus, N-1 is the inner-most dim.
+# For best perf, this should be the unit-stride dim in the grids.
+
+# Rank loops break up the whole rank into smaller regions.  In order for
+# temporal wavefronts to operate properly, the order of spatial dimensions
+# may be changed, but the scanning paths must have strictly incrementing
+# indices. Those that do not (e.g., grouped, serpentine, square-wave) may
+# *not* be used here when using temporal wavefronts. The time loop may be
+# found in StencilEquations::run_solution().
+RANK_LOOP_OPTS		?=	-ndims $(NDIMS) -inVar rank_idxs
+RANK_LOOP_ORDER		?=	1 .. N-1
+RANK_LOOP_CODE		?=	$(RANK_LOOP_OUTER_MODS) loop($(RANK_LOOP_ORDER)) \
+				{ $(RANK_LOOP_INNER_MODS) call(calc_region(stBundle_ptr)); }
+
+# Region loops break up a region using OpenMP threading into blocks.  The
+# 'omp' modifier creates an outer OpenMP loop so that each block is assigned
+# to a top-level OpenMP thread.  The region time loops are not coded here to
+# allow for proper spatial skewing for temporal wavefronts. The time loop
+# may be found in StencilEquations::calc_region().
+REGION_LOOP_OPTS	?=     	-ndims $(NDIMS) -inVar region_idxs \
+				-ompConstruct '$(omp_par_for) schedule($(omp_region_schedule)) proc_bind(spread)' \
+				-callPrefix 'sg->'
+REGION_LOOP_OUTER_MODS	?=	grouped omp
+REGION_LOOP_ORDER	?=	1 .. N-1
+REGION_LOOP_CODE	?=	$(REGION_LOOP_OUTER_MODS) loop($(REGION_LOOP_ORDER)) { \
+				$(REGION_LOOP_INNER_MODS) call(calc_block); }
+
+# Block loops break up a block into sub-blocks.  The 'omp' modifier creates
+# a *nested* OpenMP loop so that each sub-block is assigned to a nested OpenMP
+# thread.  There is no time loop because threaded temporal blocking is
+# not yet supported.
+BLOCK_LOOP_OPTS		?=     	-ndims $(NDIMS) -inVar block_idxs \
+				-ompConstruct '$(omp_par_for) schedule($(omp_block_schedule)) proc_bind(close)' \
+				-callPrefix 'sg->'
+BLOCK_LOOP_OUTER_MODS	?=	grouped omp
+BLOCK_LOOP_ORDER	?=	1 .. N-1
+BLOCK_LOOP_CODE		?=	$(BLOCK_LOOP_OUTER_MODS) loop($(BLOCK_LOOP_ORDER)) { \
+				$(BLOCK_LOOP_INNER_MODS) call(calc_sub_block(thread_idx)); }
+
+# Sub-block loops break up a sub-block into clusters or vectors.  These loops
+# are run by a single OMP thread.  The N-1 (inner) loop is generated by the
+# stencil compiler.  There is no time loop because threaded temporal
+# blocking is not yet supported.  The indexes in this loop are 'normalized',
+# i.e., vector units and rank-relative.
+SUB_BLOCK_LOOP_OPTS		?=     	-ndims $(NDIMS) -inVar norm_sub_block_idxs
+SUB_BLOCK_LOOP_OUTER_MODS	?=
+SUB_BLOCK_LOOP_ORDER		?=	1 .. N-2
+SUB_BLOCK_LOOP_CODE		?=	$(SUB_BLOCK_LOOP_OUTER_MODS) loop($(SUB_BLOCK_LOOP_ORDER)) { \
+					$(SUB_BLOCK_LOOP_INNER_MODS) call(calc_inner_loop(thread_idx)); }
+
+# General-purpose parallel loop.
+# Nested OpenMP is not used here because there is no sharing between threads.
+# TODO: Consider using nested OpenMP to hide more latency.
+MISC_LOOP_OPTS		?=     	-ndims $(NDIMS) -inVar misc_idxs \
+				-ompConstruct '$(omp_par_for) schedule($(omp_misc_schedule)) proc_bind(spread)'
+MISC_LOOP_OUTER_MODS	?=	omp
+MISC_LOOP_ORDER		?=	1 .. N-1
+MISC_LOOP_CODE		?=	$(MISC_LOOP_OUTER_MODS) loop($(MISC_LOOP_ORDER)) \
+				$(MISC_LOOP_INNER_MODS) { call(misc_fn); }
+
 ######## Primary targets & rules
 # NB: must set stencil and arch make vars to generate the desired YASK kernel.
 
@@ -588,28 +592,28 @@ $(YK_GEN_DIR)/yask_misc_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
 	$< -output $@ $(MISC_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_MISC_LOOP_OPTS) "$(MISC_LOOP_CODE)"
 
-$(YK_GEN_DIR)/yask_layout_macros.hpp: $(GEN_LAYOUTS)
+$(YK_GEN_DIR)/yask_layout_macros.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
-	$(PERL) $< -m $(max_dims) > $@
+	$(PERL) $< -m $(NDIMS) > $@
 	@- gindent -fca $@ || \
 	  indent -fca $@ ||   \
 	  echo "note:" $@ "is not properly indented because indent program failed or was not found."
 
-$(YK_GEN_DIR)/yask_layouts.hpp: $(GEN_LAYOUTS)
+$(YK_GEN_DIR)/yask_layouts.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
-	$(PERL) $< -d $(max_dims) > $@
+	$(PERL) $< -d $(NDIMS) > $@
 	@- gindent -fca $@ || \
 	  indent -fca $@ ||   \
 	  echo "note:" $@ "is not properly indented because indent program failed or was not found."
 
-$(YK_GEN_DIR)/yask_grid_code.hpp: $(GEN_LAYOUTS)
+$(YK_GEN_DIR)/yask_grid_code.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
-	$(PERL) $< -g $(max_dims) > $@
+	$(PERL) $< -g $(NDIMS) > $@
 
 # Extract the number of stencil dims from the compiler output.
 # Use this to create an option to pass to the loop generator script.
 $(YK_DIMS_FILE): $(YK_CODE_FILE)
-	awk '/NUM_STENCIL_DIMS/ {print "-ndims",$$NF}' $< > $@
+	awk '/NUM_STENCIL_DIMS/ {print $$NF}' $< > $@
 
 $(YK_CODE_FILE): $(YC_EXEC)
 	$(YK_MK_GEN_DIR)
@@ -633,7 +637,7 @@ headers: $(YK_GEN_HEADERS)
 # NB: must set stencil and arch to generate the desired kernel API.
 
 # Build C++ and Python kernel API libs.
-api: $(YK_LIB) $(YK_PY_LIB) $(MAKE_REPORT_FILE)
+api: $(YK_LIB) $(YK_PY_LIB)
 
 # Build python kernel API lib.
 # TODO: consider adding $(YK_TAG) to [some of] these targets.
diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp
index 5188ed1a..2a47ca33 100644
--- a/src/kernel/lib/context.cpp
+++ b/src/kernel/lib/context.cpp
@@ -23,6 +23,9 @@ IN THE SOFTWARE.
 
 *****************************************************************************/
 
+// This file contains implementations of StencilContext methods.
+// Also see context_setup.cpp.
+
 #include "yask.hpp"
 using namespace std;
 
@@ -963,769 +966,6 @@ namespace yask {
             outputGridMap[gname] = gp;
         }
     }
-    
-    // Init MPI-related vars and other vars related to my rank's place in
-    // the global problem: rank index, offset, etc.  Need to call this even
-    // if not using MPI to properly init these vars.  Called from
-    // prepare_solution(), so it doesn't normally need to be called from user code.
-    void StencilContext::setupRank() {
-        ostream& os = get_ostr();
-        auto& step_dim = _dims->_step_dim;
-        auto me = _env->my_rank;
-
-        // Check ranks.
-        idx_t req_ranks = _opts->_num_ranks.product();
-        if (req_ranks != _env->num_ranks) {
-            THROW_YASK_EXCEPTION("error: " << req_ranks << " rank(s) requested (" <<
-                _opts->_num_ranks.makeDimValStr(" * ") << "), but " <<
-                _env->num_ranks << " rank(s) are active");
-        }
-        assertEqualityOverRanks(_opts->_rank_sizes[step_dim], _env->comm, "num steps");
-
-        // Determine my coordinates if not provided already.
-        // TODO: do this more intelligently based on proximity.
-        if (_opts->find_loc)
-            _opts->_rank_indices = _opts->_num_ranks.unlayout(me);
-
-        // A table of rank-coordinates for everyone.
-        auto num_ddims = _opts->_rank_indices.size(); // domain-dims only!
-        idx_t coords[_env->num_ranks][num_ddims];
-
-        // Init coords for this rank.
-        for (int i = 0; i < num_ddims; i++)
-            coords[me][i] = _opts->_rank_indices[i];
-
-        // A table of rank-domain sizes for everyone.
-        idx_t rsizes[_env->num_ranks][num_ddims];
-
-        // Init sizes for this rank.
-        for (int di = 0; di < num_ddims; di++) {
-            auto& dname = _opts->_rank_indices.getDimName(di);
-            rsizes[me][di] = _opts->_rank_sizes[dname];
-        }
-
-#ifdef USE_MPI
-        // Exchange coord and size info between all ranks.
-        for (int rn = 0; rn < _env->num_ranks; rn++) {
-            MPI_Bcast(&coords[rn][0], num_ddims, MPI_INTEGER8,
-                      rn, _env->comm);
-            MPI_Bcast(&rsizes[rn][0], num_ddims, MPI_INTEGER8,
-                      rn, _env->comm);
-        }
-        // Now, the tables are filled in for all ranks.
-#endif
-
-        // Init offsets and total sizes.
-        rank_domain_offsets.setValsSame(0);
-        overall_domain_sizes.setValsSame(0);
-
-        // Loop over all ranks, including myself.
-        int num_neighbors = 0;
-        for (int rn = 0; rn < _env->num_ranks; rn++) {
-
-            // Coord offset of rn from me: prev => negative, self => 0, next => positive.
-            IdxTuple rcoords(_dims->_domain_dims);
-            IdxTuple rdeltas(_dims->_domain_dims);
-            for (int di = 0; di < num_ddims; di++) {
-                rcoords[di] = coords[rn][di];
-                rdeltas[di] = coords[rn][di] - _opts->_rank_indices[di];
-            }
-        
-            // Manhattan distance from rn (sum of abs deltas in all dims).
-            // Max distance in any dim.
-            int mandist = 0;
-            int maxdist = 0;
-            for (int di = 0; di < num_ddims; di++) {
-                mandist += abs(rdeltas[di]);
-                maxdist = max(maxdist, abs(int(rdeltas[di])));
-            }
-            
-            // Myself.
-            if (rn == me) {
-                if (mandist != 0)
-                    THROW_YASK_EXCEPTION("Internal error: distance to own rank == " << mandist);
-            }
-
-            // Someone else.
-            else {
-                if (mandist == 0)
-                    THROW_YASK_EXCEPTION("Error: ranks " << me <<
-                                         " and " << rn << " at same coordinates");
-            }
-
-            // Loop through domain dims.
-            for (int di = 0; di < num_ddims; di++) {
-                auto& dname = _opts->_rank_indices.getDimName(di);
-
-                // Is rank 'rn' in-line with my rank in 'dname' dim?
-                // True when deltas in other dims are zero.
-                bool is_inline = true;
-                for (int dj = 0; dj < num_ddims; dj++) {
-                    if (di != dj && rdeltas[dj] != 0) {
-                        is_inline = false;
-                        break;
-                    }
-                }
-
-                // Process ranks that are in-line in 'dname', including self.
-                if (is_inline) {
-                    
-                    // Accumulate total problem size in each dim for ranks that
-                    // intersect with this rank, including myself.
-                    overall_domain_sizes[dname] += rsizes[rn][di];
-
-                    // Adjust my offset in the global problem by adding all domain
-                    // sizes from prev ranks only.
-                    if (rdeltas[di] < 0)
-                        rank_domain_offsets[dname] += rsizes[rn][di];
-
-                    // Make sure all the other dims are the same size.
-                    // This ensures that all the ranks' domains line up
-                    // properly along their edges and at their corners.
-                    for (int dj = 0; dj < num_ddims; dj++) {
-                        if (di != dj) {
-                            auto mysz = rsizes[me][dj];
-                            auto rnsz = rsizes[rn][dj];
-                            if (mysz != rnsz) {
-                                auto& dnamej = _opts->_rank_indices.getDimName(dj);
-                                THROW_YASK_EXCEPTION("Error: rank " << rn << " and " << me <<
-                                    " are both at rank-index " << coords[me][di] <<
-                                    " in the '" << dname <<
-                                    "' dimension , but their rank-domain sizes are " <<
-                                    rnsz << " and " << mysz <<
-                                    " (resp.) in the '" << dj <<
-                                    "' dimension, making them unaligned");
-                            }
-                        }
-                    }
-                }
-            }
-
-            // Rank rn is myself or my immediate neighbor if its distance <= 1 in
-            // every dim.  Assume we do not need to exchange halos except
-            // with immediate neighbor. We validate this assumption below by
-            // making sure that the rank domain size is at least as big as the
-            // largest halo.
-            if (maxdist <= 1) {
-
-                // At this point, rdeltas contains only -1..+1 for each domain dim.
-                // Add one to -1..+1 to get 0..2 range for my_neighbors offsets.
-                IdxTuple roffsets = rdeltas.addElements(1);
-                assert(rdeltas.min() >= -1);
-                assert(rdeltas.max() <= 1);
-                assert(roffsets.min() >= 0);
-                assert(roffsets.max() <= 2);
-
-                // Convert the offsets into a 1D index.
-                auto rn_ofs = _mpiInfo->getNeighborIndex(roffsets);
-                TRACE_MSG("neighborhood size = " << _mpiInfo->neighborhood_sizes.makeDimValStr() <<
-                          " & roffsets of rank " << rn << " = " << roffsets.makeDimValStr() <<
-                          " => " << rn_ofs);
-                assert(idx_t(rn_ofs) < _mpiInfo->neighborhood_size);
-
-                // Save rank of this neighbor into the MPI info object.
-                _mpiInfo->my_neighbors.at(rn_ofs) = rn;
-                if (rn != me) {
-                    num_neighbors++;
-                    os << "Neighbor #" << num_neighbors << " is rank " << rn <<
-                        " at absolute rank indices " << rcoords.makeDimValStr() <<
-                        " (" << rdeltas.makeDimValOffsetStr() << " relative to rank " <<
-                        me << ")\n";
-                }
-
-                // Save manhattan dist.
-                _mpiInfo->man_dists.at(rn_ofs) = mandist;
-
-                // Loop through domain dims.
-                bool vlen_mults = true;
-                for (int di = 0; di < num_ddims; di++) {
-                    auto& dname = _opts->_rank_indices.getDimName(di);
-
-                    // Does rn have all VLEN-multiple sizes?
-                    auto rnsz = rsizes[rn][di];
-                    auto vlen = _dims->_fold_pts[di];
-                    if (rnsz % vlen != 0) {
-                        TRACE_MSG("cannot use vector halo exchange with rank " << rn <<
-                                  " because its size in '" << dname << "' is " << rnsz);
-                        vlen_mults = false;
-                    }
-                }
-
-                // Save vec-mult flag.
-                _mpiInfo->has_all_vlen_mults.at(rn_ofs) = vlen_mults;
-                
-            } // self or immediate neighbor in any direction.
-            
-        } // ranks.
-
-        // Set offsets in grids and find WF extensions
-        // based on the grids' halos.
-        update_grids();
-
-        // Determine bounding-boxes for all bundles.
-        // This must be done after finding WF extensions.
-        find_bounding_boxes();
-
-    } // setupRank.
-
-    // Alloc 'nbytes' on each requested NUMA node.
-    // Map keys are preferred NUMA nodes or -1 for local.
-    // Pointers are returned in '_data_buf'.
-    // 'ngrids' and 'type' are only used for debug msg.
-    void StencilContext::_alloc_data(const map <int, size_t>& nbytes,
-                                     const map <int, size_t>& ngrids,
-                                     map <int, shared_ptr<char>>& data_buf,
-                                     const std::string& type) {
-        ostream& os = get_ostr();
-
-        for (const auto& i : nbytes) {
-            int numa_pref = i.first;
-            size_t nb = i.second;
-            size_t ng = ngrids.at(numa_pref);
-
-            // Don't need pad after last one.
-            if (nb >= _data_buf_pad)
-                nb -= _data_buf_pad;
-
-            // Allocate data.
-            os << "Allocating " << makeByteStr(nb) <<
-                " for " << ng << " " << type << "(s)";
-#ifdef USE_NUMA
-            if (numa_pref >= 0)
-                os << " preferring NUMA node " << numa_pref;
-            else
-                os << " using NUMA policy " << numa_pref;
-#endif
-            os << "...\n" << flush;
-            auto p = shared_numa_alloc<char>(nb, numa_pref);
-            TRACE_MSG("Got memory at " << static_cast<void*>(p.get()));
-
-            // Save using original key.
-            data_buf[numa_pref] = p;
-        }
-    }
-    
-    // Allocate memory for grids that do not already have storage.
-    void StencilContext::allocGridData(ostream& os) {
-
-        // Base ptrs for all default-alloc'd data.
-        // These pointers will be shared by the ones in the grid
-        // objects, which will take over ownership when these go
-        // out of scope.
-        // Key is preferred numa node or -1 for local.
-        map <int, shared_ptr<char>> _grid_data_buf;
-
-        // Pass 0: count required size for each NUMA node, allocate chunk of memory at end.
-        // Pass 1: distribute parts of already-allocated memory chunk.
-        for (int pass = 0; pass < 2; pass++) {
-            TRACE_MSG("allocGridData pass " << pass << " for " <<
-                      gridPtrs.size() << " grid(s)");
-        
-            // Count bytes needed and number of grids for each NUMA node.
-            map <int, size_t> npbytes, ngrids;
-        
-            // Grids.
-            for (auto gp : gridPtrs) {
-                if (!gp)
-                    continue;
-                auto& gname = gp->get_name();
-
-                // Grid data.
-                // Don't alloc if already done.
-                if (!gp->is_storage_allocated()) {
-                    int numa_pref = gp->get_numa_preferred();
-
-                    // Set storage if buffer has been allocated in pass 0.
-                    if (pass == 1) {
-                        auto p = _grid_data_buf[numa_pref];
-                        assert(p);
-                        gp->set_storage(p, npbytes[numa_pref]);
-                        os << gp->make_info_string() << endl;
-                    }
-
-                    // Determine padded size (also offset to next location).
-                    size_t nbytes = gp->get_num_storage_bytes();
-                    npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad,
-                                                  CACHELINE_BYTES);
-                    ngrids[numa_pref]++;
-                    if (pass == 0)
-                        TRACE_MSG(" grid '" << gname << "' needs " << makeByteStr(nbytes) <<
-                                  " on NUMA node " << numa_pref);
-                }
-            }
-
-            // Alloc for each node.
-            if (pass == 0)
-                _alloc_data(npbytes, ngrids, _grid_data_buf, "grid");
-
-        } // grid passes.
-    };
-    
-    // Create MPI and allocate buffers.
-    void StencilContext::allocMpiData(ostream& os) {
-
-        // Remove any old MPI data.
-        freeMpiData(os);
-
-#ifdef USE_MPI
-
-        int num_exchanges = 0;
-        auto me = _env->my_rank;
-        
-        // Need to determine the size and shape of all MPI buffers.
-        // Visit all neighbors of this rank.
-        _mpiInfo->visitNeighbors
-            ([&](const IdxTuple& neigh_offsets, int neigh_rank, int neigh_idx) {
-                if (neigh_rank == MPI_PROC_NULL)
-                    return; // from lambda fn.
-
-                // Determine max dist needed.  TODO: determine max dist
-                // automatically from stencils; may not be same for all
-                // grids.
-#ifndef MAX_EXCH_DIST
-#define MAX_EXCH_DIST (NUM_STENCIL_DIMS - 1)
-#endif
-                // Always use max dist with WF.
-                // TODO: determine if this is overkill.
-                int maxdist = MAX_EXCH_DIST;
-                if (num_wf_shifts > 0)
-                    maxdist = NUM_STENCIL_DIMS - 1;
-
-                // Manhattan dist.
-                int mandist = _mpiInfo->man_dists.at(neigh_idx);
-                    
-                // Check distance.
-                // TODO: calculate and use exch dist for each grid.
-                if (mandist > maxdist) {
-                    TRACE_MSG("no halo exchange needed with rank " << neigh_rank <<
-                              " because L1-norm = " << mandist);
-                    return;     // from lambda fn.
-                }
-        
-                // Determine size of MPI buffers between neigh_rank and my rank
-                // for each grid and create those that are needed.
-                for (auto gp : gridPtrs) {
-                    if (!gp)
-                        continue;
-                    auto& gname = gp->get_name();
-
-                    // Lookup first & last domain indices and calc exchange sizes
-                    // for this grid.
-                    bool found_delta = false;
-                    IdxTuple my_halo_sizes, neigh_halo_sizes;
-                    IdxTuple first_inner_idx, last_inner_idx;
-                    IdxTuple first_outer_idx, last_outer_idx;
-                    for (auto& dim : _dims->_domain_dims.getDims()) {
-                        auto& dname = dim.getName();
-                        if (gp->is_dim_used(dname)) {
-
-                            // Get domain indices for this grid.
-                            // If there are no more ranks in the given direction, extend
-                            // the index into the outer halo to make sure all data are sync'd.
-                            // This is critical for WFs.
-                            idx_t fidx = gp->get_first_rank_domain_index(dname);
-                            idx_t lidx = gp->get_last_rank_domain_index(dname);
-                            first_inner_idx.addDimBack(dname, fidx);
-                            last_inner_idx.addDimBack(dname, lidx);
-                            if (_opts->is_first_rank(dname))
-                                fidx -= gp->get_left_halo_size(dname);
-                            if (_opts->is_last_rank(dname))
-                                lidx += gp->get_right_halo_size(dname);
-                            first_outer_idx.addDimBack(dname, fidx);
-                            last_outer_idx.addDimBack(dname, lidx);
-
-                            // Determine size of exchange. This will be the actual halo size
-                            // plus any wave-front extensions. In the current implementation,
-                            // we need the wave-front extensions regardless of whether there
-                            // is a halo on a given grid. This is because each stencil-bundle
-                            // gets shifted by the WF angles at each step in the WF.
-
-                            // Neighbor is to the left.
-                            if (neigh_offsets[dname] == MPIInfo::rank_prev) {
-                                auto ext = left_wf_exts[dname];
-
-                                // my halo.
-                                auto halo_size = gp->get_left_halo_size(dname);
-                                halo_size += ext;
-                                my_halo_sizes.addDimBack(dname, halo_size);
-
-                                // neighbor halo.
-                                halo_size = gp->get_right_halo_size(dname); // their right is on my left.
-                                halo_size += ext;
-                                neigh_halo_sizes.addDimBack(dname, halo_size);
-                            }
-
-                            // Neighbor is to the right.
-                            else if (neigh_offsets[dname] == MPIInfo::rank_next) {
-                                auto ext = right_wf_exts[dname];
-
-                                // my halo.
-                                auto halo_size = gp->get_right_halo_size(dname);
-                                halo_size += ext;
-                                my_halo_sizes.addDimBack(dname, halo_size);
-
-                                // neighbor halo.
-                                halo_size = gp->get_left_halo_size(dname); // their left is on my right.
-                                halo_size += ext;
-                                neigh_halo_sizes.addDimBack(dname, halo_size);
-                            }
-
-                            // Neighbor in-line.
-                            else {
-                                my_halo_sizes.addDimBack(dname, 0);
-                                neigh_halo_sizes.addDimBack(dname, 0);
-                            }
-
-                            // Vectorized exchange allowed based on domain sizes?
-                            // Both my rank and neighbor rank must have all domain sizes
-                            // of vector multiples.
-                            bool vec_ok = allow_vec_exchange &&
-                                _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] &&
-                                _mpiInfo->has_all_vlen_mults[neigh_idx];
-                            
-                            // Round up halo sizes if vectorized exchanges allowed.
-                            // TODO: add a heuristic to avoid increasing by a large factor.
-                            if (vec_ok) {
-                                auto vec_size = _dims->_fold_pts[dname];
-                                my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size));
-                                neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size));
-                            }
-                            
-                            // Is this neighbor before or after me in this domain direction?
-                            if (neigh_offsets[dname] != MPIInfo::rank_self)
-                                found_delta = true;
-                        }
-                    }
-
-                    // Is buffer needed?
-                    // Example: if this grid is 2D in y-z, but only neighbors are in
-                    // x-dim, we don't need any exchange.
-                    if (!found_delta) {
-                        TRACE_MSG("no halo exchange needed for grid '" << gname <<
-                                  "' with rank " << neigh_rank <<
-                                  " because the neighbor is not in a direction"
-                                  " corresponding to a grid dim");
-                        continue; // to next grid.
-                    }
-
-                    // Make a buffer in both directions (send & receive).
-                    for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) {
-
-                        // Begin/end vars to indicate what part
-                        // of main grid to read from or write to based on
-                        // the current neighbor being processed.
-                        IdxTuple copy_begin = gp->get_allocs();
-                        IdxTuple copy_end = gp->get_allocs();
-
-                        // Adjust along domain dims in this grid.
-                        for (auto& dim : _dims->_domain_dims.getDims()) {
-                            auto& dname = dim.getName();
-                            if (gp->is_dim_used(dname)) {
-
-                                // Init range to whole rank domain (including
-                                // outer halos).  These may be changed below
-                                // depending on the neighbor's direction.
-                                copy_begin[dname] = first_outer_idx[dname];
-                                copy_end[dname] = last_outer_idx[dname] + 1; // end = last + 1.
-
-                                // Neighbor direction in this dim.
-                                auto neigh_ofs = neigh_offsets[dname];
-                            
-                                // Region to read from, i.e., data from inside
-                                // this rank's domain to be put into neighbor's
-                                // halo.
-                                if (bd == MPIBufs::bufSend) {
-
-                                    // Neighbor is to the left.
-                                    if (neigh_ofs == idx_t(MPIInfo::rank_prev)) {
-
-                                        // Only read slice as wide as halo from beginning.
-                                        copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname];
-                                    }
-                            
-                                    // Neighbor is to the right.
-                                    else if (neigh_ofs == idx_t(MPIInfo::rank_next)) {
-                                    
-                                        // Only read slice as wide as halo before end.
-                                        copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname];
-                                    }
-                            
-                                    // Else, this neighbor is in same posn as I am in this dim,
-                                    // so we leave the default begin/end settings.
-                                }
-                        
-                                // Region to write to, i.e., into this rank's halo.
-                                else if (bd == MPIBufs::bufRecv) {
-
-                                    // Neighbor is to the left.
-                                    if (neigh_ofs == idx_t(MPIInfo::rank_prev)) {
-
-                                        // Only read slice as wide as halo before beginning.
-                                        copy_begin[dname] = first_inner_idx[dname] - my_halo_sizes[dname];
-                                        copy_end[dname] = first_inner_idx[dname];
-                                    }
-                            
-                                    // Neighbor is to the right.
-                                    else if (neigh_ofs == idx_t(MPIInfo::rank_next)) {
-                                    
-                                        // Only read slice as wide as halo after end.
-                                        copy_begin[dname] = last_inner_idx[dname] + 1;
-                                        copy_end[dname] = last_inner_idx[dname] + 1 + my_halo_sizes[dname];
-                                    }
-                                
-                                    // Else, this neighbor is in same posn as I am in this dim,
-                                    // so we leave the default begin/end settings.
-                                }
-                            } // domain dims in this grid.
-                        } // domain dims.
-
-                        // Sizes of buffer in all dims of this grid.
-                        // Also, set begin/end value for non-domain dims.
-                        IdxTuple buf_sizes = gp->get_allocs();
-                        bool vlen_mults = true;
-                        for (auto& dname : gp->get_dim_names()) {
-                            idx_t dsize = 1;
-
-                            // domain dim?
-                            if (_dims->_domain_dims.lookup(dname)) {
-                                dsize = copy_end[dname] - copy_begin[dname];
-
-                                // Check whether size is multiple of vlen.
-                                auto vlen = _dims->_fold_pts[dname];
-                                if (dsize % vlen != 0)
-                                    vlen_mults = false;
-                            }
-
-                            // step dim?
-                            // Allowing only one step to be exchanged.
-                            // TODO: consider exchanging mutiple steps at once for WFs.
-                            else if (dname == _dims->_step_dim) {
-
-                                // Use 0..1 as a place-holder range.
-                                // The actual values will be supplied during
-                                // halo exchange.
-                                copy_begin[dname] = 0;
-                                copy_end[dname] = 1;
-                            }
-
-                            // misc?
-                            // Copy over entire range.
-                            // TODO: make dirty flags for misc dims in grids.
-                            else {
-                                dsize = gp->get_alloc_size(dname);
-                                copy_begin[dname] = gp->get_first_misc_index(dname);
-                                copy_end[dname] = gp->get_last_misc_index(dname) + 1;
-                            }
-
-                            // Save computed size.
-                            buf_sizes[dname] = dsize;
-                                
-                        } // all dims in this grid.
-
-                        // Does buffer have non-zero size?
-                        if (buf_sizes.size() == 0 || buf_sizes.product() == 0) {
-                            TRACE_MSG("no halo exchange needed for grid '" << gname <<
-                                      "' with rank " << neigh_rank <<
-                                      " because there is no data to exchange");
-                            continue;
-                        }
-
-                        // At this point, buf_sizes, copy_begin, and copy_end
-                        // should be set for each dim in this grid.
-                        // Convert end to last.
-                        IdxTuple copy_last = copy_end.subElements(1);
-
-                        // Unique name for buffer based on grid name, direction, and ranks.
-                        ostringstream oss;
-                        oss << gname;
-                        if (bd == MPIBufs::bufSend)
-                            oss << "_send_halo_from_" << me << "_to_" << neigh_rank;
-                        else if (bd == MPIBufs::bufRecv)
-                            oss << "_recv_halo_from_" << neigh_rank << "_to_" << me;
-                        string bufname = oss.str();
-
-                        // Make MPI data entry for this grid.
-                        auto gbp = mpiData.emplace(gname, _mpiInfo);
-                        auto& gbi = gbp.first; // iterator from pair returned by emplace().
-                        auto& gbv = gbi->second; // value from iterator.
-                        auto& buf = gbv.getBuf(MPIBufs::BufDir(bd), neigh_offsets);
-
-                        // Config buffer for this grid.
-                        // (But don't allocate storage yet.)
-                        buf.begin_pt = copy_begin;
-                        buf.last_pt = copy_last;
-                        buf.num_pts = buf_sizes;
-                        buf.name = bufname;
-                        buf.has_all_vlen_mults = vlen_mults;
-                        
-                        TRACE_MSG("configured MPI buffer object '" << buf.name <<
-                                  "' for rank at relative offsets " <<
-                                  neigh_offsets.subElements(1).makeDimValStr() << " with " <<
-                                  buf.num_pts.makeDimValStr(" * ") << " = " << buf.get_size() <<
-                                  " element(s) at " << buf.begin_pt.makeDimValStr() <<
-                                  " ... " << buf.last_pt.makeDimValStr());
-                        num_exchanges++;
-
-                    } // send, recv.
-                } // grids.
-            });   // neighbors.
-        TRACE_MSG("number of halo-exchanges needed on this rank: " << num_exchanges);
-
-        // Base ptrs for all alloc'd data.
-        // These pointers will be shared by the ones in the grid
-        // objects, which will take over ownership when these go
-        // out of scope.
-        map <int, shared_ptr<char>> _mpi_data_buf;
-
-        // Allocate MPI buffers.
-        // Pass 0: count required size, allocate chunk of memory at end.
-        // Pass 1: distribute parts of already-allocated memory chunk.
-        for (int pass = 0; pass < 2; pass++) {
-            TRACE_MSG("allocMpiData pass " << pass << " for " <<
-                      mpiData.size() << " MPI buffer set(s)");
-        
-            // Count bytes needed and number of buffers for each NUMA node.
-            map <int, size_t> npbytes, nbufs;
-        
-            // Grids.
-            for (auto gp : gridPtrs) {
-                if (!gp)
-                    continue;
-                auto& gname = gp->get_name();
-                int numa_pref = gp->get_numa_preferred();
-
-                // MPI bufs for this grid.
-                if (mpiData.count(gname)) {
-                    auto& grid_mpi_data = mpiData.at(gname);
-
-                    // Visit buffers for each neighbor for this grid.
-                    grid_mpi_data.visitNeighbors
-                        ([&](const IdxTuple& roffsets,
-                             int rank,
-                             int idx,
-                             MPIBufs& bufs) {
-
-                            // Send and recv.
-                            for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) {
-                                auto& buf = grid_mpi_data.getBuf(MPIBufs::BufDir(bd), roffsets);
-                                if (buf.get_size() == 0)
-                                    continue;
-                                
-                                // Set storage if buffer has been allocated in pass 0.
-                                if (pass == 1) {
-                                    auto p = _mpi_data_buf[numa_pref];
-                                    assert(p);
-                                    buf.set_storage(p, npbytes[numa_pref]);
-                                }
-
-                                // Determine padded size (also offset to next location).
-                                auto sbytes = buf.get_bytes();
-                                npbytes[numa_pref] += ROUND_UP(sbytes + _data_buf_pad,
-                                                               CACHELINE_BYTES);
-                                nbufs[numa_pref]++;
-                                if (pass == 0)
-                                    TRACE_MSG("  MPI buf '" << buf.name << "' needs " <<
-                                              makeByteStr(sbytes) <<
-                                              " on NUMA node " << numa_pref);
-                            }
-                        } );
-                }
-            }
-
-            // Alloc for each node.
-            if (pass == 0)
-                _alloc_data(npbytes, nbufs, _mpi_data_buf, "MPI buffer");
-
-        } // MPI passes.
-#endif
-    }
-
-    // Allocate memory for scratch grids based on number of threads and
-    // block sizes.
-    void StencilContext::allocScratchData(ostream& os) {
-
-        // Remove any old scratch data.
-        freeScratchData(os);
-
-        // Base ptrs for all alloc'd data.
-        // This pointer will be shared by the ones in the grid
-        // objects, which will take over ownership when it goes
-        // out of scope.
-        map <int, shared_ptr<char>> _scratch_data_buf;
-
-        // Make sure the right number of threads are set so we
-        // have the right number of scratch grids.
-        int rthreads = set_region_threads();
-
-        // Delete any existing scratch grids.
-        // Create new scratch grids.
-        makeScratchGrids(rthreads);
-        
-        // Pass 0: count required size, allocate chunk of memory at end.
-        // Pass 1: distribute parts of already-allocated memory chunk.
-        for (int pass = 0; pass < 2; pass++) {
-            TRACE_MSG("allocScratchData pass " << pass << " for " <<
-                      scratchVecs.size() << " set(s) of scratch grids");
-        
-            // Count bytes needed and number of grids for each NUMA node.
-            map <int, size_t> npbytes, ngrids;
-
-            // Loop through each scratch grid vector.
-            for (auto* sgv : scratchVecs) {
-                assert(sgv);
-
-                // Loop through each scratch grid in this vector.
-                // There will be one for each region thread.
-                assert(int(sgv->size()) == rthreads);
-                int thr_num = 0;
-                for (auto gp : *sgv) {
-                    assert(gp);
-                    auto& gname = gp->get_name();
-                    int numa_pref = gp->get_numa_preferred();
-            
-                    // Loop through each domain dim.
-                    for (auto& dim : _dims->_domain_dims.getDims()) {
-                        auto& dname = dim.getName();
-
-                        if (gp->is_dim_used(dname)) {
-
-                            // Set domain size of grid to block size.
-                            gp->_set_domain_size(dname, _opts->_block_sizes[dname]);
-                    
-                            // Pads.
-                            // Set via both 'extra' and 'min'; larger result will be used.
-                            gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]);
-                            gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]);
-                        }
-                    } // dims.
-                
-                    // Set storage if buffer has been allocated.
-                    if (pass == 1) {
-                        auto p = _scratch_data_buf[numa_pref];
-                        assert(p);
-                        gp->set_storage(p, npbytes[numa_pref]);
-                        TRACE_MSG(gp->make_info_string());
-                    }
-
-                    // Determine size used (also offset to next location).
-                    size_t nbytes = gp->get_num_storage_bytes();
-                    npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad,
-                                                   CACHELINE_BYTES);
-                    ngrids[numa_pref]++;
-                    if (pass == 0)
-                        TRACE_MSG(" scratch grid '" << gname << "' for thread " <<
-                                  thr_num << " needs " << makeByteStr(nbytes) <<
-                                  " on NUMA node " << numa_pref);
-                    thr_num++;
-                } // scratch grids.
-            } // scratch-grid vecs.
-
-            // Alloc for each node.
-            if (pass == 0)
-                _alloc_data(npbytes, ngrids, _scratch_data_buf, "scratch grid");
-
-        } // scratch-grid passes.
-    }
 
     // Adjust offsets of scratch grids based
     // on thread and scan indices.
@@ -1773,362 +1013,6 @@ namespace yask {
         }
     }
 
-    
-    // Set non-scratch grid sizes and offsets based on settings.
-    // Set wave-front settings.
-    // This should be called anytime a setting or rank offset is changed.
-    void StencilContext::update_grids()
-    {
-        assert(_opts);
-
-        // Reset halos to zero.
-        max_halos = _dims->_domain_dims;
-
-        // Loop through each non-scratch grid.
-        for (auto gp : gridPtrs) {
-            assert(gp);
-
-            // Ignore manually-sized grid.
-            if (gp->is_fixed_size())
-                continue;
-
-            // Loop through each domain dim.
-            for (auto& dim : _dims->_domain_dims.getDims()) {
-                auto& dname = dim.getName();
-
-                if (gp->is_dim_used(dname)) {
-
-                    // Rank domains.
-                    gp->_set_domain_size(dname, _opts->_rank_sizes[dname]);
-                    
-                    // Pads.
-                    // Set via both 'extra' and 'min'; larger result will be used.
-                    gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]);
-                    gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]);
-                    
-                    // Offsets.
-                    gp->_set_offset(dname, rank_domain_offsets[dname]);
-
-                    // Update max halo across grids, used for wavefront angles.
-                    max_halos[dname] = max(max_halos[dname], gp->get_left_halo_size(dname));
-                    max_halos[dname] = max(max_halos[dname], gp->get_right_halo_size(dname));
-                }
-            }
-        } // grids.
-
-        // Calculate wave-front settings based on max halos.
-        // See the wavefront diagram in run_solution() for description
-        // of angles and extensions.
-        auto& step_dim = _dims->_step_dim;
-        auto wf_steps = _opts->_region_sizes[step_dim];
-        num_wf_shifts = 0;
-        if (wf_steps > 1)
-
-            // TODO: don't shift for scratch grids.
-            num_wf_shifts = max((idx_t(stBundles.size()) * wf_steps) - 1, idx_t(0));
-        for (auto& dim : _dims->_domain_dims.getDims()) {
-            auto& dname = dim.getName();
-            auto rksize = _opts->_rank_sizes[dname];
-            auto nranks = _opts->_num_ranks[dname];
-
-            // Determine the max spatial skewing angles for temporal
-            // wave-fronts based on the max halos.  We only need non-zero
-            // angles if the region size is less than the rank size and
-            // there are no other ranks in this dim, i.e., if the region
-            // covers the global domain in a given dim, no wave-front is
-            // needed in that dim.  TODO: make rounding-up an option.
-            idx_t angle = 0;
-            if (_opts->_region_sizes[dname] < rksize || nranks > 0)
-                angle = ROUND_UP(max_halos[dname], _dims->_cluster_pts[dname]);
-            wf_angles[dname] = angle;
-
-            // Determine the total WF shift to be added in each dim.
-            idx_t shifts = angle * num_wf_shifts;
-            wf_shifts[dname] = shifts;
-
-            // Is domain size at least as large as halo + wf_ext in direction
-            // when there are multiple ranks?
-            auto min_size = max_halos[dname] + shifts;
-            if (_opts->_num_ranks[dname] > 1 && rksize < min_size) {
-                THROW_YASK_EXCEPTION("Error: rank-domain size of " << rksize << " in '" <<
-                                     dname << "' dim is less than minimum size of " << min_size <<
-                                     ", which is based on stencil halos and temporal wave-front sizes");
-            }
-
-            // If there is another rank to the left, set wave-front
-            // extension on the left.
-            left_wf_exts[dname] = _opts->is_first_rank(dname) ? 0 : shifts;
-
-            // If there is another rank to the right, set wave-front
-            // extension on the right.
-            right_wf_exts[dname] = _opts->is_last_rank(dname) ? 0 : shifts;
-        }            
-            
-        // Now that wave-front settings are known, we can push this info
-        // back to the grids. It's useful to store this redundant info
-        // in the grids, because there it's indexed by grid dims instead
-        // of domain dims. This makes it faster to do grid indexing.
-        for (auto gp : gridPtrs) {
-            assert(gp);
-
-            // Ignore manually-sized grid.
-            if (gp->is_fixed_size())
-                continue;
-
-            // Loop through each domain dim.
-            for (auto& dim : _dims->_domain_dims.getDims()) {
-                auto& dname = dim.getName();
-                if (gp->is_dim_used(dname)) {
-
-                    // Set extensions to be the same as the global ones.
-                    gp->_set_left_wf_ext(dname, left_wf_exts[dname]);
-                    gp->_set_right_wf_ext(dname, right_wf_exts[dname]);
-                }
-            }
-        }
-    }
-    
-    // Allocate grids and MPI bufs.
-    // Initialize some data structures.
-    void StencilContext::prepare_solution() {
-        auto& step_dim = _dims->_step_dim;
-
-        // Don't continue until all ranks are this far.
-        _env->global_barrier();
-
-        ostream& os = get_ostr();
-#ifdef DEBUG
-        os << "*** WARNING: YASK compiled with DEBUG; ignore performance results.\n";
-#endif
-#if defined(NO_INTRINSICS) && (VLEN > 1)
-        os << "*** WARNING: YASK compiled with NO_INTRINSICS; ignore performance results.\n";
-#endif
-#ifdef MODEL_CACHE
-        os << "*** WARNING: YASK compiled with MODEL_CACHE; ignore performance results.\n";
-#endif
-#ifdef TRACE_MEM
-        os << "*** WARNING: YASK compiled with TRACE_MEM; ignore performance results.\n";
-#endif
-#ifdef TRACE_INTRINSICS
-        os << "*** WARNING: YASK compiled with TRACE_INTRINSICS; ignore performance results.\n";
-#endif
-        
-        // reset time keepers.
-        clear_timers();
-
-        // Init auto-tuner to run silently during normal operation.
-        _at.clear(false, false);
-
-        // Adjust all settings before setting MPI buffers or sizing grids.
-        // Prints final settings.
-        // TODO: print settings again after auto-tuning.
-        _opts->adjustSettings(os, _env);
-
-        // Report ranks.
-        os << endl;
-        os << "Num ranks: " << _env->get_num_ranks() << endl;
-        os << "This rank index: " << _env->get_rank_index() << endl;
-
-        // report threads.
-        os << "Num OpenMP procs: " << omp_get_num_procs() << endl;
-        set_all_threads();
-        os << "Num OpenMP threads: " << omp_get_max_threads() << endl;
-        set_region_threads(); // Temporary; just for reporting.
-        os << "  Num threads per region: " << omp_get_max_threads() << endl;
-        set_block_threads(); // Temporary; just for reporting.
-        os << "  Num threads per block: " << omp_get_max_threads() << endl;
-
-        // Set the number of threads for a region. It should stay this
-        // way for top-level OpenMP parallel sections.
-        int rthreads = set_region_threads();
-
-        // Run a dummy nested OMP loop to make sure nested threading is
-        // initialized.
-#ifdef _OPENMP
-#pragma omp parallel for
-        for (int i = 0; i < rthreads * 100; i++) {
-
-            idx_t dummy = 0;
-            set_block_threads();
-#pragma omp parallel for reduction(+:dummy)
-            for (int j = 0; j < i * 100; j++) {
-                dummy += j;
-            }
-        }
-#endif
-
-        // Some grid stats.
-        os << endl;
-        os << "Num grids: " << gridPtrs.size() << endl;
-        os << "Num grids to be updated: " << outputGridPtrs.size() << endl;
-        
-        // Set up data based on MPI rank, including grid positions.
-        // Update all the grid sizes.
-        setupRank();
-
-        // Alloc grids, scratch grids, MPI bufs.
-        // This is the order in which preferred NUMA nodes (e.g., HBW mem)
-        // will be used.
-        // We free the scratch and MPI data first to give grids preference.
-        freeScratchData(os);
-        freeMpiData(os);
-        allocGridData(os);
-        allocScratchData(os);
-        allocMpiData(os);
-
-        // Report total allocation.
-        rank_nbytes = get_num_bytes();
-        os << "Total allocation in this rank: " <<
-            makeByteStr(rank_nbytes) << "\n";
-        tot_nbytes = sumOverRanks(rank_nbytes, _env->comm);
-        os << "Total overall allocation in " << _env->num_ranks << " rank(s): " <<
-            makeByteStr(tot_nbytes) << "\n";
-    
-        // Report some stats.
-        idx_t dt = _opts->_rank_sizes[step_dim];
-        os << "\nProblem sizes in points (from smallest to largest):\n"
-            " vector-size:           " << _dims->_fold_pts.makeDimValStr(" * ") << endl <<
-            " cluster-size:          " << _dims->_cluster_pts.makeDimValStr(" * ") << endl <<
-            " sub-block-size:        " << _opts->_sub_block_sizes.makeDimValStr(" * ") << endl <<
-            " sub-block-group-size:  " << _opts->_sub_block_group_sizes.makeDimValStr(" * ") << endl <<
-            " block-size:            " << _opts->_block_sizes.makeDimValStr(" * ") << endl <<
-            " block-group-size:      " << _opts->_block_group_sizes.makeDimValStr(" * ") << endl <<
-            " region-size:           " << _opts->_region_sizes.makeDimValStr(" * ") << endl <<
-            " rank-domain-size:      " << _opts->_rank_sizes.makeDimValStr(" * ") << endl <<
-            " overall-problem-size:  " << overall_domain_sizes.makeDimValStr(" * ") << endl <<
-            endl <<
-            "Other settings:\n"
-            " yask-version:          " << yask_get_version_string() << endl <<
-            " stencil-name:          " << get_name() << endl <<
-            " element-size:          " << makeByteStr(get_element_bytes()) << endl <<
-#ifdef USE_MPI
-            " num-ranks:             " << _opts->_num_ranks.makeDimValStr(" * ") << endl <<
-            " rank-indices:          " << _opts->_rank_indices.makeDimValStr() << endl <<
-            " rank-domain-offsets:   " << rank_domain_offsets.makeDimValOffsetStr() << endl <<
-#endif
-            " rank-domain:           " << rank_bb.bb_begin.makeDimValStr() <<
-                " ... " << rank_bb.bb_end.subElements(1).makeDimValStr() << endl <<
-            " vector-len:            " << VLEN << endl <<
-            " extra-padding:         " << _opts->_extra_pad_sizes.makeDimValStr() << endl <<
-            " minimum-padding:       " << _opts->_min_pad_sizes.makeDimValStr() << endl <<
-            " L1-prefetch-distance:  " << PFD_L1 << endl <<
-            " L2-prefetch-distance:  " << PFD_L2 << endl <<
-            " max-halos:             " << max_halos.makeDimValStr() << endl;
-        if (num_wf_shifts > 0) {
-            os <<
-                " wave-front-angles:     " << wf_angles.makeDimValStr() << endl <<
-                " num-wave-front-shifts: " << num_wf_shifts << endl <<
-                " wave-front-shift-lens: " << wf_shifts.makeDimValStr() << endl <<
-                " left-wave-front-exts:  " << left_wf_exts.makeDimValStr() << endl <<
-                " right-wave-front-exts: " << right_wf_exts.makeDimValStr() << endl <<
-                " ext-rank-domain:       " << ext_bb.bb_begin.makeDimValStr() <<
-                " ... " << ext_bb.bb_end.subElements(1).makeDimValStr() << endl;
-        }
-        os << endl;
-        
-        // sums across bundles for this rank.
-        rank_numWrites_1t = 0;
-        rank_reads_1t = 0;
-        rank_numFpOps_1t = 0;
-        os << "Num stencil bundles: " << stBundles.size() << endl;
-        for (auto* sg : stBundles) {
-            idx_t updates1 = sg->get_scalar_points_written();
-            idx_t updates_domain = updates1 * sg->bb_num_points;
-            rank_numWrites_1t += updates_domain;
-            idx_t reads1 = sg->get_scalar_points_read();
-            idx_t reads_domain = reads1 * sg->bb_num_points;
-            rank_reads_1t += reads_domain;
-            idx_t fpops1 = sg->get_scalar_fp_ops();
-            idx_t fpops_domain = fpops1 * sg->bb_num_points;
-            rank_numFpOps_1t += fpops_domain;
-            os << "Stats for bundle '" << sg->get_name() << "':\n" <<
-                " sub-domain:                 " << sg->bb_begin.makeDimValStr() <<
-                " ... " << sg->bb_end.subElements(1).makeDimValStr() << endl <<
-                " sub-domain size:            " << sg->bb_len.makeDimValStr(" * ") << endl <<
-                " valid points in sub domain: " << makeNumStr(sg->bb_num_points) << endl <<
-                " grid-updates per point:     " << updates1 << endl <<
-                " grid-updates in sub-domain: " << makeNumStr(updates_domain) << endl <<
-                " grid-reads per point:       " << reads1 << endl <<
-                " grid-reads in sub-domain:   " << makeNumStr(reads_domain) << endl <<
-                " est FP-ops per point:       " << fpops1 << endl <<
-                " est FP-ops in sub-domain:   " << makeNumStr(fpops_domain) << endl;
-        }
-
-        // Various metrics for amount of work.
-        rank_numWrites_dt = rank_numWrites_1t * dt;
-        tot_numWrites_1t = sumOverRanks(rank_numWrites_1t, _env->comm);
-        tot_numWrites_dt = tot_numWrites_1t * dt;
-
-        rank_reads_dt = rank_reads_1t * dt;
-        tot_reads_1t = sumOverRanks(rank_reads_1t, _env->comm);
-        tot_reads_dt = tot_reads_1t * dt;
-
-        rank_numFpOps_dt = rank_numFpOps_1t * dt;
-        tot_numFpOps_1t = sumOverRanks(rank_numFpOps_1t, _env->comm);
-        tot_numFpOps_dt = tot_numFpOps_1t * dt;
-
-        rank_domain_1t = rank_bb.bb_num_points;
-        rank_domain_dt = rank_domain_1t * dt; // same as _opts->_rank_sizes.product();
-        tot_domain_1t = sumOverRanks(rank_domain_1t, _env->comm);
-        tot_domain_dt = tot_domain_1t * dt;
-    
-        // Print some more stats.
-        os << endl <<
-            "Amount-of-work stats:\n" <<
-            " domain-size in this rank for one time-step: " <<
-            makeNumStr(rank_domain_1t) << endl <<
-            " overall-problem-size in all ranks for one time-step: " <<
-            makeNumStr(tot_domain_1t) << endl <<
-            endl <<
-            " num-writes-required in this rank for one time-step: " <<
-            makeNumStr(rank_numWrites_1t) << endl <<
-            " num-writes-required in all ranks for one time-step: " <<
-            makeNumStr(tot_numWrites_1t) << endl <<
-            endl <<
-            " num-reads-required in this rank for one time-step: " <<
-            makeNumStr(rank_reads_1t) << endl <<
-            " num-reads-required in all ranks for one time-step: " <<
-            makeNumStr(tot_reads_1t) << endl <<
-            endl <<
-            " est-FP-ops in this rank for one time-step: " <<
-            makeNumStr(rank_numFpOps_1t) << endl <<
-            " est-FP-ops in all ranks for one time-step: " <<
-            makeNumStr(tot_numFpOps_1t) << endl <<
-            endl;
-
-        if (dt > 1) {
-            os <<
-                " domain-size in this rank for all time-steps: " <<
-                makeNumStr(rank_domain_dt) << endl <<
-                " overall-problem-size in all ranks for all time-steps: " <<
-                makeNumStr(tot_domain_dt) << endl <<
-                endl <<
-                " num-writes-required in this rank for all time-steps: " <<
-                makeNumStr(rank_numWrites_dt) << endl <<
-                " num-writes-required in all ranks for all time-steps: " <<
-                makeNumStr(tot_numWrites_dt) << endl <<
-                endl <<
-                " num-reads-required in this rank for all time-steps: " <<
-                makeNumStr(rank_reads_dt) << endl <<
-                " num-reads-required in all ranks for all time-steps: " <<
-                makeNumStr(tot_reads_dt) << endl <<
-                endl <<
-                " est-FP-ops in this rank for all time-steps: " <<
-                makeNumStr(rank_numFpOps_dt) << endl <<
-                " est-FP-ops in all ranks for all time-steps: " <<
-                makeNumStr(tot_numFpOps_dt) << endl <<
-                endl;
-        }
-        os <<
-            "Notes:\n"
-            " Domain-sizes and overall-problem-sizes are based on rank-domain sizes\n"
-            "  and number of ranks regardless of number of grids or sub-domains.\n"
-            " Num-writes-required is based on sum of grid-updates in sub-domain across stencil-bundle(s).\n"
-            " Num-reads-required is based on sum of grid-reads in sub-domain across stencil-bundle(s).\n"
-            " Est-FP-ops are based on sum of est-FP-ops in sub-domain across stencil-bundle(s).\n"
-            "\n";
-    }
-
     /// Get statistics associated with preceding calls to run_solution().
     yk_stats_ptr StencilContext::get_stats() {
         ostream& os = get_ostr();
@@ -2177,38 +1061,6 @@ namespace yask {
         return p;
     }
     
-    // Dealloc grids, etc.
-    void StencilContext::end_solution() {
-
-        // Final halo exchange.
-        exchange_halos_all();
-
-        // Release any MPI data.
-        mpiData.clear();
-
-        // Release grid data.
-        for (auto gp : gridPtrs) {
-            if (!gp)
-                continue;
-            gp->release_storage();
-        }
-
-	// Reset threads to original value.
-	set_max_threads();
-    }
-
-    // Init all grids & params by calling initFn.
-    void StencilContext::initValues(function<void (YkGridPtr gp, 
-                                                   real_t seed)> realInitFn) {
-        ostream& os = get_ostr();
-        real_t v = 0.1;
-        os << "Initializing grids..." << endl;
-        for (auto gp : gridPtrs) {
-            realInitFn(gp, v);
-            v += 0.01;
-        }
-    }
-
     // Compare grids in contexts.
     // Return number of mis-compares.
     idx_t StencilContext::compareData(const StencilContext& ref) const {
@@ -2228,82 +1080,6 @@ namespace yask {
         return errs;
     }
 
-    // Compute convenience values for a bounding-box.
-    void BoundingBox::update_bb(ostream& os,
-                                const string& name,
-                                StencilContext& context,
-                                bool force_full) {
-
-        auto dims = context.get_dims();
-        auto& domain_dims = dims->_domain_dims;
-        bb_len = bb_end.subElements(bb_begin);
-        bb_size = bb_len.product();
-        if (force_full)
-            bb_num_points = bb_size;
-
-        // Solid rectangle?
-        bb_is_full = true;
-        if (bb_num_points != bb_size) {
-            os << "Warning: '" << name << "' domain has only " <<
-                makeNumStr(bb_num_points) <<
-                " valid point(s) inside its bounding-box of " <<
-                makeNumStr(bb_size) <<
-                " point(s); slower scalar calculations will be used.\n";
-            bb_is_full = false;
-        }
-
-        // Does everything start on a vector-length boundary?
-        bb_is_aligned = true;
-        for (auto& dim : domain_dims.getDims()) {
-            auto& dname = dim.getName();
-            if ((bb_begin[dname] - context.rank_domain_offsets[dname]) %
-                dims->_fold_pts[dname] != 0) {
-                os << "Note: '" << name << "' domain"
-                    " has one or more starting edges not on vector boundaries;"
-                    " masked calculations will be used in peel and remainder sub-blocks.\n";
-                bb_is_aligned = false;
-                break;
-            }
-        }
-
-        // Lengths are cluster-length multiples?
-        bb_is_cluster_mult = true;
-        for (auto& dim : domain_dims.getDims()) {
-            auto& dname = dim.getName();
-            if (bb_len[dname] % dims->_cluster_pts[dname] != 0) {
-                if (bb_is_full && bb_is_aligned)
-                    os << "Note: '" << name << "' domain"
-                        " has one or more sizes that are not vector-cluster multiples;"
-                        " masked calculations will be used in peel and remainder sub-blocks.\n";
-                bb_is_cluster_mult = false;
-                break;
-            }
-        }
-
-        // All done.
-        bb_valid = true;
-    }
-    
-    // Set the bounding-box for each stencil-bundle and whole domain.
-    void StencilContext::find_bounding_boxes()
-    {
-        ostream& os = get_ostr();
-
-        // Rank BB is based only on rank offsets and rank domain sizes.
-        rank_bb.bb_begin = rank_domain_offsets;
-        rank_bb.bb_end = rank_domain_offsets.addElements(_opts->_rank_sizes, false);
-        rank_bb.update_bb(os, "rank", *this, true);
-
-        // Overall BB may be extended for wave-fronts.
-        ext_bb.bb_begin = rank_bb.bb_begin.subElements(left_wf_exts);
-        ext_bb.bb_end = rank_bb.bb_end.addElements(right_wf_exts);
-        ext_bb.update_bb(os, "extended-rank", *this, true);
-
-        // Find BB for each bundle.
-        for (auto sg : stBundles)
-            sg->find_bounding_box();
-    }
-
     // Exchange dirty halo data for all grids and all steps, regardless
     // of their stencil-bundle.
     // TODO: loop through all grids in exchange_halos() instead.
diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp
new file mode 100644
index 00000000..d0e163ba
--- /dev/null
+++ b/src/kernel/lib/grid_apis.cpp
@@ -0,0 +1,387 @@
+/*****************************************************************************
+
+YASK: Yet Another Stencil Kernel
+Copyright (c) 2014-2018, Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+* The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
+
+*****************************************************************************/
+
+// Implement methods for yk_grid APIs.
+
+#include "yask.hpp"
+using namespace std;
+
+namespace yask {
+
+    // APIs to get info from vars.
+#define GET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \
+    idx_t YkGridBase::api_name(const string& dim) const {               \
+        checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok);      \
+        int posn = get_dim_posn(dim, true, #api_name);                  \
+        if (prep_req && _offsets[posn] < 0)                             \
+            THROW_YASK_EXCEPTION("Error: '" #api_name "()' called on grid '" << \
+                                 get_name() << "' before calling 'prepare_solution()'"); \
+        return expr;                                                    \
+    }                                                                   \
+    idx_t YkGridBase::api_name(int posn) const {                        \
+        return expr;                                                    \
+    }
+    GET_GRID_API(get_rank_domain_size, _domains[posn], false, true, false, false)
+    GET_GRID_API(get_left_pad_size, _left_pads[posn], false, true, false, false) // _left_pads is actual size.
+    GET_GRID_API(get_right_pad_size, _allocs[posn] - _left_pads[posn], false, true, false, false) // _right_pads is request only.
+    GET_GRID_API(get_pad_size, _left_pads[posn], false, true, false, false)
+    GET_GRID_API(get_left_halo_size, _left_halos[posn], false, true, false, false)
+    GET_GRID_API(get_right_halo_size, _right_halos[posn], false, true, false, false)
+    GET_GRID_API(get_halo_size, _left_halos[posn], false, true, false, false)
+    GET_GRID_API(get_first_misc_index, _offsets[posn], false, false, true, false)
+    GET_GRID_API(get_last_misc_index, _offsets[posn] + _domains[posn] - 1, false, false, true, false)
+    GET_GRID_API(get_left_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false)
+    GET_GRID_API(get_right_extra_pad_size, (_allocs[posn] - _left_pads[posn] - _domains[posn]) -
+                 _right_halos[posn], false, true, false, false)
+    GET_GRID_API(get_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false)
+    GET_GRID_API(get_alloc_size, _allocs[posn], true, true, true, false)
+    GET_GRID_API(get_first_rank_domain_index, _offsets[posn] - _local_offsets[posn], false, true, false, true)
+    GET_GRID_API(get_last_rank_domain_index, _offsets[posn] - _local_offsets[posn] + _domains[posn] - 1;
+                 assert(!_is_scratch), false, true, false, true)
+    GET_GRID_API(get_first_rank_halo_index, _offsets[posn] - _left_halos[posn], false, false, true, true)
+    GET_GRID_API(get_last_rank_halo_index, _offsets[posn] + _domains[posn] + _right_halos[posn] - 1, false, false, true, true)
+    GET_GRID_API(get_first_rank_alloc_index, _offsets[posn] - _left_pads[posn], false, true, false, true)
+    GET_GRID_API(get_last_rank_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, false, true, false, true)
+    GET_GRID_API(_get_left_wf_ext, _left_wf_exts[posn], true, true, true, false)
+    GET_GRID_API(_get_right_wf_ext, _right_wf_exts[posn], true, true, true, false)
+    GET_GRID_API(_get_offset, _offsets[posn], true, true, true, true)
+    GET_GRID_API(_get_local_offset, _local_offsets[posn], true, true, true, false)
+    GET_GRID_API(_get_first_alloc_index, _offsets[posn] - _left_pads[posn], true, true, true, true)
+    GET_GRID_API(_get_last_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, true, true, true, true)
+#undef GET_GRID_API
+    
+    // APIs to set vars.
+#define COMMA ,
+#define SET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok)       \
+    void YkGridBase::api_name(const string& dim, idx_t n) {             \
+        TRACE_MSG0(get_ostr(), "grid '" << get_name() << "'."           \
+                   #api_name "('" << dim << "', " << n << ")");          \
+        checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok);      \
+        int posn = get_dim_posn(dim, true, #api_name);                  \
+        expr;                                                           \
+    }                                                                   \
+    void YkGridBase::api_name(int posn, idx_t n) {                      \
+        int dim = posn;                                                 \
+        expr;                                                           \
+    }
+    SET_GRID_API(_set_offset, _offsets[posn] = n, true, true, true)
+    SET_GRID_API(_set_local_offset, _local_offsets[posn] = n;
+                 _vec_local_offsets[posn] = n / _vec_lens[posn], true, true, true)
+    SET_GRID_API(_set_domain_size, _domains[posn] = n; resize(), true, true, true)
+    SET_GRID_API(_set_left_pad_size, _left_pads[posn] = n; resize(), true, true, true)
+    SET_GRID_API(_set_right_pad_size, _right_pads[posn] = n; resize(), true, true, true)
+    SET_GRID_API(_set_left_wf_ext, _left_wf_exts[posn] = n; resize(), true, true, true)
+    SET_GRID_API(_set_right_wf_ext, _right_wf_exts[posn] = n; resize(), true, true, true)
+    SET_GRID_API(set_left_halo_size, _left_halos[posn] = n; resize(), false, true, false)
+    SET_GRID_API(set_right_halo_size, _right_halos[posn] = n; resize(), false, true, false)
+    SET_GRID_API(set_halo_size, _left_halos[posn] = _right_halos[posn] = n; resize(), false, true, false)
+
+    SET_GRID_API(set_alloc_size, _set_domain_size(posn, n), true, false, true)
+    SET_GRID_API(set_left_min_pad_size,
+                 if (!get_raw_storage_buffer() && n > _left_pads[posn])
+                     _set_left_pad_size(posn, n),
+                 false, true, false)
+    SET_GRID_API(set_right_min_pad_size,
+                 if (!get_raw_storage_buffer() && n > _right_pads[posn])
+                     _set_right_pad_size(posn, n),
+                 false, true, false)
+    SET_GRID_API(set_min_pad_size,
+                 if (!get_raw_storage_buffer() && n > _left_pads[posn])
+                     _set_left_pad_size(posn, n);
+                 if (!get_raw_storage_buffer() && n > _right_pads[posn])
+                     _set_right_pad_size(posn, n),
+                 false, true, false)
+    SET_GRID_API(set_left_extra_pad_size,
+                 set_left_min_pad_size(posn, _left_halos[posn] + _left_wf_exts[posn] + n), false, true, false)
+    SET_GRID_API(set_right_extra_pad_size,
+                 set_right_min_pad_size(posn, _right_halos[posn] + _right_wf_exts[posn] + n), false, true, false)
+    SET_GRID_API(set_extra_pad_size, set_left_extra_pad_size(posn, n);
+                 set_right_extra_pad_size(posn, n), false, true, false)
+    SET_GRID_API(set_first_misc_index, _offsets[posn] = n, false, false, true)
+#undef COMMA
+#undef SET_GRID_API
+    
+    bool YkGridBase::is_storage_layout_identical(const yk_grid_ptr other) const {
+        auto op = dynamic_pointer_cast<YkGridBase>(other);
+        assert(op);
+
+        // Same size?
+        if (get_num_storage_bytes() != op->get_num_storage_bytes())
+            return false;
+
+        // Same dims?
+        if (get_num_dims() != op->get_num_dims())
+            return false;
+        for (int i = 0; i < get_num_dims(); i++) {
+            auto dname = get_dim_name(i);
+
+            // Same dims?
+            if (dname != op->get_dim_name(i))
+                return false;
+
+            // Same sizes?
+            // NB: not checking right pads because actual values
+            // are determined as function of other 3.
+            if (_allocs[i] != op->_allocs[i])
+                return false;
+            if (_domains[i] != op->_domains[i])
+                return false;
+            if (_left_pads[i] != op->_left_pads[i])
+                return false;
+        }
+        return true;
+    }
+
+    void YkGridBase::share_storage(yk_grid_ptr source) {
+        auto sp = dynamic_pointer_cast<YkGridBase>(source);
+        assert(sp);
+
+        if (!sp->get_raw_storage_buffer()) {
+            THROW_YASK_EXCEPTION("Error: share_storage() called without source storage allocated");
+        }
+
+        // Determine required padding from halos.
+        Indices left_pads2 = getReqdPad(_left_halos, _left_wf_exts);
+        Indices right_pads2 = getReqdPad(_right_halos, _left_wf_exts);
+
+        // NB: requirements to successful share_storage() is not as strict as
+        // is_storage_layout_identical(). See note on pad & halo below and API docs.
+        for (int i = 0; i < get_num_dims(); i++) {
+            auto dname = get_dim_name(i);
+
+            // Same dims?
+            if (sp->get_num_dims() != get_num_dims() ||
+                sp->get_dim_name(i) != dname)
+                THROW_YASK_EXCEPTION("Error: share_storage() called with incompatible grids: " <<
+                                     make_info_string() << " and " << sp->make_info_string());
+
+
+            // Check folding.
+            if (_vec_lens[i] != sp->_vec_lens[i]) {
+                THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() <<
+                                     "' of fold-length " << sp->_vec_lens[i] << " with grid '" << get_name() <<
+                                     "' of fold-length " << _vec_lens[i] << " in '" << dname << "' dim");
+            }
+
+            // Not a domain dim?
+            bool is_domain = _dims->_domain_dims.lookup(dname) != 0;
+            if (!is_domain) {
+                auto tas = get_alloc_size(dname);
+                auto sas = sp->get_alloc_size(dname);
+                if (tas != sas) {
+                    THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() <<
+                                         "' of alloc-size " << sas << " with grid '" << get_name() <<
+                                         "' of alloc-size " << tas << " in '" << dname << "' dim");
+                }
+            }
+
+            // Domain dim.
+            else {
+                auto tdom = get_rank_domain_size(i);
+                auto sdom = sp->get_rank_domain_size(i);
+                if (tdom != sdom) {
+                    THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() <<
+                                         "' of domain-size " << sdom << " with grid '" << get_name() <<
+                                         "' of domain-size " << tdom << " in '" << dname << "' dim");
+                }
+
+                // Halo and pad sizes don't have to be the same.
+                // Requirement is that halo (reqd pad) of target fits inside of pad of source.
+                auto spad = sp->get_left_pad_size(i);
+                if (left_pads2[i] > spad) {
+                    THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() <<
+                                         "' of left padding-size " << spad <<
+                                         ", which is insufficient for grid '" << get_name() <<
+                                         "' requiring " << left_pads2[i] << " in '" << dname << "' dim");
+                }
+                spad = sp->get_right_pad_size(i);
+                if (right_pads2[i] > spad) {
+                    THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() <<
+                                         "' of right padding-size " << spad <<
+                                         ", which is insufficient for grid '" << get_name() <<
+                                         "' requiring " << right_pads2[i] << " in '" << dname << "' dim");
+                }
+            }
+        }
+
+        // Copy pad sizes.
+        for (int i = 0; i < get_num_dims(); i++) {
+            auto dname = get_dim_name(i);
+            bool is_domain = _dims->_domain_dims.lookup(dname) != 0;
+            if (is_domain) {
+                _left_pads[i] = sp->_left_pads[i];
+                _right_pads[i] = sp->_right_pads[i];
+            }
+        }
+        
+        // Copy data.
+        release_storage();
+        resize();
+        if (!share_data(sp.get(), true)) {
+            THROW_YASK_EXCEPTION("Error: unexpected failure in data sharing");
+        }
+    }
+
+    // API get, set, setc.
+    bool YkGridBase::is_element_allocated(const Indices& indices) const {
+        if (!is_storage_allocated())
+            return false;
+        return checkIndices(indices, "is_element_allocated", false, false);
+    }
+    double YkGridBase::get_element(const Indices& indices) const {
+        if (!is_storage_allocated()) {
+            THROW_YASK_EXCEPTION("Error: call to 'get_element' with no data allocated for grid '" <<
+                                 get_name() << "'");
+        }
+        checkIndices(indices, "get_element", true, false);
+        idx_t asi = get_alloc_step_index(indices);
+        real_t val = readElem(indices, asi, __LINE__);
+        return double(val);
+    }
+    idx_t YkGridBase::set_element(double val,
+                                  const Indices& indices,
+                                  bool strict_indices) {
+        idx_t nup = 0;
+        if (get_raw_storage_buffer() &&
+            checkIndices(indices, "set_element", strict_indices, false)) {
+            idx_t asi = get_alloc_step_index(indices);
+            writeElem(real_t(val), indices, asi, __LINE__);
+            nup++;
+
+            // Set appropriate dirty flag.
+            set_dirty_using_alloc_index(true, asi);
+        }
+        return nup;
+    }
+    idx_t YkGridBase::add_to_element(double val,
+                                     const Indices& indices,
+                                     bool strict_indices) {
+        idx_t nup = 0;
+        if (get_raw_storage_buffer() &&
+            checkIndices(indices, "add_to_element", strict_indices, false)) {
+            idx_t asi = get_alloc_step_index(indices);
+            addToElem(real_t(val), indices, asi, __LINE__);
+            nup++;
+
+            // Set appropriate dirty flag.
+            set_dirty_using_alloc_index(true, asi);
+        }
+        return nup;
+    }
+    
+    idx_t YkGridBase::get_elements_in_slice(void* buffer_ptr,
+                                            const Indices& first_indices,
+                                            const Indices& last_indices) const {
+        if (!is_storage_allocated()) {
+            THROW_YASK_EXCEPTION("Error: call to 'get_elements_in_slice' with no data allocated for grid '" <<
+                                 get_name() << "'");
+        }
+        checkIndices(first_indices, "get_elements_in_slice", true, false);
+        checkIndices(last_indices, "get_elements_in_slice", true, false);
+
+        // Find range.
+        IdxTuple numElemsTuple = get_slice_range(first_indices, last_indices);
+        
+        // Visit points in slice.
+        numElemsTuple.visitAllPointsInParallel
+            ([&](const IdxTuple& ofs, size_t idx) {
+                Indices pt = first_indices.addElements(ofs);
+
+                // TODO: move this outside of loop for const step index.
+                idx_t asi = get_alloc_step_index(pt);
+                
+                real_t val = readElem(pt, asi, __LINE__);
+                ((real_t*)buffer_ptr)[idx] = val;
+                return true;    // keep going.
+            });
+        return numElemsTuple.product();
+    }
+    idx_t YkGridBase::set_elements_in_slice_same(double val,
+                                                 const Indices& first_indices,
+                                                 const Indices& last_indices,
+                                                 bool strict_indices) {
+        if (!is_storage_allocated())
+            return 0;
+        
+        // 'Fixed' copy of indices.
+        Indices first, last;
+        checkIndices(first_indices, "set_elements_in_slice_same",
+                     strict_indices, false, &first);
+        checkIndices(last_indices, "set_elements_in_slice_same",
+                     strict_indices, false, &last);
+
+        // Find range.
+        IdxTuple numElemsTuple = get_slice_range(first, last);
+
+        // Visit points in slice.
+        numElemsTuple.visitAllPointsInParallel([&](const IdxTuple& ofs,
+                                                   size_t idx) {
+                Indices pt = first.addElements(ofs);
+
+                // TODO: move this outside of loop for const step index.
+                idx_t asi = get_alloc_step_index(pt);
+
+                writeElem(real_t(val), pt, asi, __LINE__);
+                return true;    // keep going.
+            });
+
+        // Set appropriate dirty flag(s).
+        set_dirty_in_slice(first, last);
+
+        return numElemsTuple.product();
+    }
+    idx_t YkGridBase::set_elements_in_slice(const void* buffer_ptr,
+                                            const Indices& first_indices,
+                                            const Indices& last_indices) {
+        if (!is_storage_allocated())
+            return 0;
+        checkIndices(first_indices, "set_elements_in_slice", true, false);
+        checkIndices(last_indices, "set_elements_in_slice", true, false);
+
+        // Find range.
+        IdxTuple numElemsTuple = get_slice_range(first_indices, last_indices);
+
+        // Visit points in slice.
+        numElemsTuple.visitAllPointsInParallel
+            ([&](const IdxTuple& ofs,
+                 size_t idx) {
+                Indices pt = first_indices.addElements(ofs);
+
+                // TODO: move this outside of loop for const step index.
+                idx_t asi = get_alloc_step_index(pt);
+
+                real_t val = ((real_t*)buffer_ptr)[idx];
+                writeElem(val, pt, asi, __LINE__);
+                return true;    // keep going.
+            });
+
+        // Set appropriate dirty flag(s).
+        set_dirty_in_slice(first_indices, last_indices);
+
+        return numElemsTuple.product();
+    }
+
+} // namespace.
+
diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp
index 1a4faa87..51ba3086 100644
--- a/src/kernel/lib/realv_grids.cpp
+++ b/src/kernel/lib/realv_grids.cpp
@@ -30,99 +30,6 @@ using namespace std;
 
 namespace yask {
 
-    // APIs to get info from vars.
-#define GET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \
-    idx_t YkGridBase::api_name(const string& dim) const {               \
-        checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok);      \
-        int posn = get_dim_posn(dim, true, #api_name);                  \
-        if (prep_req && _offsets[posn] < 0)                             \
-            THROW_YASK_EXCEPTION("Error: '" #api_name "()' called on grid '" << \
-                                 get_name() << "' before calling 'prepare_solution()'"); \
-        return expr;                                                    \
-    }                                                                   \
-    idx_t YkGridBase::api_name(int posn) const {                        \
-        return expr;                                                    \
-    }
-    GET_GRID_API(get_rank_domain_size, _domains[posn], false, true, false, false)
-    GET_GRID_API(get_left_pad_size, _left_pads[posn], false, true, false, false) // _left_pads is actual size.
-    GET_GRID_API(get_right_pad_size, _allocs[posn] - _left_pads[posn], false, true, false, false) // _right_pads is request only.
-    GET_GRID_API(get_pad_size, _left_pads[posn], false, true, false, false)
-    GET_GRID_API(get_left_halo_size, _left_halos[posn], false, true, false, false)
-    GET_GRID_API(get_right_halo_size, _right_halos[posn], false, true, false, false)
-    GET_GRID_API(get_halo_size, _left_halos[posn], false, true, false, false)
-    GET_GRID_API(get_first_misc_index, _offsets[posn], false, false, true, false)
-    GET_GRID_API(get_last_misc_index, _offsets[posn] + _domains[posn] - 1, false, false, true, false)
-    GET_GRID_API(get_left_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false)
-    GET_GRID_API(get_right_extra_pad_size, (_allocs[posn] - _left_pads[posn] - _domains[posn]) -
-                 _right_halos[posn], false, true, false, false)
-    GET_GRID_API(get_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false)
-    GET_GRID_API(get_alloc_size, _allocs[posn], true, true, true, false)
-    GET_GRID_API(get_first_rank_domain_index, _offsets[posn] - _local_offsets[posn], false, true, false, true)
-    GET_GRID_API(get_last_rank_domain_index, _offsets[posn] - _local_offsets[posn] + _domains[posn] - 1;
-                 assert(!_is_scratch), false, true, false, true)
-    GET_GRID_API(get_first_rank_halo_index, _offsets[posn] - _left_halos[posn], false, false, true, true)
-    GET_GRID_API(get_last_rank_halo_index, _offsets[posn] + _domains[posn] + _right_halos[posn] - 1, false, false, true, true)
-    GET_GRID_API(get_first_rank_alloc_index, _offsets[posn] - _left_pads[posn], false, true, false, true)
-    GET_GRID_API(get_last_rank_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, false, true, false, true)
-    GET_GRID_API(_get_left_wf_ext, _left_wf_exts[posn], true, true, true, false)
-    GET_GRID_API(_get_right_wf_ext, _right_wf_exts[posn], true, true, true, false)
-    GET_GRID_API(_get_offset, _offsets[posn], true, true, true, true)
-    GET_GRID_API(_get_local_offset, _local_offsets[posn], true, true, true, false)
-    GET_GRID_API(_get_first_alloc_index, _offsets[posn] - _left_pads[posn], true, true, true, true)
-    GET_GRID_API(_get_last_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, true, true, true, true)
-#undef GET_GRID_API
-    
-    // APIs to set vars.
-#define COMMA ,
-#define SET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok)       \
-    void YkGridBase::api_name(const string& dim, idx_t n) {             \
-        TRACE_MSG0(get_ostr(), "grid '" << get_name() << "'."           \
-                   #api_name "('" << dim << "', " << n << ")");          \
-        checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok);      \
-        int posn = get_dim_posn(dim, true, #api_name);                  \
-        expr;                                                           \
-    }                                                                   \
-    void YkGridBase::api_name(int posn, idx_t n) {                      \
-        int dim = posn;                                                 \
-        expr;                                                           \
-    }
-    SET_GRID_API(_set_offset, _offsets[posn] = n, true, true, true)
-    SET_GRID_API(_set_local_offset, _local_offsets[posn] = n;
-                 _vec_local_offsets[posn] = n / _vec_lens[posn], true, true, true)
-    SET_GRID_API(_set_domain_size, _domains[posn] = n; resize(), true, true, true)
-    SET_GRID_API(_set_left_pad_size, _left_pads[posn] = n; resize(), true, true, true)
-    SET_GRID_API(_set_right_pad_size, _right_pads[posn] = n; resize(), true, true, true)
-    SET_GRID_API(_set_left_wf_ext, _left_wf_exts[posn] = n; resize(), true, true, true)
-    SET_GRID_API(_set_right_wf_ext, _right_wf_exts[posn] = n; resize(), true, true, true)
-    SET_GRID_API(set_left_halo_size, _left_halos[posn] = n; resize(), false, true, false)
-    SET_GRID_API(set_right_halo_size, _right_halos[posn] = n; resize(), false, true, false)
-    SET_GRID_API(set_halo_size, _left_halos[posn] = _right_halos[posn] = n; resize(), false, true, false)
-
-    SET_GRID_API(set_alloc_size, _set_domain_size(posn, n), true, false, true)
-    SET_GRID_API(set_left_min_pad_size,
-                 if (!get_raw_storage_buffer() && n > _left_pads[posn])
-                     _set_left_pad_size(posn, n),
-                 false, true, false)
-    SET_GRID_API(set_right_min_pad_size,
-                 if (!get_raw_storage_buffer() && n > _right_pads[posn])
-                     _set_right_pad_size(posn, n),
-                 false, true, false)
-    SET_GRID_API(set_min_pad_size,
-                 if (!get_raw_storage_buffer() && n > _left_pads[posn])
-                     _set_left_pad_size(posn, n);
-                 if (!get_raw_storage_buffer() && n > _right_pads[posn])
-                     _set_right_pad_size(posn, n),
-                 false, true, false)
-    SET_GRID_API(set_left_extra_pad_size,
-                 set_left_min_pad_size(posn, _left_halos[posn] + _left_wf_exts[posn] + n), false, true, false)
-    SET_GRID_API(set_right_extra_pad_size,
-                 set_right_min_pad_size(posn, _right_halos[posn] + _right_wf_exts[posn] + n), false, true, false)
-    SET_GRID_API(set_extra_pad_size, set_left_extra_pad_size(posn, n);
-                 set_right_extra_pad_size(posn, n), false, true, false)
-    SET_GRID_API(set_first_misc_index, _offsets[posn] = n, false, false, true)
-#undef COMMA
-#undef SET_GRID_API
-
     // Ctor.
     YkGridBase::YkGridBase(GenericGridBase* ggb,
                            size_t ndims,
@@ -150,7 +57,6 @@ namespace yask {
         _vec_local_offsets.setFromConst(0, n);
     }
     
-    
     // Convenience function to format indices like
     // "x=5, y=3".
     std::string YkGridBase::makeIndexString(const Indices& idxs,
@@ -304,127 +210,6 @@ namespace yask {
         _dims->checkDimType(dim, fn_name, step_ok, domain_ok, misc_ok);
     }
     
-    bool YkGridBase::is_storage_layout_identical(const yk_grid_ptr other) const {
-        auto op = dynamic_pointer_cast<YkGridBase>(other);
-        assert(op);
-
-        // Same size?
-        if (get_num_storage_bytes() != op->get_num_storage_bytes())
-            return false;
-
-        // Same dims?
-        if (get_num_dims() != op->get_num_dims())
-            return false;
-        for (int i = 0; i < get_num_dims(); i++) {
-            auto dname = get_dim_name(i);
-
-            // Same dims?
-            if (dname != op->get_dim_name(i))
-                return false;
-
-            // Same sizes?
-            // NB: not checking right pads because actual values
-            // are determined as function of other 3.
-            if (_allocs[i] != op->_allocs[i])
-                return false;
-            if (_domains[i] != op->_domains[i])
-                return false;
-            if (_left_pads[i] != op->_left_pads[i])
-                return false;
-        }
-        return true;
-    }
-
-    void YkGridBase::share_storage(yk_grid_ptr source) {
-        auto sp = dynamic_pointer_cast<YkGridBase>(source);
-        assert(sp);
-
-        if (!sp->get_raw_storage_buffer()) {
-            THROW_YASK_EXCEPTION("Error: share_storage() called without source storage allocated");
-        }
-
-        // Determine required padding from halos.
-        Indices left_pads2 = getReqdPad(_left_halos, _left_wf_exts);
-        Indices right_pads2 = getReqdPad(_right_halos, _left_wf_exts);
-
-        // NB: requirements to successful share_storage() is not as strict as
-        // is_storage_layout_identical(). See note on pad & halo below and API docs.
-        for (int i = 0; i < get_num_dims(); i++) {
-            auto dname = get_dim_name(i);
-
-            // Same dims?
-            if (sp->get_num_dims() != get_num_dims() ||
-                sp->get_dim_name(i) != dname)
-                THROW_YASK_EXCEPTION("Error: share_storage() called with incompatible grids: " <<
-                                     make_info_string() << " and " << sp->make_info_string());
-
-
-            // Check folding.
-            if (_vec_lens[i] != sp->_vec_lens[i]) {
-                THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() <<
-                                     "' of fold-length " << sp->_vec_lens[i] << " with grid '" << get_name() <<
-                                     "' of fold-length " << _vec_lens[i] << " in '" << dname << "' dim");
-            }
-
-            // Not a domain dim?
-            bool is_domain = _dims->_domain_dims.lookup(dname) != 0;
-            if (!is_domain) {
-                auto tas = get_alloc_size(dname);
-                auto sas = sp->get_alloc_size(dname);
-                if (tas != sas) {
-                    THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() <<
-                                         "' of alloc-size " << sas << " with grid '" << get_name() <<
-                                         "' of alloc-size " << tas << " in '" << dname << "' dim");
-                }
-            }
-
-            // Domain dim.
-            else {
-                auto tdom = get_rank_domain_size(i);
-                auto sdom = sp->get_rank_domain_size(i);
-                if (tdom != sdom) {
-                    THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() <<
-                                         "' of domain-size " << sdom << " with grid '" << get_name() <<
-                                         "' of domain-size " << tdom << " in '" << dname << "' dim");
-                }
-
-                // Halo and pad sizes don't have to be the same.
-                // Requirement is that halo (reqd pad) of target fits inside of pad of source.
-                auto spad = sp->get_left_pad_size(i);
-                if (left_pads2[i] > spad) {
-                    THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() <<
-                                         "' of left padding-size " << spad <<
-                                         ", which is insufficient for grid '" << get_name() <<
-                                         "' requiring " << left_pads2[i] << " in '" << dname << "' dim");
-                }
-                spad = sp->get_right_pad_size(i);
-                if (right_pads2[i] > spad) {
-                    THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() <<
-                                         "' of right padding-size " << spad <<
-                                         ", which is insufficient for grid '" << get_name() <<
-                                         "' requiring " << right_pads2[i] << " in '" << dname << "' dim");
-                }
-            }
-        }
-
-        // Copy pad sizes.
-        for (int i = 0; i < get_num_dims(); i++) {
-            auto dname = get_dim_name(i);
-            bool is_domain = _dims->_domain_dims.lookup(dname) != 0;
-            if (is_domain) {
-                _left_pads[i] = sp->_left_pads[i];
-                _right_pads[i] = sp->_right_pads[i];
-            }
-        }
-        
-        // Copy data.
-        release_storage();
-        resize();
-        if (!share_data(sp.get(), true)) {
-            THROW_YASK_EXCEPTION("Error: unexpected failure in data sharing");
-        }
-    }
-
     // Check for equality.
     // Return number of mismatches greater than epsilon.
     idx_t YkGridBase::compare(const YkGridBase* ref,
@@ -587,145 +372,6 @@ namespace yask {
         return numElemsTuple;
     }
     
-    // API get, set, setc.
-    bool YkGridBase::is_element_allocated(const Indices& indices) const {
-        if (!is_storage_allocated())
-            return false;
-        return checkIndices(indices, "is_element_allocated", false, false);
-    }
-    double YkGridBase::get_element(const Indices& indices) const {
-        if (!is_storage_allocated()) {
-            THROW_YASK_EXCEPTION("Error: call to 'get_element' with no data allocated for grid '" <<
-                                 get_name() << "'");
-        }
-        checkIndices(indices, "get_element", true, false);
-        idx_t asi = get_alloc_step_index(indices);
-        real_t val = readElem(indices, asi, __LINE__);
-        return double(val);
-    }
-    idx_t YkGridBase::set_element(double val,
-                                  const Indices& indices,
-                                  bool strict_indices) {
-        idx_t nup = 0;
-        if (get_raw_storage_buffer() &&
-            checkIndices(indices, "set_element", strict_indices, false)) {
-            idx_t asi = get_alloc_step_index(indices);
-            writeElem(real_t(val), indices, asi, __LINE__);
-            nup++;
-
-            // Set appropriate dirty flag.
-            set_dirty_using_alloc_index(true, asi);
-        }
-        return nup;
-    }
-    idx_t YkGridBase::add_to_element(double val,
-                                     const Indices& indices,
-                                     bool strict_indices) {
-        idx_t nup = 0;
-        if (get_raw_storage_buffer() &&
-            checkIndices(indices, "add_to_element", strict_indices, false)) {
-            idx_t asi = get_alloc_step_index(indices);
-            addToElem(real_t(val), indices, asi, __LINE__);
-            nup++;
-
-            // Set appropriate dirty flag.
-            set_dirty_using_alloc_index(true, asi);
-        }
-        return nup;
-    }
-    
-    idx_t YkGridBase::get_elements_in_slice(void* buffer_ptr,
-                                            const Indices& first_indices,
-                                            const Indices& last_indices) const {
-        if (!is_storage_allocated()) {
-            THROW_YASK_EXCEPTION("Error: call to 'get_elements_in_slice' with no data allocated for grid '" <<
-                                 get_name() << "'");
-        }
-        checkIndices(first_indices, "get_elements_in_slice", true, false);
-        checkIndices(last_indices, "get_elements_in_slice", true, false);
-
-        // Find range.
-        IdxTuple numElemsTuple = get_slice_range(first_indices, last_indices);
-        
-        // Visit points in slice.
-        numElemsTuple.visitAllPointsInParallel
-            ([&](const IdxTuple& ofs, size_t idx) {
-                Indices pt = first_indices.addElements(ofs);
-
-                // TODO: move this outside of loop for const step index.
-                idx_t asi = get_alloc_step_index(pt);
-                
-                real_t val = readElem(pt, asi, __LINE__);
-                ((real_t*)buffer_ptr)[idx] = val;
-                return true;    // keep going.
-            });
-        return numElemsTuple.product();
-    }
-    idx_t YkGridBase::set_elements_in_slice_same(double val,
-                                                 const Indices& first_indices,
-                                                 const Indices& last_indices,
-                                                 bool strict_indices) {
-        if (!is_storage_allocated())
-            return 0;
-        
-        // 'Fixed' copy of indices.
-        Indices first, last;
-        checkIndices(first_indices, "set_elements_in_slice_same",
-                     strict_indices, false, &first);
-        checkIndices(last_indices, "set_elements_in_slice_same",
-                     strict_indices, false, &last);
-
-        // Find range.
-        IdxTuple numElemsTuple = get_slice_range(first, last);
-
-        // Visit points in slice.
-        numElemsTuple.visitAllPointsInParallel([&](const IdxTuple& ofs,
-                                                   size_t idx) {
-                Indices pt = first.addElements(ofs);
-
-                // TODO: move this outside of loop for const step index.
-                idx_t asi = get_alloc_step_index(pt);
-
-                writeElem(real_t(val), pt, asi, __LINE__);
-                return true;    // keep going.
-            });
-
-        // Set appropriate dirty flag(s).
-        set_dirty_in_slice(first, last);
-
-        return numElemsTuple.product();
-    }
-    idx_t YkGridBase::set_elements_in_slice(const void* buffer_ptr,
-                                            const Indices& first_indices,
-                                            const Indices& last_indices) {
-        if (!is_storage_allocated())
-            return 0;
-        checkIndices(first_indices, "set_elements_in_slice", true, false);
-        checkIndices(last_indices, "set_elements_in_slice", true, false);
-
-        // Find range.
-        IdxTuple numElemsTuple = get_slice_range(first_indices, last_indices);
-
-        // Visit points in slice.
-        numElemsTuple.visitAllPointsInParallel
-            ([&](const IdxTuple& ofs,
-                 size_t idx) {
-                Indices pt = first_indices.addElements(ofs);
-
-                // TODO: move this outside of loop for const step index.
-                idx_t asi = get_alloc_step_index(pt);
-
-                real_t val = ((real_t*)buffer_ptr)[idx];
-                writeElem(val, pt, asi, __LINE__);
-                return true;    // keep going.
-            });
-
-        // Set appropriate dirty flag(s).
-        set_dirty_in_slice(first_indices, last_indices);
-
-        return numElemsTuple.product();
-    }
-
     // Print one element like
     // "message: mygrid[x=4, y=7] = 3.14 at line 35".
     void YkGridBase::printElem(const std::string& msg,
diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp
new file mode 100644
index 00000000..0cc0b503
--- /dev/null
+++ b/src/kernel/lib/setup.cpp
@@ -0,0 +1,1260 @@
+/*****************************************************************************
+
+YASK: Yet Another Stencil Kernel
+Copyright (c) 2014-2018, Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+* The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
+
+*****************************************************************************/
+
+// This file contains implementations of StencilContext methods
+// specific to the preparation steps.
+
+#include "yask.hpp"
+using namespace std;
+
+namespace yask {
+
+    // Init MPI-related vars and other vars related to my rank's place in
+    // the global problem: rank index, offset, etc.  Need to call this even
+    // if not using MPI to properly init these vars.  Called from
+    // prepare_solution(), so it doesn't normally need to be called from user code.
+    void StencilContext::setupRank() {
+        ostream& os = get_ostr();
+        auto& step_dim = _dims->_step_dim;
+        auto me = _env->my_rank;
+
+        // Check ranks.
+        idx_t req_ranks = _opts->_num_ranks.product();
+        if (req_ranks != _env->num_ranks) {
+            THROW_YASK_EXCEPTION("error: " << req_ranks << " rank(s) requested (" <<
+                _opts->_num_ranks.makeDimValStr(" * ") << "), but " <<
+                _env->num_ranks << " rank(s) are active");
+        }
+        assertEqualityOverRanks(_opts->_rank_sizes[step_dim], _env->comm, "num steps");
+
+        // Determine my coordinates if not provided already.
+        // TODO: do this more intelligently based on proximity.
+        if (_opts->find_loc)
+            _opts->_rank_indices = _opts->_num_ranks.unlayout(me);
+
+        // A table of rank-coordinates for everyone.
+        auto num_ddims = _opts->_rank_indices.size(); // domain-dims only!
+        idx_t coords[_env->num_ranks][num_ddims];
+
+        // Init coords for this rank.
+        for (int i = 0; i < num_ddims; i++)
+            coords[me][i] = _opts->_rank_indices[i];
+
+        // A table of rank-domain sizes for everyone.
+        idx_t rsizes[_env->num_ranks][num_ddims];
+
+        // Init sizes for this rank.
+        for (int di = 0; di < num_ddims; di++) {
+            auto& dname = _opts->_rank_indices.getDimName(di);
+            rsizes[me][di] = _opts->_rank_sizes[dname];
+        }
+
+#ifdef USE_MPI
+        // Exchange coord and size info between all ranks.
+        for (int rn = 0; rn < _env->num_ranks; rn++) {
+            MPI_Bcast(&coords[rn][0], num_ddims, MPI_INTEGER8,
+                      rn, _env->comm);
+            MPI_Bcast(&rsizes[rn][0], num_ddims, MPI_INTEGER8,
+                      rn, _env->comm);
+        }
+        // Now, the tables are filled in for all ranks.
+#endif
+
+        // Init offsets and total sizes.
+        rank_domain_offsets.setValsSame(0);
+        overall_domain_sizes.setValsSame(0);
+
+        // Loop over all ranks, including myself.
+        int num_neighbors = 0;
+        for (int rn = 0; rn < _env->num_ranks; rn++) {
+
+            // Coord offset of rn from me: prev => negative, self => 0, next => positive.
+            IdxTuple rcoords(_dims->_domain_dims);
+            IdxTuple rdeltas(_dims->_domain_dims);
+            for (int di = 0; di < num_ddims; di++) {
+                rcoords[di] = coords[rn][di];
+                rdeltas[di] = coords[rn][di] - _opts->_rank_indices[di];
+            }
+        
+            // Manhattan distance from rn (sum of abs deltas in all dims).
+            // Max distance in any dim.
+            int mandist = 0;
+            int maxdist = 0;
+            for (int di = 0; di < num_ddims; di++) {
+                mandist += abs(rdeltas[di]);
+                maxdist = max(maxdist, abs(int(rdeltas[di])));
+            }
+            
+            // Myself.
+            if (rn == me) {
+                if (mandist != 0)
+                    THROW_YASK_EXCEPTION("Internal error: distance to own rank == " << mandist);
+            }
+
+            // Someone else.
+            else {
+                if (mandist == 0)
+                    THROW_YASK_EXCEPTION("Error: ranks " << me <<
+                                         " and " << rn << " at same coordinates");
+            }
+
+            // Loop through domain dims.
+            for (int di = 0; di < num_ddims; di++) {
+                auto& dname = _opts->_rank_indices.getDimName(di);
+
+                // Is rank 'rn' in-line with my rank in 'dname' dim?
+                // True when deltas in other dims are zero.
+                bool is_inline = true;
+                for (int dj = 0; dj < num_ddims; dj++) {
+                    if (di != dj && rdeltas[dj] != 0) {
+                        is_inline = false;
+                        break;
+                    }
+                }
+
+                // Process ranks that are in-line in 'dname', including self.
+                if (is_inline) {
+                    
+                    // Accumulate total problem size in each dim for ranks that
+                    // intersect with this rank, including myself.
+                    overall_domain_sizes[dname] += rsizes[rn][di];
+
+                    // Adjust my offset in the global problem by adding all domain
+                    // sizes from prev ranks only.
+                    if (rdeltas[di] < 0)
+                        rank_domain_offsets[dname] += rsizes[rn][di];
+
+                    // Make sure all the other dims are the same size.
+                    // This ensures that all the ranks' domains line up
+                    // properly along their edges and at their corners.
+                    for (int dj = 0; dj < num_ddims; dj++) {
+                        if (di != dj) {
+                            auto mysz = rsizes[me][dj];
+                            auto rnsz = rsizes[rn][dj];
+                            if (mysz != rnsz) {
+                                auto& dnamej = _opts->_rank_indices.getDimName(dj);
+                                THROW_YASK_EXCEPTION("Error: rank " << rn << " and " << me <<
+                                    " are both at rank-index " << coords[me][di] <<
+                                    " in the '" << dname <<
+                                    "' dimension , but their rank-domain sizes are " <<
+                                    rnsz << " and " << mysz <<
+                                    " (resp.) in the '" << dj <<
+                                    "' dimension, making them unaligned");
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Rank rn is myself or my immediate neighbor if its distance <= 1 in
+            // every dim.  Assume we do not need to exchange halos except
+            // with immediate neighbor. We validate this assumption below by
+            // making sure that the rank domain size is at least as big as the
+            // largest halo.
+            if (maxdist <= 1) {
+
+                // At this point, rdeltas contains only -1..+1 for each domain dim.
+                // Add one to -1..+1 to get 0..2 range for my_neighbors offsets.
+                IdxTuple roffsets = rdeltas.addElements(1);
+                assert(rdeltas.min() >= -1);
+                assert(rdeltas.max() <= 1);
+                assert(roffsets.min() >= 0);
+                assert(roffsets.max() <= 2);
+
+                // Convert the offsets into a 1D index.
+                auto rn_ofs = _mpiInfo->getNeighborIndex(roffsets);
+                TRACE_MSG("neighborhood size = " << _mpiInfo->neighborhood_sizes.makeDimValStr() <<
+                          " & roffsets of rank " << rn << " = " << roffsets.makeDimValStr() <<
+                          " => " << rn_ofs);
+                assert(idx_t(rn_ofs) < _mpiInfo->neighborhood_size);
+
+                // Save rank of this neighbor into the MPI info object.
+                _mpiInfo->my_neighbors.at(rn_ofs) = rn;
+                if (rn != me) {
+                    num_neighbors++;
+                    os << "Neighbor #" << num_neighbors << " is rank " << rn <<
+                        " at absolute rank indices " << rcoords.makeDimValStr() <<
+                        " (" << rdeltas.makeDimValOffsetStr() << " relative to rank " <<
+                        me << ")\n";
+                }
+
+                // Save manhattan dist.
+                _mpiInfo->man_dists.at(rn_ofs) = mandist;
+
+                // Loop through domain dims.
+                bool vlen_mults = true;
+                for (int di = 0; di < num_ddims; di++) {
+                    auto& dname = _opts->_rank_indices.getDimName(di);
+
+                    // Does rn have all VLEN-multiple sizes?
+                    auto rnsz = rsizes[rn][di];
+                    auto vlen = _dims->_fold_pts[di];
+                    if (rnsz % vlen != 0) {
+                        TRACE_MSG("cannot use vector halo exchange with rank " << rn <<
+                                  " because its size in '" << dname << "' is " << rnsz);
+                        vlen_mults = false;
+                    }
+                }
+
+                // Save vec-mult flag.
+                _mpiInfo->has_all_vlen_mults.at(rn_ofs) = vlen_mults;
+                
+            } // self or immediate neighbor in any direction.
+            
+        } // ranks.
+
+        // Set offsets in grids and find WF extensions
+        // based on the grids' halos.
+        update_grids();
+
+        // Determine bounding-boxes for all bundles.
+        // This must be done after finding WF extensions.
+        find_bounding_boxes();
+
+    } // setupRank.
+
+    // Alloc 'nbytes' on each requested NUMA node.
+    // Map keys are preferred NUMA nodes or -1 for local.
+    // Pointers are returned in '_data_buf'.
+    // 'ngrids' and 'type' are only used for debug msg.
+    void StencilContext::_alloc_data(const map <int, size_t>& nbytes,
+                                     const map <int, size_t>& ngrids,
+                                     map <int, shared_ptr<char>>& data_buf,
+                                     const std::string& type) {
+        ostream& os = get_ostr();
+
+        for (const auto& i : nbytes) {
+            int numa_pref = i.first;
+            size_t nb = i.second;
+            size_t ng = ngrids.at(numa_pref);
+
+            // Don't need pad after last one.
+            if (nb >= _data_buf_pad)
+                nb -= _data_buf_pad;
+
+            // Allocate data.
+            os << "Allocating " << makeByteStr(nb) <<
+                " for " << ng << " " << type << "(s)";
+#ifdef USE_NUMA
+            if (numa_pref >= 0)
+                os << " preferring NUMA node " << numa_pref;
+            else
+                os << " using NUMA policy " << numa_pref;
+#endif
+            os << "...\n" << flush;
+            auto p = shared_numa_alloc<char>(nb, numa_pref);
+            TRACE_MSG("Got memory at " << static_cast<void*>(p.get()));
+
+            // Save using original key.
+            data_buf[numa_pref] = p;
+        }
+    }
+    
+    // Allocate memory for grids that do not already have storage.
+    void StencilContext::allocGridData(ostream& os) {
+
+        // Base ptrs for all default-alloc'd data.
+        // These pointers will be shared by the ones in the grid
+        // objects, which will take over ownership when these go
+        // out of scope.
+        // Key is preferred numa node or -1 for local.
+        map <int, shared_ptr<char>> _grid_data_buf;
+
+        // Pass 0: count required size for each NUMA node, allocate chunk of memory at end.
+        // Pass 1: distribute parts of already-allocated memory chunk.
+        for (int pass = 0; pass < 2; pass++) {
+            TRACE_MSG("allocGridData pass " << pass << " for " <<
+                      gridPtrs.size() << " grid(s)");
+        
+            // Count bytes needed and number of grids for each NUMA node.
+            map <int, size_t> npbytes, ngrids;
+        
+            // Grids.
+            for (auto gp : gridPtrs) {
+                if (!gp)
+                    continue;
+                auto& gname = gp->get_name();
+
+                // Grid data.
+                // Don't alloc if already done.
+                if (!gp->is_storage_allocated()) {
+                    int numa_pref = gp->get_numa_preferred();
+
+                    // Set storage if buffer has been allocated in pass 0.
+                    if (pass == 1) {
+                        auto p = _grid_data_buf[numa_pref];
+                        assert(p);
+                        gp->set_storage(p, npbytes[numa_pref]);
+                        os << gp->make_info_string() << endl;
+                    }
+
+                    // Determine padded size (also offset to next location).
+                    size_t nbytes = gp->get_num_storage_bytes();
+                    npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad,
+                                                  CACHELINE_BYTES);
+                    ngrids[numa_pref]++;
+                    if (pass == 0)
+                        TRACE_MSG(" grid '" << gname << "' needs " << makeByteStr(nbytes) <<
+                                  " on NUMA node " << numa_pref);
+                }
+            }
+
+            // Alloc for each node.
+            if (pass == 0)
+                _alloc_data(npbytes, ngrids, _grid_data_buf, "grid");
+
+        } // grid passes.
+    };
+    
+    // Create MPI and allocate buffers.
+    void StencilContext::allocMpiData(ostream& os) {
+
+        // Remove any old MPI data.
+        freeMpiData(os);
+
+#ifdef USE_MPI
+
+        int num_exchanges = 0;
+        auto me = _env->my_rank;
+        
+        // Need to determine the size and shape of all MPI buffers.
+        // Visit all neighbors of this rank.
+        _mpiInfo->visitNeighbors
+            ([&](const IdxTuple& neigh_offsets, int neigh_rank, int neigh_idx) {
+                if (neigh_rank == MPI_PROC_NULL)
+                    return; // from lambda fn.
+
+                // Determine max dist needed.  TODO: determine max dist
+                // automatically from stencils; may not be same for all
+                // grids.
+#ifndef MAX_EXCH_DIST
+#define MAX_EXCH_DIST (NUM_STENCIL_DIMS - 1)
+#endif
+                // Always use max dist with WF.
+                // TODO: determine if this is overkill.
+                int maxdist = MAX_EXCH_DIST;
+                if (num_wf_shifts > 0)
+                    maxdist = NUM_STENCIL_DIMS - 1;
+
+                // Manhattan dist.
+                int mandist = _mpiInfo->man_dists.at(neigh_idx);
+                    
+                // Check distance.
+                // TODO: calculate and use exch dist for each grid.
+                if (mandist > maxdist) {
+                    TRACE_MSG("no halo exchange needed with rank " << neigh_rank <<
+                              " because L1-norm = " << mandist);
+                    return;     // from lambda fn.
+                }
+        
+                // Determine size of MPI buffers between neigh_rank and my rank
+                // for each grid and create those that are needed.
+                for (auto gp : gridPtrs) {
+                    if (!gp)
+                        continue;
+                    auto& gname = gp->get_name();
+
+                    // Lookup first & last domain indices and calc exchange sizes
+                    // for this grid.
+                    bool found_delta = false;
+                    IdxTuple my_halo_sizes, neigh_halo_sizes;
+                    IdxTuple first_inner_idx, last_inner_idx;
+                    IdxTuple first_outer_idx, last_outer_idx;
+                    for (auto& dim : _dims->_domain_dims.getDims()) {
+                        auto& dname = dim.getName();
+                        if (gp->is_dim_used(dname)) {
+
+                            // Get domain indices for this grid.
+                            // If there are no more ranks in the given direction, extend
+                            // the index into the outer halo to make sure all data are sync'd.
+                            // This is critical for WFs.
+                            idx_t fidx = gp->get_first_rank_domain_index(dname);
+                            idx_t lidx = gp->get_last_rank_domain_index(dname);
+                            first_inner_idx.addDimBack(dname, fidx);
+                            last_inner_idx.addDimBack(dname, lidx);
+                            if (_opts->is_first_rank(dname))
+                                fidx -= gp->get_left_halo_size(dname);
+                            if (_opts->is_last_rank(dname))
+                                lidx += gp->get_right_halo_size(dname);
+                            first_outer_idx.addDimBack(dname, fidx);
+                            last_outer_idx.addDimBack(dname, lidx);
+
+                            // Determine size of exchange. This will be the actual halo size
+                            // plus any wave-front extensions. In the current implementation,
+                            // we need the wave-front extensions regardless of whether there
+                            // is a halo on a given grid. This is because each stencil-bundle
+                            // gets shifted by the WF angles at each step in the WF.
+
+                            // Neighbor is to the left.
+                            if (neigh_offsets[dname] == MPIInfo::rank_prev) {
+                                auto ext = left_wf_exts[dname];
+
+                                // my halo.
+                                auto halo_size = gp->get_left_halo_size(dname);
+                                halo_size += ext;
+                                my_halo_sizes.addDimBack(dname, halo_size);
+
+                                // neighbor halo.
+                                halo_size = gp->get_right_halo_size(dname); // their right is on my left.
+                                halo_size += ext;
+                                neigh_halo_sizes.addDimBack(dname, halo_size);
+                            }
+
+                            // Neighbor is to the right.
+                            else if (neigh_offsets[dname] == MPIInfo::rank_next) {
+                                auto ext = right_wf_exts[dname];
+
+                                // my halo.
+                                auto halo_size = gp->get_right_halo_size(dname);
+                                halo_size += ext;
+                                my_halo_sizes.addDimBack(dname, halo_size);
+
+                                // neighbor halo.
+                                halo_size = gp->get_left_halo_size(dname); // their left is on my right.
+                                halo_size += ext;
+                                neigh_halo_sizes.addDimBack(dname, halo_size);
+                            }
+
+                            // Neighbor in-line.
+                            else {
+                                my_halo_sizes.addDimBack(dname, 0);
+                                neigh_halo_sizes.addDimBack(dname, 0);
+                            }
+
+                            // Vectorized exchange allowed based on domain sizes?
+                            // Both my rank and neighbor rank must have all domain sizes
+                            // of vector multiples.
+                            bool vec_ok = allow_vec_exchange &&
+                                _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] &&
+                                _mpiInfo->has_all_vlen_mults[neigh_idx];
+                            
+                            // Round up halo sizes if vectorized exchanges allowed.
+                            // TODO: add a heuristic to avoid increasing by a large factor.
+                            if (vec_ok) {
+                                auto vec_size = _dims->_fold_pts[dname];
+                                my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size));
+                                neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size));
+                            }
+                            
+                            // Is this neighbor before or after me in this domain direction?
+                            if (neigh_offsets[dname] != MPIInfo::rank_self)
+                                found_delta = true;
+                        }
+                    }
+
+                    // Is buffer needed?
+                    // Example: if this grid is 2D in y-z, but only neighbors are in
+                    // x-dim, we don't need any exchange.
+                    if (!found_delta) {
+                        TRACE_MSG("no halo exchange needed for grid '" << gname <<
+                                  "' with rank " << neigh_rank <<
+                                  " because the neighbor is not in a direction"
+                                  " corresponding to a grid dim");
+                        continue; // to next grid.
+                    }
+
+                    // Make a buffer in both directions (send & receive).
+                    for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) {
+
+                        // Begin/end vars to indicate what part
+                        // of main grid to read from or write to based on
+                        // the current neighbor being processed.
+                        IdxTuple copy_begin = gp->get_allocs();
+                        IdxTuple copy_end = gp->get_allocs();
+
+                        // Adjust along domain dims in this grid.
+                        for (auto& dim : _dims->_domain_dims.getDims()) {
+                            auto& dname = dim.getName();
+                            if (gp->is_dim_used(dname)) {
+
+                                // Init range to whole rank domain (including
+                                // outer halos).  These may be changed below
+                                // depending on the neighbor's direction.
+                                copy_begin[dname] = first_outer_idx[dname];
+                                copy_end[dname] = last_outer_idx[dname] + 1; // end = last + 1.
+
+                                // Neighbor direction in this dim.
+                                auto neigh_ofs = neigh_offsets[dname];
+                            
+                                // Region to read from, i.e., data from inside
+                                // this rank's domain to be put into neighbor's
+                                // halo.
+                                if (bd == MPIBufs::bufSend) {
+
+                                    // Neighbor is to the left.
+                                    if (neigh_ofs == idx_t(MPIInfo::rank_prev)) {
+
+                                        // Only read slice as wide as halo from beginning.
+                                        copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname];
+                                    }
+                            
+                                    // Neighbor is to the right.
+                                    else if (neigh_ofs == idx_t(MPIInfo::rank_next)) {
+                                    
+                                        // Only read slice as wide as halo before end.
+                                        copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname];
+                                    }
+                            
+                                    // Else, this neighbor is in same posn as I am in this dim,
+                                    // so we leave the default begin/end settings.
+                                }
+                        
+                                // Region to write to, i.e., into this rank's halo.
+                                else if (bd == MPIBufs::bufRecv) {
+
+                                    // Neighbor is to the left.
+                                    if (neigh_ofs == idx_t(MPIInfo::rank_prev)) {
+
+                                        // Only read slice as wide as halo before beginning.
+                                        copy_begin[dname] = first_inner_idx[dname] - my_halo_sizes[dname];
+                                        copy_end[dname] = first_inner_idx[dname];
+                                    }
+                            
+                                    // Neighbor is to the right.
+                                    else if (neigh_ofs == idx_t(MPIInfo::rank_next)) {
+                                    
+                                        // Only read slice as wide as halo after end.
+                                        copy_begin[dname] = last_inner_idx[dname] + 1;
+                                        copy_end[dname] = last_inner_idx[dname] + 1 + my_halo_sizes[dname];
+                                    }
+                                
+                                    // Else, this neighbor is in same posn as I am in this dim,
+                                    // so we leave the default begin/end settings.
+                                }
+                            } // domain dims in this grid.
+                        } // domain dims.
+
+                        // Sizes of buffer in all dims of this grid.
+                        // Also, set begin/end value for non-domain dims.
+                        IdxTuple buf_sizes = gp->get_allocs();
+                        bool vlen_mults = true;
+                        for (auto& dname : gp->get_dim_names()) {
+                            idx_t dsize = 1;
+
+                            // domain dim?
+                            if (_dims->_domain_dims.lookup(dname)) {
+                                dsize = copy_end[dname] - copy_begin[dname];
+
+                                // Check whether size is multiple of vlen.
+                                auto vlen = _dims->_fold_pts[dname];
+                                if (dsize % vlen != 0)
+                                    vlen_mults = false;
+                            }
+
+                            // step dim?
+                            // Allowing only one step to be exchanged.
+                            // TODO: consider exchanging mutiple steps at once for WFs.
+                            else if (dname == _dims->_step_dim) {
+
+                                // Use 0..1 as a place-holder range.
+                                // The actual values will be supplied during
+                                // halo exchange.
+                                copy_begin[dname] = 0;
+                                copy_end[dname] = 1;
+                            }
+
+                            // misc?
+                            // Copy over entire range.
+                            // TODO: make dirty flags for misc dims in grids.
+                            else {
+                                dsize = gp->get_alloc_size(dname);
+                                copy_begin[dname] = gp->get_first_misc_index(dname);
+                                copy_end[dname] = gp->get_last_misc_index(dname) + 1;
+                            }
+
+                            // Save computed size.
+                            buf_sizes[dname] = dsize;
+                                
+                        } // all dims in this grid.
+
+                        // Does buffer have non-zero size?
+                        if (buf_sizes.size() == 0 || buf_sizes.product() == 0) {
+                            TRACE_MSG("no halo exchange needed for grid '" << gname <<
+                                      "' with rank " << neigh_rank <<
+                                      " because there is no data to exchange");
+                            continue;
+                        }
+
+                        // At this point, buf_sizes, copy_begin, and copy_end
+                        // should be set for each dim in this grid.
+                        // Convert end to last.
+                        IdxTuple copy_last = copy_end.subElements(1);
+
+                        // Unique name for buffer based on grid name, direction, and ranks.
+                        ostringstream oss;
+                        oss << gname;
+                        if (bd == MPIBufs::bufSend)
+                            oss << "_send_halo_from_" << me << "_to_" << neigh_rank;
+                        else if (bd == MPIBufs::bufRecv)
+                            oss << "_recv_halo_from_" << neigh_rank << "_to_" << me;
+                        string bufname = oss.str();
+
+                        // Make MPI data entry for this grid.
+                        auto gbp = mpiData.emplace(gname, _mpiInfo);
+                        auto& gbi = gbp.first; // iterator from pair returned by emplace().
+                        auto& gbv = gbi->second; // value from iterator.
+                        auto& buf = gbv.getBuf(MPIBufs::BufDir(bd), neigh_offsets);
+
+                        // Config buffer for this grid.
+                        // (But don't allocate storage yet.)
+                        buf.begin_pt = copy_begin;
+                        buf.last_pt = copy_last;
+                        buf.num_pts = buf_sizes;
+                        buf.name = bufname;
+                        buf.has_all_vlen_mults = vlen_mults;
+                        
+                        TRACE_MSG("configured MPI buffer object '" << buf.name <<
+                                  "' for rank at relative offsets " <<
+                                  neigh_offsets.subElements(1).makeDimValStr() << " with " <<
+                                  buf.num_pts.makeDimValStr(" * ") << " = " << buf.get_size() <<
+                                  " element(s) at " << buf.begin_pt.makeDimValStr() <<
+                                  " ... " << buf.last_pt.makeDimValStr());
+                        num_exchanges++;
+
+                    } // send, recv.
+                } // grids.
+            });   // neighbors.
+        TRACE_MSG("number of halo-exchanges needed on this rank: " << num_exchanges);
+
+        // Base ptrs for all alloc'd data.
+        // These pointers will be shared by the ones in the grid
+        // objects, which will take over ownership when these go
+        // out of scope.
+        map <int, shared_ptr<char>> _mpi_data_buf;
+
+        // Allocate MPI buffers.
+        // Pass 0: count required size, allocate chunk of memory at end.
+        // Pass 1: distribute parts of already-allocated memory chunk.
+        for (int pass = 0; pass < 2; pass++) {
+            TRACE_MSG("allocMpiData pass " << pass << " for " <<
+                      mpiData.size() << " MPI buffer set(s)");
+        
+            // Count bytes needed and number of buffers for each NUMA node.
+            map <int, size_t> npbytes, nbufs;
+        
+            // Grids.
+            for (auto gp : gridPtrs) {
+                if (!gp)
+                    continue;
+                auto& gname = gp->get_name();
+                int numa_pref = gp->get_numa_preferred();
+
+                // MPI bufs for this grid.
+                if (mpiData.count(gname)) {
+                    auto& grid_mpi_data = mpiData.at(gname);
+
+                    // Visit buffers for each neighbor for this grid.
+                    grid_mpi_data.visitNeighbors
+                        ([&](const IdxTuple& roffsets,
+                             int rank,
+                             int idx,
+                             MPIBufs& bufs) {
+
+                            // Send and recv.
+                            for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) {
+                                auto& buf = grid_mpi_data.getBuf(MPIBufs::BufDir(bd), roffsets);
+                                if (buf.get_size() == 0)
+                                    continue;
+                                
+                                // Set storage if buffer has been allocated in pass 0.
+                                if (pass == 1) {
+                                    auto p = _mpi_data_buf[numa_pref];
+                                    assert(p);
+                                    buf.set_storage(p, npbytes[numa_pref]);
+                                }
+
+                                // Determine padded size (also offset to next location).
+                                auto sbytes = buf.get_bytes();
+                                npbytes[numa_pref] += ROUND_UP(sbytes + _data_buf_pad,
+                                                               CACHELINE_BYTES);
+                                nbufs[numa_pref]++;
+                                if (pass == 0)
+                                    TRACE_MSG("  MPI buf '" << buf.name << "' needs " <<
+                                              makeByteStr(sbytes) <<
+                                              " on NUMA node " << numa_pref);
+                            }
+                        } );
+                }
+            }
+
+            // Alloc for each node.
+            if (pass == 0)
+                _alloc_data(npbytes, nbufs, _mpi_data_buf, "MPI buffer");
+
+        } // MPI passes.
+#endif
+    }
+
+    // Allocate memory for scratch grids based on number of threads and
+    // block sizes.
+    void StencilContext::allocScratchData(ostream& os) {
+
+        // Remove any old scratch data.
+        freeScratchData(os);
+
+        // Base ptrs for all alloc'd data.
+        // This pointer will be shared by the ones in the grid
+        // objects, which will take over ownership when it goes
+        // out of scope.
+        map <int, shared_ptr<char>> _scratch_data_buf;
+
+        // Make sure the right number of threads are set so we
+        // have the right number of scratch grids.
+        int rthreads = set_region_threads();
+
+        // Delete any existing scratch grids.
+        // Create new scratch grids.
+        makeScratchGrids(rthreads);
+        
+        // Pass 0: count required size, allocate chunk of memory at end.
+        // Pass 1: distribute parts of already-allocated memory chunk.
+        for (int pass = 0; pass < 2; pass++) {
+            TRACE_MSG("allocScratchData pass " << pass << " for " <<
+                      scratchVecs.size() << " set(s) of scratch grids");
+        
+            // Count bytes needed and number of grids for each NUMA node.
+            map <int, size_t> npbytes, ngrids;
+
+            // Loop through each scratch grid vector.
+            for (auto* sgv : scratchVecs) {
+                assert(sgv);
+
+                // Loop through each scratch grid in this vector.
+                // There will be one for each region thread.
+                assert(int(sgv->size()) == rthreads);
+                int thr_num = 0;
+                for (auto gp : *sgv) {
+                    assert(gp);
+                    auto& gname = gp->get_name();
+                    int numa_pref = gp->get_numa_preferred();
+            
+                    // Loop through each domain dim.
+                    for (auto& dim : _dims->_domain_dims.getDims()) {
+                        auto& dname = dim.getName();
+
+                        if (gp->is_dim_used(dname)) {
+
+                            // Set domain size of grid to block size.
+                            gp->_set_domain_size(dname, _opts->_block_sizes[dname]);
+                    
+                            // Pads.
+                            // Set via both 'extra' and 'min'; larger result will be used.
+                            gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]);
+                            gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]);
+                        }
+                    } // dims.
+                
+                    // Set storage if buffer has been allocated.
+                    if (pass == 1) {
+                        auto p = _scratch_data_buf[numa_pref];
+                        assert(p);
+                        gp->set_storage(p, npbytes[numa_pref]);
+                        TRACE_MSG(gp->make_info_string());
+                    }
+
+                    // Determine size used (also offset to next location).
+                    size_t nbytes = gp->get_num_storage_bytes();
+                    npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad,
+                                                   CACHELINE_BYTES);
+                    ngrids[numa_pref]++;
+                    if (pass == 0)
+                        TRACE_MSG(" scratch grid '" << gname << "' for thread " <<
+                                  thr_num << " needs " << makeByteStr(nbytes) <<
+                                  " on NUMA node " << numa_pref);
+                    thr_num++;
+                } // scratch grids.
+            } // scratch-grid vecs.
+
+            // Alloc for each node.
+            if (pass == 0)
+                _alloc_data(npbytes, ngrids, _scratch_data_buf, "scratch grid");
+
+        } // scratch-grid passes.
+    }
+
+    // Set non-scratch grid sizes and offsets based on settings.
+    // Set wave-front settings.
+    // This should be called anytime a setting or rank offset is changed.
+    void StencilContext::update_grids()
+    {
+        assert(_opts);
+
+        // Reset halos to zero.
+        max_halos = _dims->_domain_dims;
+
+        // Loop through each non-scratch grid.
+        for (auto gp : gridPtrs) {
+            assert(gp);
+
+            // Ignore manually-sized grid.
+            if (gp->is_fixed_size())
+                continue;
+
+            // Loop through each domain dim.
+            for (auto& dim : _dims->_domain_dims.getDims()) {
+                auto& dname = dim.getName();
+
+                if (gp->is_dim_used(dname)) {
+
+                    // Rank domains.
+                    gp->_set_domain_size(dname, _opts->_rank_sizes[dname]);
+                    
+                    // Pads.
+                    // Set via both 'extra' and 'min'; larger result will be used.
+                    gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]);
+                    gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]);
+                    
+                    // Offsets.
+                    gp->_set_offset(dname, rank_domain_offsets[dname]);
+
+                    // Update max halo across grids, used for wavefront angles.
+                    max_halos[dname] = max(max_halos[dname], gp->get_left_halo_size(dname));
+                    max_halos[dname] = max(max_halos[dname], gp->get_right_halo_size(dname));
+                }
+            }
+        } // grids.
+
+        // Calculate wave-front settings based on max halos.
+        // See the wavefront diagram in run_solution() for description
+        // of angles and extensions.
+        auto& step_dim = _dims->_step_dim;
+        auto wf_steps = _opts->_region_sizes[step_dim];
+        num_wf_shifts = 0;
+        if (wf_steps > 1)
+
+            // TODO: don't shift for scratch grids.
+            num_wf_shifts = max((idx_t(stBundles.size()) * wf_steps) - 1, idx_t(0));
+        for (auto& dim : _dims->_domain_dims.getDims()) {
+            auto& dname = dim.getName();
+            auto rksize = _opts->_rank_sizes[dname];
+            auto nranks = _opts->_num_ranks[dname];
+
+            // Determine the max spatial skewing angles for temporal
+            // wave-fronts based on the max halos.  We only need non-zero
+            // angles if the region size is less than the rank size and
+            // there are no other ranks in this dim, i.e., if the region
+            // covers the global domain in a given dim, no wave-front is
+            // needed in that dim.  TODO: make rounding-up an option.
+            idx_t angle = 0;
+            if (_opts->_region_sizes[dname] < rksize || nranks > 0)
+                angle = ROUND_UP(max_halos[dname], _dims->_cluster_pts[dname]);
+            wf_angles[dname] = angle;
+
+            // Determine the total WF shift to be added in each dim.
+            idx_t shifts = angle * num_wf_shifts;
+            wf_shifts[dname] = shifts;
+
+            // Is domain size at least as large as halo + wf_ext in direction
+            // when there are multiple ranks?
+            auto min_size = max_halos[dname] + shifts;
+            if (_opts->_num_ranks[dname] > 1 && rksize < min_size) {
+                THROW_YASK_EXCEPTION("Error: rank-domain size of " << rksize << " in '" <<
+                                     dname << "' dim is less than minimum size of " << min_size <<
+                                     ", which is based on stencil halos and temporal wave-front sizes");
+            }
+
+            // If there is another rank to the left, set wave-front
+            // extension on the left.
+            left_wf_exts[dname] = _opts->is_first_rank(dname) ? 0 : shifts;
+
+            // If there is another rank to the right, set wave-front
+            // extension on the right.
+            right_wf_exts[dname] = _opts->is_last_rank(dname) ? 0 : shifts;
+        }            
+            
+        // Now that wave-front settings are known, we can push this info
+        // back to the grids. It's useful to store this redundant info
+        // in the grids, because there it's indexed by grid dims instead
+        // of domain dims. This makes it faster to do grid indexing.
+        for (auto gp : gridPtrs) {
+            assert(gp);
+
+            // Ignore manually-sized grid.
+            if (gp->is_fixed_size())
+                continue;
+
+            // Loop through each domain dim.
+            for (auto& dim : _dims->_domain_dims.getDims()) {
+                auto& dname = dim.getName();
+                if (gp->is_dim_used(dname)) {
+
+                    // Set extensions to be the same as the global ones.
+                    gp->_set_left_wf_ext(dname, left_wf_exts[dname]);
+                    gp->_set_right_wf_ext(dname, right_wf_exts[dname]);
+                }
+            }
+        }
+    }
+    
+    // Allocate grids and MPI bufs.
+    // Initialize some data structures.
+    void StencilContext::prepare_solution() {
+        auto& step_dim = _dims->_step_dim;
+
+        // Don't continue until all ranks are this far.
+        _env->global_barrier();
+
+        ostream& os = get_ostr();
+#ifdef DEBUG
+        os << "*** WARNING: YASK compiled with DEBUG; ignore performance results.\n";
+#endif
+#if defined(NO_INTRINSICS) && (VLEN > 1)
+        os << "*** WARNING: YASK compiled with NO_INTRINSICS; ignore performance results.\n";
+#endif
+#ifdef MODEL_CACHE
+        os << "*** WARNING: YASK compiled with MODEL_CACHE; ignore performance results.\n";
+#endif
+#ifdef TRACE_MEM
+        os << "*** WARNING: YASK compiled with TRACE_MEM; ignore performance results.\n";
+#endif
+#ifdef TRACE_INTRINSICS
+        os << "*** WARNING: YASK compiled with TRACE_INTRINSICS; ignore performance results.\n";
+#endif
+        
+        // reset time keepers.
+        clear_timers();
+
+        // Init auto-tuner to run silently during normal operation.
+        _at.clear(false, false);
+
+        // Adjust all settings before setting MPI buffers or sizing grids.
+        // Prints final settings.
+        // TODO: print settings again after auto-tuning.
+        _opts->adjustSettings(os, _env);
+
+        // Report ranks.
+        os << endl;
+        os << "Num ranks: " << _env->get_num_ranks() << endl;
+        os << "This rank index: " << _env->get_rank_index() << endl;
+
+        // report threads.
+        os << "Num OpenMP procs: " << omp_get_num_procs() << endl;
+        set_all_threads();
+        os << "Num OpenMP threads: " << omp_get_max_threads() << endl;
+        set_region_threads(); // Temporary; just for reporting.
+        os << "  Num threads per region: " << omp_get_max_threads() << endl;
+        set_block_threads(); // Temporary; just for reporting.
+        os << "  Num threads per block: " << omp_get_max_threads() << endl;
+
+        // Set the number of threads for a region. It should stay this
+        // way for top-level OpenMP parallel sections.
+        int rthreads = set_region_threads();
+
+        // Run a dummy nested OMP loop to make sure nested threading is
+        // initialized.
+#ifdef _OPENMP
+#pragma omp parallel for
+        for (int i = 0; i < rthreads * 100; i++) {
+
+            idx_t dummy = 0;
+            set_block_threads();
+#pragma omp parallel for reduction(+:dummy)
+            for (int j = 0; j < i * 100; j++) {
+                dummy += j;
+            }
+        }
+#endif
+
+        // Some grid stats.
+        os << endl;
+        os << "Num grids: " << gridPtrs.size() << endl;
+        os << "Num grids to be updated: " << outputGridPtrs.size() << endl;
+        
+        // Set up data based on MPI rank, including grid positions.
+        // Update all the grid sizes.
+        setupRank();
+
+        // Alloc grids, scratch grids, MPI bufs.
+        // This is the order in which preferred NUMA nodes (e.g., HBW mem)
+        // will be used.
+        // We free the scratch and MPI data first to give grids preference.
+        freeScratchData(os);
+        freeMpiData(os);
+        allocGridData(os);
+        allocScratchData(os);
+        allocMpiData(os);
+
+        // Report total allocation.
+        rank_nbytes = get_num_bytes();
+        os << "Total allocation in this rank: " <<
+            makeByteStr(rank_nbytes) << "\n";
+        tot_nbytes = sumOverRanks(rank_nbytes, _env->comm);
+        os << "Total overall allocation in " << _env->num_ranks << " rank(s): " <<
+            makeByteStr(tot_nbytes) << "\n";
+    
+        // Report some stats.
+        idx_t dt = _opts->_rank_sizes[step_dim];
+        os << "\nProblem sizes in points (from smallest to largest):\n"
+            " vector-size:           " << _dims->_fold_pts.makeDimValStr(" * ") << endl <<
+            " cluster-size:          " << _dims->_cluster_pts.makeDimValStr(" * ") << endl <<
+            " sub-block-size:        " << _opts->_sub_block_sizes.makeDimValStr(" * ") << endl <<
+            " sub-block-group-size:  " << _opts->_sub_block_group_sizes.makeDimValStr(" * ") << endl <<
+            " block-size:            " << _opts->_block_sizes.makeDimValStr(" * ") << endl <<
+            " block-group-size:      " << _opts->_block_group_sizes.makeDimValStr(" * ") << endl <<
+            " region-size:           " << _opts->_region_sizes.makeDimValStr(" * ") << endl <<
+            " rank-domain-size:      " << _opts->_rank_sizes.makeDimValStr(" * ") << endl <<
+            " overall-problem-size:  " << overall_domain_sizes.makeDimValStr(" * ") << endl <<
+            endl <<
+            "Other settings:\n"
+            " yask-version:          " << yask_get_version_string() << endl <<
+            " stencil-name:          " << get_name() << endl <<
+            " element-size:          " << makeByteStr(get_element_bytes()) << endl <<
+#ifdef USE_MPI
+            " num-ranks:             " << _opts->_num_ranks.makeDimValStr(" * ") << endl <<
+            " rank-indices:          " << _opts->_rank_indices.makeDimValStr() << endl <<
+            " rank-domain-offsets:   " << rank_domain_offsets.makeDimValOffsetStr() << endl <<
+#endif
+            " rank-domain:           " << rank_bb.bb_begin.makeDimValStr() <<
+                " ... " << rank_bb.bb_end.subElements(1).makeDimValStr() << endl <<
+            " vector-len:            " << VLEN << endl <<
+            " extra-padding:         " << _opts->_extra_pad_sizes.makeDimValStr() << endl <<
+            " minimum-padding:       " << _opts->_min_pad_sizes.makeDimValStr() << endl <<
+            " L1-prefetch-distance:  " << PFD_L1 << endl <<
+            " L2-prefetch-distance:  " << PFD_L2 << endl <<
+            " max-halos:             " << max_halos.makeDimValStr() << endl;
+        if (num_wf_shifts > 0) {
+            os <<
+                " wave-front-angles:     " << wf_angles.makeDimValStr() << endl <<
+                " num-wave-front-shifts: " << num_wf_shifts << endl <<
+                " wave-front-shift-lens: " << wf_shifts.makeDimValStr() << endl <<
+                " left-wave-front-exts:  " << left_wf_exts.makeDimValStr() << endl <<
+                " right-wave-front-exts: " << right_wf_exts.makeDimValStr() << endl <<
+                " ext-rank-domain:       " << ext_bb.bb_begin.makeDimValStr() <<
+                " ... " << ext_bb.bb_end.subElements(1).makeDimValStr() << endl;
+        }
+        os << endl;
+        
+        // sums across bundles for this rank.
+        rank_numWrites_1t = 0;
+        rank_reads_1t = 0;
+        rank_numFpOps_1t = 0;
+        os << "Num stencil bundles: " << stBundles.size() << endl;
+        for (auto* sg : stBundles) {
+            idx_t updates1 = sg->get_scalar_points_written();
+            idx_t updates_domain = updates1 * sg->bb_num_points;
+            rank_numWrites_1t += updates_domain;
+            idx_t reads1 = sg->get_scalar_points_read();
+            idx_t reads_domain = reads1 * sg->bb_num_points;
+            rank_reads_1t += reads_domain;
+            idx_t fpops1 = sg->get_scalar_fp_ops();
+            idx_t fpops_domain = fpops1 * sg->bb_num_points;
+            rank_numFpOps_1t += fpops_domain;
+            os << "Stats for bundle '" << sg->get_name() << "':\n" <<
+                " sub-domain:                 " << sg->bb_begin.makeDimValStr() <<
+                " ... " << sg->bb_end.subElements(1).makeDimValStr() << endl <<
+                " sub-domain size:            " << sg->bb_len.makeDimValStr(" * ") << endl <<
+                " valid points in sub domain: " << makeNumStr(sg->bb_num_points) << endl <<
+                " grid-updates per point:     " << updates1 << endl <<
+                " grid-updates in sub-domain: " << makeNumStr(updates_domain) << endl <<
+                " grid-reads per point:       " << reads1 << endl <<
+                " grid-reads in sub-domain:   " << makeNumStr(reads_domain) << endl <<
+                " est FP-ops per point:       " << fpops1 << endl <<
+                " est FP-ops in sub-domain:   " << makeNumStr(fpops_domain) << endl;
+        }
+
+        // Various metrics for amount of work.
+        rank_numWrites_dt = rank_numWrites_1t * dt;
+        tot_numWrites_1t = sumOverRanks(rank_numWrites_1t, _env->comm);
+        tot_numWrites_dt = tot_numWrites_1t * dt;
+
+        rank_reads_dt = rank_reads_1t * dt;
+        tot_reads_1t = sumOverRanks(rank_reads_1t, _env->comm);
+        tot_reads_dt = tot_reads_1t * dt;
+
+        rank_numFpOps_dt = rank_numFpOps_1t * dt;
+        tot_numFpOps_1t = sumOverRanks(rank_numFpOps_1t, _env->comm);
+        tot_numFpOps_dt = tot_numFpOps_1t * dt;
+
+        rank_domain_1t = rank_bb.bb_num_points;
+        rank_domain_dt = rank_domain_1t * dt; // same as _opts->_rank_sizes.product();
+        tot_domain_1t = sumOverRanks(rank_domain_1t, _env->comm);
+        tot_domain_dt = tot_domain_1t * dt;
+    
+        // Print some more stats.
+        os << endl <<
+            "Amount-of-work stats:\n" <<
+            " domain-size in this rank for one time-step: " <<
+            makeNumStr(rank_domain_1t) << endl <<
+            " overall-problem-size in all ranks for one time-step: " <<
+            makeNumStr(tot_domain_1t) << endl <<
+            endl <<
+            " num-writes-required in this rank for one time-step: " <<
+            makeNumStr(rank_numWrites_1t) << endl <<
+            " num-writes-required in all ranks for one time-step: " <<
+            makeNumStr(tot_numWrites_1t) << endl <<
+            endl <<
+            " num-reads-required in this rank for one time-step: " <<
+            makeNumStr(rank_reads_1t) << endl <<
+            " num-reads-required in all ranks for one time-step: " <<
+            makeNumStr(tot_reads_1t) << endl <<
+            endl <<
+            " est-FP-ops in this rank for one time-step: " <<
+            makeNumStr(rank_numFpOps_1t) << endl <<
+            " est-FP-ops in all ranks for one time-step: " <<
+            makeNumStr(tot_numFpOps_1t) << endl <<
+            endl;
+
+        if (dt > 1) {
+            os <<
+                " domain-size in this rank for all time-steps: " <<
+                makeNumStr(rank_domain_dt) << endl <<
+                " overall-problem-size in all ranks for all time-steps: " <<
+                makeNumStr(tot_domain_dt) << endl <<
+                endl <<
+                " num-writes-required in this rank for all time-steps: " <<
+                makeNumStr(rank_numWrites_dt) << endl <<
+                " num-writes-required in all ranks for all time-steps: " <<
+                makeNumStr(tot_numWrites_dt) << endl <<
+                endl <<
+                " num-reads-required in this rank for all time-steps: " <<
+                makeNumStr(rank_reads_dt) << endl <<
+                " num-reads-required in all ranks for all time-steps: " <<
+                makeNumStr(tot_reads_dt) << endl <<
+                endl <<
+                " est-FP-ops in this rank for all time-steps: " <<
+                makeNumStr(rank_numFpOps_dt) << endl <<
+                " est-FP-ops in all ranks for all time-steps: " <<
+                makeNumStr(tot_numFpOps_dt) << endl <<
+                endl;
+        }
+        os <<
+            "Notes:\n"
+            " Domain-sizes and overall-problem-sizes are based on rank-domain sizes\n"
+            "  and number of ranks regardless of number of grids or sub-domains.\n"
+            " Num-writes-required is based on sum of grid-updates in sub-domain across stencil-bundle(s).\n"
+            " Num-reads-required is based on sum of grid-reads in sub-domain across stencil-bundle(s).\n"
+            " Est-FP-ops are based on sum of est-FP-ops in sub-domain across stencil-bundle(s).\n"
+            "\n";
+    }
+
+    // Dealloc grids, etc.
+    void StencilContext::end_solution() {
+
+        // Final halo exchange.
+        exchange_halos_all();
+
+        // Release any MPI data.
+        mpiData.clear();
+
+        // Release grid data.
+        for (auto gp : gridPtrs) {
+            if (!gp)
+                continue;
+            gp->release_storage();
+        }
+
+	// Reset threads to original value.
+	set_max_threads();
+    }
+
+    // Init all grids & params by calling initFn.
+    void StencilContext::initValues(function<void (YkGridPtr gp, 
+                                                   real_t seed)> realInitFn) {
+        ostream& os = get_ostr();
+        real_t v = 0.1;
+        os << "Initializing grids..." << endl;
+        for (auto gp : gridPtrs) {
+            realInitFn(gp, v);
+            v += 0.01;
+        }
+    }
+
+    // Compute convenience values for a bounding-box.
+    void BoundingBox::update_bb(ostream& os,
+                                const string& name,
+                                StencilContext& context,
+                                bool force_full) {
+
+        auto dims = context.get_dims();
+        auto& domain_dims = dims->_domain_dims;
+        bb_len = bb_end.subElements(bb_begin);
+        bb_size = bb_len.product();
+        if (force_full)
+            bb_num_points = bb_size;
+
+        // Solid rectangle?
+        bb_is_full = true;
+        if (bb_num_points != bb_size) {
+            os << "Warning: '" << name << "' domain has only " <<
+                makeNumStr(bb_num_points) <<
+                " valid point(s) inside its bounding-box of " <<
+                makeNumStr(bb_size) <<
+                " point(s); slower scalar calculations will be used.\n";
+            bb_is_full = false;
+        }
+
+        // Does everything start on a vector-length boundary?
+        bb_is_aligned = true;
+        for (auto& dim : domain_dims.getDims()) {
+            auto& dname = dim.getName();
+            if ((bb_begin[dname] - context.rank_domain_offsets[dname]) %
+                dims->_fold_pts[dname] != 0) {
+                os << "Note: '" << name << "' domain"
+                    " has one or more starting edges not on vector boundaries;"
+                    " masked calculations will be used in peel and remainder sub-blocks.\n";
+                bb_is_aligned = false;
+                break;
+            }
+        }
+
+        // Lengths are cluster-length multiples?
+        bb_is_cluster_mult = true;
+        for (auto& dim : domain_dims.getDims()) {
+            auto& dname = dim.getName();
+            if (bb_len[dname] % dims->_cluster_pts[dname] != 0) {
+                if (bb_is_full && bb_is_aligned)
+                    os << "Note: '" << name << "' domain"
+                        " has one or more sizes that are not vector-cluster multiples;"
+                        " masked calculations will be used in peel and remainder sub-blocks.\n";
+                bb_is_cluster_mult = false;
+                break;
+            }
+        }
+
+        // All done.
+        bb_valid = true;
+    }
+    
+    // Set the bounding-box for each stencil-bundle and whole domain.
+    void StencilContext::find_bounding_boxes()
+    {
+        ostream& os = get_ostr();
+
+        // Rank BB is based only on rank offsets and rank domain sizes.
+        rank_bb.bb_begin = rank_domain_offsets;
+        rank_bb.bb_end = rank_domain_offsets.addElements(_opts->_rank_sizes, false);
+        rank_bb.update_bb(os, "rank", *this, true);
+
+        // Overall BB may be extended for wave-fronts.
+        ext_bb.bb_begin = rank_bb.bb_begin.subElements(left_wf_exts);
+        ext_bb.bb_end = rank_bb.bb_end.addElements(right_wf_exts);
+        ext_bb.update_bb(os, "extended-rank", *this, true);
+
+        // Find BB for each bundle.
+        for (auto sg : stBundles)
+            sg->find_bounding_box();
+    }
+
+} // namespace yask.
diff --git a/src/kernel/lib/yask.hpp b/src/kernel/lib/yask.hpp
index ec2ef99b..06c08636 100644
--- a/src/kernel/lib/yask.hpp
+++ b/src/kernel/lib/yask.hpp
@@ -140,6 +140,11 @@ inline void omp_set_nested(int n) { }
 #include "yask_stencil_code.hpp"
 #undef DEFINE_MACROS
 
+// Max number of dims allowed in grids.
+#ifndef MAX_DIMS
+#define MAX_DIMS NUM_STENCIL_DIMS
+#endif
+
 // Default cmd-line arguments.
 #ifndef DEF_ARGS
 #define DEF_ARGS ""

From 102a27393f93d55e941872b691c7953ed31dca9e Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Wed, 25 Apr 2018 13:43:23 -0700
Subject: [PATCH 09/21] Change DEBUG macro to CHECK. Turn on by default at -O0.

This turns on lots of assertions in the kernel code.
Add py-kernel-api target.
---
 Makefile                         |  3 +++
 src/kernel/Makefile              | 15 +++++++++++----
 src/kernel/lib/generic_grids.hpp |  4 ++--
 src/kernel/lib/realv.hpp         |  4 ++--
 src/kernel/lib/setup.cpp         |  4 ++--
 src/kernel/lib/stencil_calc.cpp  |  4 ++--
 src/kernel/lib/yask.hpp          |  4 ++--
 7 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index 5b12591e..f334a7bc 100644
--- a/Makefile
+++ b/Makefile
@@ -138,6 +138,9 @@ compiler-api:
 kernel-api:
 	$(YK_MAKE) api
 
+py-kernel-api:
+	$(YK_MAKE) py-api
+
 api:
 	$(YC_MAKE) $@
 	$(YK_MAKE) $@
diff --git a/src/kernel/Makefile b/src/kernel/Makefile
index 3d5dccdf..d1e6c502 100644
--- a/src/kernel/Makefile
+++ b/src/kernel/Makefile
@@ -398,6 +398,11 @@ ifneq ($(filter -O0 -O1,$(YK_CXXOPT)),)
  pfd_l2		=	0
 endif
 
+# Turn on checking at O0.
+ifneq ($(filter -O0,$(YK_CXXOPT)),)
+ MACROS		+=	CHECK
+endif
+
 # Set MACROS based on individual makefile vars.
 # MACROS and EXTRA_MACROS will be written to a header file.
 MACROS		+=	PFD_L1=$(pfd_l1) PFD_L2=$(pfd_l2)
@@ -639,6 +644,8 @@ headers: $(YK_GEN_HEADERS)
 # Build C++ and Python kernel API libs.
 api: $(YK_LIB) $(YK_PY_LIB)
 
+py-api: $(YK_PY_LIB)
+
 # Build python kernel API lib.
 # TODO: consider adding $(YK_TAG) to [some of] these targets.
 $(YK_SWIG_DIR)/yask_kernel_api_wrap.cpp: $(YK_SWIG_DIR)/yask*.i $(INC_DIR)/*.hpp
@@ -873,12 +880,12 @@ help:
 	@echo " $(MAKE) clean; $(MAKE) -j arch=skl stencil=awp yk-api"
 	@echo " "
 	@echo "Example debug builds of kernel cmd-line tool:"
-	@echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' EXTRA_MACROS='DEBUG'"
-	@echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' EXTRA_MACROS='DEBUG TRACE'  # TRACE is a useful debug setting!"
-	@echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis radius=0 fold='x=1,y=1,z=1' mpi=0 YK_CXX=g++ OMPFLAGS='' YK_CXXOPT='-O0' EXTRA_MACROS='DEBUG TRACE TRACE_MEM TRACE_INTRINSICS'"
+	@echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' EXTRA_MACROS='CHECK'"
+	@echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' EXTRA_MACROS='CHECK TRACE'  # TRACE is a useful debug setting!"
+	@echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis radius=0 fold='x=1,y=1,z=1' mpi=0 YK_CXX=g++ OMPFLAGS='' YK_CXXOPT='-O0' EXTRA_MACROS='CHECK TRACE TRACE_MEM TRACE_INTRINSICS'"
 	@echo " "
 	@echo "Example builds with test runs:"
 	@echo " $(MAKE) -j all"
 	@echo " $(MAKE) -j all ranks=2"
 	@echo " $(MAKE) -j all YK_CXX=g++ YK_CXXOPT=-O2 mpi=0"
-	@echo " $(MAKE) -j all YK_CXX=mpigxx YK_CXXOPT=-O2 ranks=3 EXTRA_MACROS='DEBUG'"
+	@echo " $(MAKE) -j all YK_CXX=mpigxx YK_CXXOPT=-O2 ranks=3 EXTRA_MACROS='CHECK'"
diff --git a/src/kernel/lib/generic_grids.hpp b/src/kernel/lib/generic_grids.hpp
index 7833a1d2..fd57fc5d 100644
--- a/src/kernel/lib/generic_grids.hpp
+++ b/src/kernel/lib/generic_grids.hpp
@@ -341,7 +341,7 @@ namespace yask {
         
         // Get 1D index using layout.
         virtual idx_t get_index(const Indices& idxs, bool check=true) const final {
-#ifdef DEBUG
+#ifdef CHECK
             if (check) {
                 for (int i = 0; i < this->_dims.size(); i++) {
                     idx_t j = idxs[i];
@@ -351,7 +351,7 @@ namespace yask {
             }
 #endif
             idx_t ai = _layout.layout(idxs);
-#ifdef DEBUG
+#ifdef CHECK
             if (check)
                 assert(ai < this->get_num_elems());
 #endif
diff --git a/src/kernel/lib/realv.hpp b/src/kernel/lib/realv.hpp
index a1b81bad..427660ab 100644
--- a/src/kernel/lib/realv.hpp
+++ b/src/kernel/lib/realv.hpp
@@ -99,7 +99,7 @@ namespace yask {
 #undef VEC_ELEMS
 
     // Macro for looping through an aligned real_vec_t.
-#if defined(DEBUG) || (VLEN==1) || !defined(__INTEL_COMPILER) 
+#if defined(CHECK) || (VLEN==1) || !defined(__INTEL_COMPILER) 
 #define REAL_VEC_LOOP(i)                        \
     for (int i=0; i<VLEN; i++)
 #define REAL_VEC_LOOP_UNALIGNED(i)              \
@@ -121,7 +121,7 @@ namespace yask {
     }
     
     // conditional inlining
-#ifdef DEBUG
+#ifdef CHECK
 #define ALWAYS_INLINE inline
 #else
 #define ALWAYS_INLINE __attribute__((always_inline)) inline
diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp
index 0cc0b503..58fb63ef 100644
--- a/src/kernel/lib/setup.cpp
+++ b/src/kernel/lib/setup.cpp
@@ -917,8 +917,8 @@ namespace yask {
         _env->global_barrier();
 
         ostream& os = get_ostr();
-#ifdef DEBUG
-        os << "*** WARNING: YASK compiled with DEBUG; ignore performance results.\n";
+#ifdef CHECK
+        os << "*** WARNING: YASK compiled with CHECK; ignore performance results.\n";
 #endif
 #if defined(NO_INTRINSICS) && (VLEN > 1)
         os << "*** WARNING: YASK compiled with NO_INTRINSICS; ignore performance results.\n";
diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp
index c3c6b287..f6d2bf5d 100644
--- a/src/kernel/lib/stencil_calc.cpp
+++ b/src/kernel/lib/stencil_calc.cpp
@@ -515,7 +515,7 @@ namespace yask {
                    loop_idxs.start.makeValStr(nsdims) <<
                    " ... (end before) " << loop_idxs.stop.makeValStr(nsdims));
 
-#ifdef DEBUG
+#ifdef CHECK
         // Check that only the inner dim has a range greater than one cluster.
         for (int i = 0, j = 0; i < nsdims; i++) {
             if (i != step_posn) {
@@ -553,7 +553,7 @@ namespace yask {
                    " ... (end before) " << loop_idxs.stop.makeValStr(nsdims) <<
                    " w/write-mask = 0x" << hex << write_mask << dec);
 
-#ifdef DEBUG
+#ifdef CHECK
         // Check that only the inner dim has a range greater than one vector.
         for (int i = 0; i < nsdims; i++) {
             if (i != step_posn && i != _inner_posn)
diff --git a/src/kernel/lib/yask.hpp b/src/kernel/lib/yask.hpp
index 06c08636..abaa3dfa 100644
--- a/src/kernel/lib/yask.hpp
+++ b/src/kernel/lib/yask.hpp
@@ -43,9 +43,9 @@ typedef std::uint64_t uidx_t;
 // Settings from makefile.
 #include "yask_macros.hpp"
 
-// Control assert() by turning on with DEBUG instead of turning off with
+// Control assert() by turning on with CHECK instead of turning off with
 // NDEBUG. This makes it off by default.
-#ifndef DEBUG
+#ifndef CHECK
 #define NDEBUG
 #endif
 

From fd20e788f900fb1112393d37bb54c06f3b90b1f2 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Thu, 26 Apr 2018 11:53:29 -0700
Subject: [PATCH 10/21] Calculate max grid dims correctly.

Needed when grid dims > stencil dims.
---
 src/compiler/lib/YaskKernel.cpp | 10 ++++++-
 src/kernel/Makefile             | 49 +++++++++++++++++++--------------
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/compiler/lib/YaskKernel.cpp b/src/compiler/lib/YaskKernel.cpp
index f26151a0..b0aa9b83 100644
--- a/src/compiler/lib/YaskKernel.cpp
+++ b/src/compiler/lib/YaskKernel.cpp
@@ -92,7 +92,15 @@ namespace yask {
 
         os << "\n// Number of stencil dimensions (step and domain):\n"
             "#define NUM_STENCIL_DIMS " << _dims->_stencilDims.size() << endl;
-        
+
+        int gdims = 0;
+        for (auto gp : _grids) {
+            int ndims = gp->get_num_dims();
+            gdims = max(gdims, ndims);
+        }
+        os << "\n// Max number of grid dimensions:\n"
+            "#define NUM_GRID_DIMS " << gdims << endl;
+
         // Vec/cluster lengths.
         auto nvec = _dims->_foldGT1.getNumDims();
         os << "\n// One vector fold: " << _dims->_fold.makeDimValStr(" * ") << endl;
diff --git a/src/kernel/Makefile b/src/kernel/Makefile
index d1e6c502..6caeb23d 100644
--- a/src/kernel/Makefile
+++ b/src/kernel/Makefile
@@ -299,7 +299,8 @@ YK_PY_MOD	:=	$(YASK_DIR)/$(YK_MODULE).py
 YK_API_TEST_EXEC :=	$(BIN_DIR)/$(YK_BASE)_api_test.exe
 YK_GRID_TEST_EXEC :=	$(BIN_DIR)/$(YK_BASE)_grid_test.exe
 YK_API_TEST_EXEC_WITH_EXCEPTION :=	$(BIN_DIR)/$(YK_BASE)_api_exception_test.exe
-YK_DIMS_FILE	:=	num_dims.$(stencil).txt
+YK_STENCIL_DIMS_FILE	:=	num_stencil_dims.$(stencil).txt
+YK_GRID_DIMS_FILE	:=	num_grid_dims.$(stencil).txt
 
 MAKE_REPORT_FILE:=	make-report.$(YK_TAG).txt
 
@@ -479,8 +480,9 @@ endif
 # Add in final flags and user-added flags.
 YK_CXXFLAGS	+=	$(YK_CXXOPT) $(OMPFLAGS) $(EXTRA_YK_CXXFLAGS)
 
-# Number of dims extracted from YASK compiler output.
-NDIMS		:=	`cat $(YK_DIMS_FILE)`
+# Number of stencil/grid dims extracted from YASK compiler output.
+NSDIMS		:=	`cat $(YK_STENCIL_DIMS_FILE)`
+NGDIMS		:=	`cat $(YK_GRID_DIMS_FILE)`
 
 ######## Loop-compiler configuration:
 # The loop indices range from 0..N-1.
@@ -497,7 +499,7 @@ NDIMS		:=	`cat $(YK_DIMS_FILE)`
 # indices. Those that do not (e.g., grouped, serpentine, square-wave) may
 # *not* be used here when using temporal wavefronts. The time loop may be
 # found in StencilEquations::run_solution().
-RANK_LOOP_OPTS		?=	-ndims $(NDIMS) -inVar rank_idxs
+RANK_LOOP_OPTS		?=	-ndims $(NSDIMS) -inVar rank_idxs
 RANK_LOOP_ORDER		?=	1 .. N-1
 RANK_LOOP_CODE		?=	$(RANK_LOOP_OUTER_MODS) loop($(RANK_LOOP_ORDER)) \
 				{ $(RANK_LOOP_INNER_MODS) call(calc_region(stBundle_ptr)); }
@@ -507,7 +509,7 @@ RANK_LOOP_CODE		?=	$(RANK_LOOP_OUTER_MODS) loop($(RANK_LOOP_ORDER)) \
 # to a top-level OpenMP thread.  The region time loops are not coded here to
 # allow for proper spatial skewing for temporal wavefronts. The time loop
 # may be found in StencilEquations::calc_region().
-REGION_LOOP_OPTS	?=     	-ndims $(NDIMS) -inVar region_idxs \
+REGION_LOOP_OPTS	?=     	-ndims $(NSDIMS) -inVar region_idxs \
 				-ompConstruct '$(omp_par_for) schedule($(omp_region_schedule)) proc_bind(spread)' \
 				-callPrefix 'sg->'
 REGION_LOOP_OUTER_MODS	?=	grouped omp
@@ -519,7 +521,7 @@ REGION_LOOP_CODE	?=	$(REGION_LOOP_OUTER_MODS) loop($(REGION_LOOP_ORDER)) { \
 # a *nested* OpenMP loop so that each sub-block is assigned to a nested OpenMP
 # thread.  There is no time loop because threaded temporal blocking is
 # not yet supported.
-BLOCK_LOOP_OPTS		?=     	-ndims $(NDIMS) -inVar block_idxs \
+BLOCK_LOOP_OPTS		?=     	-ndims $(NSDIMS) -inVar block_idxs \
 				-ompConstruct '$(omp_par_for) schedule($(omp_block_schedule)) proc_bind(close)' \
 				-callPrefix 'sg->'
 BLOCK_LOOP_OUTER_MODS	?=	grouped omp
@@ -532,7 +534,7 @@ BLOCK_LOOP_CODE		?=	$(BLOCK_LOOP_OUTER_MODS) loop($(BLOCK_LOOP_ORDER)) { \
 # stencil compiler.  There is no time loop because threaded temporal
 # blocking is not yet supported.  The indexes in this loop are 'normalized',
 # i.e., vector units and rank-relative.
-SUB_BLOCK_LOOP_OPTS		?=     	-ndims $(NDIMS) -inVar norm_sub_block_idxs
+SUB_BLOCK_LOOP_OPTS		?=     	-ndims $(NSDIMS) -inVar norm_sub_block_idxs
 SUB_BLOCK_LOOP_OUTER_MODS	?=
 SUB_BLOCK_LOOP_ORDER		?=	1 .. N-2
 SUB_BLOCK_LOOP_CODE		?=	$(SUB_BLOCK_LOOP_OUTER_MODS) loop($(SUB_BLOCK_LOOP_ORDER)) { \
@@ -541,7 +543,7 @@ SUB_BLOCK_LOOP_CODE		?=	$(SUB_BLOCK_LOOP_OUTER_MODS) loop($(SUB_BLOCK_LOOP_ORDER
 # General-purpose parallel loop.
 # Nested OpenMP is not used here because there is no sharing between threads.
 # TODO: Consider using nested OpenMP to hide more latency.
-MISC_LOOP_OPTS		?=     	-ndims $(NDIMS) -inVar misc_idxs \
+MISC_LOOP_OPTS		?=     	-ndims $(NSDIMS) -inVar misc_idxs \
 				-ompConstruct '$(omp_par_for) schedule($(omp_misc_schedule)) proc_bind(spread)'
 MISC_LOOP_OUTER_MODS	?=	omp
 MISC_LOOP_ORDER		?=	1 .. N-1
@@ -577,49 +579,54 @@ $(MAKE_REPORT_FILE): $(YK_LIB)
 #$(MAKE) code-stats | tee -a $@
 
 # Generated source files.
-$(YK_GEN_DIR)/yask_rank_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE)
+$(YK_GEN_DIR)/yask_rank_loops.hpp: $(GEN_LOOPS) $(YK_STENCIL_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
 	$(PERL) $< -output $@ $(RANK_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_RANK_LOOP_OPTS) "$(RANK_LOOP_CODE)"
 
-$(YK_GEN_DIR)/yask_region_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE)
+$(YK_GEN_DIR)/yask_region_loops.hpp: $(GEN_LOOPS) $(YK_STENCIL_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
 	$(PERL) $< -output $@ $(REGION_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_REGION_LOOP_OPTS) "$(REGION_LOOP_CODE)"
 
-$(YK_GEN_DIR)/yask_block_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE)
+$(YK_GEN_DIR)/yask_block_loops.hpp: $(GEN_LOOPS) $(YK_STENCIL_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
 	$(PERL) $< -output $@ $(BLOCK_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_BLOCK_LOOP_OPTS) "$(BLOCK_LOOP_CODE)"
 
-$(YK_GEN_DIR)/yask_sub_block_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE)
+$(YK_GEN_DIR)/yask_sub_block_loops.hpp: $(GEN_LOOPS) $(YK_STENCIL_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
 	$(PERL) $< -output $@ $(SUB_BLOCK_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_SUB_BLOCK_LOOP_OPTS) "$(SUB_BLOCK_LOOP_CODE)"
 
-$(YK_GEN_DIR)/yask_misc_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE)
+$(YK_GEN_DIR)/yask_misc_loops.hpp: $(GEN_LOOPS) $(YK_STENCIL_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
 	$< -output $@ $(MISC_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_MISC_LOOP_OPTS) "$(MISC_LOOP_CODE)"
 
-$(YK_GEN_DIR)/yask_layout_macros.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE)
+$(YK_GEN_DIR)/yask_layout_macros.hpp: $(GEN_LAYOUTS) $(YK_GRID_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
-	$(PERL) $< -m $(NDIMS) > $@
+	$(PERL) $< -m $(NGDIMS) > $@
 	@- gindent -fca $@ || \
 	  indent -fca $@ ||   \
 	  echo "note:" $@ "is not properly indented because indent program failed or was not found."
 
-$(YK_GEN_DIR)/yask_layouts.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE)
+$(YK_GEN_DIR)/yask_layouts.hpp: $(GEN_LAYOUTS) $(YK_GRID_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
-	$(PERL) $< -d $(NDIMS) > $@
+	$(PERL) $< -d $(NGDIMS) > $@
 	@- gindent -fca $@ || \
 	  indent -fca $@ ||   \
 	  echo "note:" $@ "is not properly indented because indent program failed or was not found."
 
-$(YK_GEN_DIR)/yask_grid_code.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE)
+$(YK_GEN_DIR)/yask_grid_code.hpp: $(GEN_LAYOUTS) $(YK_GRID_DIMS_FILE)
 	$(YK_MK_GEN_DIR)
-	$(PERL) $< -g $(NDIMS) > $@
+	$(PERL) $< -g $(NGDIMS) > $@
 
 # Extract the number of stencil dims from the compiler output.
 # Use this to create an option to pass to the loop generator script.
-$(YK_DIMS_FILE): $(YK_CODE_FILE)
+$(YK_STENCIL_DIMS_FILE): $(YK_CODE_FILE)
 	awk '/NUM_STENCIL_DIMS/ {print $$NF}' $< > $@
 
+# Extract the number of grid dims from the compiler output.
+# Use this to create an option to pass to the layout generator script.
+$(YK_GRID_DIMS_FILE): $(YK_CODE_FILE)
+	awk '/NUM_GRID_DIMS/ {print $$NF}' $< > $@
+
 $(YK_CODE_FILE): $(YC_EXEC)
 	$(YK_MK_GEN_DIR)
 	$(RUN_PREFIX) $< $(YC_FLAGS) $(EXTRA_YC_FLAGS) -p $(YC_TARGET) $@
@@ -781,7 +788,7 @@ all:
 # Make this target before rebuilding YASK with any new parameters.
 clean:
 	rm -fv *.s
-	rm -fv num_dims.*.txt
+	rm -fv num_*dims.*.txt
 	rm -fr $(YK_SWIG_DIR)/build $(YK_GEN_DIR)
 	rm -fv $(YK_SWIG_DIR)/*_api_wrap.*
 	rm -fv $(YK_OBJS)

From 0be535ff8414e0fdfaf1d3b573297c40ae4bee40 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Thu, 26 Apr 2018 14:46:26 -0700
Subject: [PATCH 11/21] Reorganize tests.

---
 Makefile            | 44 +++++++++++++++-----------------------------
 src/kernel/Makefile | 32 +++++++++++++++-----------------
 2 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/Makefile b/Makefile
index f334a7bc..6c357999 100644
--- a/Makefile
+++ b/Makefile
@@ -196,39 +196,21 @@ py-yc-api-and-cxx-yk-api-test:
 	$(YK_MAKE) py-yc-api-test
 	$(YK_MAKE) cxx-yk-api-test
 
-# Run C++ compiler API test with exception, then run C++ kernel API test with exception.
-cxx-yc-api-and-cxx-yk-api-test-with-exception:
-	$(YK_MAKE) cxx-yc-api-test-with-exception
-	$(YK_MAKE) cxx-yk-api-test-with-exception
-
-# Run python compiler API test with exception, then run python kernel API test with exception.
-py-yc-api-and-py-yk-api-test-with-exception:
-	$(YK_MAKE) py-yc-api-test-with-exception
-	$(YK_MAKE) py-yk-api-test-with-exception
-
-# Run C++ compiler API test with exception, then run python kernel API test with exception.
-cxx-yc-api-and-py-yk-api-test-with-exception:
-	$(YK_MAKE) cxx-yc-api-test-with-exception
-	$(YK_MAKE) py-yk-api-test-with-exception
-
-# Run python compiler API test with exception, then run C++ kernel API test with exception.
-py-yc-api-and-cxx-yk-api-test-with-exception:
-	$(YK_MAKE) py-yc-api-test-with-exception
-	$(YK_MAKE) cxx-yk-api-test-with-exception
-
-api-tests:
-	$(MAKE) yc-and-cxx-yk-api-test
-	$(MAKE) yc-and-py-yk-api-test
+# Run 8 out of 9 combos of (built-in, C++, Python)^2
+# API tests. The 9th one is built-in with built-in,
+# which is tested more extensively in the kernel tests.
+# When the built-in stencil examples aren't being used,
+# "stencil=test" in the commands below is simply used to
+# create file names.
+combo-api-tests:
+	$(MAKE) stencil=iso3dfd yc-and-cxx-yk-api-test
+	$(MAKE) stencil=iso3dfd yc-and-py-yk-api-test
 	$(MAKE) stencil=test cxx-yc-api-and-yk-test
 	$(MAKE) stencil=test py-yc-api-and-yk-test
 	$(MAKE) stencil=test cxx-yc-api-and-cxx-yk-api-test
 	$(MAKE) stencil=test py-yc-api-and-py-yk-api-test
 	$(MAKE) stencil=test cxx-yc-api-and-py-yk-api-test
 	$(MAKE) stencil=test py-yc-api-and-cxx-yk-api-test
-	$(MAKE) stencil=test cxx-yc-api-and-cxx-yk-api-test-with-exception
-	$(MAKE) stencil=test py-yc-api-and-py-yk-api-test-with-exception
-	$(MAKE) stencil=test cxx-yc-api-and-py-yk-api-test-with-exception
-	$(MAKE) stencil=test py-yc-api-and-cxx-yk-api-test-with-exception
 
 ######## Misc targets
 
@@ -248,10 +230,14 @@ tuple-test: $(TUPLE_TEST_EXEC)
 	@echo '*** Running the C++ YASK tuple test...'
 	$(RUN_PREFIX) $<
 
-all-tests: compiler
+api-tests: compiler-api
+	$(MAKE) combo-api-tests
+	$(YK_MAKE) $@
+
+all-tests: compiler-api
 	$(MAKE) tuple-test
+	$(MAKE) combo-api-tests
 	$(YK_MAKE) $@
-	$(MAKE) api-tests
 
 docs: api-docs
 
diff --git a/src/kernel/Makefile b/src/kernel/Makefile
index 6caeb23d..06301110 100644
--- a/src/kernel/Makefile
+++ b/src/kernel/Makefile
@@ -723,18 +723,6 @@ cxx-yc-api-test:
 	$(YK_MK_GEN_DIR)
 	mv $(YC_SRC_DIR)/yc-api-test-cxx.hpp $(YK_CODE_FILE)
 
-# Run Python compiler API test with exceptions to create stencil-code file.
-py-yc-api-test-with-exception:
-	$(MAKE) -C $(YC_SRC_DIR) $@
-	$(YK_MK_GEN_DIR)
-	mv $(YC_SRC_DIR)/yc-api-test-with-exception-py.hpp $(YK_CODE_FILE)
-
-# Run C++ compiler API test with exceptions to create stencil-code file.
-cxx-yc-api-test-with-exception:
-	$(MAKE) -C $(YC_SRC_DIR) $@
-	$(YK_MK_GEN_DIR)
-	mv $(YC_SRC_DIR)/yc-api-test-with-exception-cxx.hpp $(YK_CODE_FILE)
-
 ######## Misc targets
 
 # Run the default YASK compiler and kernel.
@@ -755,15 +743,20 @@ kernel-only:
 yk-test-no-yc: kernel-only
 	$(BIN_DIR)/yask.sh -stencil $(stencil) -arch $(arch) -ranks $(ranks) -v $(v_args)
 
+# Run the kernel API tests for C++ and Python with and w/o expected exceptions.
+api-tests:
+	$(MAKE) clean; $(MAKE) cxx-yk-api-test real_bytes=8 stencil=iso3dfd
+	$(MAKE) clean; $(MAKE) py-yk-api-test stencil=iso3dfd
+	$(MAKE) clean; $(MAKE) cxx-yk-api-test-with-exception real_bytes=8 stencil=iso3dfd
+	$(MAKE) clean; $(MAKE) py-yk-api-test-with-exception stencil=iso3dfd
+
+# Run several stencils using built-in validation.
 # NB: set arch var if applicable.
 # NB: save some time by using YK_CXXOPT=-O2.
 # These tests are focused on the kernel and not the compiler.
 # For testing both the kernel and compiler in various combinations,
 # run the tests from the top-level Makefile.
-all-tests:
-	$(MAKE) clean; $(MAKE) cxx-yk-grid-test stencil=test_3d fold=x=4,y=2 
-	$(MAKE) clean; $(MAKE) cxx-yk-api-test real_bytes=8 stencil=iso3dfd
-	$(MAKE) clean; $(MAKE) py-yk-api-test stencil=iso3dfd
+stencil-tests:
 	$(MAKE) clean; $(MAKE) yc-and-yk-test real_bytes=8 stencil=test_1d
 	$(MAKE) clean; $(MAKE) yc-and-yk-test real_bytes=8 stencil=3axis fold=x=2,y=2
 	$(MAKE) clean; $(MAKE) yc-and-yk-test real_bytes=8 stencil=9axis fold=x=2,z=2
@@ -778,6 +771,11 @@ all-tests:
 	$(MAKE) clean; $(MAKE) yc-and-yk-test real_bytes=8 stencil=fsg_abc
 	$(MAKE) clean; $(MAKE) yc-and-yk-test real_bytes=8 stencil=fsg2
 
+all-tests:
+	$(MAKE) clean; $(MAKE) cxx-yk-grid-test stencil=test_3d fold=x=4,y=2
+	$(MAKE) api-tests
+	$(MAKE) stencil-tests
+
 all:
 	$(MAKE) kernel
 	$(MAKE) api
@@ -895,4 +893,4 @@ help:
 	@echo " $(MAKE) -j all"
 	@echo " $(MAKE) -j all ranks=2"
 	@echo " $(MAKE) -j all YK_CXX=g++ YK_CXXOPT=-O2 mpi=0"
-	@echo " $(MAKE) -j all YK_CXX=mpigxx YK_CXXOPT=-O2 ranks=3 EXTRA_MACROS='CHECK'"
+	@echo " $(MAKE) -j all YK_CXX=mpigxx YK_CXXOPT=-O1 ranks=3 EXTRA_MACROS='CHECK'"

From 777259da0e9dedb7137eac6dfbf56347711e489a Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Thu, 26 Apr 2018 14:48:23 -0700
Subject: [PATCH 12/21] Add check for gcc version.

---
 src/compiler/lib/Expr.hpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/compiler/lib/Expr.hpp b/src/compiler/lib/Expr.hpp
index 89023f95..6fd375ec 100644
--- a/src/compiler/lib/Expr.hpp
+++ b/src/compiler/lib/Expr.hpp
@@ -39,6 +39,14 @@ IN THE SOFTWARE.
 #include <cstdarg>
 #include <assert.h>
 #include <fstream>
+
+// Need g++ >= 4.9 for regex.
+#define GCC_VERSION (__GNUC__ * 10000 \
+                     + __GNUC_MINOR__ * 100 \
+                     + __GNUC_PATCHLEVEL__)
+#if GCC_VERSION < 40900
+#error G++ 4.9.0 or later is required
+#endif
 #include <regex>
 
 // Common utilities.

From a2078372ceb5abfab6ab3f4b444af19e93bec940 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Fri, 27 Apr 2018 14:42:53 -0700
Subject: [PATCH 13/21] Fix alignment of blocks in ranks with non-aligned
 starting offsets.

Fix padding and offsets of non-vectorized grids.
---
 bin/gen_loops.pl                | 17 ++++++-
 src/kernel/lib/context.cpp      | 55 ++++++++++++++++-------
 src/kernel/lib/context.hpp      |  5 +--
 src/kernel/lib/grid_apis.cpp    |  1 +
 src/kernel/lib/realv_grids.cpp  | 33 ++++++++++++++
 src/kernel/lib/realv_grids.hpp  | 19 +-------
 src/kernel/lib/settings.hpp     | 34 +++++++++-----
 src/kernel/lib/stencil_calc.cpp | 80 +++++++++++++++++++++++++--------
 8 files changed, 174 insertions(+), 70 deletions(-)

diff --git a/bin/gen_loops.pl b/bin/gen_loops.pl
index ff62a008..50b75f40 100755
--- a/bin/gen_loops.pl
+++ b/bin/gen_loops.pl
@@ -92,6 +92,9 @@ sub stepVar {
 sub alignVar {
     return inVar("align", @_);
 }
+sub alignOfsVar {
+    return inVar("align_ofs", @_);
+}
 sub groupSizeVar {
     return inVar("group_size", @_);
 }
@@ -201,6 +204,7 @@ ($$$)
                 my $evar = endVar($dim);
                 my $svar = stepVar($dim);
                 my $avar = alignVar($dim);
+                my $aovar = alignOfsVar($dim);
                 my $aavar = adjAlignVar($dim);
                 my $abvar = alignBeginVar($dim);
                 my $nvar = numItersVar($dim);
@@ -208,11 +212,20 @@ ($$$)
                 my $tsvar = groupSizeVar($dim);
                 my $ntivar = numFullGroupItersVar($dim);
 
+                # Example alignment:
+                # bvar = 20.
+                # svar = 8.
+                # avar = 4.
+                # aovar = 15.
+                # Then,
+                # aavar = min(4, 8) = 4.
+                # abvar = round_down_flr(20 - 15, 4) + 15 = 4 + 15 = 19.
+
                 push @$code,
                     " // Alignment must be less than or equal to step size.",
                     " const $itype $aavar = std::min($avar, $svar);",
-                    " // Aligned beginning point. May be at or before $bvar.",
-                    " const $itype $abvar = yask::round_down_flr($bvar, $aavar);",
+                    " // Aligned beginning point such that ($bvar - $svar) < $abvar <= $bvar.",
+                    " const $itype $abvar = yask::round_down_flr($bvar - $aovar, $aavar) + $aovar;",
                     " // Number of iterations to get from $abvar to (but not including) $evar, stepping by $svar.".
                     " This value is rounded up because the last iteration may cover fewer than $svar steps.",
                     " const $itype $nvar = yask::ceil_idiv_flr($evar - $abvar, $svar);";
diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp
index 2a47ca33..668696d1 100644
--- a/src/kernel/lib/context.cpp
+++ b/src/kernel/lib/context.cpp
@@ -193,14 +193,14 @@ namespace yask {
         
         // Indices to loop through.
         // Init from begin & end tuples.
-        ScanIndices rank_idxs(*_dims, false);
+        ScanIndices rank_idxs(*_dims, false, &rank_domain_offsets);
         rank_idxs.begin = begin;
         rank_idxs.end = end;
 
         // Set offsets in scratch grids.
         // Requires scratch grids to be allocated for whole
         // rank instead of smaller grid size.
-        update_scratch_grids(scratch_grid_idx, rank_idxs);
+        update_scratch_grids(scratch_grid_idx, rank_idxs.begin);
             
         // Initial halo exchange.
         // (Needed in case there are 0 time-steps).
@@ -376,7 +376,7 @@ namespace yask {
         }
 
         // Indices needed for the 'rank' loops.
-        ScanIndices rank_idxs(*_dims, true);
+        ScanIndices rank_idxs(*_dims, true, &rank_domain_offsets);
         rank_idxs.begin = begin;
         rank_idxs.end = end;
         rank_idxs.step = step;
@@ -507,7 +507,7 @@ namespace yask {
                   " ... (end before) " << rank_idxs.stop.makeValStr(ndims));
 
         // Init region begin & end from rank start & stop indices.
-        ScanIndices region_idxs(*_dims, true);
+        ScanIndices region_idxs(*_dims, true, &rank_domain_offsets);
         region_idxs.initFromOuter(rank_idxs);
 
         // Make a copy of the original start and stop indices because
@@ -967,12 +967,13 @@ namespace yask {
         }
     }
 
-    // Adjust offsets of scratch grids based
-    // on thread and scan indices.
-    // Each scratch-grid is assigned to a thread, so it must
-    // "move around" as the thread is assigned to each block.
+    // Adjust offsets of scratch grids based on thread number 'thread_idx'
+    // and beginning point of block 'idxs'.  Each scratch-grid is assigned
+    // to a thread, so it must "move around" as the thread is assigned to
+    // each block.  This move is accomplished by changing the grids' global
+    // and local offsets.
     void StencilContext::update_scratch_grids(int thread_idx,
-                                              const ScanIndices& idxs) {
+                                              const Indices& idxs) {
         auto dims = get_dims();
         int nsdims = dims->_stencil_dims.size();
         auto step_posn = Indices::step_posn;
@@ -981,7 +982,7 @@ namespace yask {
         for (auto* sv : scratchVecs) {
             assert(sv);
 
-            // Get the one for this thread.
+            // Get ptr to the scratch grid for this thread.
             auto gp = sv->at(thread_idx);
             assert(gp);
             assert(gp->is_scratch());
@@ -996,16 +997,32 @@ namespace yask {
                     int posn = gp->get_dim_posn(dname);
                     if (posn >= 0) {
 
+                        // |        +------+       |
+                        // |  loc   |      |       | 
+                        // |  ofs   |      |       | 
+                        // |<------>|      |       | 
+                        // |        +------+       |
+                        // ^        ^
+                        // |        |
+                        // |        start of grid/0-idx of block
+                        // first rank-domain index
+                        
                         // Set offset of grid based on starting point of block.
-                        // This is global, so it will include the rank offset.
-                        gp->_set_offset(posn, idxs.begin[i]);
+                        // This is a global index, so it will include the rank offset.
+                        gp->_set_offset(posn, idxs[i]);
 
+                        // Local offset is the offset of this grid
+                        // relative to the current rank.
                         // Set local offset to diff between global offset
-                        // and rank offset. Must be vec-multiple.
+                        // and rank offset.
                         auto rofs = rank_domain_offsets[j];
-                        auto lofs = idxs.begin[i] - rofs;
+                        auto lofs = idxs[i] - rofs;
                         gp->_set_local_offset(posn, lofs);
-                        assert(imod_flr(lofs, dims->_fold_pts[j]) == 0);
+
+                        // For a vectorized grid, the local offset must
+                        // be a vector multiple. This is necessary for
+                        // vector and cluster operations to work properly.
+                        assert(imod_flr(lofs, gp->_get_vec_lens(posn)) == 0);
                     }
                     j++;
                 }
@@ -1148,10 +1165,14 @@ namespace yask {
             auto sg_list = sg.get_scratch_deps();
             sg_list.push_back(&sg);
 
-            // Loop through all the needed groups.
+            // Loop through all the needed bundles.
             for (auto* csg : sg_list) {
 
-                // Loop thru all *input* grids in this group.
+                TRACE_MSG("exchange_halos: checking " << csg->inputGridPtrs.size() <<
+                          " input grid(s) to bundle '" << csg->get_name() <<
+                          "' that is needed for bundle '" << sg.get_name() << "'");
+
+                // Loop thru all *input* grids in this bundle.
                 for (auto gp : csg->inputGridPtrs) {
 
                     // Don't swap scratch grids.
diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp
index d156bf3d..418959fb 100644
--- a/src/kernel/lib/context.hpp
+++ b/src/kernel/lib/context.hpp
@@ -295,9 +295,6 @@ namespace yask {
             // Dump stats if get_stats() hasn't been called yet.
             if (steps_done)
                 get_stats();
-
-            // Free mem, reset threads, etc.
-            end_solution();
         }
 
         // Set debug output to cout if my_rank == msg_rank
@@ -386,7 +383,7 @@ namespace yask {
         // Adjust offsets of scratch grids based
         // on thread and scan indices.
         virtual void update_scratch_grids(int thread_idx,
-                                          const ScanIndices& idxs);
+                                          const Indices& idxs);
 
         // Get total memory allocation required by grids.
         // Does not include MPI buffers.
diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp
index d0e163ba..3b2a148d 100644
--- a/src/kernel/lib/grid_apis.cpp
+++ b/src/kernel/lib/grid_apis.cpp
@@ -66,6 +66,7 @@ namespace yask {
     GET_GRID_API(get_last_rank_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, false, true, false, true)
     GET_GRID_API(_get_left_wf_ext, _left_wf_exts[posn], true, true, true, false)
     GET_GRID_API(_get_right_wf_ext, _right_wf_exts[posn], true, true, true, false)
+    GET_GRID_API(_get_vec_lens, _vec_lens[posn], true, true, true, true)
     GET_GRID_API(_get_offset, _offsets[posn], true, true, true, true)
     GET_GRID_API(_get_local_offset, _local_offsets[posn], true, true, true, false)
     GET_GRID_API(_get_first_alloc_index, _offsets[posn] - _left_pads[posn], true, true, true, true)
diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp
index 51ba3086..43726f97 100644
--- a/src/kernel/lib/realv_grids.cpp
+++ b/src/kernel/lib/realv_grids.cpp
@@ -109,6 +109,36 @@ namespace yask {
         return posn;
     }
         
+    // Determine required padding from halos.
+    // Does not include user-specified min padding or
+    // final rounding for left pad.
+    Indices YkGridBase::getReqdPad(const Indices& halos, const Indices& wf_exts) const {
+
+        // Start with halos plus WF exts.
+        Indices mp = halos.addElements(wf_exts);
+            
+
+        // For scratch grids, halo area must be written to.  Halo is sum
+        // of dependent's write halo and depender's read halo, but these
+        // two components are not stored individually.  Write halo will
+        // be expanded to full vec len during computation, requiring
+        // load from read halo beyond full vec len.  Worst case is when
+        // write halo is one and rest is read halo.  So if there is a
+        // halo and/or wf-ext, padding should be that plus all but one
+        // element of a vector. In addition, this vec-len should be the
+        // global one, not the one for this grid to handle the case where
+        // this grid is not vectorized.
+        for (int i = 0; i < get_num_dims(); i++) {
+            if (mp[i] >= 1) {
+                auto& dname = get_dim_name(i);
+                auto* p = _dims->_domain_dims.lookup(dname);
+                if (p)
+                    mp[i] += *p - 1;
+            }
+        }
+        return mp;
+    }
+
     // Resizes the underlying generic grid.
     // Modifies _pads and _allocs.
     // Fails if mem different and already alloc'd.
@@ -151,6 +181,9 @@ namespace yask {
             left_pads2[i] = ROUND_UP(left_pads2[i], _vec_lens[i]);
             _left_pads[i] = left_pads2[i];
             _vec_left_pads[i] = left_pads2[i] / _vec_lens[i];
+
+            // For the right pad, we will round it up below when
+            // we calculate alloc.
         }
         
         // New allocation in each dim.
diff --git a/src/kernel/lib/realv_grids.hpp b/src/kernel/lib/realv_grids.hpp
index 8c185a8b..929376a6 100644
--- a/src/kernel/lib/realv_grids.hpp
+++ b/src/kernel/lib/realv_grids.hpp
@@ -92,23 +92,7 @@ namespace yask {
         // Determine required padding from halos.
         // Does not include user-specified min padding or
         // final rounding for left pad.
-        virtual Indices getReqdPad(const Indices& halos, const Indices& wf_exts) const {
-            Indices mp = halos.addElements(wf_exts);
-            for (int i = 0; i < get_num_dims(); i++) {
-
-                // For scratch grids, halo area must be written to.  Halo is sum
-                // of dependent's write halo and dependency's read halo, but
-                // these two components are not stored individually.  Write halo
-                // will be expanded to full vec len during computation,
-                // requiring load from read halo beyond full vec len.  Worst
-                // case is when write halo is one and rest is read halo.  So if
-                // there is a halo and/or wf-ext, padding should be that plus
-                // all but one element of a vector.
-                if (mp[i] >= 1)
-                    mp[i] += _vec_lens[i] - 1;
-            }
-            return mp;
-        }
+        virtual Indices getReqdPad(const Indices& halos, const Indices& wf_exts) const;
 
         // Check whether dim exists and is of allowed type.
         virtual void checkDimType(const std::string& dim,
@@ -382,6 +366,7 @@ namespace yask {
         GET_GRID_API(_get_last_alloc_index)
         GET_GRID_API(_get_left_wf_ext)
         GET_GRID_API(_get_right_wf_ext)
+        GET_GRID_API(_get_vec_lens)
         SET_GRID_API(_set_domain_size)
         SET_GRID_API(_set_left_pad_size)
         SET_GRID_API(_set_right_pad_size)
diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp
index f9d305e6..5545de7e 100644
--- a/src/kernel/lib/settings.hpp
+++ b/src/kernel/lib/settings.hpp
@@ -460,8 +460,12 @@ namespace yask {
         Indices begin, end;     // first and end (beyond last) range of each index.
         Indices step;           // step value within range.
         Indices align;          // alignment of steps after first one.
+        Indices align_ofs;      // adjustment for alignment (see below).
         Indices group_size;     // proximity grouping within range.
 
+        // Alignment: when possible, each step will be aligned
+        // such that ((start - align_ofs) % align) == 0.
+        
         // Values that differ for each sub-range.
         Indices start, stop;    // first and last+1 for this sub-range.
         Indices index;          // 0-based unique index for each sub-range.
@@ -475,26 +479,30 @@ namespace yask {
         //                                       start   stop  (index = 2)
         
         // Default init.
-        ScanIndices(const Dims& dims, bool use_vec_align) :
+        ScanIndices(const Dims& dims, bool use_vec_align, IdxTuple* ofs) :
             ndims(dims._stencil_dims.size()),
             begin(idx_t(0), ndims),
             end(idx_t(0), ndims),
             step(idx_t(1), ndims),
             align(idx_t(1), ndims),
+            align_ofs(idx_t(0), ndims),
             group_size(idx_t(1), ndims),
             start(idx_t(0), ndims),
             stop(idx_t(0), ndims),
             index(idx_t(0), ndims) {
 
-            // Set alignment to vector lengths.
-            if (use_vec_align) {
-                
-                // i: index for stencil dims, j: index for domain dims.
-                for (int i = 0, j = 0; i < ndims; i++) {
-                    if (i != Indices::step_posn) {
-                        align[i] = dims._fold_pts[j];
-                        j++;
-                    }
+            // i: index for stencil dims, j: index for domain dims.
+            for (int i = 0, j = 0; i < ndims; i++) {
+                if (i == Indices::step_posn) continue;
+
+                // Set alignment to vector lengths.
+                if (use_vec_align)
+                    align[i] = dims._fold_pts[j];
+
+                // Set alignment offset.
+                if (ofs) {
+                    assert(ofs->getNumDims() == ndims - 1);
+                    align_ofs[i] = ofs->getVal(j);
                 }
             }
         }
@@ -508,7 +516,11 @@ namespace yask {
             begin = outer.start;
             end = outer.stop;
 
-            // Pass output values through by default.
+            // Pass other values through by default.
+            step = outer.step;
+            align = outer.align;
+            align_ofs = outer.align_ofs;
+            group_size = outer.group_size;
             start = outer.start;
             stop = outer.stop;
             index = outer.index;
diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp
index f6d2bf5d..1f26ee92 100644
--- a/src/kernel/lib/stencil_calc.cpp
+++ b/src/kernel/lib/stencil_calc.cpp
@@ -36,18 +36,19 @@ namespace yask {
 
         auto opts = _generic_context->get_settings();
         auto dims = _generic_context->get_dims();
-        int ndims = dims->_stencil_dims.size();
+        int nsdims = dims->_stencil_dims.size();
         auto& step_dim = dims->_step_dim;
+        auto step_posn = Indices::step_posn;
         int thread_idx = omp_get_thread_num(); // used to index the scratch grids.
         TRACE_MSG3("calc_block:" <<
                    " in non-scratch bundle '" << get_name() << "': " <<
-                   region_idxs.start.makeValStr(ndims) <<
-                   " ... (end before) " << region_idxs.stop.makeValStr(ndims) <<
+                   region_idxs.start.makeValStr(nsdims) <<
+                   " ... (end before) " << region_idxs.stop.makeValStr(nsdims) <<
                    " by thread " << thread_idx);
         assert(!is_scratch());
 
         // Init default block begin & end from region start & stop indices.
-        ScanIndices def_block_idxs(*dims, true);
+        ScanIndices def_block_idxs(*dims, true, 0);
         def_block_idxs.initFromOuter(region_idxs);
 
         // Steps within a block are based on sub-block sizes.
@@ -57,7 +58,7 @@ namespace yask {
         def_block_idxs.group_size = opts->_sub_block_group_sizes;
 
         // Update offsets of scratch grids based on this bundle's location.
-        _generic_context->update_scratch_grids(thread_idx, def_block_idxs);
+        _generic_context->update_scratch_grids(thread_idx, def_block_idxs.begin);
         
         // Define the bundles that need to be processed in
         // this block. This will be the prerequisite scratch-grid
@@ -79,8 +80,8 @@ namespace yask {
 
             TRACE_MSG3("calc_block: " <<
                        " in bundle '" << sg->get_name() << "': " <<
-                       block_idxs.begin.makeValStr(ndims) <<
-                       " ... (end before) " << block_idxs.end.makeValStr(ndims) <<
+                       block_idxs.begin.makeValStr(nsdims) <<
+                       " ... (end before) " << block_idxs.end.makeValStr(nsdims) <<
                        " by thread " << thread_idx);
 
             // Include automatically-generated loop code that calls
@@ -91,7 +92,8 @@ namespace yask {
     }
 
     // Normalize the indices, i.e., divide by vector len in each dim.
-    // Ranks offsets must already be subtracted.
+    // Ranks offsets must already be subtracted because rank offsets
+    // are not necessarily vec-multiples.
     // Each dim in 'orig' must be a multiple of corresponding vec len.
     void StencilBundleBase::normalize_indices(const Indices& orig, Indices& norm) const {
         auto* cp = _generic_context;
@@ -155,7 +157,7 @@ namespace yask {
         
         // Init sub-block begin & end from block start & stop indices.
         // These indices are in element units and global (NOT rank-relative).
-        ScanIndices sub_block_idxs(*dims, true);
+        ScanIndices sub_block_idxs(*dims, true, 0);
         sub_block_idxs.initFromOuter(block_idxs);
 
         // Sub block indices in element units and rank-relative.
@@ -169,10 +171,16 @@ namespace yask {
         // These indices are in element units and rank-relative.
         ScanIndices sub_block_fvidxs(sub_block_idxs);
         
-        // Superset of sub-block that is full or partial vectors.
+        // Superset of sub-block that is full or partial (masked) vectors.
         // These indices are in element units and rank-relative.
         ScanIndices sub_block_vidxs(sub_block_idxs);
 
+        // These will be set to rank-relative, so set ofs to zero.
+        sub_block_eidxs.align_ofs.setFromConst(0);
+        sub_block_fcidxs.align_ofs.setFromConst(0);
+        sub_block_fvidxs.align_ofs.setFromConst(0);
+        sub_block_vidxs.align_ofs.setFromConst(0);
+        
         // Masks for computing partial vectors in each dim.
         // Init to all-ones (no masking).
         Indices peel_masks(nsdims), rem_masks(nsdims);
@@ -209,9 +217,9 @@ namespace yask {
             sub_block_vidxs.end.setFromConst(0);
         }
 
+        // Adjust indices to be rank-relative.
         // Determine the subset of this sub-block that is
-        // clusters, vectors, and partial vectors.  TODO: pre-calc this info
-        // for each block.
+        // clusters, vectors, and partial vectors.
         else {
             do_clusters = true;
             do_vectors = false;
@@ -233,6 +241,7 @@ namespace yask {
                     // Find range of full clusters.
                     // Note that fcend <= eend because we round
                     // down to get whole clusters only.
+                    // Similarly, fcbgn >= ebgn.
                     auto cpts = dims->_cluster_pts[j];
                     auto fcbgn = round_up_flr(ebgn, cpts);
                     auto fcend = round_down_flr(eend, cpts);
@@ -255,6 +264,8 @@ namespace yask {
                         // Similar but opposite for begin vars.
                         // We make a vector mask to pick the
                         // right elements.
+                        // TODO: use compile-time consts instead
+                        // of _fold_pts for more efficiency.
                         auto vpts = dims->_fold_pts[j];
                         auto fvbgn = round_up_flr(ebgn, vpts);
                         auto fvend = round_down_flr(eend, vpts);
@@ -282,6 +293,26 @@ namespace yask {
                         // Calculate masks in this dim for partial vectors.
                         // All such masks will be ANDed together to form the
                         // final masks over all domain dims.
+                        // Example: assume folding is x=4*y=4.
+                        // Possible 'x' peel mask to exclude 1st 2 cols:
+                        //   0 0 1 1
+                        //   0 0 1 1
+                        //   0 0 1 1
+                        //   0 0 1 1
+                        // Possible 'y' peel mask to exclude 1st row:
+                        //   0 0 0 0
+                        //   1 1 1 1
+                        //   1 1 1 1
+                        //   1 1 1 1
+                        // Along 'x' face, the 'x' peel mask is used.
+                        // Along 'y' face, the 'y' peel mask is used.
+                        // Along an 'x-y' edge, they are ANDed to make this mask:
+                        //   0 0 0 0
+                        //   0 0 1 1
+                        //   0 0 1 1
+                        //   0 0 1 1
+                        // so that the 6 corner elements are updated.
+
                         if (vbgn < fvbgn || vend > fvend) {
                             idx_t pmask = 0, rmask = 0;
 
@@ -327,6 +358,15 @@ namespace yask {
                             scalar_for_peel_rem = true;
                         }
                     }
+
+                    // If no peel or rem, just set vec indices to same as
+                    // full cluster.
+                    else {
+                        sub_block_fvidxs.begin[i] = fcbgn;
+                        sub_block_fvidxs.end[i] = fcend;
+                        sub_block_vidxs.begin[i] = fcbgn;
+                        sub_block_vidxs.end[i] = fcend;
+                    }
                     
                     // Next domain index.
                     j++;
@@ -348,7 +388,7 @@ namespace yask {
         norm_sub_block_idxs.stop = norm_sub_block_idxs.end;
         norm_sub_block_idxs.align.setFromConst(1); // one vector.
         
-        // Full rectangular polytope of aligned clusters: use optimized code.
+        // Full rectilinear polytope of aligned clusters: use optimized code.
         if (do_clusters) {
             TRACE_MSG3("calc_sub_block:  using cluster code for " <<
                        sub_block_fcidxs.begin.makeValStr(nsdims) <<
@@ -380,7 +420,7 @@ namespace yask {
             TRACE_MSG3("calc_sub_block:  using vector code for " <<
                        sub_block_vidxs.begin.makeValStr(nsdims) <<
                        " ... (end before) " << sub_block_vidxs.end.makeValStr(nsdims) <<
-                       " before and/or after full vector-clusters in " <<
+                       " *not* within full vector-clusters at " <<
                        sub_block_fcidxs.begin.makeValStr(nsdims) <<
                        " ... (end before) " << sub_block_fcidxs.end.makeValStr(nsdims));
 
@@ -413,7 +453,7 @@ namespace yask {
 
             // Also normalize the *full* vector indices to determine if
             // we need a mask at each vector index.
-            // We don't need start, stop, or step for this.
+            // We just need begin and end indices for this.
             ScanIndices norm_sub_block_fvidxs(sub_block_eidxs);
             normalize_indices(sub_block_fvidxs.begin, norm_sub_block_fvidxs.begin);
             normalize_indices(sub_block_fvidxs.end, norm_sub_block_fvidxs.end);
@@ -424,6 +464,8 @@ namespace yask {
             // range (before the cluster) and/or remainder
             // range (after the clusters). If so, call the
             // loop-of-vectors function w/appropriate mask.
+            // See the mask diagrams above that show how the
+            // masks are ANDed together.
             // Since step is always 1, we ignore loop_idxs.stop.
 #define calc_inner_loop(thread_idx, loop_idxs) \
             bool ok = false;                                            \
@@ -635,11 +677,11 @@ namespace yask {
         auto& domain_dims = dims->_domain_dims;
         auto& step_dim = dims->_step_dim;
         auto& stencil_dims = dims->_stencil_dims;
-        auto ndims = stencil_dims.size();
+        auto nsdims = stencil_dims.size();
 
         // Init min vars w/max val and vice-versa.
-        Indices min_pts(idx_max, ndims);
-        Indices max_pts(idx_min, ndims);
+        Indices min_pts(idx_max, nsdims);
+        Indices max_pts(idx_min, nsdims);
         idx_t npts = 0;
 
         // Begin, end tuples.
@@ -653,7 +695,7 @@ namespace yask {
         end[step_dim] = 1;      // one time-step only.
 
         // Indices needed for the generated 'misc' loops.
-        ScanIndices misc_idxs(*dims, false);
+        ScanIndices misc_idxs(*dims, false, 0);
         misc_idxs.begin = begin;
         misc_idxs.end = end;
 

From d4f01fc0a7fea1d8ff9c2e38b4134b43e94b1d42 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Fri, 27 Apr 2018 17:40:00 -0700
Subject: [PATCH 14/21] Fix bug in grid compare() due to use of deprecated API.
 Add deprecation warning.

---
 src/kernel/lib/grid_apis.cpp   | 10 +++++++---
 src/kernel/lib/realv_grids.cpp | 17 ++++++++++-------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp
index 3b2a148d..3d8e8d90 100644
--- a/src/kernel/lib/grid_apis.cpp
+++ b/src/kernel/lib/grid_apis.cpp
@@ -30,6 +30,9 @@ using namespace std;
 
 namespace yask {
 
+#define DEPRECATED(api_name) cerr << "\n*** WARNING: deprecated YASK API '" \
+    #api_name "' will be removed in a future release ***\n"
+
     // APIs to get info from vars.
 #define GET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \
     idx_t YkGridBase::api_name(const string& dim) const {               \
@@ -46,16 +49,13 @@ namespace yask {
     GET_GRID_API(get_rank_domain_size, _domains[posn], false, true, false, false)
     GET_GRID_API(get_left_pad_size, _left_pads[posn], false, true, false, false) // _left_pads is actual size.
     GET_GRID_API(get_right_pad_size, _allocs[posn] - _left_pads[posn], false, true, false, false) // _right_pads is request only.
-    GET_GRID_API(get_pad_size, _left_pads[posn], false, true, false, false)
     GET_GRID_API(get_left_halo_size, _left_halos[posn], false, true, false, false)
     GET_GRID_API(get_right_halo_size, _right_halos[posn], false, true, false, false)
-    GET_GRID_API(get_halo_size, _left_halos[posn], false, true, false, false)
     GET_GRID_API(get_first_misc_index, _offsets[posn], false, false, true, false)
     GET_GRID_API(get_last_misc_index, _offsets[posn] + _domains[posn] - 1, false, false, true, false)
     GET_GRID_API(get_left_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false)
     GET_GRID_API(get_right_extra_pad_size, (_allocs[posn] - _left_pads[posn] - _domains[posn]) -
                  _right_halos[posn], false, true, false, false)
-    GET_GRID_API(get_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false)
     GET_GRID_API(get_alloc_size, _allocs[posn], true, true, true, false)
     GET_GRID_API(get_first_rank_domain_index, _offsets[posn] - _local_offsets[posn], false, true, false, true)
     GET_GRID_API(get_last_rank_domain_index, _offsets[posn] - _local_offsets[posn] + _domains[posn] - 1;
@@ -71,6 +71,10 @@ namespace yask {
     GET_GRID_API(_get_local_offset, _local_offsets[posn], true, true, true, false)
     GET_GRID_API(_get_first_alloc_index, _offsets[posn] - _left_pads[posn], true, true, true, true)
     GET_GRID_API(_get_last_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, true, true, true, true)
+
+    GET_GRID_API(get_pad_size, (DEPRECATED(get_pad_size), _left_pads[posn]), false, true, false, false)
+    GET_GRID_API(get_halo_size, (DEPRECATED(get_halo_size), _left_halos[posn]), false, true, false, false)
+    GET_GRID_API(get_extra_pad_size, (DEPRECATED(get_extra_pad_size), _left_pads[posn] - _left_halos[posn]), false, true, false, false)
 #undef GET_GRID_API
     
     // APIs to set vars.
diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp
index 43726f97..d94b659f 100644
--- a/src/kernel/lib/realv_grids.cpp
+++ b/src/kernel/lib/realv_grids.cpp
@@ -273,23 +273,26 @@ namespace yask {
         auto allocs = get_allocs();
 
         // This will loop over the entire allocation.
-        // Indices of 'pt' will be relative to allocation.
+        // We use this as a handy way to get offsets,
+        // but not all will be used.
         allocs.visitAllPoints
             ([&](const IdxTuple& pt, size_t idx) {
 
                 // Adjust alloc indices to overall indices.
                 IdxTuple opt(pt);
                 bool ok = true;
-                for (int i = 0; i < pt.getNumDims(); i++) {
+                for (int i = 0; ok && i < pt.getNumDims(); i++) {
                     auto val = pt.getVal(i);
-                    opt[i] = _offsets[i] - _left_pads[i] + val;
 
-                    // Don't compare points in the extra padding area.
+                    // Convert to global index.
+                    opt[i] = _offsets[i] + val;
+
+                    // Don't compare points outside the domain.
+                    // TODO: check points in halo.
                     auto& dname = pt.getDimName(i);
                     if (_dims->_domain_dims.lookup(dname)) {
-                        auto halo_sz = get_halo_size(dname);
-                        auto first_ok = get_first_rank_domain_index(dname) - halo_sz;
-                        auto last_ok = get_last_rank_domain_index(dname) + halo_sz;
+                        auto first_ok = get_first_rank_domain_index(dname);
+                        auto last_ok = get_last_rank_domain_index(dname);
                         if (opt[i] < first_ok || opt[i] > last_ok)
                             ok = false;
                     }

From db4b0ddc51c676879a0b8998b631408bfe92ac84 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Fri, 27 Apr 2018 18:31:39 -0700
Subject: [PATCH 15/21] Fix scalar peel/remainder loop.

---
 src/kernel/lib/stencil_calc.cpp | 72 +++++++++++++++++----------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp
index 1f26ee92..14166a09 100644
--- a/src/kernel/lib/stencil_calc.cpp
+++ b/src/kernel/lib/stencil_calc.cpp
@@ -273,11 +273,12 @@ namespace yask {
                         auto vend = round_up_flr(eend, vpts);
                         if (i == _inner_posn) {
 
-                            // Don't do any vectors in plane of inner dim.
-                            // We'll do these with scalars.
-                            // This is unusual because vector folding is
-                            // normally done in a plane perpendicular to
-                            // the inner dim for >= 2D domains.
+                            // Don't do any full and/or partial vectors in
+                            // plane of inner dim.  We'll do these with
+                            // scalars.  This is unusual because vector
+                            // folding is normally done in a plane
+                            // perpendicular to the inner dim for >= 2D
+                            // domains.
                             fvbgn = vbgn = fcbgn;
                             fvend = vend = fcend;
                         }
@@ -397,10 +398,9 @@ namespace yask {
             // Step sizes are based on cluster lengths (in vector units).
             // The step in the inner loop is hard-coded in the generated code.
             for (int i = 0, j = 0; i < nsdims; i++) {
-                if (i != step_posn) {
-                    norm_sub_block_idxs.step[i] = dims->_cluster_mults[j];
-                    j++;
-                }
+                if (i == step_posn) continue;
+                norm_sub_block_idxs.step[i] = dims->_cluster_mults[j]; // N vecs.
+                j++;
             }
 
             // Define the function called from the generated loops
@@ -444,12 +444,7 @@ namespace yask {
 
             // Step sizes are one vector.
             // The step in the inner loop is hard-coded in the generated code.
-            for (int i = 0, j = 0; i < nsdims; i++) {
-                if (i != step_posn) {
-                    norm_sub_block_idxs.step[i] = 1;
-                    j++;
-                }
-            }
+            norm_sub_block_idxs.step.setFromConst(1);
 
             // Also normalize the *full* vector indices to determine if
             // we need a mask at each vector index.
@@ -494,43 +489,50 @@ namespace yask {
         // Use scalar code for anything not done above.
         if (do_scalars) {
 
+            // Use the 'misc' loops. Indices for these loops will be scalar and
+            // global rather than normalized as in the cluster and vector loops.
+            ScanIndices misc_idxs(sub_block_idxs);
+
+            // Step sizes and alignment are one element.
+            misc_idxs.step.setFromConst(1);
+            misc_idxs.align.setFromConst(1);
+
 #ifdef TRACE
             string msg = "calc_sub_block:  using scalar code for ";
             msg += scalar_for_peel_rem ? "peel/remainder of" : "entire";
             msg += " sub-block ";
             msg += bb_is_full ? "without" : "with";
-            msg += " sub-domain checking";
-            TRACE_MSG3(msg);
+            msg += " sub-domain checking for ";
+            TRACE_MSG3(msg << 
+                       misc_idxs.begin.makeValStr(nsdims) <<
+                       " ... (end before) " <<
+                       misc_idxs.end.makeValStr(nsdims));
 #endif
 
-            // Use the 'misc' loops. Indices for these loops will be scalar and
-            // global rather than normalized as in the cluster and vector loops.
-            ScanIndices misc_idxs(sub_block_idxs);
-
             // Define misc-loop function.
             // If point is in sub-domain for this
             // bundle, then evaluate the reference scalar code.
             // If no holes, don't need to check each point in domain.
             // Since step is always 1, we ignore misc_idxs.stop.
-#define misc_fn(misc_idxs)  do {                                        \
-            bool ok = true;                                             \
-            if (scalar_for_peel_rem) {                                  \
-                ok = false;                                             \
-                for (int i = 0, j = 0; i < nsdims; i++) {               \
-                    if (i != step_posn) {                               \
+#define misc_fn(pt_idxs)  do {                                          \
+                TRACE_MSG3("calc_sub_block:   at pt " << pt_idxs.start.makeValStr(nsdims)); \
+                bool ok = true;                                         \
+                if (scalar_for_peel_rem) {                              \
+                    ok = false;                                         \
+                    for (int i = 0, j = 0; i < nsdims; i++) {           \
+                        if (i == step_posn) continue;                   \
                         auto rofs = cp->rank_domain_offsets[j];         \
-                        if (misc_idxs.start[i] < rofs + sub_block_vidxs.begin[i] || \
-                            misc_idxs.start[i] >= rofs + sub_block_vidxs.end[i]) { \
+                        if (pt_idxs.start[i] < rofs + sub_block_vidxs.begin[i] || \
+                            pt_idxs.start[i] >= rofs + sub_block_vidxs.end[i]) { \
                             ok = true; break; }                         \
                         j++;                                            \
                     }                                                   \
                 }                                                       \
-            }                                                           \
-            if (ok && (bb_is_full || is_in_valid_domain(misc_idxs.start))) { \
-                calc_scalar(thread_idx, misc_idxs.start);               \
-            }                                                           \
-        } while(0)
-
+                if (ok && (bb_is_full || is_in_valid_domain(pt_idxs.start))) { \
+                    calc_scalar(thread_idx, pt_idxs.start);             \
+                }                                                       \
+            } while(0)
+            
             // Scan through n-D space.
             // The OMP in the misc loops will be ignored if we're already in
             // the max allowed nested OMP region.

From 533a236b20cc96969b6e8bc62735aed607063471 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Sat, 28 Apr 2018 06:53:49 -0700
Subject: [PATCH 16/21] Fix pad-adjustment code.

---
 src/kernel/lib/realv_grids.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp
index d94b659f..3eaae4c3 100644
--- a/src/kernel/lib/realv_grids.cpp
+++ b/src/kernel/lib/realv_grids.cpp
@@ -117,7 +117,6 @@ namespace yask {
         // Start with halos plus WF exts.
         Indices mp = halos.addElements(wf_exts);
             
-
         // For scratch grids, halo area must be written to.  Halo is sum
         // of dependent's write halo and depender's read halo, but these
         // two components are not stored individually.  Write halo will
@@ -131,9 +130,11 @@ namespace yask {
         for (int i = 0; i < get_num_dims(); i++) {
             if (mp[i] >= 1) {
                 auto& dname = get_dim_name(i);
-                auto* p = _dims->_domain_dims.lookup(dname);
-                if (p)
+                auto* p = _dims->_fold_pts.lookup(dname);
+                if (p) {
+                    assert (p >= 1);
                     mp[i] += *p - 1;
+                }
             }
         }
         return mp;

From 6956cda4f876f786d3d659b9c7849aea2c41e223 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Sat, 28 Apr 2018 06:54:20 -0700
Subject: [PATCH 17/21] Make 2D test asymmetical.

---
 src/stencils/SimpleTestStencils.hpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/stencils/SimpleTestStencils.hpp b/src/stencils/SimpleTestStencils.hpp
index 6e910aec..1d2c1c9f 100644
--- a/src/stencils/SimpleTestStencils.hpp
+++ b/src/stencils/SimpleTestStencils.hpp
@@ -82,11 +82,16 @@ class Test2dStencil : public StencilRadiusBase {
     // Define equation to apply to all points in 'data' grid.
     virtual void define() {
 
-        // define the value at t+1.
+        // define the value at t+1 using asymmetric stencil.
         GridValue v = data(t, x, y) + 1.0;
         for (int r = 1; r <= _radius; r++)
-            v += data(t, x + r, y) + data(t, x - r, y)
-                + data(t, x, y + r) + data(t, x, y - r);
+            v += data(t, x + r, y);
+        for (int r = 1; r <= _radius + 1; r++)
+            v += data(t, x - r, y);
+        for (int r = 1; r <= _radius + 2; r++)
+            v += data(t, x, y + r);
+        for (int r = 1; r <= _radius + 3; r++)
+            v += data(t, x, y - r);
         data(t+1, x, y) EQUALS v;
     }
 };

From ce9653f7efb3ff4a1e83dd33787c8b6d55116df4 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Sat, 28 Apr 2018 06:54:58 -0700
Subject: [PATCH 18/21] Clean up deprecation warning. Ver 2.06.01.

---
 src/common/common_utils.cpp  |  2 +-
 src/kernel/lib/grid_apis.cpp | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp
index 1d683006..a5c5504b 100644
--- a/src/common/common_utils.cpp
+++ b/src/common/common_utils.cpp
@@ -41,7 +41,7 @@ namespace yask {
     // for numbers above 9 (at least up to 99).
 
     // Format: "major.minor.patch".
-    const string version = "2.06.00";
+    const string version = "2.06.01";
 
     string yask_get_version_string() {
         return version;
diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp
index 3d8e8d90..66544942 100644
--- a/src/kernel/lib/grid_apis.cpp
+++ b/src/kernel/lib/grid_apis.cpp
@@ -31,7 +31,7 @@ using namespace std;
 namespace yask {
 
 #define DEPRECATED(api_name) cerr << "\n*** WARNING: deprecated YASK API '" \
-    #api_name "' will be removed in a future release ***\n"
+    #api_name "' used that will be removed in a future release ***\n"
 
     // APIs to get info from vars.
 #define GET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \
@@ -41,10 +41,12 @@ namespace yask {
         if (prep_req && _offsets[posn] < 0)                             \
             THROW_YASK_EXCEPTION("Error: '" #api_name "()' called on grid '" << \
                                  get_name() << "' before calling 'prepare_solution()'"); \
-        return expr;                                                    \
+        auto rtn = expr;                                                \
+        return rtn;                                                     \
     }                                                                   \
     idx_t YkGridBase::api_name(int posn) const {                        \
-        return expr;                                                    \
+        auto rtn = expr;                                                \
+        return rtn;                                                     \
     }
     GET_GRID_API(get_rank_domain_size, _domains[posn], false, true, false, false)
     GET_GRID_API(get_left_pad_size, _left_pads[posn], false, true, false, false) // _left_pads is actual size.
@@ -72,9 +74,9 @@ namespace yask {
     GET_GRID_API(_get_first_alloc_index, _offsets[posn] - _left_pads[posn], true, true, true, true)
     GET_GRID_API(_get_last_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, true, true, true, true)
 
-    GET_GRID_API(get_pad_size, (DEPRECATED(get_pad_size), _left_pads[posn]), false, true, false, false)
-    GET_GRID_API(get_halo_size, (DEPRECATED(get_halo_size), _left_halos[posn]), false, true, false, false)
-    GET_GRID_API(get_extra_pad_size, (DEPRECATED(get_extra_pad_size), _left_pads[posn] - _left_halos[posn]), false, true, false, false)
+    GET_GRID_API(get_pad_size, _left_pads[posn]; DEPRECATED(get_pad_size), false, true, false, false)
+    GET_GRID_API(get_halo_size, _left_halos[posn]; DEPRECATED(get_halo_size), false, true, false, false)
+    GET_GRID_API(get_extra_pad_size, _left_pads[posn] - _left_halos[posn]; DEPRECATED(get_extra_pad_size), false, true, false, false)
 #undef GET_GRID_API
     
     // APIs to set vars.

From d1c8d0c025e4f76499def9097b3e9c77e1a4f329 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Sat, 28 Apr 2018 06:59:48 -0700
Subject: [PATCH 19/21] Fix assertion.

---
 src/kernel/lib/realv_grids.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp
index 3eaae4c3..83fc1d4e 100644
--- a/src/kernel/lib/realv_grids.cpp
+++ b/src/kernel/lib/realv_grids.cpp
@@ -132,7 +132,7 @@ namespace yask {
                 auto& dname = get_dim_name(i);
                 auto* p = _dims->_fold_pts.lookup(dname);
                 if (p) {
-                    assert (p >= 1);
+                    assert (*p >= 1);
                     mp[i] += *p - 1;
                 }
             }

From eb8fcf2d364edaceb3c13f57087e3f106780d182 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Sat, 28 Apr 2018 11:09:58 -0700
Subject: [PATCH 20/21] Mark grids dirty in a rank if *any* rank could make it
 dirty. Closes #106.

---
 src/kernel/lib/context.cpp |  41 ++++++++----
 src/kernel/lib/setup.cpp   | 127 +++++++++++++++++++++----------------
 2 files changed, 101 insertions(+), 67 deletions(-)

diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp
index 668696d1..7c521bf0 100644
--- a/src/kernel/lib/context.cpp
+++ b/src/kernel/lib/context.cpp
@@ -271,9 +271,13 @@ namespace yask {
 #include "yask_misc_loops.hpp"
 #undef misc_fn
                 
-                    // Remember grids that have been written to by this bundle,
+                    // Mark grids that [may] have been written to by this bundle,
                     // updated at next step (+/- 1).
-                    mark_grids_dirty(start_t + step_t, stop_t + step_t, *asg);
+                    // Mark grids as dirty even if not actually written by this
+                    // rank. This is needed because neighbors will not know what
+                    // grids are actually dirty, and all ranks must have the same
+                    // information about which grids are possibly dirty.
+                    mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);
                 
                 } // needed bundles.
             } // all bundles.
@@ -436,9 +440,9 @@ namespace yask {
             }
 
             // If doing wave-fronts, must loop through all bundles in
-            // calc_region().
-            // TODO: make this the only case, allowing all bundles to be done
-            // between MPI exchanges, even w/o wave-fronts.
+            // calc_region().  TODO: consider making this the only case,
+            // allowing all bundles to be done between MPI exchanges, even
+            // w/o wave-fronts.
             else {
 
                 // Exchange all dirty halo(s).
@@ -606,13 +610,18 @@ namespace yask {
                     // similar for y and z.  This code typically
                     // contains the outer OpenMP loop(s).
 #include "yask_region_loops.hpp"
-
-                    // Remember grids that have been written to by this bundle,
-                    // updated at next step (+/- 1).
-                    mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);
                 }
             
-                // Shift spatial region boundaries for next iteration to
+                // Mark grids that [may] have been written to by this bundle,
+                // updated at next step (+/- 1).
+                // Mark grids as dirty even if not actually written by this
+                // rank. This is needed because neighbors will not know what
+                // grids are actually dirty, and all ranks must have the same
+                // information about which grids are possibly dirty.
+                // TODO: make this smarter.
+                mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);
+
+                    // Shift spatial region boundaries for next iteration to
                 // implement temporal wavefront.  Between regions, we only shift
                 // backward, so region loops must strictly increment. They may do
                 // so in any order.  TODO: shift only what is needed by
@@ -1184,7 +1193,7 @@ namespace yask {
                     if (!gp->is_dirty(t))
                         continue;
 
-                    // Only need to swap grids that have MPI buffers.
+                    // Only need to swap grids that have any MPI buffers.
                     auto& gname = gp->get_name();
                     if (mpiData.count(gname) == 0)
                         continue;
@@ -1226,7 +1235,7 @@ namespace yask {
                     auto gp = gtsi.second;
                     gi++;
                     MPI_Request* grid_recv_reqs = recv_reqs[gi];
-                    TRACE_MSG(" for grid '" << gname << "'...");
+                    TRACE_MSG(" for grid #" << gi << ", '" << gname << "'...");
 
                     // Visit all this rank's neighbors.
                     auto& grid_mpi_data = mpiData.at(gname);
@@ -1260,6 +1269,8 @@ namespace yask {
                                               neighbor_rank, int(gi), _env->comm, &grid_recv_reqs[ni]);
                                     num_recv_reqs++;
                                 }
+                                else
+                                    TRACE_MSG("   0B to request");
                             }
 
                             // Pack data into send buffer, then send to neighbor.
@@ -1276,7 +1287,7 @@ namespace yask {
                                     IdxTuple first = sendBuf.begin_pt;
                                     IdxTuple last = sendBuf.last_pt;
 
-                                    // The code in allocData() pre-calculated the first and
+                                    // The code in allocMpiData() pre-calculated the first and
                                     // last points of each buffer, except in the step dim.
                                     // So, we need to set that value now.
                                     // TODO: update this if we expand the buffers to hold
@@ -1305,6 +1316,8 @@ namespace yask {
                                               neighbor_rank, int(gi), _env->comm,
                                               &send_reqs[num_send_reqs++]);
                                 }
+                                else
+                                    TRACE_MSG("   0B to send");
                             }
 
                             // Wait for data from neighbor, then unpack it.
@@ -1343,6 +1356,8 @@ namespace yask {
                                         n = gp->set_elements_in_slice(buf, first, last);
                                     assert(n == recvBuf.get_size());
                                 }
+                                else
+                                    TRACE_MSG("   0B to wait for");
                             }
                         }); // visit neighbors.
 
diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp
index 58fb63ef..d4aacc8f 100644
--- a/src/kernel/lib/setup.cpp
+++ b/src/kernel/lib/setup.cpp
@@ -328,7 +328,7 @@ namespace yask {
         } // grid passes.
     };
     
-    // Create MPI and allocate buffers.
+    // Create MPI buffers and allocate them.
     void StencilContext::allocMpiData(ostream& os) {
 
         // Remove any old MPI data.
@@ -336,7 +336,8 @@ namespace yask {
 
 #ifdef USE_MPI
 
-        int num_exchanges = 0;
+        map<int, int> num_exchanges; // send/recv => count.
+        map<int, idx_t> num_elems; // send/recv => count.
         auto me = _env->my_rank;
         
         // Need to determine the size and shape of all MPI buffers.
@@ -369,10 +370,24 @@ namespace yask {
                     return;     // from lambda fn.
                 }
         
-                // Determine size of MPI buffers between neigh_rank and my rank
-                // for each grid and create those that are needed.
+                // Is vectorized exchange allowed based on domain sizes?
+                // Both my rank and neighbor rank must have all domain sizes
+                // of vector multiples.
+                bool vec_ok = allow_vec_exchange &&
+                    _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] &&
+                    _mpiInfo->has_all_vlen_mults[neigh_idx];
+                
+                // Determine size of MPI buffers between neigh_rank and my
+                // rank for each grid and create those that are needed.  It
+                // is critical that the number, size, and shape of my
+                // send/receive buffers match those of the receive/send
+                // buffers of my neighbors.  Important: Current algorithm
+                // assumes my left neighbor's buffer sizes can be calculated
+                // by considering my rank's right side data and vice-versa.
+                // Thus, all ranks must have consistent data that contribute
+                // to these calculations.
                 for (auto gp : gridPtrs) {
-                    if (!gp)
+                    if (!gp || gp->is_scratch() || gp->is_fixed_size())
                         continue;
                     auto& gname = gp->get_name();
 
@@ -384,12 +399,15 @@ namespace yask {
                     IdxTuple first_outer_idx, last_outer_idx;
                     for (auto& dim : _dims->_domain_dims.getDims()) {
                         auto& dname = dim.getName();
+
+                        // Only consider domain dims that are used in this grid.
                         if (gp->is_dim_used(dname)) {
 
-                            // Get domain indices for this grid.
-                            // If there are no more ranks in the given direction, extend
-                            // the index into the outer halo to make sure all data are sync'd.
-                            // This is critical for WFs.
+                            // Get domain indices for this grid.  If there
+                            // are no more ranks in the given direction,
+                            // extend the "outer" index to include the halo
+                            // in that direction to make sure all data are
+                            // sync'd.  This is critical for WFs.
                             idx_t fidx = gp->get_first_rank_domain_index(dname);
                             idx_t lidx = gp->get_last_rank_domain_index(dname);
                             first_inner_idx.addDimBack(dname, fidx);
@@ -401,55 +419,57 @@ namespace yask {
                             first_outer_idx.addDimBack(dname, fidx);
                             last_outer_idx.addDimBack(dname, lidx);
 
-                            // Determine size of exchange. This will be the actual halo size
-                            // plus any wave-front extensions. In the current implementation,
-                            // we need the wave-front extensions regardless of whether there
-                            // is a halo on a given grid. This is because each stencil-bundle
-                            // gets shifted by the WF angles at each step in the WF.
+                            // Determine size of exchange in this dim. This
+                            // will be the actual halo size plus any
+                            // wave-front shifts. In the current
+                            // implementation, we need the wave-front shifts
+                            // regardless of whether there is a halo on a
+                            // given grid. This is because each
+                            // stencil-bundle gets shifted by the WF angles
+                            // at each step in the WF.
 
-                            // Neighbor is to the left.
+                            // Neighbor is to the left in this dim.
                             if (neigh_offsets[dname] == MPIInfo::rank_prev) {
-                                auto ext = left_wf_exts[dname];
+                                auto ext = wf_shifts[dname];
 
-                                // my halo.
+                                // my halo on my left.
                                 auto halo_size = gp->get_left_halo_size(dname);
                                 halo_size += ext;
                                 my_halo_sizes.addDimBack(dname, halo_size);
 
-                                // neighbor halo.
-                                halo_size = gp->get_right_halo_size(dname); // their right is on my left.
+                                // neighbor halo on their right.
+                                halo_size = gp->get_right_halo_size(dname); // assume their right == my right.
                                 halo_size += ext;
                                 neigh_halo_sizes.addDimBack(dname, halo_size);
+
+                                // Flag that this grid has a neighbor to left or right.
+                                found_delta = true;
                             }
 
-                            // Neighbor is to the right.
+                            // Neighbor is to the right in this dim.
                             else if (neigh_offsets[dname] == MPIInfo::rank_next) {
-                                auto ext = right_wf_exts[dname];
+                                auto ext = wf_shifts[dname];
 
-                                // my halo.
+                                // my halo on my right.
                                 auto halo_size = gp->get_right_halo_size(dname);
                                 halo_size += ext;
                                 my_halo_sizes.addDimBack(dname, halo_size);
 
-                                // neighbor halo.
-                                halo_size = gp->get_left_halo_size(dname); // their left is on my right.
+                                // neighbor halo on their left.
+                                halo_size = gp->get_left_halo_size(dname); // assume their left == my left.
                                 halo_size += ext;
                                 neigh_halo_sizes.addDimBack(dname, halo_size);
+
+                                // Flag that this grid has a neighbor to left or right.
+                                found_delta = true;
                             }
 
-                            // Neighbor in-line.
+                            // Neighbor in-line in this dim.
                             else {
                                 my_halo_sizes.addDimBack(dname, 0);
                                 neigh_halo_sizes.addDimBack(dname, 0);
                             }
 
-                            // Vectorized exchange allowed based on domain sizes?
-                            // Both my rank and neighbor rank must have all domain sizes
-                            // of vector multiples.
-                            bool vec_ok = allow_vec_exchange &&
-                                _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] &&
-                                _mpiInfo->has_all_vlen_mults[neigh_idx];
-                            
                             // Round up halo sizes if vectorized exchanges allowed.
                             // TODO: add a heuristic to avoid increasing by a large factor.
                             if (vec_ok) {
@@ -457,12 +477,8 @@ namespace yask {
                                 my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size));
                                 neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size));
                             }
-                            
-                            // Is this neighbor before or after me in this domain direction?
-                            if (neigh_offsets[dname] != MPIInfo::rank_self)
-                                found_delta = true;
-                        }
-                    }
+                        } // domain dims in this grid.
+                    } // domain dims.
 
                     // Is buffer needed?
                     // Example: if this grid is 2D in y-z, but only neighbors are in
@@ -589,11 +605,19 @@ namespace yask {
                                 
                         } // all dims in this grid.
 
+                        // Unique name for buffer based on grid name, direction, and ranks.
+                        ostringstream oss;
+                        oss << gname;
+                        if (bd == MPIBufs::bufSend)
+                            oss << "_send_halo_from_" << me << "_to_" << neigh_rank;
+                        else if (bd == MPIBufs::bufRecv)
+                            oss << "_recv_halo_from_" << neigh_rank << "_to_" << me;
+                        string bufname = oss.str();
+
                         // Does buffer have non-zero size?
                         if (buf_sizes.size() == 0 || buf_sizes.product() == 0) {
-                            TRACE_MSG("no halo exchange needed for grid '" << gname <<
-                                      "' with rank " << neigh_rank <<
-                                      " because there is no data to exchange");
+                            TRACE_MSG("MPI buffer '" << bufname <<
+                                      "' not needed because there is no data to exchange");
                             continue;
                         }
 
@@ -602,15 +626,6 @@ namespace yask {
                         // Convert end to last.
                         IdxTuple copy_last = copy_end.subElements(1);
 
-                        // Unique name for buffer based on grid name, direction, and ranks.
-                        ostringstream oss;
-                        oss << gname;
-                        if (bd == MPIBufs::bufSend)
-                            oss << "_send_halo_from_" << me << "_to_" << neigh_rank;
-                        else if (bd == MPIBufs::bufRecv)
-                            oss << "_recv_halo_from_" << neigh_rank << "_to_" << me;
-                        string bufname = oss.str();
-
                         // Make MPI data entry for this grid.
                         auto gbp = mpiData.emplace(gname, _mpiInfo);
                         auto& gbi = gbp.first; // iterator from pair returned by emplace().
@@ -625,18 +640,22 @@ namespace yask {
                         buf.name = bufname;
                         buf.has_all_vlen_mults = vlen_mults;
                         
-                        TRACE_MSG("configured MPI buffer object '" << buf.name <<
-                                  "' for rank at relative offsets " <<
+                        TRACE_MSG("MPI buffer '" << buf.name <<
+                                  "' configured for rank at relative offsets " <<
                                   neigh_offsets.subElements(1).makeDimValStr() << " with " <<
                                   buf.num_pts.makeDimValStr(" * ") << " = " << buf.get_size() <<
                                   " element(s) at " << buf.begin_pt.makeDimValStr() <<
                                   " ... " << buf.last_pt.makeDimValStr());
-                        num_exchanges++;
+                        num_exchanges[bd]++;
+                        num_elems[bd] += buf.get_size();
 
                     } // send, recv.
                 } // grids.
             });   // neighbors.
-        TRACE_MSG("number of halo-exchanges needed on this rank: " << num_exchanges);
+        TRACE_MSG("number of MPI send buffers on this rank: " << num_exchanges[int(MPIBufs::bufSend)]);
+        TRACE_MSG("number of elements in send buffers: " << makeNumStr(num_elems[int(MPIBufs::bufSend)]));
+        TRACE_MSG("number of MPI recv buffers on this rank: " << num_exchanges[int(MPIBufs::bufRecv)]);
+        TRACE_MSG("number of elements in recv buffers: " << makeNumStr(num_elems[int(MPIBufs::bufRecv)]));
 
         // Base ptrs for all alloc'd data.
         // These pointers will be shared by the ones in the grid

From 6d2cd4fc8ffe1dfda3dab08f9d902e943cda7f83 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Sat, 28 Apr 2018 13:16:22 -0700
Subject: [PATCH 21/21] Fix bug in rounding up indices & sizes for vectorized
 halo exchanges. Ver 2.06.02.

---
 src/common/common_utils.cpp |  2 +-
 src/kernel/lib/context.cpp  |  4 +-
 src/kernel/lib/settings.hpp |  4 +-
 src/kernel/lib/setup.cpp    | 95 +++++++++++++++++++++++++------------
 4 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp
index a5c5504b..fc430a4d 100644
--- a/src/common/common_utils.cpp
+++ b/src/common/common_utils.cpp
@@ -41,7 +41,7 @@ namespace yask {
     // for numbers above 9 (at least up to 99).
 
     // Format: "major.minor.patch".
-    const string version = "2.06.01";
+    const string version = "2.06.02";
 
     string yask_get_version_string() {
         return version;
diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp
index 7c521bf0..b70596e0 100644
--- a/src/kernel/lib/context.cpp
+++ b/src/kernel/lib/context.cpp
@@ -1281,7 +1281,7 @@ namespace yask {
                                     // Vec ok?
                                     // Domain sizes must be ok, and buffer size must be ok
                                     // as calculated when buffers were created.
-                                    bool send_vec_ok = vec_ok && sendBuf.has_all_vlen_mults;
+                                    bool send_vec_ok = vec_ok && sendBuf.vec_copy_ok;
 
                                     // Get first and last ranges.
                                     IdxTuple first = sendBuf.begin_pt;
@@ -1330,7 +1330,7 @@ namespace yask {
                                     MPI_Wait(&grid_recv_reqs[ni], MPI_STATUS_IGNORE);
 
                                     // Vec ok?
-                                    bool recv_vec_ok = vec_ok && recvBuf.has_all_vlen_mults;
+                                    bool recv_vec_ok = vec_ok && recvBuf.vec_copy_ok;
 
                                     // Get first and last ranges.
                                     IdxTuple first = recvBuf.begin_pt;
diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp
index 5545de7e..a2532cfd 100644
--- a/src/kernel/lib/settings.hpp
+++ b/src/kernel/lib/settings.hpp
@@ -633,8 +633,8 @@ namespace yask {
         IdxTuple num_pts;
 
         // Whether the number of points is a multiple of the
-        // vector length in all dims.
-        bool has_all_vlen_mults = false;
+        // vector length in all dims and buffer is aligned.
+        bool vec_copy_ok = false;
 
         // Number of points overall.
         idx_t get_size() const {
diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp
index d4aacc8f..02797647 100644
--- a/src/kernel/lib/setup.cpp
+++ b/src/kernel/lib/setup.cpp
@@ -371,7 +371,7 @@ namespace yask {
                 }
         
                 // Is vectorized exchange allowed based on domain sizes?
-                // Both my rank and neighbor rank must have all domain sizes
+                // Both my rank and neighbor rank must have *all* domain sizes
                 // of vector multiples.
                 bool vec_ok = allow_vec_exchange &&
                     _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] &&
@@ -390,6 +390,7 @@ namespace yask {
                     if (!gp || gp->is_scratch() || gp->is_fixed_size())
                         continue;
                     auto& gname = gp->get_name();
+                    bool grid_vec_ok = vec_ok;
 
                     // Lookup first & last domain indices and calc exchange sizes
                     // for this grid.
@@ -402,6 +403,9 @@ namespace yask {
 
                         // Only consider domain dims that are used in this grid.
                         if (gp->is_dim_used(dname)) {
+                            auto vlen = _dims->_fold_pts[dname];
+                            auto lhalo = gp->get_left_halo_size(dname);
+                            auto rhalo = gp->get_right_halo_size(dname);
 
                             // Get domain indices for this grid.  If there
                             // are no more ranks in the given direction,
@@ -413,12 +417,25 @@ namespace yask {
                             first_inner_idx.addDimBack(dname, fidx);
                             last_inner_idx.addDimBack(dname, lidx);
                             if (_opts->is_first_rank(dname))
-                                fidx -= gp->get_left_halo_size(dname);
+                                fidx -= lhalo;
                             if (_opts->is_last_rank(dname))
-                                lidx += gp->get_right_halo_size(dname);
+                                lidx += rhalo;
                             first_outer_idx.addDimBack(dname, fidx);
                             last_outer_idx.addDimBack(dname, lidx);
 
+                            // Determine if it is possible to round the
+                            // outer indices to vec-multiples. This will
+                            // be required to allow full vec exchanges for
+                            // this grid. We won't do the actual rounding
+                            // yet, because we need to see if it's safe
+                            // in all dims.
+                            fidx = round_down_flr(fidx, vlen);
+                            lidx = round_up_flr(lidx, vlen);
+                            if (fidx < gp->get_first_rank_alloc_index(dname))
+                                grid_vec_ok = false;
+                            if (lidx > gp->get_last_rank_alloc_index(dname))
+                                grid_vec_ok = false;
+                            
                             // Determine size of exchange in this dim. This
                             // will be the actual halo size plus any
                             // wave-front shifts. In the current
@@ -432,15 +449,12 @@ namespace yask {
                             if (neigh_offsets[dname] == MPIInfo::rank_prev) {
                                 auto ext = wf_shifts[dname];
 
-                                // my halo on my left.
-                                auto halo_size = gp->get_left_halo_size(dname);
-                                halo_size += ext;
-                                my_halo_sizes.addDimBack(dname, halo_size);
+                                // My halo on my left.
+                                my_halo_sizes.addDimBack(dname, lhalo + ext);
 
-                                // neighbor halo on their right.
-                                halo_size = gp->get_right_halo_size(dname); // assume their right == my right.
-                                halo_size += ext;
-                                neigh_halo_sizes.addDimBack(dname, halo_size);
+                                // Neighbor halo on their right.
+                                // Assume my right is same as their right.
+                                neigh_halo_sizes.addDimBack(dname, rhalo + ext);
 
                                 // Flag that this grid has a neighbor to left or right.
                                 found_delta = true;
@@ -450,15 +464,12 @@ namespace yask {
                             else if (neigh_offsets[dname] == MPIInfo::rank_next) {
                                 auto ext = wf_shifts[dname];
 
-                                // my halo on my right.
-                                auto halo_size = gp->get_right_halo_size(dname);
-                                halo_size += ext;
-                                my_halo_sizes.addDimBack(dname, halo_size);
+                                // My halo on my right.
+                                my_halo_sizes.addDimBack(dname, rhalo + ext);
 
-                                // neighbor halo on their left.
-                                halo_size = gp->get_left_halo_size(dname); // assume their left == my left.
-                                halo_size += ext;
-                                neigh_halo_sizes.addDimBack(dname, halo_size);
+                                // Neighbor halo on their left.
+                                // Assume my left is same as their left.
+                                neigh_halo_sizes.addDimBack(dname, lhalo + ext);
 
                                 // Flag that this grid has a neighbor to left or right.
                                 found_delta = true;
@@ -470,13 +481,6 @@ namespace yask {
                                 neigh_halo_sizes.addDimBack(dname, 0);
                             }
 
-                            // Round up halo sizes if vectorized exchanges allowed.
-                            // TODO: add a heuristic to avoid increasing by a large factor.
-                            if (vec_ok) {
-                                auto vec_size = _dims->_fold_pts[dname];
-                                my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size));
-                                neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size));
-                            }
                         } // domain dims in this grid.
                     } // domain dims.
 
@@ -491,6 +495,31 @@ namespace yask {
                         continue; // to next grid.
                     }
 
+                    // Round halo sizes if vectorized exchanges allowed.
+                    // Both self and neighbor must be vec-multiples
+                    // and outer indices must be vec-mults or extendable
+                    // to be so.
+                    // TODO: add a heuristic to avoid increasing by a large factor.
+                    if (grid_vec_ok) {
+                        for (auto& dim : _dims->_domain_dims.getDims()) {
+                            auto& dname = dim.getName();
+                            if (gp->is_dim_used(dname)) {
+                                auto vlen = _dims->_fold_pts[dname];
+
+                                // first index rounded down.
+                                first_outer_idx.setVal(dname, round_down_flr(first_outer_idx[dname], vlen));
+
+                                // last index rounded up.
+                                last_outer_idx.setVal(dname, round_up_flr(last_outer_idx[dname], vlen));
+                                
+                                // sizes rounded up.
+                                my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vlen));
+                                neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vlen));
+                                    
+                            } // domain dims in this grid.
+                        } // domain dims.
+                    }
+                    
                     // Make a buffer in both directions (send & receive).
                     for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) {
 
@@ -498,7 +527,7 @@ namespace yask {
                         // of main grid to read from or write to based on
                         // the current neighbor being processed.
                         IdxTuple copy_begin = gp->get_allocs();
-                        IdxTuple copy_end = gp->get_allocs();
+                        IdxTuple copy_end = gp->get_allocs(); // one past last!
 
                         // Adjust along domain dims in this grid.
                         for (auto& dim : _dims->_domain_dims.getDims()) {
@@ -516,13 +545,15 @@ namespace yask {
                             
                                 // Region to read from, i.e., data from inside
                                 // this rank's domain to be put into neighbor's
-                                // halo.
+                                // halo. So, use neighbor's halo sizes when
+                                // calculating buffer size.
                                 if (bd == MPIBufs::bufSend) {
 
                                     // Neighbor is to the left.
                                     if (neigh_ofs == idx_t(MPIInfo::rank_prev)) {
 
                                         // Only read slice as wide as halo from beginning.
+                                        copy_begin[dname] = first_inner_idx[dname];
                                         copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname];
                                     }
                             
@@ -531,6 +562,7 @@ namespace yask {
                                     
                                         // Only read slice as wide as halo before end.
                                         copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname];
+                                        copy_end[dname] = last_inner_idx[dname] + 1;
                                     }
                             
                                     // Else, this neighbor is in same posn as I am in this dim,
@@ -538,6 +570,7 @@ namespace yask {
                                 }
                         
                                 // Region to write to, i.e., into this rank's halo.
+                                // So, use my halo sizes when calculating buffer sizes.
                                 else if (bd == MPIBufs::bufRecv) {
 
                                     // Neighbor is to the left.
@@ -573,10 +606,12 @@ namespace yask {
                             if (_dims->_domain_dims.lookup(dname)) {
                                 dsize = copy_end[dname] - copy_begin[dname];
 
-                                // Check whether size is multiple of vlen.
+                                // Check whether alignment and size are multiple of vlen.
                                 auto vlen = _dims->_fold_pts[dname];
                                 if (dsize % vlen != 0)
                                     vlen_mults = false;
+                                if (imod_flr(copy_begin[dname], vlen) != 0)
+                                    vlen_mults = false;
                             }
 
                             // step dim?
@@ -638,7 +673,7 @@ namespace yask {
                         buf.last_pt = copy_last;
                         buf.num_pts = buf_sizes;
                         buf.name = bufname;
-                        buf.has_all_vlen_mults = vlen_mults;
+                        buf.vec_copy_ok = vlen_mults;
                         
                         TRACE_MSG("MPI buffer '" << buf.name <<
                                   "' configured for rank at relative offsets " <<