From 8775765462c227fe1e83316c1d26fffa9c6c886e Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Tue, 17 Apr 2018 17:07:30 -0700 Subject: [PATCH 01/21] Change equation and stencil groups to bundles. Preliminary bundle API defns. --- include/yask_compiler_api.hpp | 213 +++- include/yask_kernel_api.hpp | 1609 +------------------------ include/yk_grid_api.hpp | 942 +++++++++++++++ include/yk_solution_api.hpp | 836 +++++++++++++ src/common/common_utils.cpp | 2 +- src/compiler/lib/Cpp.hpp | 14 +- src/compiler/lib/CppIntrin.hpp | 18 +- src/compiler/lib/Eqs.cpp | 184 ++- src/compiler/lib/Eqs.hpp | 120 +- src/compiler/lib/Grid.cpp | 2 +- src/compiler/lib/Grid.hpp | 6 +- src/compiler/lib/Print.cpp | 16 +- src/compiler/lib/Print.hpp | 18 +- src/compiler/lib/Soln.cpp | 41 +- src/compiler/lib/Soln.hpp | 4 +- src/compiler/lib/YaskKernel.cpp | 78 +- src/compiler/main.cpp | 30 +- src/compiler/swig/yask_compiler_api.i | 1 + src/kernel/Makefile | 4 +- src/kernel/lib/context.cpp | 120 +- src/kernel/lib/context.hpp | 26 +- src/kernel/lib/stencil_calc.cpp | 40 +- src/kernel/lib/stencil_calc.hpp | 40 +- src/kernel/swig/yask_kernel_api.i | 1 + 24 files changed, 2338 insertions(+), 2027 deletions(-) create mode 100644 include/yk_grid_api.hpp create mode 100644 include/yk_solution_api.hpp diff --git a/include/yask_compiler_api.hpp b/include/yask_compiler_api.hpp index cd1eccbe..a6c3bd81 100644 --- a/include/yask_compiler_api.hpp +++ b/include/yask_compiler_api.hpp @@ -47,6 +47,10 @@ namespace yask { /// Shared pointer to \ref yc_grid typedef yc_grid* yc_grid_ptr; + class yc_equation_group; + /// Shared pointer to \ref yc_equation_group; + typedef std::shared_ptr yc_equation_group_ptr; + // Forward declarations of expression nodes and their pointers. class yc_expr_node; @@ -166,7 +170,7 @@ namespace yask { At least one grid must be defined with at least one domain-index node. - @returns Pointer to the new grid. + @returns Pointer to the new \ref yc_grid object. */ virtual yc_grid_ptr new_grid(const std::string& name @@ -181,7 +185,7 @@ namespace yask { /** C++ initializer-list version with same semantics as vector version. @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. - @returns Pointer to the new grid. + @returns Pointer to the new \ref yc_grid object. */ virtual yc_grid_ptr new_grid(const std::string& name /**< [in] Unique name of the grid; must be @@ -206,7 +210,7 @@ namespace yask { See `TestScratchStencil*` classes in `src/stencils/SimpleTestStencils.hpp` for usage examples. - @returns Pointer to the new grid. + @returns Pointer to the new \ref yc_grid object. */ virtual yc_grid_ptr new_scratch_grid(const std::string& name @@ -221,7 +225,7 @@ namespace yask { /** C++ initializer-list version with same semantics as vector version. @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. - @returns Pointer to the new grid. + @returns Pointer to the new \ref yc_grid object. */ virtual yc_grid_ptr new_scratch_grid(const std::string& name @@ -249,17 +253,48 @@ namespace yask { get_grid(const std::string& name /**< [in] Name of the grid. */ ) =0; /// Get the number of equations in the solution. - /** Equations are added when equation_nodes are created via new_equation_node(). + /** Equations are added when yc_node_factory::new_equation_node() is called. @returns Number of equations that have been created. */ virtual int get_num_equations() const =0; /// Get the specified equation. - /** @returns Pointer to equation_node of nth equation. */ + /** @returns Pointer to \ref yc_equation_node of nth equation. */ virtual yc_equation_node_ptr get_equation(int n /**< [in] Index of equation between zero (0) and get_num_equations()-1. */ ) =0; + /// Create a new equation group. + /** + In normal usage, equation groups are created automatically when + format() is called. Under automatic grouping, the YASK compiler + discovers dependencies between equations and places equations + together in a group if they do not depend upon one another. + Then, the YASK compiler schedules the resulting groups for + execution in the kernel based on the dependencies between groups. + + A \ref yc_equation_group object allows manual grouping of equations. + Under manual grouping, the YASK compiler does _not_ check + for illegal dependencies within the group. + In addition, if `do_schedule` is `false`, the YASK compiler + will not check for dependencies with other groups and + will not schedule the group for execution in the kernel. + Then, it will be the programmer's responsibility to run the + stencil group via yk_solution::run_stencil_group(). + + This capability is useful for processing equations that + the YASK compiler cannot currently handle, like equations + with dependencies between different points of a grid + at the same step index. + + @returns Pointer to the new \ref yc_equation_group object. + */ + virtual yc_equation_group_ptr + new_equation_group(const std::string& name + /**< [in] Name of the group. */, + bool do_schedule = true + /**< [in] Schedule the group for execution in the kernel. */ ) =0; + /// Set the vectorization length in given dimension. /** For YASK-code generation, the product of the fold lengths should be equal to the number of elements in a HW SIMD register. @@ -347,8 +382,8 @@ namespace yask { is a scalar, a 1-dim grid is an array, etc. A compile-time grid is a variable used for constructing equations. It does not contain any data. - Data is only stored during run-time, using a yk_grid. - Create new grids via yc_solution::new_grid(). */ + Data is only stored during run-time, using a \ref yk_grid. + Created via yc_solution::new_grid(). */ class yc_grid { public: virtual ~yc_grid() {} @@ -380,9 +415,9 @@ namespace yask { /** The indices are specified relative to the stencil-evaluation index. Each offset refers to the dimensions defined when the grid was created via stencil_solution::new_grid(). - Example: if g = new_grid("heat", {"t", "x", "y"}), then - g->new_relative_grid_point(1, -1, 0) refers to heat(t+1, x-1, y) - for some point t, x, y during stencil evaluation. + Example: if `g = new_grid("heat", {"t", "x", "y"})`, then + `g->new_relative_grid_point(1, -1, 0)` refers to `heat(t+1, x-1, y)` + for some point `t, x, y` dynamically defined during stencil evaluation. @warning This convenience function can only be used when every dimension of the grid is either the step dimension or a domain dimension. @note Offsets beyond the dimensions in the grid will be ignored. @@ -403,8 +438,8 @@ namespace yask { }; /// Factory to create AST nodes. - /** @note Grid-point reference nodes are created from a `yc_grid` object - instead of from this factory. */ + /** @note Grid-point reference nodes are created from a \ref yc_grid object + instead of from a \ref yc_node_factory. */ class yc_node_factory { public: virtual ~yc_node_factory() {} @@ -414,6 +449,7 @@ namespace yask { Create a variable to be used to index grids in the solution-step dimension. The name usually describes time, e.g. "t". + @returns Pointer to new \ref yc_index_node object. */ virtual yc_index_node_ptr new_step_index(const std::string& name @@ -426,6 +462,7 @@ namespace yask { The name usually describes spatial dimensions, e.g. "x" or "y". This should *not* include the step dimension, which is specified via new_step_index(). + @returns Pointer to new \ref yc_index_node object. */ virtual yc_index_node_ptr new_domain_index(const std::string& name @@ -436,6 +473,7 @@ namespace yask { Create an variable to be used to index grids in the some dimension that is not the step dimension or a domain dimension. Example: index into an array. + @returns Pointer to new \ref yc_index_node object. */ virtual yc_index_node_ptr new_misc_index(const std::string& name @@ -447,49 +485,62 @@ namespace yask { created, it is automatically added to the list of equations for the yc_solution that contains the grid that is on the LHS. - @returns Pointer to new node. */ + @returns Pointer to new \ref yc_equation_node object. + */ virtual yc_equation_node_ptr new_equation_node(yc_grid_point_node_ptr lhs /**< [in] Grid-point before EQUALS operator. */, yc_number_node_ptr rhs /**< [in] Expression after EQUALS operator. */ ); /// Create a constant numerical value node. - /** This is unary negation. - Use new_subtraction_node() for binary '-'. - @returns Pointer to new node. */ + /** + This is unary negation. + Use new_subtraction_node() for binary '-'. + @returns Pointer to new \ref yc_const_number_node object. + */ virtual yc_const_number_node_ptr new_const_number_node(double val /**< [in] Value to store in node. */ ); /// Create a numerical negation operator node. - /** @returns Pointer to new node. */ + /** + @returns Pointer to new \ref yc_negate_node object. + */ virtual yc_negate_node_ptr new_negate_node(yc_number_node_ptr rhs /**< [in] Expression after '-' sign. */ ); /// Create an addition node. - /** Nodes must be created with at least two operands, and more can - be added by calling add_operand() on the returned node. - @returns Pointer to new node. */ + /** + Nodes must be created with at least two operands, and more can + be added by calling add_operand() on the returned node. + @returns Pointer to new \ref yc_add_node object. + */ virtual yc_add_node_ptr new_add_node(yc_number_node_ptr lhs /**< [in] Expression before '+' sign. */, yc_number_node_ptr rhs /**< [in] Expression after '+' sign. */ ); /// Create a multiplication node. - /** Nodes must be created with at least two operands, and more can - be added by calling add_operand() on the returned node. - @returns Pointer to new node. */ + /** + Nodes must be created with at least two operands, and more can + be added by calling add_operand() on the returned node. + @returns Pointer to new \ref yc_multiply_node object. + */ virtual yc_multiply_node_ptr new_multiply_node(yc_number_node_ptr lhs /**< [in] Expression before '*' sign. */, yc_number_node_ptr rhs /**< [in] Expression after '*' sign. */ ); /// Create a subtraction node. - /** This is binary subtraction. - Use new_negation_node() for unary '-'. - @returns Pointer to new node. */ + /** + This is binary subtraction. + Use new_negation_node() for unary '-'. + @returns Pointer to new \ref yc_subtract_node object. + */ virtual yc_subtract_node_ptr new_subtract_node(yc_number_node_ptr lhs /**< [in] Expression before '-' sign. */, yc_number_node_ptr rhs /**< [in] Expression after '-' sign. */ ); /// Create a division node. - /** @returns Pointer to new node. */ + /** + @returns Pointer to new \ref yc_divide_node object. + */ virtual yc_divide_node_ptr new_divide_node(yc_number_node_ptr lhs /**< [in] Expression before '/' sign. */, yc_number_node_ptr rhs /**< [in] Expression after '/' sign. */ ); @@ -502,20 +553,25 @@ namespace yask { virtual ~yc_expr_node() {} /// Create a simple human-readable string. - /** Formats the expression starting at this node. - @returns String containing a single-line human-readable version of the expression. + /** + Formats the expression starting at this node. + @returns String containing a single-line human-readable version of the expression. */ virtual std::string format_simple() const =0; /// Count the size of the AST. - /** @returns Number of nodes in this tree, - including this node and all its descendants. */ + /** + @returns Number of nodes in this tree, + including this node and all its descendants. + */ virtual int get_num_nodes() const =0; }; /// Equation node. /** Indicates grid point on LHS is equivalent to expression - on RHS. This is NOT a test for equality. */ + on RHS. This is NOT a test for equality. + Created via yc_node_factory::new_equation_node(). + */ class yc_equation_node : public virtual yc_expr_node { public: @@ -537,45 +593,58 @@ namespace yask { class yc_bool_node : public virtual yc_expr_node { }; /// A dimension or an index in that dimension. - /** This is a leaf node in an AST. - Use a yask_solution object to create an object of this type. */ + /** + This is a leaf node in an AST. + Created via yc_node_factory::new_step_index(), + yc_node_factory::new_domain_index(), and + yc_node_factory::new_misc_index(). + */ class yc_index_node : public virtual yc_number_node { public: /// Get the dimension's name. /** @returns Name given at creation. */ - virtual const std::string& get_name() const =0; + virtual const std::string& + get_name() const =0; }; /// A reference to a point in a grid. + /** + Created via yc_grid::new_relative_grid_point(). + */ class yc_grid_point_node : public virtual yc_number_node { public: /// Get the grid this point is in. - /** @returns Pointer to grid. */ - virtual yc_grid_ptr get_grid() =0; + /** @returns Pointer to a \ref yc_grid object. */ + virtual yc_grid_ptr + get_grid() =0; }; /// A constant numerical value. /** All values are stored as doubles. This is a leaf node in an AST. - Use a yask_compiler_factory object to create an object of this type. */ + Created via yc_node_factory::new_const_number_node(). + */ class yc_const_number_node : public virtual yc_number_node { public: /// Set the value. /** The value is considered "constant" only when the compiler output is created. It can be changed in the AST. */ - virtual void set_value(double val /**< [in] Value to store in node. */ ) =0; + virtual void + set_value(double val /**< [in] Value to store in node. */ ) =0; /// Get the stored value. /** @returns Copy of stored value. */ - virtual double get_value() const =0; + virtual double + get_value() const =0; }; /// A numerical negation operator. /** Example: used to implement -(a*b). - Use a yask_compiler_factory object to create an object of this type. */ + Created via yc_node_factory::new_negate_node(). + */ class yc_negate_node : public virtual yc_number_node { public: @@ -583,7 +652,8 @@ namespace yask { /** This node implements unary negation only, not subtraction, so there is never a left-hand-side. @returns Expression node on right-hand-side of '-' sign. */ - virtual yc_number_node_ptr get_rhs() =0; + virtual yc_number_node_ptr + get_rhs() =0; }; /// Base class for commutative numerical operators. @@ -598,7 +668,8 @@ namespace yask { them. Example: for an add operator, if the operands are 'a', 'b', and 'c', then the expression is 'a + b + c'. @returns Number of operands. */ - virtual int get_num_operands() =0; + virtual int + get_num_operands() =0; /// Get the specified operand. /** @returns Pointer to node at given position or null pointer if out of bounds. */ @@ -612,35 +683,81 @@ namespace yask { }; /// An addition node. + /** Created via yc_node_factory::new_negate_node(). */ class yc_add_node : public virtual yc_commutative_number_node { }; /// A multiplication node. + /** Created via yc_node_factory::new_multiply_node(). */ class yc_multiply_node : public virtual yc_commutative_number_node { }; /// A subtraction node. + /** Created via yc_node_factory::new_subtract_node(). */ class yc_subtract_node : public virtual yc_number_node { public: /// Get the left-hand-side operand. /** @returns Pointer to expression node appearing before the '-' sign. */ - virtual yc_number_node_ptr get_lhs() =0; + virtual yc_number_node_ptr + get_lhs() =0; /// Get the right-hand-side operand. /** @returns Pointer to expression node appearing after the '-' sign. */ - virtual yc_number_node_ptr get_rhs() =0; + virtual yc_number_node_ptr + get_rhs() =0; }; /// A division node. + /** Created via yc_node_factory::new_divide_node(). */ class yc_divide_node : public virtual yc_number_node { public: /// Get the left-hand-side operand. /** @returns Pointer to expression node appearing before the '/' sign. */ - virtual yc_number_node_ptr get_lhs() =0; + virtual yc_number_node_ptr + get_lhs() =0; /// Get the right-hand-side operand. /** @returns Pointer to expression node appearing after the '/' sign. */ - virtual yc_number_node_ptr get_rhs() =0; + virtual yc_number_node_ptr + get_rhs() =0; + }; + + /// A manual grouping of stencil equations. + /** + Created via yc_solution::new_equation_group(). + See yc_solution::new_equation_group() for a description of + automatic versus manual grouping. + + After a \ref yc_equation_group is processed by the YASK + compiler and the resulting kernel is compiled, + it will be visible as a \ref yk_stencil_group + in the corresponding YASK kernel. + */ + class yc_equation_group { + public: + + /// Get the name of this group. + /** + @returns Name created via yc_solution::new_equation_group(). + */ + virtual const std::string& + get_name() const =0; + + /// Determine whether this group will be automatically scheduled. + /** + @returns `true` if this group will be run via yk_solution::run_solution() + or `false` if this group must be run via yk_solution::run_stencil_group(). + This is the `do_schedule` setting passed via yc_solution::new_equation_group(). + */ + virtual bool + get_do_schedule() const =0; + + /// Add an equation to this group. + virtual void + add_equation(yc_equation_node_ptr equation + /**< [in] Pointer to equation to be added. */ ) =0; + + public: }; } // namespace yask. diff --git a/include/yask_kernel_api.hpp b/include/yask_kernel_api.hpp index 481e50ce..e579cac1 100644 --- a/include/yask_kernel_api.hpp +++ b/include/yask_kernel_api.hpp @@ -46,48 +46,35 @@ namespace yask { typedef std::int64_t idx_t; #endif - /// Allocate grids on local NUMA node. - /** - This is used in yk_solution::set_default_numa_preferred - and yk_grid::set_numa_preferred. - In Python, specify as `yask_kernel.cvar.yask_numa_local`. - */ - const int yask_numa_local = -1; - - /// Allocate grids across all available NUMA nodes. - /** - This is used in yk_solution::set_default_numa_preferred - and yk_grid::set_numa_preferred. - In Python, specify as `yask_kernel.cvar.yask_numa_interleave`. - */ - const int yask_numa_interleave = -2; - - /// Do not specify any NUMA binding. - /** - This is used in yk_solution::set_default_numa_preferred - and yk_grid::set_numa_preferred. - In Python, specify as `yask_kernel.cvar.yask_numa_none`. - */ - const int yask_numa_none = -9; - // Forward declarations of classes and pointers. class yk_env; - /// Shared pointer to \ref yk_env + /// Shared pointer to \ref yk_env. typedef std::shared_ptr yk_env_ptr; class yk_solution; - /// Shared pointer to \ref yk_solution + /// Shared pointer to \ref yk_solution. typedef std::shared_ptr yk_solution_ptr; class yk_grid; - /// Shared pointer to \ref yk_grid + /// Shared pointer to \ref yk_grid. typedef std::shared_ptr yk_grid_ptr; + class yk_stencil_group; + /// Shared pointer to \ref yk_stencil_group. + typedef std::shared_ptr yk_stencil_group; + class yk_stats; - /// Shared pointer to \ref yk_stats + /// Shared pointer to \ref yk_stats. typedef std::shared_ptr yk_stats_ptr; +} // namespace yask. + +#include "yk_solution_api.hpp" +#include "yk_grid_api.hpp" + +namespace yask { + /// Factory to create a stencil solution. class yk_factory { public: @@ -161,1572 +148,6 @@ namespace yask { global_barrier() const =0; }; - /// Stencil solution as defined by the generated code from the YASK stencil compiler. - /** - Objects of this type contain all the grids and equations - that comprise a solution. - */ - class yk_solution { - public: - virtual ~yk_solution() {} - - /// Set object to receive debug output. - virtual void - set_debug_output(yask_output_ptr debug - /**< [out] Pointer to object to receive debug output. - See \ref yask_output_factory. */ ) =0; - - /// Get the name of the solution. - /** - @returns String containing the solution name provided during stencil compilation. - */ - virtual const std::string& - get_name() const =0; - - /// Get the floating-point precision size. - /** - @returns Number of bytes in each FP element: 4 or 8. - */ - virtual int - get_element_bytes() const =0; - - /// Get the solution step dimension. - /** - @returns String containing the step-dimension name. - */ - virtual std::string - get_step_dim_name() const =0; - - /// Get the number of domain dimensions used in this solution. - /** - The domain dimensions are those over which the stencil is - applied in each step. - Does *not* include the step dimension or any miscellaneous dimensions. - @returns Number of dimensions that define the problem domain. - */ - virtual int - get_num_domain_dims() const =0; - - /// Get all the domain dimension names. - /** - @returns List of all domain-dimension names. - */ - virtual std::vector - get_domain_dim_names() const =0; - - /// Get all the miscellaneous dimension names. - /** - @returns List of all dimension names used in the solution - that are not step or domain dimensions. - */ - virtual std::vector - get_misc_dim_names() const =0; - - /// Set the size of the solution domain for this rank. - /** - The domain defines the number of elements that will be evaluated with the stencil(s). - If MPI is not enabled, this is the entire problem domain. - If MPI is enabled, this is the domain for the current rank only, - and the problem domain consists of the sum of all rank domains - in each dimension (weak-scaling). - The domain size in each rank does not have to be the same, but - all domains in the same column must have the same width, - all domains in the same row must have the same height, - and so forth, for each domain dimension. - The domain size does *not* include the halo region or any padding. - For best performance, set the rank domain - size to a multiple of the number of elements in a vector-cluster in - each dimension whenever possible. - See the "Detailed Description" for \ref yk_grid for more information on grid sizes. - There is no domain-size setting allowed in the - solution-step dimension (usually "t"). - */ - virtual void - set_rank_domain_size(const std::string& dim - /**< [in] Name of dimension to set. Must be one of - the names from get_domain_dim_names(). */, - idx_t size /**< [in] Elements in the domain in this `dim`. */ ) =0; - - /// Get the domain size for this rank. - /** - @returns Current setting of rank domain size in specified dimension. - */ - virtual idx_t - get_rank_domain_size(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from get_domain_dim_names(). */) const =0; - - /// Set the minimum amount of grid padding for all grids. - /** - This sets the minimum number of elements in each grid that is - reserved outside of the rank domain in the given dimension. - This padding area can be used for required halo regions. At - least the specified number of elements will be added to both - sides, i.e., both "before" and "after" the domain. - - The *actual* padding size will be the largest of the following values, - additionally rounded up based on the vector-folding dimensions - and/or cache-line alignment: - - Halo size. - - Value provided by any of the pad-size setting functions. - - The padding size cannot be changed after data storage - has been allocated for a given grid; attempted changes to the pad size for such - grids will be ignored. - In addition, once a grid's padding is set, it cannot be reduced, only increased. - Call yk_grid::get_pad_size() to determine the actual padding size for a given grid. - See the "Detailed Description" for \ref yk_grid for more information on grid sizes. - There is no padding allowed in the solution-step dimension (usually "t"). - */ - virtual void - set_min_pad_size(const std::string& dim - /**< [in] Name of dimension to set. Must - be one of the names from get_domain_dim_names(). */, - idx_t size - /**< [in] Elements in this `dim` applied - to both sides of the domain. */ ) =0; - - /// Get the minimum amount of grid padding for all grids. - /** - @returns Current setting of minimum amount of grid padding for all grids. - */ - virtual idx_t - get_min_pad_size(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from get_domain_dim_names(). */) const =0; - - /// Set the block size in the given dimension. - /** - This sets the approximate number of elements that are evaluated in - each "block". - This is a performance setting and should not affect the functional - correctness or total number of elements evaluated. - A block is typically the unit of work done by a - top-level OpenMP thread. The actual number of elements evaluated - in a block may be greater than the specified size due to rounding - up to fold-cluster sizes. The number of elements in a block may - also be smaller than the specified size when the block is at the - edge of the domain. The block size cannot be set in the - solution-step dimension (because temporal blocking is not yet enabled). - - Unless auto-tuning is disabled, the block size will be used as - a starting point for an automated search for a higher-performing - block size. - */ - virtual void - set_block_size(const std::string& dim - /**< [in] Name of dimension to set. Must be one of - the names from get_domain_dim_names(). */, - idx_t size - /**< [in] Elements in a block in this `dim`. */ ) =0; - - /// Get the block size. - /** - Returned value may be slightly larger than the value provided - via set_block_size() due to rounding. - @returns Current settings of block size. - */ - virtual idx_t - get_block_size(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from get_domain_dim_names(). */) const =0; - - /// Set the number of MPI ranks in the given dimension. - /** - The *product* of the number of ranks across all dimensions must - equal yk_env::get_num_ranks(). - The curent MPI rank will be assigned a unique location - within the overall problem domain based on its MPI rank index. - The same number of MPI ranks must be set via this API on each - constituent MPI rank to ensure a consistent overall configuration. - The number of ranks in each dimension must be properly set - before calling yk_solution::prepare_solution(). - There is no rank setting allowed in the - solution-step dimension (usually "t"). - */ - virtual void - set_num_ranks(const std::string& dim - /**< [in] Name of dimension to set. Must be one of - the names from get_domain_dim_names(). */, - idx_t num /**< [in] Number of ranks in `dim`. */ ) =0; - - /// Get the number of MPI ranks in the given dimension. - /** - @returns Current setting of rank size. - */ - virtual idx_t - get_num_ranks(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from get_domain_dim_names(). */) const =0; - - /// Get the rank index in the specified dimension. - /** - The overall rank indices in the specified dimension will range from - zero (0) to get_num_ranks() - 1, inclusive. - @returns Zero-based index of this rank. - */ - virtual idx_t - get_rank_index(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from get_domain_dim_names(). */ ) const =0; - - /// Get the number of grids in the solution. - /** - Grids may be pre-defined by the stencil compiler - (e.g., via yc_solution::new_grid()) - or created explicitly via yk_solution::new_grid(). - @returns Number of grids that have been created. - */ - virtual int - get_num_grids() const =0; - - /// Get the specified grid. - /** - This cannot be used to access scratch grids. - @returns Pointer to the specified grid or null pointer if it does not exist. - */ - virtual yk_grid_ptr - get_grid(const std::string& name /**< [in] Name of the grid. */ ) =0; - - /// Get all the grids. - /** - @returns List of all non-scratch grids in the solution. - */ - virtual std::vector - get_grids() =0; - - /// Prepare the solution for stencil application. - /** - Allocates data in grids that do not already have storage allocated. - Calculates the position of each rank in the overall problem domain. - Sets many other data structures needed for proper stencil application. - Since this function initiates MPI communication, it must be called - on all MPI ranks, and it will block until all ranks have completed. - Must be called before applying any stencils. - */ - virtual void - prepare_solution() =0; - - /// Get the first index of the sub-domain in this rank in the specified dimension. - /** - This returns the first *overall* index at the beginning of the domain. - Elements within the domain in this rank lie between the values returned by - get_first_rank_domain_index() and get_last_rank_domain_index(), inclusive. - If there is only one MPI rank, this is typically zero (0). - If there is more than one MPI rank, the value depends - on the the rank's position within the overall problem domain. - - @note This function should be called only *after* calling prepare_solution() - because prepare_solution() assigns this rank's position in the problem domain. - @returns First domain index in this rank. - */ - virtual idx_t - get_first_rank_domain_index(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from get_domain_dim_names(). */ ) const =0; - - /// Get the last index of the sub-domain in this rank the specified dimension. - /** - This returns the last *overall* index within the domain in this rank - (*not* one past the end). - If there is only one MPI rank, this is typically one less than the value - provided by set_rank_domain_size(). - If there is more than one MPI rank, the value depends - on the the rank's position within the overall problem domain. - See get_first_rank_domain_index() for more information. - - @note This function should be called only *after* calling prepare_solution() - because prepare_solution() assigns this rank's position in the problem domain. - @returns Last index in this rank. - */ - virtual idx_t - get_last_rank_domain_index(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from get_domain_dim_names(). */ ) const =0; - - /// Get the overall problem size in the specified dimension. - /** - The overall domain indices in the specified dimension will range from - zero (0) to get_overall_domain_size() - 1, inclusive. - Call get_first_rank_domain_index() and get_last_rank_domain_index() - to find the subset of this domain in each rank. - - @note This function should be called only *after* calling prepare_solution() - because prepare_solution() obtains the sub-domain sizes from other ranks. - @returns Sum of all ranks' domain sizes in the given dimension. - */ - virtual idx_t - get_overall_domain_size(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from get_domain_dim_names(). */ ) const =0; - - /// Run the stencil solution for the specified steps. - /** - The stencil(s) in the solution are applied to the grid data, setting the - index variables as follows: - 1. If temporal wave-fronts are *not* used (the default): - - The step index (e.g., `t` for "time") will be sequentially set to values - from `first_step_index` to `last_step_index`, inclusive. - + If the stencil equations were defined with dependencies on lower-valued steps, - e.g., `t+1` depends on `t`, then `last_step_index` should be greater than or equal to - `first_step_index` (forward solution). - + If the stencil equations were defined with dependencies on higher-valued steps, - e.g., `t-1` depends on `t`, then `last_step_index` should be less than or equal to - `first_step_index` (reverse solution). - - For each step index, the domain indices will be set - to values across the entire domain as returned by yk_solution::get_overall_domain_size() - (not necessarily sequentially). - - MPI halo exchanges will occur as necessary before, after, or during a step. - - Since this function initiates MPI communication, it must be called - on all MPI ranks, and it will block until all ranks have completed. - 2. **[Advanced]** If temporal wave-fronts *are* enabled (currently only possible via apply_command_line_options()): - - The step index (e.g., `t` for "time") will be sequentially set to values - from `first_step_index` to `last_step_index`, inclusive, within each wave-front tile. - + The number of steps in a wave-front tile may also be restricted by the size - of the tile in the step dimension. In that case, tiles will be done in slices of that size. - + Reverse solutions are not allowed with wave-front tiling. - - For each step index within each wave-front tile, the domain indices will be set - to values across the entire tile (not necessarily sequentially). - - Ultimately, the stencil(s) will be applied to same the elements in both the step - and domain dimensions as when wave-front tiling is not used. - - MPI is not supported with wave-front tiling. - - This function should be called only *after* calling prepare_solution(). - */ - virtual void - run_solution(idx_t first_step_index /**< [in] First index in the step dimension */, - idx_t last_step_index /**< [in] Last index in the step dimension */ ) =0; - - /// Run the stencil solution for the specified step. - /** - This function is simply an alias for `run_solution(step_index, step_index)`, i.e., - the solution will be applied for exactly one step across the domain. - - Typical C++ usage: - - ~~~{.cpp} - for (idx_t t = 1; t <= num_steps; t++) - run_solution(t); - ~~~ - - As written, the above loop is identical to - - ~~~{.cpp} - run_solution(1, num_steps); - ~~~ - - @note The parameter is *not* the number of steps to run. - @note Since only one step is taken per call, using this function effectively disables - wave-front tiling. - */ - virtual void - run_solution(idx_t step_index /**< [in] Index in the step dimension */ ) =0; - - /// Finish using a solution. - /** - Releases shared ownership of memory used by the grids. This will - result in deallocating each memory block whose ownership is not - shared by another shared pointer. - */ - virtual void - end_solution() =0; - - - /// Get performance statistics associated with preceding calls to run_solution(). - /** - Side effect: resets all statistics, so a subsequent call will - measure performance after the current call. - @returns Pointer to statistics object. - */ - virtual yk_stats_ptr - get_stats() =0; - - /// Determine whether the auto-tuner is enabled on this rank. - /** - The auto-tuner is enabled by default. - It will become disabled after it has converged or after reset_auto_tuner(false) has been called. - @returns Whether the auto-tuner is still searching. - */ - virtual bool - is_auto_tuner_enabled() =0; - - /* Advanced APIs for yk_solution found below are not needed for most applications. */ - - /// **[Advanced]** Restart or disable the auto-tuner on this rank. - /** - Under normal operation, an auto-tuner is invoked automatically during calls to - run_solution(). - Currently, only the block size is set by the auto-tuner, and the search begins from the - sizes set via set_block_size() or the default size if set_block_size() has - not been called. - This function is used to apply the current best-known settings if the tuner has - been running, reset the state of the auto-tuner, and either - restart its search or disable it from running. - This call must be made on each rank where the change is desired. - */ - virtual void - reset_auto_tuner(bool enable - /**< [in] If _true_, start or restart the auto-tuner search. - If _false_, disable the auto-tuner from running. */, - bool verbose = false - /**< [in] If _true_, print progress information to the debug object - set via set_debug_output(). */ ) =0; - - /// **[Advanced]** Automatically tune selected settings immediately. - /** - Executes a search algorithm to find [locally] optimum values for some of the - settings. - Under normal operation, an auto-tuner is invoked during calls to - run_solution(). - See reset_auto_tuner() for more information. - This function causes the stencil solution to be run immediately - until the auto-tuner converges on all ranks. - It is useful for benchmarking, where performance is to be timed - for a given number of steps after the best settings are found. - This function should be called only *after* calling prepare_solution(). - This call must be made on each rank. - @warning Modifies the contents of the grids by calling run_solution() - an arbitrary number of times, but without halo exchange. - (See run_solution() for other restrictions and warnings.) - Thus, grid data should be set *after* calling this function when - used in a production or test setting where correct results are expected. - */ - virtual void - run_auto_tuner_now(bool verbose = true - /**< [in] If _true_, print progress information to the debug object - set via set_debug_output(). */ ) =0; - - /// **[Advanced]** Add a new grid to the solution. - /** - This is typically not needed because grids used by the stencils are pre-defined - by the solution itself via the stencil compiler. - However, a grid may be created explicitly via this function - in order to use it for purposes other than by the - pre-defined stencils within the current solution. - - Grids created by this function will be treated like a pre-defined grid. - For example, - - For each domain dimension of the grid, - the new grid's domain size will be the same as that returned by - get_rank_domain_size(). - - Calls to set_rank_domain_size() will resize the corresponding domain - size in this grid. - - This grid's first domain index in this rank will be determined - by the position of this rank. - - This grid's initial padding size will be the same as that returned by - get_min_pad_size(). - - After creating a new grid, you can increase its padding - sizes in the domain dimensions via yk_grid::set_min_pad_size(), etc. - - For step and misc dimensions, you can change the allocation via - yk_grid::set_alloc_size(). - - If you want a grid that is not automatically resized based on the - solution settings, use new_fixed_size_grid() instead. - - @note A new grid contains only the meta-data for the grid; data storage - is not yet allocated. - Storage may be allocated in any of the methods listed - in the "Detailed Description" for \ref yk_grid. - @returns Pointer to the new grid. - */ - virtual yk_grid_ptr - new_grid(const std::string& name - /**< [in] Name of the grid; must be unique - within the solution. */, - const std::vector& dims - /**< [in] List of names of all dimensions. - Names must be valid C++ identifiers and - not repeated within this grid. */ ) =0; - -#ifndef SWIG - /// **[Advanced]** Add a new grid to the solution. - /** - See documentation for the version of new_grid() with a vector of dimension names - as a parameter. - @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. - @returns Pointer to the new grid. - */ - virtual yk_grid_ptr - new_grid(const std::string& name - /**< [in] Name of the grid; must be unique - within the solution. */, - const std::initializer_list& dims - /**< [in] List of names of all dimensions. - Names must be valid C++ identifiers and - not repeated within this grid. */ ) =0; -#endif - - /// **[Advanced]** Add a new grid to the solution with a specified size. - /** - This is typically not needed because grids used by the stencils are pre-defined - by the solution itself via the stencil compiler. - However, a grid may be created explicitly via this function - in order to use it for purposes other than by the - pre-defined stencils within the current solution. - - Unlike new_grid(), - grids created by this function will *not* be treated like a pre-defined grid. - For example, - - For each domain dimension of the grid, - the new grid's domain size is provided during creation and cannot be changed. - - Calls to set_rank_domain_size() will *not* resize the corresponding domain - size in this grid. - - This grid's first domain index in this rank will be fixed at zero (0) - regardless of this rank's position. - - This grid's padding size will be affected only by calls to - yk_grid::set_min_pad_size(), etc. - - For step and misc dimensions, you can still change the allocation via - yk_grid::set_alloc_size(). - - @note A new grid contains only the meta-data for the grid; data storage - is not yet allocated. - Storage may be allocated in any of the methods listed - in the "Detailed Description" for \ref yk_grid. - @returns Pointer to the new grid. - */ - virtual yk_grid_ptr - new_fixed_size_grid(const std::string& name - /**< [in] Name of the grid; must be unique - within the solution. */, - const std::vector& dims - /**< [in] List of names of all dimensions. - Names must be valid C++ identifiers and - not repeated within this grid. */, - const std::vector& dim_sizes - /**< [in] Initial allocation in each dimension. - Must be exatly one size for each dimension. */ ) =0; - -#ifndef SWIG - /// **[Advanced]** Add a new grid to the solution with a specified size. - /** - See documentation for the version of new_fixed_size_grid() with a vector of dimension names - as a parameter. - @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. - @returns Pointer to the new grid. - */ - virtual yk_grid_ptr - new_fixed_size_grid(const std::string& name - /**< [in] Name of the grid; must be unique - within the solution. */, - const std::initializer_list& dims - /**< [in] List of names of all dimensions. - Names must be valid C++ identifiers and - not repeated within this grid. */, - const std::initializer_list& dim_sizes - /**< [in] Initial allocation in each dimension. - Must be exatly one size for each dimension. */ ) =0; -#endif - - /// **[Advanced]** Set the default preferred NUMA node on which to allocate data. - /** - This value is used when allocating grids and MPI buffers. - The NUMA "preferred node allocation" policy is used, meaning that - memory will be allocated in an alternative node if the preferred one - doesn't have enough space available or is otherwise restricted. - Instead of specifying a NUMA node, a special value may be used - to specify another policy as listed. - This setting may be overridden for any specific grid. - */ - virtual void - set_default_numa_preferred(int numa_node - /**< [in] Preferred NUMA node for data - allocation. Alternatively, use - `yask_numa_local` for explicit - local-node allocation, - `yask_numa_interleave` for - interleaving pages across all nodes, - or `yask_numa_none` for no explicit NUMA - policy. These constants are defined in - the _Variable Documentation_ section of - \ref yask_kernel_api.hpp. */) =0; - - /// **[Advanced]** Get the default preferred NUMA node on which to allocate data. - /** - @returns Current setting of preferred NUMA node. - */ - virtual int - get_default_numa_preferred() const =0; - - /// **[Advanced]** Set performance parameters from an option string. - /** - Parses the string for options as if from a command-line. - Example: "-bx 64 -block_threads 4" sets the block-size in the *x* - dimension to 64 and the number of threads used to process each - block to 4. - See the help message from the YASK kernel binary for documentation - on the command-line options. - - @returns Any strings that were not recognized by the parser as options. - */ - virtual std::string - apply_command_line_options(const std::string& args - /**< [in] String of arguments to parse. */ ) =0; - - /// **[Advanced]** Use data-storage from existing grids in specified solution. - /** - Calls yk_grid::share_storage() for each pair of grids that have the same name - in this solution and the source solution. - All conditions listed in yk_grid::share_storage() must hold for each pair. - */ - virtual void - share_grid_storage(yk_solution_ptr source - /**< [in] Solution from which grid storage will be shared. */) =0; - }; - - /// Statistics from calls to run_solution(). - /** - A throughput rate may be calculated by multiplying an - amount-of-work-per-step quantity by the number of steps done and - dividing by the number of seconds elapsed. - */ - class yk_stats { - public: - virtual ~yk_stats() {} - - /// Get the number of elements in the overall domain. - /** - @returns Product of all the overal domain sizes across all domain dimensions. - */ - virtual idx_t - get_num_elements() =0; - - /// Get the number of elements written in each step. - /** - @returns Number of elements written to each output grid. - This is the same value as get_num_elements() if there is only one output grid. - */ - virtual idx_t - get_num_writes() =0; - - /// Get the estimated number of floating-point operations required for each step. - /** - @returns Number of FP ops created by the stencil compiler. - It may be slightly more or less than the actual number of FP ops executed - by the CPU due to C++ compiler transformations. - */ - virtual idx_t - get_est_fp_ops() =0; - - /// Get the number of steps calculated via run_solution(). - /** - @returns A positive number, regardless of whether run_solution() steps were executed - forward or backward. - */ - virtual idx_t - get_num_steps_done() =0; - - /// Get the number of seconds elapsed during calls to run_solution(). - /** - @returns Only the time spent in run_solution(), not in any other code in your - application between calls. - */ - virtual double - get_elapsed_run_secs() =0; - }; - - /// A run-time grid. - /** - "Grid" is a generic term for any n-dimensional array. A 0-dim grid - is a scalar, a 1-dim grid is an array, etc. A run-time grid contains - data, unlike yc_grid, a compile-time grid variable. - - Typically, access to each grid is obtained via yk_solution::get_grid(). - You may also use yk_solution::new_grid() or yk_solution::new_fixed_size_grid() - if you need a grid that is not part of the pre-defined solution. - - Each dimension of a grid is one of the following: - - The *step* dimension, typically time ("t"), as identified via yk_solution::get_step_dim_name(). - - A *domain* dimension, typically a spatial dimension such as "x" or "y", - as identified via yk_solution:get_domain_dim_names(). - - A *miscellaneous* dimension, which is any dimension that is not a domain or step dimension, - as identified via yk_solution:get_misc_dim_names(). - - In the step dimension, there is no fixed domain size, and no - specified first or last index. - However, there is an allocation size, which is the number of values in the step - dimension that are stored in memory. - Step-dimension indices "wrap-around" within this allocation to reuse memory. - For example, if the step dimension is "t", and the t-dimension allocation size is 3, - then t=-2, t=0, t=3, t=6, ..., t=303, etc. would all alias to the same spatial values in memory. - - In each domain dimension, - grid sizes include the following components: - - The *domain* is the elements to which the stencils are applied. - - The *left padding* is all the elements before the domain and includes the left halo. - - The *right padding* is all the elements before the domain and includes the right halo. - - The *left halo* is the elements just before the domain which must be - copied between preceding ranks during halo exchanges. The left halo is contained within the left padding. - - The *right halo* is the elements just after the domain which must be - copied between following ranks during halo exchanges. The right halo is contained within the right padding. - - The *extra left padding* is the elements before the domain and left halo - and thus does not include the left halo. - - The *extra right padding* is the elements after the domain and right halo - and thus does not include the right halo. - - The *allocation* includes the left padding, domain, and right padding. - - Domain sizes specified via yk_solution::set_rank_domain_size() apply to each MPI rank. - Visually, in each of the domain dimensions, these sizes are related as follows - in each rank: - -
extra left padding left halo domain right halo extra right padding -
left padding
right padding
-
allocation
-
- - If MPI is not enabled, a rank's domain is equivalent to the entire problem size. - If MPI is enabled, the domains of the ranks are logically abutted to create the - overall problem domain in each dimension: - -
extra left padding of rank A halo of rank A domain of rank A domain of rank B - ... domain of rank Z halo of rank Z extra right padding of rank Z -
left padding of rank A
-
overall problem domain
-
right padding of rank Z
-
- The intermediate halos and paddings also exist, but are not shown in the above diagram. - The halos overlap the domains of adjacent ranks. - For example, the left halo of rank B in the diagram would overlap the domain of rank A. - Data in these overlapped regions is exchanged as needed during stencil application - to maintain a consistent values as if there was only one rank. - - In each miscellaneous dimension, there is only an allocation size, - and there is no wrap-around as in the step dimension. - Each index must be between its first and last allowed value. - - All sizes are expressed in numbers of elements. - Each element may be a 4-byte (single precision) - or 8-byte (double precision) floating-point value as returned by - yk_solution::get_element_bytes(). - - Initially, a grid is not assigned any allocated storage. - This is done to allow modification of domain, padding, and other allocation sizes - before allocation. - Once the allocation sizes have been set in all dimensions, the data storage itself may - be allocated. - This can be done in any of the following ways: - - Storage for all grids without data storage will be automatically allocated when - prepare_solution() is called. - - Storage for a specific grid may be allocated before calling prepare_solution() - via yk_grid::alloc_storage(). - - **[Advanced]** Storage for a specific grid may be shared with another grid with - existing storage via yk_grid::share_storage(). - - @note The domain index arguments to the \ref yk_grid functions that require indices - are *always* relative to the overall problem; they are *not* relative to the current rank. - The first and last overall-problem index that lies within a rank can be - retrieved via yk_solution::get_first_rank_domain_index() and - yk_solution::get_last_rank_domain_index(), respectively. - The first and last accessible index that lies within a rank for a given grid can be - retrieved via yk_grid::get_first_rank_alloc_index() and - yk_grid::get_last_rank_alloc_index(), respectively. - Also, index arguments are always inclusive. - Specifically, for functions that return or require a "last" index, that - index indicates the last one in the relevant range, i.e., *not* one past the last value - (this is more like Fortran and Perl than Python and Lisp). - */ - class yk_grid { - public: - virtual ~yk_grid() {} - - /// Get the name of the grid. - /** - @returns String containing name provided via yc_solution::new_grid(). - */ - virtual const std::string& get_name() const =0; - - /// Determine whether this grid is automatically resized based on the solution. - /** - @returns `true` if this grid was created via yk_solution::new_fixed_size_grid() - or `false` otherwise. - */ - virtual bool is_fixed_size() const =0; - - /// Get the number of dimensions used in this grid. - /** - This may include domain, step, and/or miscellaneous dimensions. - @returns Number of dimensions created via yc_solution::new_grid(), - yk_solution::new_grid(), or yk_solution::new_fixed_size_grid(). - */ - virtual int get_num_dims() const =0; - - /// Get all the dimensions in this grid. - /** - This may include domain, step, and/or miscellaneous dimensions. - @returns List of names of all the dimensions. - */ - virtual std::vector - get_dim_names() const =0; - - /// Determine whether specified dimension exists in this grid. - /** - @returns `true` if dimension exists (including step-dimension), - `false` otherwise. - */ - virtual bool - is_dim_used(const std::string& dim) const =0; - - /// Get the domain size for this rank. - /** - @returns The same value as yk_solution::get_rank_domain_size() if - is_fixed_size() returns `false` or the fixed sized provided via - yk_solution::new_fixed_size_grid() otherwise. - */ - virtual idx_t - get_rank_domain_size(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from yk_solution::get_domain_dim_names(). */) const =0; - - /// Get the first index of the sub-domain in this rank in the specified dimension. - /** - @note This function should be called only *after* calling prepare_solution() - because prepare_solution() assigns this rank's position in the problem domain. - @returns The same value as yk_solution::get_first_rank_domain_index() if - is_fixed_size() returns `false` or zero (0) otherwise. - */ - virtual idx_t - get_first_rank_domain_index(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// Get the last index of the sub-domain in this rank in the specified dimension. - /** - @note This function should be called only *after* calling prepare_solution() - because prepare_solution() assigns this rank's position in the problem domain. - @returns The same value as yk_solution::get_last_rank_domain_index() if - is_fixed_size() returns `false` or one less than the fixed sized provided via - yk_solution::new_fixed_size_grid() otherwise. - */ - virtual idx_t - get_last_rank_domain_index(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// Get the left halo size in the specified dimension. - /** - This value is typically set by the stencil compiler. - @returns Elements in halo in given dimension before the domain. - */ - virtual idx_t - get_left_halo_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// Get the right halo size in the specified dimension. - /** - This value is typically set by the stencil compiler. - @returns Elements in halo in given dimension after the domain. - */ - virtual idx_t - get_right_halo_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// Get the first index of the left halo in this rank in the specified dimension. - /** - @note This function should be called only *after* calling prepare_solution() - because prepare_solution() assigns this rank's position in the problem domain. - @returns The first index of left halo in this rank or - the same value as yk_grid::get_first_rank_domain_index() - if the left halo has zero size. - */ - virtual idx_t - get_first_rank_halo_index(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// Get the last index of the right halo in this rank in the specified dimension. - /** - @note This function should be called only *after* calling prepare_solution() - because prepare_solution() assigns this rank's position in the problem domain. - @returns The last index of right halo in this rank or - the same value as yk_grid::get_last_rank_domain_index() - if the right halo has zero size. - */ - virtual idx_t - get_last_rank_halo_index(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// Get the left padding in the specified dimension. - /** - The left padding is the memory allocated before - the domain in a given dimension. - The left padding size includes the left halo size. - The value may be slightly - larger than that provided via set_min_pad_size(), etc. due to rounding. - @returns Elements in left padding in given dimension. - */ - virtual idx_t - get_left_pad_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// Get the right padding in the specified dimension. - /** - The right padding is the memory allocated after - the domain in a given dimension. - The right padding size includes the right halo size. - The value may be slightly - larger than that provided via set_min_pad_size(), etc. due to rounding. - @returns Elements in right padding in given dimension. - */ - virtual idx_t - get_right_pad_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// Get the extra left padding in the specified dimension. - /** - The *extra* padding size is the left padding size minus the left halo size. - @returns Elements in padding in given dimension before the - left halo region. - */ - virtual idx_t - get_left_extra_pad_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// Get the extra right padding in the specified dimension. - /** - The *extra* padding size is the right padding size minus the right halo size. - @returns Elements in padding in given dimension after the - right halo region. - */ - virtual idx_t - get_right_extra_pad_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// Set the padding in the specified dimension. - /** - This sets the minimum number of elements in this grid - in both left and right pads. - This padding area can be used for required halo regions. - - The *actual* padding size will be the largest of the following values, - additionally rounded up based on the vector-folding dimensions - and/or cache-line alignment: - - Halo size. - - Value provided by any of the pad-size setting functions. - - The padding size cannot be changed after data storage - has been allocated for this grid; attempted changes to the pad size - will be ignored. - In addition, once a grid's padding is set, it cannot be reduced, only increased. - Call get_pad_size() to determine the actual padding size for the grid. - See the "Detailed Description" for \ref yk_grid for information on grid sizes. - */ - virtual void - set_min_pad_size(const std::string& dim - /**< [in] Name of dimension to set. - Must be one of - the names from yk_solution::get_domain_dim_names(). */, - idx_t size - /**< [in] Minimum number of elements to allocate beyond the domain size. */ ) =0; - - /// Get the storage allocation in the specified dimension. - /** - For the step dimension, this is the specified allocation and - does not typically depend on the number of steps evaluated. - For the non-step dimensions, this includes the domain and padding sizes. - See the "Detailed Description" for \ref yk_grid for information on grid sizes. - @returns allocation in number of elements (not bytes). - */ - virtual idx_t - get_alloc_size(const std::string& dim - /**< [in] Name of dimension to get. */ ) const =0; - - /// Get the first index of a specified miscellaneous dimension. - /** - @returns the first allowed index in a non-step and non-domain dimension. - */ - virtual idx_t - get_first_misc_index(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from yk_solution::get_misc_dim_names(). */ ) const =0; - - /// Get the last index of a specified miscellaneous dimension. - /** - @returns the last allowed index in a non-step and non-domain dimension. - */ - virtual idx_t - get_last_misc_index(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from yk_solution::get_misc_dim_names(). */ ) const =0; - - /// Determine whether the given indices are allocated in this rank. - /** - Provide indices in a list in the same order returned by get_dim_names(). - Indices are relative to the *overall* problem domain. - @returns `true` if index values fall within the allocated space as returned by - get_first_rank_alloc_index() and get_last_rank_alloc_index() for - each dimension; `false` otherwise. - */ - virtual bool - is_element_allocated(const std::vector& indices - /**< [in] List of indices, one for each grid dimension. */ ) const =0; - -#ifndef SWIG - /// Determine whether the given indices are allocated in this rank. - /** - Provide indices in a list in the same order returned by get_dim_names(). - Indices are relative to the *overall* problem domain. - @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. - @returns `true` if index values fall within the allocated space as returned by - get_first_rank_alloc_index() and get_last_rank_alloc_index() for - each dimension; `false` otherwise. - */ - virtual bool - is_element_allocated(const std::initializer_list& indices - /**< [in] List of indices, one for each grid dimension. */ ) const =0; -#endif - - /// Get the value of one grid element. - /** - Provide indices in a list in the same order returned by get_dim_names(). - Indices are relative to the *overall* problem domain. - Index values must fall within the allocated space as returned by - get_first_rank_alloc_index() and get_last_rank_alloc_index() for - each dimension. - @returns value in grid at given multi-dimensional location. - */ - virtual double - get_element(const std::vector& indices - /**< [in] List of indices, one for each grid dimension. */ ) const =0; - -#ifndef SWIG - /// Get the value of one grid element. - /** - Provide indices in a list in the same order returned by get_dim_names(). - Indices are relative to the *overall* problem domain. - Index values must fall within the allocated space as returned by - get_first_rank_alloc_index() and get_last_rank_alloc_index() for - each dimension. - @note The return value is a double-precision floating-point value, but - it will be converted from a single-precision if - yk_solution::get_element_bytes() returns 4. - @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. - @returns value in grid at given multi-dimensional location. - */ - virtual double - get_element(const std::initializer_list& indices - /**< [in] List of indices, one for each grid dimension. */ ) const =0; -#endif - - /// Get grid elements within specified subset of the grid. - /** - Reads all elements from `first_indices` to `last_indices` in each dimension - and writes them to consecutive memory locations in the buffer. - Indices in the buffer progress in row-major order. - The buffer pointed to must contain the number of bytes equal to - yk_solution::get_element_bytes() multiplied by the number of - elements in the specified slice. - Since the reads proceed in row-major order, the last index is "unit-stride" - in the buffer. - Provide indices in two lists in the same order returned by get_dim_names(). - Indices are relative to the *overall* problem domain. - Index values must fall within the allocated space as returned by - get_first_rank_alloc_index() and get_last_rank_alloc_index() for - each dimension. - @returns Number of elements read. - */ - virtual idx_t - get_elements_in_slice(void* buffer_ptr - /**< [out] Pointer to buffer where values will be written. */, - const std::vector& first_indices - /**< [in] List of initial indices, one for each grid dimension. */, - const std::vector& last_indices - /**< [in] List of final indices, one for each grid dimension. */ ) const =0; - - /// Set the value of one grid element. - /** - Provide indices in a list in the same order returned by get_dim_names(). - Indices are relative to the *overall* problem domain. - Index values must fall within the allocated space as returned by - get_first_rank_alloc_index() and get_last_rank_alloc_index() for - each dimension. - @note The parameter value is a double-precision floating-point value, but - it will be converted to single-precision if - yk_solution::get_element_bytes() returns 4. - If storage has not been allocated for this grid, this will have no effect. - @returns Number of elements set. - */ - virtual idx_t - set_element(double val /**< [in] Element in grid will be set to this. */, - const std::vector& indices - /**< [in] List of indices, one for each grid dimension. */, - bool strict_indices = false - /**< [in] If true, indices must be within domain or padding. - If false, indices outside of domain and padding result - in no change to grid. */ ) =0; - -#ifndef SWIG - /// Set the value of one grid element. - /** - Provide the number of indices equal to the number of dimensions in the grid. - Indices beyond that will be ignored. - Indices are relative to the *overall* problem domain. - If any index values fall outside of the allocated space as returned by - get_first_rank_alloc_index() and get_last_rank_alloc_index() for - each dimension, this will have no effect. - @note The parameter value is a double-precision floating-point value, but - it will be converted to single-precision if - yk_solution::get_element_bytes() returns 4. - If storage has not been allocated for this grid, this will have no effect. - @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. - @returns Number of elements set. - */ - virtual idx_t - set_element(double val /**< [in] Element in grid will be set to this. */, - const std::initializer_list& indices - /**< [in] List of indices, one for each grid dimension. */, - bool strict_indices = false - /**< [in] If true, indices must be within domain or padding. - If false, indices outside of domain and padding result - in no change to grid. */ ) =0; -#endif - /// Atomically add to the value of one grid element. - /** - Provide indices in a list in the same order returned by get_dim_names(). - Indices are relative to the *overall* problem domain. - Index values must fall within the allocated space as returned by - get_first_rank_alloc_index() and get_last_rank_alloc_index() for - each dimension if `strict_indices` is set to true. - Updates are OpenMP atomic, meaning that this function can be called by - several OpenMP threads without causing a race condition. - @note The parameter value is a double-precision floating-point value, but - it will be converted to single-precision if - yk_solution::get_element_bytes() returns 4. - If storage has not been allocated for this grid, this will have no effect. - @returns Number of elements updated. - */ - virtual idx_t - add_to_element(double val /**< [in] This value will be added to element in grid. */, - const std::vector& indices - /**< [in] List of indices, one for each grid dimension. */, - bool strict_indices = false - /**< [in] If true, indices must be within domain or padding. - If false, indices outside of domain and padding result - in no change to grid. */ ) =0; - -#ifndef SWIG - /// Atomically add to the value of one grid element. - /** - Provide the number of indices equal to the number of dimensions in the grid. - Indices beyond that will be ignored. - Indices are relative to the *overall* problem domain. - Index values must fall within the allocated space as returned by - get_first_rank_alloc_index() and get_last_rank_alloc_index() for - each dimension if `strict_indices` is set to true. - Updates are OpenMP atomic, meaning that this function can be called by - several OpenMP threads without causing a race condition. - @note The parameter value is a double-precision floating-point value, but - it will be converted to single-precision if - yk_solution::get_element_bytes() returns 4. - If storage has not been allocated for this grid, this will have no effect. - @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. - @returns Number of elements set. - */ - virtual idx_t - add_to_element(double val /**< [in] This value will be added to element in grid. */, - const std::initializer_list& indices - /**< [in] List of indices, one for each grid dimension. */, - bool strict_indices = false - /**< [in] If true, indices must be within domain or padding. - If false, indices outside of domain and padding result - in no change to grid. */ ) =0; -#endif - - /// Initialize all grid elements to the same value. - /** - Sets all allocated elements, including those in the domain and padding - area to the same specified value. - @note The parameter is a double-precision floating-point value, but - it will be converted to single-precision if - yk_solution::get_element_bytes() returns 4. - @note If storage has not been allocated via yk_solution::prepare_solution(), - this will have no effect. - */ - virtual void - set_all_elements_same(double val /**< [in] All elements will be set to this. */ ) =0; - - /// Initialize grid elements within specified subset of the grid to the same value. - /** - Sets all elements from `first_indices` to `last_indices` in each dimension to the - specified value. - Provide indices in two lists in the same order returned by get_dim_names(). - Indices are relative to the *overall* problem domain. - Index values must fall within the allocated space as returned by - get_first_rank_alloc_index() and get_last_rank_alloc_index() for - each dimension. - Indices are relative to the *overall* problem domain. - If storage has not been allocated for this grid, this will have no effect. - @returns Number of elements set. - */ - virtual idx_t - set_elements_in_slice_same(double val /**< [in] All elements in the slice will be set to this. */, - const std::vector& first_indices - /**< [in] List of initial indices, one for each grid dimension. */, - const std::vector& last_indices - /**< [in] List of final indices, one for each grid dimension. */, - bool strict_indices = false - /**< [in] If true, indices must be within domain or padding. - If false, only elements within the allocation of this grid - will be set, and elements outside will be ignored. */ ) =0; - - /// Set grid elements within specified subset of the grid. - /** - Reads elements from consecutive memory locations, - starting at `buffer_ptr` - and writes them from `first_indices` to `last_indices` in each dimension. - Indices in the buffer progress in row-major order. - The buffer pointed to must contain either 4 or 8 byte FP values per element in the - subset, depending on the FP precision of the solution. - The buffer pointed to must contain the number of FP values in the specified slice, - where each FP value is the size of yk_solution::get_element_bytes(). - Since the writes proceed in row-major order, the last index is "unit-stride" - in the buffer. - Provide indices in two lists in the same order returned by get_dim_names(). - Indices are relative to the *overall* problem domain. - Index values must fall within the allocated space as returned by - get_first_rank_alloc_index() and get_last_rank_alloc_index() for - each dimension. - Indices are relative to the *overall* problem domain. - If storage has not been allocated for this grid, this will have no effect. - @returns Number of elements written. - */ - virtual idx_t - set_elements_in_slice(const void* buffer_ptr - /**< [out] Pointer to buffer where values will be read. */, - const std::vector& first_indices - /**< [in] List of initial indices, one for each grid dimension. */, - const std::vector& last_indices - /**< [in] List of final indices, one for each grid dimension. */ ) =0; - - /// Format the indices for pretty-printing. - /** - Provide indices in a list in the same order returned by get_dim_names(). - @returns A string containing the grid name and the index values. - */ - virtual std::string - format_indices(const std::vector& indices - /**< [in] List of indices, one for each grid dimension. */ ) const =0; - -#ifndef SWIG - /// Format the indices for pretty-printing. - /** - Provide indices in a list in the same order returned by get_dim_names(). - @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. - @returns A string containing the grid name and the index values. - */ - virtual std::string - format_indices(const std::initializer_list& indices - /**< [in] List of indices, one for each grid dimension. */ ) const =0; -#endif - - /// Determine whether storage has been allocated. - /** - @returns `true` if storage has been allocated, - `false` otherwise. - */ - virtual bool - is_storage_allocated() const =0; - - /// Determine size of raw storage in bytes. - /** - @returns Minimum number of bytes required for - storage given the current domain size and padding settings. - */ - virtual idx_t - get_num_storage_bytes() const =0; - - /// Determine size of raw storage in elements. - /** - @returns get_num_storage_bytes() / yk_solution.get_element_bytes(). - */ - virtual idx_t - get_num_storage_elements() const =0; - - /* Advanced APIs for yk_grid found below are not needed for most applications. */ - - /// **[Advanced]** Set the default preferred NUMA node on which to allocate data. - /** - This value is used when allocating data for this grid. - Thus, the desired NUMA policy must be set before calling alloc_data() - or yk_solution::prepare_solution(). - */ - virtual void - set_numa_preferred(int numa_node - /**< [in] Preferred NUMA node. - See yk_solution::set_default_numa_preferred() for other options. */) =0; - - /// **[Advanced]** Get the default preferred NUMA node on which to allocate data. - /** - @returns Current setting of preferred NUMA node for this grid. - */ - virtual int - get_numa_preferred() const =0; - - /// **[Advanced]** Set the left halo size in the specified dimension. - /** - This value is typically set by the stencil compiler, but - this function allows you to override that value. - If the left halo is set to a value larger than the left padding size, the - left padding size will be automatically increase to accomodate it. - @note After data storage has been allocated, the left halo size - can only be set to a value less than or equal to the left padding size - in the given dimension. - */ - virtual void - set_left_halo_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */, - idx_t size - /**< [in] Number of elements in the left halo. */ ) =0; - - /// **[Advanced]** Set the right halo size in the specified dimension. - /** - This value is typically set by the stencil compiler, but - this function allows you to override that value. - If the right halo is set to a value larger than the right padding size, the - right padding size will be automatically increase to accomodate it. - @note After data storage has been allocated, the right halo size - can only be set to a value less than or equal to the right padding size - in the given dimension. - */ - virtual void - set_right_halo_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */, - idx_t size - /**< [in] Number of elements in the right halo. */ ) =0; - - /// **[Advanced]** Set the left and right halo sizes in the specified dimension. - /** - Alias for set_left_halo_size(dim, size); set_right_halo_size(dim, size). - */ - virtual void - set_halo_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */, - idx_t size - /**< [in] Number of elements in the halo. */ ) =0; - - - /// **[Advanced]** Set the number of elements to allocate in the specified dimension. - /** - This setting is only allowed in the step dimension. - Typically, the allocation in the step dimension is determined by the - stencil compiler, but - this function allows you to override that value. - Allocations in other dimensions should be set indirectly - via the domain and padding sizes. - The allocation size cannot be changed after data storage - has been allocated for this grid. - */ - virtual void - set_alloc_size(const std::string& dim - /**< [in] Name of dimension to set. - Must *not* be one of - the names from yk_solution::get_domain_dim_names(). */, - idx_t size /**< [in] Number of elements to allocate. */ ) =0; - - /// **[Advanced]** Set the first index of a specified miscellaneous dimension. - /** - Sets the first allowed index in a non-step and non-domain dimension. - After calling this function, the last allowed index will be the first index - as set by this function plus the allocation size set by set_alloc_size() - minus one. - */ - virtual void - set_first_misc_index(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from yk_solution::get_misc_dim_names(). */, - idx_t idx /**< [in] New value for first index. - May be negative. */ ) =0; - - /// **[Advanced]** Get the first accessible index in this grid in this rank in the specified dimension. - /** - This returns the first *overall* index allowed in this grid. - This element may be in the domain, left halo, or extra left padding area. - This function is only for checking the legality of an index. - @returns First allowed index in this grid. - */ - virtual idx_t - get_first_rank_alloc_index(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// **[Advanced]** Get the last accessible index in this grid in this rank in the specified dimension. - /** - This returns the last *overall* index allowed in this grid. - This element may be in the domain, right halo, or extra right padding area. - This function is only for checking the legality of an index. - @returns Last allowed index in this grid. - */ - virtual idx_t - get_last_rank_alloc_index(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// **[Advanced]** Explicitly allocate data-storage memory for this grid. - /** - Amount of allocation is calculated based on domain, padding, and - step-dimension allocation sizes. - Any pre-existing storage will be released before allocation as via release_storage(). - See allocation options in the "Detailed Description" for \ref yk_grid. - */ - virtual void - alloc_storage() =0; - - /// **[Advanced]** Explicitly release any allocated data-storage for this grid. - /** - This will release storage allocated via any of the options - described in the "Detailed Description" for \ref yk_grid. - If the data was shared between two or more grids, the data will - be retained by the remaining grids. - */ - virtual void - release_storage() =0; - - /// **[Advanced]** Determines whether storage layout is the same as another grid. - /** - In order for the storage layout to be identical, the following - must be the same: - - Number of dimensions. - - Name of each dimension, in the same order. - - Allocation size in each dimension. - - Rank domain size in each domain dimension. - - Padding size in each domain dimension. - - The following do not have to be identical: - - Halo size. - - @returns `true` if storage for this grid has the same layout as - `other` or `false` otherwise. - */ - virtual bool - is_storage_layout_identical(const yk_grid_ptr other) const =0; - - /// **[Advanced]** Use existing data-storage from specified grid. - /** - This is an alternative to allocating data storage via - yk_solution::prepare_solution() or alloc_storage(). - In this case, data from a grid in this or another solution will be shared with - this grid. - In order to successfully share storage, the following conditions must hold: - - The source grid must already have storage allocated. - - The two grids must have the same dimensions in the same order. - - The two grids must have the same domain sizes in all domain dimensions. - - The two grids must have the same allocation sizes in non-domain dimensions. - - The required padding size of this grid must be less than or - equal to the actual padding size of the source grid in all domain - dimensions. The required padding size of this grid will be equal to - or greater than its halo size. It is not strictly necessary that the - two grids have the same halo sizes, but that is a sufficient condition. - - Any pre-existing storage will be released before allocation as via release_storage(). - The padding size(s) of this grid will be set to that of the source grid. - After calling share_storage(), changes in one grid via set_all_elements() - or set_element() will be visible in the other grid. - - See allocation options and more information about grid sizes - in the "Detailed Description" for \ref yk_grid. - */ - virtual void - share_storage(yk_grid_ptr source - /**< [in] Grid from which storage will be shared. */) =0; - - /// **[Advanced]** Get pointer to raw data storage buffer. - /** - The following assumptions about the contents of data are safe: - - Each FP element starts at a number of bytes from the beginning - of the buffer which is a multiple of yk_solution::get_element_bytes(). - - All the FP elements will be located within get_num_storage_bytes() - bytes from the beginning of the buffer. - - A call to set_all_elements_same() will initialize all elements - within get_num_storage_bytes() bytes from the beginning of the buffer. - - If is_storage_layout_identical() returns `true` between this - and some other grid, any given element index applied to both grids - will refer to an element at the same offset into their respective - data buffers. - - Thus, - - You can perform element-wise unary mathematical operations on - all elements of a grid via its raw buffer, e.g., add some constant - value to all elements. - - If the layouts of two grids are identical, you can use their - raw buffers to copy or compare the grid contents for equality or - perform element-wise binary mathematical operations on them, - e.g., add all elements from one grid to another. - - The following assumptions are not safe: - - Any expectations regarding the relationship between an element - index and that element's offset from the beginning of the buffer - such as row-major or column-major layout. - - All elements in the buffer are part of the rank domain or halo. - - Thus, - - You should not perform any operations dependent on - the logical indices of any element via raw buffer, e.g., matrix - multiply. - - @returns Pointer to raw data storage if is_storage_allocated() - returns `true` or NULL otherwise. - */ - virtual void* get_raw_storage_buffer() =0; - - /* Deprecated APIs for yk_grid found below should be avoided. - Use the more explicit form found in the documentation. */ - - /// **[Deprecated]** Get the left halo size in the specified dimension. - /** - Alias for get_left_halo_size(dim, size). - @returns Elements in halo in given dimension before the domain. - */ - virtual idx_t - get_halo_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// **[Deprecated]** Get the left padding in the specified dimension. - /** - Alias for get_left_pad_size(dim). - @returns Elements in left padding in given dimension. - */ - virtual idx_t - get_pad_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// **[Deprecated]** Get the extra left padding in the specified dimension. - /** - Alias for get_extra_left_pad_size(dim). - @returns Elements in padding in given dimension before the - left halo region. - */ - virtual idx_t - get_extra_pad_size(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - }; - - } // namespace yask. #endif diff --git a/include/yk_grid_api.hpp b/include/yk_grid_api.hpp new file mode 100644 index 00000000..d6245e9e --- /dev/null +++ b/include/yk_grid_api.hpp @@ -0,0 +1,942 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kernel +Copyright (c) 2014-2018, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +///////// API for the YASK stencil kernel grid. //////////// + +// This file uses Doxygen 1.8 markup for API documentation-generation. +// See http://www.stack.nl/~dimitri/doxygen. +/** @file yk_grid_api.hpp */ + +#ifndef YK_GRID_API +#define YK_GRID_API + +#include "yask_kernel_api.hpp" + +namespace yask { + + /// A run-time grid. + /** + "Grid" is a generic term for any n-dimensional array. A 0-dim grid + is a scalar, a 1-dim grid is an array, etc. A run-time grid contains + data, unlike yc_grid, a compile-time grid variable. + + Typically, access to each grid is obtained via yk_solution::get_grid(). + You may also use yk_solution::new_grid() or yk_solution::new_fixed_size_grid() + if you need a grid that is not part of the pre-defined solution. + + Each dimension of a grid is one of the following: + - The *step* dimension, typically time ("t"), as identified via yk_solution::get_step_dim_name(). + - A *domain* dimension, typically a spatial dimension such as "x" or "y", + as identified via yk_solution:get_domain_dim_names(). + - A *miscellaneous* dimension, which is any dimension that is not a domain or step dimension, + as identified via yk_solution:get_misc_dim_names(). + + In the step dimension, there is no fixed domain size, and no + specified first or last index. + However, there is an allocation size, which is the number of values in the step + dimension that are stored in memory. + Step-dimension indices "wrap-around" within this allocation to reuse memory. + For example, if the step dimension is "t", and the t-dimension allocation size is 3, + then t=-2, t=0, t=3, t=6, ..., t=303, etc. would all alias to the same spatial values in memory. + + In each domain dimension, + grid sizes include the following components: + - The *domain* is the elements to which the stencils are applied. + - The *left padding* is all the elements before the domain and includes the left halo. + - The *right padding* is all the elements before the domain and includes the right halo. + - The *left halo* is the elements just before the domain which must be + copied between preceding ranks during halo exchanges. The left halo is contained within the left padding. + - The *right halo* is the elements just after the domain which must be + copied between following ranks during halo exchanges. The right halo is contained within the right padding. + - The *extra left padding* is the elements before the domain and left halo + and thus does not include the left halo. + - The *extra right padding* is the elements after the domain and right halo + and thus does not include the right halo. + - The *allocation* includes the left padding, domain, and right padding. + + Domain sizes specified via yk_solution::set_rank_domain_size() apply to each MPI rank. + Visually, in each of the domain dimensions, these sizes are related as follows + in each rank: + +
extra left padding left halo domain right halo extra right padding +
left padding
right padding
+
allocation
+
+ + If MPI is not enabled, a rank's domain is equivalent to the entire problem size. + If MPI is enabled, the domains of the ranks are logically abutted to create the + overall problem domain in each dimension: + +
extra left padding of rank A halo of rank A domain of rank A domain of rank B + ... domain of rank Z halo of rank Z extra right padding of rank Z +
left padding of rank A
+
overall problem domain
+
right padding of rank Z
+
+ The intermediate halos and paddings also exist, but are not shown in the above diagram. + The halos overlap the domains of adjacent ranks. + For example, the left halo of rank B in the diagram would overlap the domain of rank A. + Data in these overlapped regions is exchanged as needed during stencil application + to maintain a consistent values as if there was only one rank. + + In each miscellaneous dimension, there is only an allocation size, + and there is no wrap-around as in the step dimension. + Each index must be between its first and last allowed value. + + All sizes are expressed in numbers of elements. + Each element may be a 4-byte (single precision) + or 8-byte (double precision) floating-point value as returned by + yk_solution::get_element_bytes(). + + Initially, a grid is not assigned any allocated storage. + This is done to allow modification of domain, padding, and other allocation sizes + before allocation. + Once the allocation sizes have been set in all dimensions, the data storage itself may + be allocated. + This can be done in any of the following ways: + - Storage for all grids without data storage will be automatically allocated when + prepare_solution() is called. + - Storage for a specific grid may be allocated before calling prepare_solution() + via yk_grid::alloc_storage(). + - **[Advanced]** Storage for a specific grid may be shared with another grid with + existing storage via yk_grid::share_storage(). + + @note The domain index arguments to the \ref yk_grid functions that require indices + are *always* relative to the overall problem; they are *not* relative to the current rank. + The first and last overall-problem index that lies within a rank can be + retrieved via yk_solution::get_first_rank_domain_index() and + yk_solution::get_last_rank_domain_index(), respectively. + The first and last accessible index that lies within a rank for a given grid can be + retrieved via yk_grid::get_first_rank_alloc_index() and + yk_grid::get_last_rank_alloc_index(), respectively. + Also, index arguments are always inclusive. + Specifically, for functions that return or require a "last" index, that + index indicates the last one in the relevant range, i.e., *not* one past the last value + (this is more like Fortran and Perl than Python and Lisp). + */ + class yk_grid { + public: + virtual ~yk_grid() {} + + /// Get the name of the grid. + /** + @returns String containing name provided via yc_solution::new_grid(). + */ + virtual const std::string& get_name() const =0; + + /// Determine whether this grid is automatically resized based on the solution. + /** + @returns `true` if this grid was created via yk_solution::new_fixed_size_grid() + or `false` otherwise. + */ + virtual bool is_fixed_size() const =0; + + /// Get the number of dimensions used in this grid. + /** + This may include domain, step, and/or miscellaneous dimensions. + @returns Number of dimensions created via yc_solution::new_grid(), + yk_solution::new_grid(), or yk_solution::new_fixed_size_grid(). + */ + virtual int get_num_dims() const =0; + + /// Get all the dimensions in this grid. + /** + This may include domain, step, and/or miscellaneous dimensions. + @returns List of names of all the dimensions. + */ + virtual std::vector + get_dim_names() const =0; + + /// Determine whether specified dimension exists in this grid. + /** + @returns `true` if dimension exists (including step-dimension), + `false` otherwise. + */ + virtual bool + is_dim_used(const std::string& dim) const =0; + + /// Get the domain size for this rank. + /** + @returns The same value as yk_solution::get_rank_domain_size() if + is_fixed_size() returns `false` or the fixed sized provided via + yk_solution::new_fixed_size_grid() otherwise. + */ + virtual idx_t + get_rank_domain_size(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from yk_solution::get_domain_dim_names(). */) const =0; + + /// Get the first index of the sub-domain in this rank in the specified dimension. + /** + @note This function should be called only *after* calling prepare_solution() + because prepare_solution() assigns this rank's position in the problem domain. + @returns The same value as yk_solution::get_first_rank_domain_index() if + is_fixed_size() returns `false` or zero (0) otherwise. + */ + virtual idx_t + get_first_rank_domain_index(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// Get the last index of the sub-domain in this rank in the specified dimension. + /** + @note This function should be called only *after* calling prepare_solution() + because prepare_solution() assigns this rank's position in the problem domain. + @returns The same value as yk_solution::get_last_rank_domain_index() if + is_fixed_size() returns `false` or one less than the fixed sized provided via + yk_solution::new_fixed_size_grid() otherwise. + */ + virtual idx_t + get_last_rank_domain_index(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// Get the left halo size in the specified dimension. + /** + This value is typically set by the stencil compiler. + @returns Elements in halo in given dimension before the domain. + */ + virtual idx_t + get_left_halo_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// Get the right halo size in the specified dimension. + /** + This value is typically set by the stencil compiler. + @returns Elements in halo in given dimension after the domain. + */ + virtual idx_t + get_right_halo_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// Get the first index of the left halo in this rank in the specified dimension. + /** + @note This function should be called only *after* calling prepare_solution() + because prepare_solution() assigns this rank's position in the problem domain. + @returns The first index of left halo in this rank or + the same value as yk_grid::get_first_rank_domain_index() + if the left halo has zero size. + */ + virtual idx_t + get_first_rank_halo_index(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// Get the last index of the right halo in this rank in the specified dimension. + /** + @note This function should be called only *after* calling prepare_solution() + because prepare_solution() assigns this rank's position in the problem domain. + @returns The last index of right halo in this rank or + the same value as yk_grid::get_last_rank_domain_index() + if the right halo has zero size. + */ + virtual idx_t + get_last_rank_halo_index(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// Get the left padding in the specified dimension. + /** + The left padding is the memory allocated before + the domain in a given dimension. + The left padding size includes the left halo size. + The value may be slightly + larger than that provided via set_min_pad_size(), etc. due to rounding. + @returns Elements in left padding in given dimension. + */ + virtual idx_t + get_left_pad_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// Get the right padding in the specified dimension. + /** + The right padding is the memory allocated after + the domain in a given dimension. + The right padding size includes the right halo size. + The value may be slightly + larger than that provided via set_min_pad_size(), etc. due to rounding. + @returns Elements in right padding in given dimension. + */ + virtual idx_t + get_right_pad_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// Get the extra left padding in the specified dimension. + /** + The *extra* padding size is the left padding size minus the left halo size. + @returns Elements in padding in given dimension before the + left halo region. + */ + virtual idx_t + get_left_extra_pad_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// Get the extra right padding in the specified dimension. + /** + The *extra* padding size is the right padding size minus the right halo size. + @returns Elements in padding in given dimension after the + right halo region. + */ + virtual idx_t + get_right_extra_pad_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// Set the padding in the specified dimension. + /** + This sets the minimum number of elements in this grid + in both left and right pads. + This padding area can be used for required halo regions. + + The *actual* padding size will be the largest of the following values, + additionally rounded up based on the vector-folding dimensions + and/or cache-line alignment: + - Halo size. + - Value provided by any of the pad-size setting functions. + + The padding size cannot be changed after data storage + has been allocated for this grid; attempted changes to the pad size + will be ignored. + In addition, once a grid's padding is set, it cannot be reduced, only increased. + Call get_pad_size() to determine the actual padding size for the grid. + See the "Detailed Description" for \ref yk_grid for information on grid sizes. + */ + virtual void + set_min_pad_size(const std::string& dim + /**< [in] Name of dimension to set. + Must be one of + the names from yk_solution::get_domain_dim_names(). */, + idx_t size + /**< [in] Minimum number of elements to allocate beyond the domain size. */ ) =0; + + /// Get the storage allocation in the specified dimension. + /** + For the step dimension, this is the specified allocation and + does not typically depend on the number of steps evaluated. + For the non-step dimensions, this includes the domain and padding sizes. + See the "Detailed Description" for \ref yk_grid for information on grid sizes. + @returns allocation in number of elements (not bytes). + */ + virtual idx_t + get_alloc_size(const std::string& dim + /**< [in] Name of dimension to get. */ ) const =0; + + /// Get the first index of a specified miscellaneous dimension. + /** + @returns the first allowed index in a non-step and non-domain dimension. + */ + virtual idx_t + get_first_misc_index(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from yk_solution::get_misc_dim_names(). */ ) const =0; + + /// Get the last index of a specified miscellaneous dimension. + /** + @returns the last allowed index in a non-step and non-domain dimension. + */ + virtual idx_t + get_last_misc_index(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from yk_solution::get_misc_dim_names(). */ ) const =0; + + /// Determine whether the given indices are allocated in this rank. + /** + Provide indices in a list in the same order returned by get_dim_names(). + Indices are relative to the *overall* problem domain. + @returns `true` if index values fall within the allocated space as returned by + get_first_rank_alloc_index() and get_last_rank_alloc_index() for + each dimension; `false` otherwise. + */ + virtual bool + is_element_allocated(const std::vector& indices + /**< [in] List of indices, one for each grid dimension. */ ) const =0; + +#ifndef SWIG + /// Determine whether the given indices are allocated in this rank. + /** + Provide indices in a list in the same order returned by get_dim_names(). + Indices are relative to the *overall* problem domain. + @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. + @returns `true` if index values fall within the allocated space as returned by + get_first_rank_alloc_index() and get_last_rank_alloc_index() for + each dimension; `false` otherwise. + */ + virtual bool + is_element_allocated(const std::initializer_list& indices + /**< [in] List of indices, one for each grid dimension. */ ) const =0; +#endif + + /// Get the value of one grid element. + /** + Provide indices in a list in the same order returned by get_dim_names(). + Indices are relative to the *overall* problem domain. + Index values must fall within the allocated space as returned by + get_first_rank_alloc_index() and get_last_rank_alloc_index() for + each dimension. + @returns value in grid at given multi-dimensional location. + */ + virtual double + get_element(const std::vector& indices + /**< [in] List of indices, one for each grid dimension. */ ) const =0; + +#ifndef SWIG + /// Get the value of one grid element. + /** + Provide indices in a list in the same order returned by get_dim_names(). + Indices are relative to the *overall* problem domain. + Index values must fall within the allocated space as returned by + get_first_rank_alloc_index() and get_last_rank_alloc_index() for + each dimension. + @note The return value is a double-precision floating-point value, but + it will be converted from a single-precision if + yk_solution::get_element_bytes() returns 4. + @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. + @returns value in grid at given multi-dimensional location. + */ + virtual double + get_element(const std::initializer_list& indices + /**< [in] List of indices, one for each grid dimension. */ ) const =0; +#endif + + /// Get grid elements within specified subset of the grid. + /** + Reads all elements from `first_indices` to `last_indices` in each dimension + and writes them to consecutive memory locations in the buffer. + Indices in the buffer progress in row-major order. + The buffer pointed to must contain the number of bytes equal to + yk_solution::get_element_bytes() multiplied by the number of + elements in the specified slice. + Since the reads proceed in row-major order, the last index is "unit-stride" + in the buffer. + Provide indices in two lists in the same order returned by get_dim_names(). + Indices are relative to the *overall* problem domain. + Index values must fall within the allocated space as returned by + get_first_rank_alloc_index() and get_last_rank_alloc_index() for + each dimension. + @returns Number of elements read. + */ + virtual idx_t + get_elements_in_slice(void* buffer_ptr + /**< [out] Pointer to buffer where values will be written. */, + const std::vector& first_indices + /**< [in] List of initial indices, one for each grid dimension. */, + const std::vector& last_indices + /**< [in] List of final indices, one for each grid dimension. */ ) const =0; + + /// Set the value of one grid element. + /** + Provide indices in a list in the same order returned by get_dim_names(). + Indices are relative to the *overall* problem domain. + Index values must fall within the allocated space as returned by + get_first_rank_alloc_index() and get_last_rank_alloc_index() for + each dimension. + @note The parameter value is a double-precision floating-point value, but + it will be converted to single-precision if + yk_solution::get_element_bytes() returns 4. + If storage has not been allocated for this grid, this will have no effect. + @returns Number of elements set. + */ + virtual idx_t + set_element(double val /**< [in] Element in grid will be set to this. */, + const std::vector& indices + /**< [in] List of indices, one for each grid dimension. */, + bool strict_indices = false + /**< [in] If true, indices must be within domain or padding. + If false, indices outside of domain and padding result + in no change to grid. */ ) =0; + +#ifndef SWIG + /// Set the value of one grid element. + /** + Provide the number of indices equal to the number of dimensions in the grid. + Indices beyond that will be ignored. + Indices are relative to the *overall* problem domain. + If any index values fall outside of the allocated space as returned by + get_first_rank_alloc_index() and get_last_rank_alloc_index() for + each dimension, this will have no effect. + @note The parameter value is a double-precision floating-point value, but + it will be converted to single-precision if + yk_solution::get_element_bytes() returns 4. + If storage has not been allocated for this grid, this will have no effect. + @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. + @returns Number of elements set. + */ + virtual idx_t + set_element(double val /**< [in] Element in grid will be set to this. */, + const std::initializer_list& indices + /**< [in] List of indices, one for each grid dimension. */, + bool strict_indices = false + /**< [in] If true, indices must be within domain or padding. + If false, indices outside of domain and padding result + in no change to grid. */ ) =0; +#endif + /// Atomically add to the value of one grid element. + /** + Provide indices in a list in the same order returned by get_dim_names(). + Indices are relative to the *overall* problem domain. + Index values must fall within the allocated space as returned by + get_first_rank_alloc_index() and get_last_rank_alloc_index() for + each dimension if `strict_indices` is set to true. + Updates are OpenMP atomic, meaning that this function can be called by + several OpenMP threads without causing a race condition. + @note The parameter value is a double-precision floating-point value, but + it will be converted to single-precision if + yk_solution::get_element_bytes() returns 4. + If storage has not been allocated for this grid, this will have no effect. + @returns Number of elements updated. + */ + virtual idx_t + add_to_element(double val /**< [in] This value will be added to element in grid. */, + const std::vector& indices + /**< [in] List of indices, one for each grid dimension. */, + bool strict_indices = false + /**< [in] If true, indices must be within domain or padding. + If false, indices outside of domain and padding result + in no change to grid. */ ) =0; + +#ifndef SWIG + /// Atomically add to the value of one grid element. + /** + Provide the number of indices equal to the number of dimensions in the grid. + Indices beyond that will be ignored. + Indices are relative to the *overall* problem domain. + Index values must fall within the allocated space as returned by + get_first_rank_alloc_index() and get_last_rank_alloc_index() for + each dimension if `strict_indices` is set to true. + Updates are OpenMP atomic, meaning that this function can be called by + several OpenMP threads without causing a race condition. + @note The parameter value is a double-precision floating-point value, but + it will be converted to single-precision if + yk_solution::get_element_bytes() returns 4. + If storage has not been allocated for this grid, this will have no effect. + @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. + @returns Number of elements set. + */ + virtual idx_t + add_to_element(double val /**< [in] This value will be added to element in grid. */, + const std::initializer_list& indices + /**< [in] List of indices, one for each grid dimension. */, + bool strict_indices = false + /**< [in] If true, indices must be within domain or padding. + If false, indices outside of domain and padding result + in no change to grid. */ ) =0; +#endif + + /// Initialize all grid elements to the same value. + /** + Sets all allocated elements, including those in the domain and padding + area to the same specified value. + @note The parameter is a double-precision floating-point value, but + it will be converted to single-precision if + yk_solution::get_element_bytes() returns 4. + @note If storage has not been allocated via yk_solution::prepare_solution(), + this will have no effect. + */ + virtual void + set_all_elements_same(double val /**< [in] All elements will be set to this. */ ) =0; + + /// Initialize grid elements within specified subset of the grid to the same value. + /** + Sets all elements from `first_indices` to `last_indices` in each dimension to the + specified value. + Provide indices in two lists in the same order returned by get_dim_names(). + Indices are relative to the *overall* problem domain. + Index values must fall within the allocated space as returned by + get_first_rank_alloc_index() and get_last_rank_alloc_index() for + each dimension. + If storage has not been allocated for this grid, this will have no effect. + @returns Number of elements set. + */ + virtual idx_t + set_elements_in_slice_same(double val /**< [in] All elements in the slice will be set to this. */, + const std::vector& first_indices + /**< [in] List of initial indices, one for each grid dimension. */, + const std::vector& last_indices + /**< [in] List of final indices, one for each grid dimension. */, + bool strict_indices = false + /**< [in] If true, indices must be within domain or padding. + If false, only elements within the allocation of this grid + will be set, and elements outside will be ignored. */ ) =0; + + /// Set grid elements within specified subset of the grid. + /** + Reads elements from consecutive memory locations, + starting at `buffer_ptr` + and writes them from `first_indices` to `last_indices` in each dimension. + Indices in the buffer progress in row-major order. + The buffer pointed to must contain either 4 or 8 byte FP values per element in the + subset, depending on the FP precision of the solution. + The buffer pointed to must contain the number of FP values in the specified slice, + where each FP value is the size of yk_solution::get_element_bytes(). + Since the writes proceed in row-major order, the last index is "unit-stride" + in the buffer. + Provide indices in two lists in the same order returned by get_dim_names(). + Indices are relative to the *overall* problem domain. + Index values must fall within the allocated space as returned by + get_first_rank_alloc_index() and get_last_rank_alloc_index() for + each dimension. + If storage has not been allocated for this grid, this will have no effect. + @returns Number of elements written. + */ + virtual idx_t + set_elements_in_slice(const void* buffer_ptr + /**< [out] Pointer to buffer where values will be read. */, + const std::vector& first_indices + /**< [in] List of initial indices, one for each grid dimension. */, + const std::vector& last_indices + /**< [in] List of final indices, one for each grid dimension. */ ) =0; + + /// Format the indices for pretty-printing. + /** + Provide indices in a list in the same order returned by get_dim_names(). + @returns A string containing the grid name and the index values. + */ + virtual std::string + format_indices(const std::vector& indices + /**< [in] List of indices, one for each grid dimension. */ ) const =0; + +#ifndef SWIG + /// Format the indices for pretty-printing. + /** + Provide indices in a list in the same order returned by get_dim_names(). + @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. + @returns A string containing the grid name and the index values. + */ + virtual std::string + format_indices(const std::initializer_list& indices + /**< [in] List of indices, one for each grid dimension. */ ) const =0; +#endif + + /// Determine whether storage has been allocated. + /** + @returns `true` if storage has been allocated, + `false` otherwise. + */ + virtual bool + is_storage_allocated() const =0; + + /// Determine size of raw storage in bytes. + /** + @returns Minimum number of bytes required for + storage given the current domain size and padding settings. + */ + virtual idx_t + get_num_storage_bytes() const =0; + + /// Determine size of raw storage in elements. + /** + @returns get_num_storage_bytes() / yk_solution.get_element_bytes(). + */ + virtual idx_t + get_num_storage_elements() const =0; + + /* Advanced APIs for yk_grid found below are not needed for most applications. */ + + /// **[Advanced]** Set the default preferred NUMA node on which to allocate data. + /** + This value is used when allocating data for this grid. + Thus, the desired NUMA policy must be set before calling alloc_data() + or yk_solution::prepare_solution(). + */ + virtual void + set_numa_preferred(int numa_node + /**< [in] Preferred NUMA node. + See yk_solution::set_default_numa_preferred() for other options. */) =0; + + /// **[Advanced]** Get the default preferred NUMA node on which to allocate data. + /** + @returns Current setting of preferred NUMA node for this grid. + */ + virtual int + get_numa_preferred() const =0; + + /// **[Advanced]** Set the left halo size in the specified dimension. + /** + This value is typically set by the stencil compiler, but + this function allows you to override that value. + If the left halo is set to a value larger than the left padding size, the + left padding size will be automatically increase to accomodate it. + @note After data storage has been allocated, the left halo size + can only be set to a value less than or equal to the left padding size + in the given dimension. + */ + virtual void + set_left_halo_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */, + idx_t size + /**< [in] Number of elements in the left halo. */ ) =0; + + /// **[Advanced]** Set the right halo size in the specified dimension. + /** + This value is typically set by the stencil compiler, but + this function allows you to override that value. + If the right halo is set to a value larger than the right padding size, the + right padding size will be automatically increase to accomodate it. + @note After data storage has been allocated, the right halo size + can only be set to a value less than or equal to the right padding size + in the given dimension. + */ + virtual void + set_right_halo_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */, + idx_t size + /**< [in] Number of elements in the right halo. */ ) =0; + + /// **[Advanced]** Set the left and right halo sizes in the specified dimension. + /** + Alias for set_left_halo_size(dim, size); set_right_halo_size(dim, size). + */ + virtual void + set_halo_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */, + idx_t size + /**< [in] Number of elements in the halo. */ ) =0; + + + /// **[Advanced]** Set the number of elements to allocate in the specified dimension. + /** + This setting is only allowed in the step dimension. + Typically, the allocation in the step dimension is determined by the + stencil compiler, but + this function allows you to override that value. + Allocations in other dimensions should be set indirectly + via the domain and padding sizes. + The allocation size cannot be changed after data storage + has been allocated for this grid. + */ + virtual void + set_alloc_size(const std::string& dim + /**< [in] Name of dimension to set. + Must *not* be one of + the names from yk_solution::get_domain_dim_names(). */, + idx_t size /**< [in] Number of elements to allocate. */ ) =0; + + /// **[Advanced]** Set the first index of a specified miscellaneous dimension. + /** + Sets the first allowed index in a non-step and non-domain dimension. + After calling this function, the last allowed index will be the first index + as set by this function plus the allocation size set by set_alloc_size() + minus one. + */ + virtual void + set_first_misc_index(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from yk_solution::get_misc_dim_names(). */, + idx_t idx /**< [in] New value for first index. + May be negative. */ ) =0; + + /// **[Advanced]** Get the first accessible index in this grid in this rank in the specified dimension. + /** + This returns the first *overall* index allowed in this grid. + This element may be in the domain, left halo, or extra left padding area. + This function is only for checking the legality of an index. + @returns First allowed index in this grid. + */ + virtual idx_t + get_first_rank_alloc_index(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// **[Advanced]** Get the last accessible index in this grid in this rank in the specified dimension. + /** + This returns the last *overall* index allowed in this grid. + This element may be in the domain, right halo, or extra right padding area. + This function is only for checking the legality of an index. + @returns Last allowed index in this grid. + */ + virtual idx_t + get_last_rank_alloc_index(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// **[Advanced]** Explicitly allocate data-storage memory for this grid. + /** + Amount of allocation is calculated based on domain, padding, and + step-dimension allocation sizes. + Any pre-existing storage will be released before allocation as via release_storage(). + See allocation options in the "Detailed Description" for \ref yk_grid. + */ + virtual void + alloc_storage() =0; + + /// **[Advanced]** Explicitly release any allocated data-storage for this grid. + /** + This will release storage allocated via any of the options + described in the "Detailed Description" for \ref yk_grid. + If the data was shared between two or more grids, the data will + be retained by the remaining grids. + */ + virtual void + release_storage() =0; + + /// **[Advanced]** Determines whether storage layout is the same as another grid. + /** + In order for the storage layout to be identical, the following + must be the same: + - Number of dimensions. + - Name of each dimension, in the same order. + - Allocation size in each dimension. + - Rank domain size in each domain dimension. + - Padding size in each domain dimension. + + The following do not have to be identical: + - Halo size. + + @returns `true` if storage for this grid has the same layout as + `other` or `false` otherwise. + */ + virtual bool + is_storage_layout_identical(const yk_grid_ptr other) const =0; + + /// **[Advanced]** Use existing data-storage from specified grid. + /** + This is an alternative to allocating data storage via + yk_solution::prepare_solution() or alloc_storage(). + In this case, data from a grid in this or another solution will be shared with + this grid. + In order to successfully share storage, the following conditions must hold: + - The source grid must already have storage allocated. + - The two grids must have the same dimensions in the same order. + - The two grids must have the same domain sizes in all domain dimensions. + - The two grids must have the same allocation sizes in non-domain dimensions. + - The required padding size of this grid must be less than or + equal to the actual padding size of the source grid in all domain + dimensions. The required padding size of this grid will be equal to + or greater than its halo size. It is not strictly necessary that the + two grids have the same halo sizes, but that is a sufficient condition. + + Any pre-existing storage will be released before allocation as via release_storage(). + The padding size(s) of this grid will be set to that of the source grid. + After calling share_storage(), changes in one grid via set_all_elements() + or set_element() will be visible in the other grid. + + See allocation options and more information about grid sizes + in the "Detailed Description" for \ref yk_grid. + */ + virtual void + share_storage(yk_grid_ptr source + /**< [in] Grid from which storage will be shared. */) =0; + + /// **[Advanced]** Get pointer to raw data storage buffer. + /** + The following assumptions about the contents of data are safe: + - Each FP element starts at a number of bytes from the beginning + of the buffer which is a multiple of yk_solution::get_element_bytes(). + - All the FP elements will be located within get_num_storage_bytes() + bytes from the beginning of the buffer. + - A call to set_all_elements_same() will initialize all elements + within get_num_storage_bytes() bytes from the beginning of the buffer. + - If is_storage_layout_identical() returns `true` between this + and some other grid, any given element index applied to both grids + will refer to an element at the same offset into their respective + data buffers. + + Thus, + - You can perform element-wise unary mathematical operations on + all elements of a grid via its raw buffer, e.g., add some constant + value to all elements. + - If the layouts of two grids are identical, you can use their + raw buffers to copy or compare the grid contents for equality or + perform element-wise binary mathematical operations on them, + e.g., add all elements from one grid to another. + + The following assumptions are not safe: + - Any expectations regarding the relationship between an element + index and that element's offset from the beginning of the buffer + such as row-major or column-major layout. + - All elements in the buffer are part of the rank domain or halo. + + Thus, + - You should not perform any operations dependent on + the logical indices of any element via raw buffer, e.g., matrix + multiply. + + @returns Pointer to raw data storage if is_storage_allocated() + returns `true` or NULL otherwise. + */ + virtual void* get_raw_storage_buffer() =0; + + /* Deprecated APIs for yk_grid found below should be avoided. + Use the more explicit form found in the documentation. */ + + /// **[Deprecated]** Get the left halo size in the specified dimension. + /** + Alias for get_left_halo_size(dim, size). + @returns Elements in halo in given dimension before the domain. + */ + virtual idx_t + get_halo_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// **[Deprecated]** Get the left padding in the specified dimension. + /** + Alias for get_left_pad_size(dim). + @returns Elements in left padding in given dimension. + */ + virtual idx_t + get_pad_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + /// **[Deprecated]** Get the extra left padding in the specified dimension. + /** + Alias for get_extra_left_pad_size(dim). + @returns Elements in padding in given dimension before the + left halo region. + */ + virtual idx_t + get_extra_pad_size(const std::string& dim + /**< [in] Name of dimension to get. + Must be one of + the names from yk_solution::get_domain_dim_names(). */ ) const =0; + + }; + + +} // namespace yask. + +#endif diff --git a/include/yk_solution_api.hpp b/include/yk_solution_api.hpp new file mode 100644 index 00000000..130b2858 --- /dev/null +++ b/include/yk_solution_api.hpp @@ -0,0 +1,836 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kernel +Copyright (c) 2014-2018, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +///////// API for the YASK stencil kernel solution. //////////// + +// This file uses Doxygen 1.8 markup for API documentation-generation. +// See http://www.stack.nl/~dimitri/doxygen. +/** @file yk_solution_api.hpp */ + +#ifndef YK_SOLN_API +#define YK_SOLN_API + +#include "yask_kernel_api.hpp" + +namespace yask { + + /// Allocate grids on local NUMA node. + /** + This is used in yk_solution::set_default_numa_preferred + and yk_grid::set_numa_preferred. + In Python, specify as `yask_kernel.cvar.yask_numa_local`. + */ + const int yask_numa_local = -1; + + /// Allocate grids across all available NUMA nodes. + /** + This is used in yk_solution::set_default_numa_preferred + and yk_grid::set_numa_preferred. + In Python, specify as `yask_kernel.cvar.yask_numa_interleave`. + */ + const int yask_numa_interleave = -2; + + /// Do not specify any NUMA binding. + /** + This is used in yk_solution::set_default_numa_preferred + and yk_grid::set_numa_preferred. + In Python, specify as `yask_kernel.cvar.yask_numa_none`. + */ + const int yask_numa_none = -9; + + /// Stencil solution as defined by the generated code from the YASK stencil compiler. + /** + Objects of this type contain all the grids and equations + that comprise a solution. + */ + class yk_solution { + public: + virtual ~yk_solution() {} + + /// Set object to receive debug output. + virtual void + set_debug_output(yask_output_ptr debug + /**< [out] Pointer to object to receive debug output. + See \ref yask_output_factory. */ ) =0; + + /// Get the name of the solution. + /** + @returns String containing the solution name provided during stencil compilation. + */ + virtual const std::string& + get_name() const =0; + + /// Get the floating-point precision size. + /** + @returns Number of bytes in each FP element: 4 or 8. + */ + virtual int + get_element_bytes() const =0; + + /// Get the solution step dimension. + /** + @returns String containing the step-dimension name. + */ + virtual std::string + get_step_dim_name() const =0; + + /// Get the number of domain dimensions used in this solution. + /** + The domain dimensions are those over which the stencil is + applied in each step. + Does *not* include the step dimension or any miscellaneous dimensions. + @returns Number of dimensions that define the problem domain. + */ + virtual int + get_num_domain_dims() const =0; + + /// Get all the domain dimension names. + /** + @returns List of all domain-dimension names. + */ + virtual std::vector + get_domain_dim_names() const =0; + + /// Get all the miscellaneous dimension names. + /** + @returns List of all dimension names used in the solution + that are not step or domain dimensions. + */ + virtual std::vector + get_misc_dim_names() const =0; + + /// Set the size of the solution domain for this rank. + /** + The domain defines the number of elements that will be evaluated with the stencil(s). + If MPI is not enabled, this is the entire problem domain. + If MPI is enabled, this is the domain for the current rank only, + and the problem domain consists of the sum of all rank domains + in each dimension (weak-scaling). + The domain size in each rank does not have to be the same, but + all domains in the same column must have the same width, + all domains in the same row must have the same height, + and so forth, for each domain dimension. + The domain size does *not* include the halo region or any padding. + For best performance, set the rank domain + size to a multiple of the number of elements in a vector-cluster in + each dimension whenever possible. + See the "Detailed Description" for \ref yk_grid for more information on grid sizes. + There is no domain-size setting allowed in the + solution-step dimension (usually "t"). + */ + virtual void + set_rank_domain_size(const std::string& dim + /**< [in] Name of dimension to set. Must be one of + the names from get_domain_dim_names(). */, + idx_t size /**< [in] Elements in the domain in this `dim`. */ ) =0; + + /// Get the domain size for this rank. + /** + @returns Current setting of rank domain size in specified dimension. + */ + virtual idx_t + get_rank_domain_size(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from get_domain_dim_names(). */) const =0; + + /// Set the block size in the given dimension. + /** + This sets the approximate number of elements that are evaluated in + each "block". + This is a performance setting and should not affect the functional + correctness or total number of elements evaluated. + A block is typically the unit of work done by a + top-level OpenMP thread. The actual number of elements evaluated + in a block may be greater than the specified size due to rounding + up to fold-cluster sizes. The number of elements in a block may + also be smaller than the specified size when the block is at the + edge of the domain. The block size cannot be set in the + solution-step dimension (because temporal blocking is not yet enabled). + + Unless auto-tuning is disabled, the block size will be used as + a starting point for an automated search for a higher-performing + block size. + */ + virtual void + set_block_size(const std::string& dim + /**< [in] Name of dimension to set. Must be one of + the names from get_domain_dim_names(). */, + idx_t size + /**< [in] Elements in a block in this `dim`. */ ) =0; + + /// Get the block size. + /** + Returned value may be slightly larger than the value provided + via set_block_size() due to rounding. + @returns Current settings of block size. + */ + virtual idx_t + get_block_size(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from get_domain_dim_names(). */) const =0; + + /// Set the number of MPI ranks in the given dimension. + /** + The *product* of the number of ranks across all dimensions must + equal yk_env::get_num_ranks(). + The curent MPI rank will be assigned a unique location + within the overall problem domain based on its MPI rank index. + The same number of MPI ranks must be set via this API on each + constituent MPI rank to ensure a consistent overall configuration. + The number of ranks in each dimension must be properly set + before calling yk_solution::prepare_solution(). + There is no rank setting allowed in the + solution-step dimension (usually "t"). + */ + virtual void + set_num_ranks(const std::string& dim + /**< [in] Name of dimension to set. Must be one of + the names from get_domain_dim_names(). */, + idx_t num /**< [in] Number of ranks in `dim`. */ ) =0; + + /// Get the number of MPI ranks in the given dimension. + /** + @returns Current setting of rank size. + */ + virtual idx_t + get_num_ranks(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from get_domain_dim_names(). */) const =0; + + /// Get the rank index in the specified dimension. + /** + The overall rank indices in the specified dimension will range from + zero (0) to get_num_ranks() - 1, inclusive. + @returns Zero-based index of this rank. + */ + virtual idx_t + get_rank_index(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from get_domain_dim_names(). */ ) const =0; + + /// Get the number of grids in the solution. + /** + Grids may be pre-defined by the stencil compiler + (e.g., via yc_solution::new_grid()) + or created explicitly via yk_solution::new_grid(). + @returns Number of grids that have been created. + */ + virtual int + get_num_grids() const =0; + + /// Get the specified grid. + /** + This cannot be used to access scratch grids. + @returns Pointer to the specified grid or null pointer if it does not exist. + */ + virtual yk_grid_ptr + get_grid(const std::string& name /**< [in] Name of the grid. */ ) =0; + + /// Get all the grids. + /** + @returns List of all non-scratch grids in the solution. + */ + virtual std::vector + get_grids() =0; + + /// Prepare the solution for stencil application. + /** + Allocates data in grids that do not already have storage allocated. + Calculates the position of each rank in the overall problem domain. + Sets many other data structures needed for proper stencil application. + Since this function initiates MPI communication, it must be called + on all MPI ranks, and it will block until all ranks have completed. + Must be called before applying any stencils. + */ + virtual void + prepare_solution() =0; + + /// Get the first index of the sub-domain in this rank in the specified dimension. + /** + This returns the first *overall* index at the beginning of the domain. + Elements within the domain in this rank lie between the values returned by + get_first_rank_domain_index() and get_last_rank_domain_index(), inclusive. + If there is only one MPI rank, this is typically zero (0). + If there is more than one MPI rank, the value depends + on the the rank's position within the overall problem domain. + + @note This function should be called only *after* calling prepare_solution() + because prepare_solution() assigns this rank's position in the problem domain. + @returns First domain index in this rank. + */ + virtual idx_t + get_first_rank_domain_index(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from get_domain_dim_names(). */ ) const =0; + + /// Get the last index of the sub-domain in this rank the specified dimension. + /** + This returns the last *overall* index within the domain in this rank + (*not* one past the end). + If there is only one MPI rank, this is typically one less than the value + provided by set_rank_domain_size(). + If there is more than one MPI rank, the value depends + on the the rank's position within the overall problem domain. + See get_first_rank_domain_index() for more information. + + @note This function should be called only *after* calling prepare_solution() + because prepare_solution() assigns this rank's position in the problem domain. + @returns Last index in this rank. + */ + virtual idx_t + get_last_rank_domain_index(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from get_domain_dim_names(). */ ) const =0; + + /// Get the overall problem size in the specified dimension. + /** + The overall domain indices in the specified dimension will range from + zero (0) to get_overall_domain_size() - 1, inclusive. + Call get_first_rank_domain_index() and get_last_rank_domain_index() + to find the subset of this domain in each rank. + + @note This function should be called only *after* calling prepare_solution() + because prepare_solution() obtains the sub-domain sizes from other ranks. + @returns Sum of all ranks' domain sizes in the given dimension. + */ + virtual idx_t + get_overall_domain_size(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from get_domain_dim_names(). */ ) const =0; + + /// Run the stencil solution for the specified steps. + /** + The stencil(s) in the solution are applied to the grid data, setting the + index variables as follows: + 1. If temporal wave-fronts are *not* used (the default): + - The step index (e.g., `t` for "time") will be sequentially set to values + from `first_step_index` to `last_step_index`, inclusive. + + If the stencil equations were defined with dependencies on lower-valued steps, + e.g., `t+1` depends on `t`, then `last_step_index` should be greater than or equal to + `first_step_index` (forward solution). + + If the stencil equations were defined with dependencies on higher-valued steps, + e.g., `t-1` depends on `t`, then `last_step_index` should be less than or equal to + `first_step_index` (reverse solution). + - For each step index, the domain indices will be set + to values across the entire domain as returned by yk_solution::get_overall_domain_size() + (not necessarily sequentially). + - MPI halo exchanges will occur as necessary before, after, or during a step. + - Since this function initiates MPI communication, it must be called + on all MPI ranks, and it will block until all ranks have completed. + 2. **[Advanced]** If temporal wave-fronts *are* enabled (currently only possible via apply_command_line_options()): + - The step index (e.g., `t` for "time") will be sequentially set to values + from `first_step_index` to `last_step_index`, inclusive, within each wave-front tile. + + The number of steps in a wave-front tile may also be restricted by the size + of the tile in the step dimension. In that case, tiles will be done in slices of that size. + + Reverse solutions are not allowed with wave-front tiling. + - For each step index within each wave-front tile, the domain indices will be set + to values across the entire tile (not necessarily sequentially). + - Ultimately, the stencil(s) will be applied to same the elements in both the step + and domain dimensions as when wave-front tiling is not used. + - MPI is not supported with wave-front tiling. + + This function should be called only *after* calling prepare_solution(). + */ + virtual void + run_solution(idx_t first_step_index /**< [in] First index in the step dimension */, + idx_t last_step_index /**< [in] Last index in the step dimension */ ) =0; + + /// Run the stencil solution for the specified step. + /** + This function is simply an alias for `run_solution(step_index, step_index)`, i.e., + the solution will be applied for exactly one step across the domain. + + Typical C++ usage: + + \code{.cpp} + soln->prepare_solution(); + for (idx_t t = 1; t <= num_steps; t++) + soln->run_solution(t); + soln->end_solution(); + \endcode + + As written, the above loop is identical to + + \code{.cpp} + soln->prepare_solution(); + soln->run_solution(1, num_steps); + soln->end_solution(); + \endcode + + @note The parameter is *not* the number of steps to run. + @note Since only one step is taken per call, using this function effectively disables + wave-front tiling. + */ + virtual void + run_solution(idx_t step_index /**< [in] Index in the step dimension */ ) =0; + + /// Finish using a solution. + /** + Performs a final MPI halo exchange. + Releases shared ownership of memory used by the grids. This will + result in deallocating each memory block that is not + referenced by another shared pointer. + */ + virtual void + end_solution() =0; + + + /// Get performance statistics associated with preceding calls to run_solution(). + /** + Side effect: resets all statistics, so a subsequent call will + measure performance after the current call. + @returns Pointer to statistics object. + */ + virtual yk_stats_ptr + get_stats() =0; + + /// Determine whether the auto-tuner is enabled on this rank. + /** + The auto-tuner is enabled by default. + It will become disabled after it has converged or after reset_auto_tuner(false) has been called. + @returns Whether the auto-tuner is still searching. + */ + virtual bool + is_auto_tuner_enabled() =0; + + /* Advanced APIs for yk_solution found below are not needed for most applications. */ + + /// **[Advanced]** Set the minimum amount of grid padding for all grids. + /** + This sets the minimum number of elements in each grid that is + reserved outside of the rank domain in the given dimension. + This padding area can be used for required halo regions. At + least the specified number of elements will be added to both + sides, i.e., both "before" and "after" the domain. + + The *actual* padding size will be the largest of the following values, + additionally rounded up based on the vector-folding dimensions + and/or cache-line alignment: + - Halo size. + - Value provided by any of the pad-size setting functions. + + The padding size cannot be changed after data storage + has been allocated for a given grid; attempted changes to the pad size for such + grids will be ignored. + In addition, once a grid's padding is set, it cannot be reduced, only increased. + Call yk_grid::get_pad_size() to determine the actual padding size for a given grid. + See the "Detailed Description" for \ref yk_grid for more information on grid sizes. + There is no padding allowed in the solution-step dimension (usually "t"). + */ + virtual void + set_min_pad_size(const std::string& dim + /**< [in] Name of dimension to set. Must + be one of the names from get_domain_dim_names(). */, + idx_t size + /**< [in] Elements in this `dim` applied + to both sides of the domain. */ ) =0; + + /// **[Advanced]** Get the minimum amount of grid padding for all grids. + /** + @returns Current setting of minimum amount of grid padding for all grids. + */ + virtual idx_t + get_min_pad_size(const std::string& dim + /**< [in] Name of dimension to get. Must be one of + the names from get_domain_dim_names(). */) const =0; + + /// **[Advanced]** Restart or disable the auto-tuner on this rank. + /** + Under normal operation, an auto-tuner is invoked automatically during calls to + run_solution(). + Currently, only the block size is set by the auto-tuner, and the search begins from the + sizes set via set_block_size() or the default size if set_block_size() has + not been called. + This function is used to apply the current best-known settings if the tuner has + been running, reset the state of the auto-tuner, and either + restart its search or disable it from running. + This call must be made on each rank where the change is desired. + */ + virtual void + reset_auto_tuner(bool enable + /**< [in] If _true_, start or restart the auto-tuner search. + If _false_, disable the auto-tuner from running. */, + bool verbose = false + /**< [in] If _true_, print progress information to the debug object + set via set_debug_output(). */ ) =0; + + /// **[Advanced]** Automatically tune selected settings immediately. + /** + Executes a search algorithm to find [locally] optimum values for some of the + settings. + Under normal operation, an auto-tuner is invoked during calls to + run_solution(). + See reset_auto_tuner() for more information. + This function causes the stencil solution to be run immediately + until the auto-tuner converges on all ranks. + It is useful for benchmarking, where performance is to be timed + for a given number of steps after the best settings are found. + This function should be called only *after* calling prepare_solution(). + This call must be made on each rank. + @warning Modifies the contents of the grids by calling run_solution() + an arbitrary number of times, but without halo exchange. + (See run_solution() for other restrictions and warnings.) + Thus, grid data should be set *after* calling this function when + used in a production or test setting where correct results are expected. + */ + virtual void + run_auto_tuner_now(bool verbose = true + /**< [in] If _true_, print progress information to the debug object + set via set_debug_output(). */ ) =0; + + /// **[Advanced]** Add a new grid to the solution. + /** + This is typically not needed because grids used by the stencils are pre-defined + by the solution itself via the stencil compiler. + However, a grid may be created explicitly via this function + in order to use it for purposes other than by the + pre-defined stencils within the current solution. + + Grids created by this function will be treated like a pre-defined grid. + For example, + - For each domain dimension of the grid, + the new grid's domain size will be the same as that returned by + get_rank_domain_size(). + - Calls to set_rank_domain_size() will resize the corresponding domain + size in this grid. + - This grid's first domain index in this rank will be determined + by the position of this rank. + - This grid's initial padding size will be the same as that returned by + get_min_pad_size(). + - After creating a new grid, you can increase its padding + sizes in the domain dimensions via yk_grid::set_min_pad_size(), etc. + - For step and misc dimensions, you can change the allocation via + yk_grid::set_alloc_size(). + + If you want a grid that is not automatically resized based on the + solution settings, use new_fixed_size_grid() instead. + + @note A new grid contains only the meta-data for the grid; data storage + is not yet allocated. + Storage may be allocated in any of the methods listed + in the "Detailed Description" for \ref yk_grid. + @returns Pointer to the new grid. + */ + virtual yk_grid_ptr + new_grid(const std::string& name + /**< [in] Name of the grid; must be unique + within the solution. */, + const std::vector& dims + /**< [in] List of names of all dimensions. + Names must be valid C++ identifiers and + not repeated within this grid. */ ) =0; + +#ifndef SWIG + /// **[Advanced]** Add a new grid to the solution. + /** + See documentation for the version of new_grid() with a vector of dimension names + as a parameter. + @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. + @returns Pointer to the new grid. + */ + virtual yk_grid_ptr + new_grid(const std::string& name + /**< [in] Name of the grid; must be unique + within the solution. */, + const std::initializer_list& dims + /**< [in] List of names of all dimensions. + Names must be valid C++ identifiers and + not repeated within this grid. */ ) =0; +#endif + + /// **[Advanced]** Add a new grid to the solution with a specified size. + /** + This is typically not needed because grids used by the stencils are pre-defined + by the solution itself via the stencil compiler. + However, a grid may be created explicitly via this function + in order to use it for purposes other than by the + pre-defined stencils within the current solution. + + Unlike new_grid(), + grids created by this function will *not* be treated like a pre-defined grid. + For example, + - For each domain dimension of the grid, + the new grid's domain size is provided during creation and cannot be changed. + - Calls to set_rank_domain_size() will *not* resize the corresponding domain + size in this grid. + - This grid's first domain index in this rank will be fixed at zero (0) + regardless of this rank's position. + - This grid's padding size will be affected only by calls to + yk_grid::set_min_pad_size(), etc. + - For step and misc dimensions, you can still change the allocation via + yk_grid::set_alloc_size(). + + @note A new grid contains only the meta-data for the grid; data storage + is not yet allocated. + Storage may be allocated in any of the methods listed + in the "Detailed Description" for \ref yk_grid. + @returns Pointer to the new grid. + */ + virtual yk_grid_ptr + new_fixed_size_grid(const std::string& name + /**< [in] Name of the grid; must be unique + within the solution. */, + const std::vector& dims + /**< [in] List of names of all dimensions. + Names must be valid C++ identifiers and + not repeated within this grid. */, + const std::vector& dim_sizes + /**< [in] Initial allocation in each dimension. + Must be exatly one size for each dimension. */ ) =0; + +#ifndef SWIG + /// **[Advanced]** Add a new grid to the solution with a specified size. + /** + See documentation for the version of new_fixed_size_grid() with a vector of dimension names + as a parameter. + @note This version is not available (or needed) in SWIG-based APIs, e.g., Python. + @returns Pointer to the new grid. + */ + virtual yk_grid_ptr + new_fixed_size_grid(const std::string& name + /**< [in] Name of the grid; must be unique + within the solution. */, + const std::initializer_list& dims + /**< [in] List of names of all dimensions. + Names must be valid C++ identifiers and + not repeated within this grid. */, + const std::initializer_list& dim_sizes + /**< [in] Initial allocation in each dimension. + Must be exatly one size for each dimension. */ ) =0; +#endif + + /// **[Advanced]** Set the default preferred NUMA node on which to allocate data. + /** + This value is used when allocating grids and MPI buffers. + The NUMA "preferred node allocation" policy is used, meaning that + memory will be allocated in an alternative node if the preferred one + doesn't have enough space available or is otherwise restricted. + Instead of specifying a NUMA node, a special value may be used + to specify another policy as listed. + This setting may be overridden for any specific grid. + */ + virtual void + set_default_numa_preferred(int numa_node + /**< [in] Preferred NUMA node for data + allocation. Alternatively, use + `yask_numa_local` for explicit + local-node allocation, + `yask_numa_interleave` for + interleaving pages across all nodes, + or `yask_numa_none` for no explicit NUMA + policy. These constants are defined in + the _Variable Documentation_ section of + \ref yk_solution_api.hpp. */) =0; + + /// **[Advanced]** Get the default preferred NUMA node on which to allocate data. + /** + @returns Current setting of preferred NUMA node. + */ + virtual int + get_default_numa_preferred() const =0; + + /// **[Advanced]** Set performance parameters from an option string. + /** + Parses the string for options as if from a command-line. + Example: "-bx 64 -block_threads 4" sets the block-size in the *x* + dimension to 64 and the number of threads used to process each + block to 4. + See the help message from the YASK kernel binary for documentation + on the command-line options. + + @returns Any strings that were not recognized by the parser as options. + */ + virtual std::string + apply_command_line_options(const std::string& args + /**< [in] String of arguments to parse. */ ) =0; + + /// **[Advanced]** Get the specified stencil group. + /** + @returns Pointer to the specified \ref yk_stencil_group + or null pointer if it does not exist. + */ + virtual yk_stencil_group_ptr + get_stencil_group(const std::string& name + /**< [in] Name of the group. */ ) =0; + + /// **[Advanced]** Get all the stencil groups. + /** + @returns List of all stencil groups in the solution. + */ + virtual std::vector + get_stencil_groups() =0; + + /// **[Advanced]** Run the specified stencil group over the given sub-domain. + /** + Applies all the stencil kernels in the given group + from `first_domain_indices` at `first_step_index` + to `last_domain_indices` at `last_domain_index` (inclusive) in each dimension. + Each list of domain indices should contain the indices for the + dimensions returned by get_domain_dim_names() in the same order. + + Indices are relative to the *overall* problem domain and + need not be limited to fall within the domain of the current MPI rank. + The actual points to which the group is applied on each rank will be + limited internally as needed. + + Example C++ usage: + + \code{.cpp} + // Find my custom stencil group created in the YASK compiler. + auto my_group = soln->get_stencil_group("my_group"); + ... + soln->prepare_solution(); + ... + // Set first_indices and last_indices to apply my_group + // to only the first slice in the "z" dimension. + std::vector first_indices, last_indices; + for (auto dim : soln->get_domain_dim_names()) { + auto overall_size = soln->get_overall_domain_size(dim); + first_indices.push_back(0); + if (dim == "z") + last_indices.push_back(0); + else + last_indices.push_back(overall_size - 1); + } + ... + // Execute the time-steps. + for (idx_t t = 0; t < num_steps; t++) { + + // Apply the automatically-scheduled stencils. + soln->run_solution(t); + + // Apply my custom stencil group. + soln->run_stencil_group(my_group, + t, first_indices, + t, last_indices); + } + soln->end_solution(); + \endcode + + @returns Number of points to which the group was applied. + */ + virtual idx_t + run_stencil_group(yk_stencil_group_ptr stencil_group + /**< [in] Pointer to the stencil group obtained from + get_stencil_groups() or get_stencil_group(). */, + const std::vector& first_domain_indices + /**< [in] List of initial domain indices. */, + const std::vector& last_domain_indices + /**< [in] List of final domain indices. */ ) =0; + + /// **[Advanced]** Use data-storage from existing grids in specified solution. + /** + Calls yk_grid::share_storage() for each pair of grids that have the same name + in this solution and the source solution. + All conditions listed in yk_grid::share_storage() must hold for each pair. + */ + virtual void + share_grid_storage(yk_solution_ptr source + /**< [in] Solution from which grid storage will be shared. */) =0; + }; + + /// Statistics from calls to run_solution(). + /** + A throughput rate may be calculated by multiplying an + amount-of-work-per-step quantity by the number of steps done and + dividing by the number of seconds elapsed. + */ + class yk_stats { + public: + virtual ~yk_stats() {} + + /// Get the number of elements in the overall domain. + /** + @returns Product of all the overal domain sizes across all domain dimensions. + */ + virtual idx_t + get_num_elements() =0; + + /// Get the number of elements written in each step. + /** + @returns Number of elements written to each output grid. + This is the same value as get_num_elements() if there is only one output grid. + */ + virtual idx_t + get_num_writes() =0; + + /// Get the estimated number of floating-point operations required for each step. + /** + @returns Number of FP ops created by the stencil compiler. + It may be slightly more or less than the actual number of FP ops executed + by the CPU due to C++ compiler transformations. + */ + virtual idx_t + get_est_fp_ops() =0; + + /// Get the number of steps calculated via run_solution(). + /** + @returns A positive number, regardless of whether run_solution() steps were executed + forward or backward. + */ + virtual idx_t + get_num_steps_done() =0; + + /// Get the number of seconds elapsed during calls to run_solution(). + /** + @returns Only the time spent in run_solution(), not in any other code in your + application between calls. + */ + virtual double + get_elapsed_run_secs() =0; + }; + + /// A group of stencil kernels. + /** + Groups of stencils are created automatically by the YASK stencil compiler + or manually via yc_solution::new_equation_group(). See the latter for + more information. + */ + class yk_stencil_group { + public: + virtual ~yk_stencil_group() {} + + /// Get the name of this group. + /** + @returns Default name given by the YASK stencil compiler + or the name provided via yc_solution::new_equation_group(). + */ + virtual const std::string& + get_name() const =0; + + /// Determine whether this group will be automatically scheduled. + /** + @returns `true` if this group will be run via yk_solution::run_solution() + or `false` if this group must be run via yk_solution::run_stencil_group(). + This is the `do_schedule` setting passed via yc_solution::new_equation_group(). + */ + virtual bool + is_scheduled() const =0; + + }; + +} // namespace yask. + +#endif diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp index 28550e4b..1d683006 100644 --- a/src/common/common_utils.cpp +++ b/src/common/common_utils.cpp @@ -41,7 +41,7 @@ namespace yask { // for numbers above 9 (at least up to 99). // Format: "major.minor.patch". - const string version = "2.05.07"; + const string version = "2.06.00"; string yask_get_version_string() { return version; diff --git a/src/compiler/lib/Cpp.hpp b/src/compiler/lib/Cpp.hpp index 89bc299a..46e7f597 100644 --- a/src/compiler/lib/Cpp.hpp +++ b/src/compiler/lib/Cpp.hpp @@ -238,12 +238,12 @@ namespace yask { // Print out a stencil in C++ form for YASK. class YASKCppPrinter : public PrinterBase { protected: - EqGroups& _clusterEqGroups; + EqBundles& _clusterEqBundles; const Dimensions* _dims; string _context, _context_base; // Print an expression as a one-line C++ comment. - void addComment(ostream& os, EqGroup& eq); + void addComment(ostream& os, EqBundle& eq); // A factory method to create a new PrintHelper. // This can be overridden in derived classes to provide @@ -260,17 +260,17 @@ namespace yask { // Print pieces of YASK output. virtual void printMacros(ostream& os); virtual void printData(ostream& os); - virtual void printEqGroups(ostream& os); + virtual void printEqBundles(ostream& os); virtual void printContext(ostream& os); public: YASKCppPrinter(StencilSolution& stencil, - EqGroups& eqGroups, - EqGroups& clusterEqGroups, + EqBundles& eqBundles, + EqBundles& clusterEqBundles, const Dimensions* dims) : - PrinterBase(stencil, eqGroups), - _clusterEqGroups(clusterEqGroups), + PrinterBase(stencil, eqBundles), + _clusterEqBundles(clusterEqBundles), _dims(dims) { // name of C++ struct. diff --git a/src/compiler/lib/CppIntrin.hpp b/src/compiler/lib/CppIntrin.hpp index e9b48f76..86693c93 100644 --- a/src/compiler/lib/CppIntrin.hpp +++ b/src/compiler/lib/CppIntrin.hpp @@ -202,10 +202,10 @@ namespace yask { public: YASKKncPrinter(StencilSolution& stencil, - EqGroups& eqGroups, - EqGroups& clusterEqGroups, + EqBundles& eqBundles, + EqBundles& clusterEqBundles, const Dimensions* dims) : - YASKCppPrinter(stencil, eqGroups, clusterEqGroups, + YASKCppPrinter(stencil, eqBundles, clusterEqBundles, dims) { } virtual int num_vec_elems() const { return 64 / _settings._elem_bytes; } @@ -225,10 +225,10 @@ namespace yask { public: YASKAvx256Printer(StencilSolution& stencil, - EqGroups& eqGroups, - EqGroups& clusterEqGroups, + EqBundles& eqBundles, + EqBundles& clusterEqBundles, const Dimensions* dims) : - YASKCppPrinter(stencil, eqGroups, clusterEqGroups, dims) { } + YASKCppPrinter(stencil, eqBundles, clusterEqBundles, dims) { } virtual int num_vec_elems() const { return 32 / _settings._elem_bytes; } }; @@ -244,10 +244,10 @@ namespace yask { public: YASKAvx512Printer(StencilSolution& stencil, - EqGroups& eqGroups, - EqGroups& clusterEqGroups, + EqBundles& eqBundles, + EqBundles& clusterEqBundles, const Dimensions* dims) : - YASKCppPrinter(stencil, eqGroups, clusterEqGroups, + YASKCppPrinter(stencil, eqBundles, clusterEqBundles, dims) { } virtual int num_vec_elems() const { return 64 / _settings._elem_bytes; } diff --git a/src/compiler/lib/Eqs.cpp b/src/compiler/lib/Eqs.cpp index 0a8acac3..94e14f0a 100644 --- a/src/compiler/lib/Eqs.cpp +++ b/src/compiler/lib/Eqs.cpp @@ -23,7 +23,7 @@ IN THE SOFTWARE. *****************************************************************************/ -///////// Methods for equations and equation groups //////////// +///////// Methods for equations and equation bundles //////////// #include "Print.hpp" #include "ExprUtils.hpp" @@ -182,14 +182,12 @@ namespace yask { } // Find dependencies based on all eqs. - // If 'eq_deps' is set, save dependencies between eqs. // Side effect: sets _stepDir in dims. // Throws exceptions on illegal dependencies. // TODO: split this into smaller functions. // BIG-TODO: replace dependency algorithms with integration of a polyhedral // library. void Eqs::findDeps(Dimensions& dims, - EqDepMap* eq_deps, ostream& os) { auto& stepDim = dims._stepDim; @@ -380,21 +378,15 @@ namespace yask { } // Save dependency. - if (eq_deps) { #ifdef DEBUG_DEP - cout << " Exact match found to " << op1->makeQuotedStr() << ".\n"; + cout << " Exact match found to " << op1->makeQuotedStr() << ".\n"; #endif - (*eq_deps)[cur_step_dep].set_imm_dep_on(eq2, eq1); - } + _eq_deps[cur_step_dep].set_imm_dep_on(eq2, eq1); // Move along to next eq2. continue; } - // Check more only if saving dependencies. - if (!eq_deps) - continue; - // Next dep check: inexact matches on LHS of eq1 to RHS of eq2. // Does eq1 define *any* point in a grid that eq2 inputs // at the same step index? If so, they *might* have a @@ -443,12 +435,10 @@ namespace yask { } // Save dependency. - if (eq_deps) { #ifdef DEBUG_DEP - cout << " Likely match found to " << op1->makeQuotedStr() << ".\n"; + cout << " Likely match found to " << op1->makeQuotedStr() << ".\n"; #endif - (*eq_deps)[cur_step_dep].set_imm_dep_on(eq2, eq1); - } + _eq_deps[cur_step_dep].set_imm_dep_on(eq2, eq1); // Move along to next equation. break; @@ -463,11 +453,9 @@ namespace yask { } // for all eqs (eq1). // Resolve indirect dependencies. - if (eq_deps) { - os << " Resolving indirect dependencies...\n"; - for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1)) - (*eq_deps)[dt].find_all_deps(); - } + os << " Resolving indirect dependencies...\n"; + for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1)) + _eq_deps[dt].find_all_deps(); os << " Done with dependency analysis.\n"; } @@ -607,7 +595,7 @@ namespace yask { // Update access stats for the grids. // Also finds scratch-grid eqs needed for each non-scratch eq. - void Eqs::updateGridStats(EqDepMap& eq_deps) { + void Eqs::updateGridStats() { // Find all LHS and RHS points and grids for all eqs. PointVisitor pv; @@ -638,7 +626,7 @@ namespace yask { // 'eq1'. It's important to visit the eqs in dep order to // properly propagate halos sizes thru chains of scratch grids. if (!og1->isScratch()) { - eq_deps[cur_step_dep].visitDeps + _eq_deps[cur_step_dep].visitDeps // 'eq1' is 'b' or depends on 'b', immediately or indirectly. (eq1, [&](EqualsExprPtr b, EqDeps::EqVecSet& path) { @@ -719,9 +707,9 @@ namespace yask { } - // Get the full name of an eq-group. + // Get the full name of an eq-bundle. // Must be unique. - string EqGroup::getName() const { + string EqBundle::getName() const { // Add index to base name. ostringstream oss; @@ -729,11 +717,11 @@ namespace yask { return oss.str(); } - // Make a human-readable description of this eq group. - string EqGroup::getDescription(bool show_cond, + // Make a human-readable description of this eq bundle. + string EqBundle::getDescription(bool show_cond, string quote) const { - string des = "equation-group " + quote + getName() + quote; + string des = "equation-bundle " + quote + getName() + quote; if (show_cond) { if (cond.get()) des += " w/condition " + cond->makeQuotedStr(quote); @@ -743,11 +731,11 @@ namespace yask { return des; } - // Add an equation to an EqGroup. - void EqGroup::addEq(EqualsExprPtr ee) + // Add an equation to an EqBundle. + void EqBundle::addEq(EqualsExprPtr ee) { -#ifdef DEBUG_EQ_GROUP - cout << "EqGroup: adding " << ee->makeQuotedStr() << endl; +#ifdef DEBUG_EQ_BUNDLE + cout << "EqBundle: adding " << ee->makeQuotedStr() << endl; #endif _eqs.insert(ee); @@ -755,7 +743,7 @@ namespace yask { PointVisitor pv; ee->accept(&pv); - // update list of input and output grids for this group. + // update list of input and output grids for this bundle. auto* outGrid = pv.getOutputGrids().at(ee.get()); _outGrids.insert(outGrid); auto& inGrids = pv.getInputGrids().at(ee.get()); @@ -764,8 +752,10 @@ namespace yask { } // Check for and set dependencies on eg2. - void EqGroup::checkDeps(Eqs& allEqs, EqDepMap& eq_deps, const EqGroup& eg2) + void EqBundle::checkDeps(Eqs& allEqs, const EqBundle& eg2) { + auto& eq_deps = allEqs.getDeps(); + // Eqs in this. for (auto& eq1 : getEqs()) { auto& sdeps1 = allEqs.getScratchDeps(eq1); @@ -776,13 +766,13 @@ namespace yask { for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1)) { // Immediate dep. - if (eq_deps[dt].is_imm_dep_on(eq1, eq2)) { + if (eq_deps.at(dt).is_imm_dep_on(eq1, eq2)) { _imm_dep_on[dt].insert(eg2.getName()); _dep_on[dt].insert(eg2.getName()); } // Indirect dep. - else if (eq_deps[dt].is_dep_on(eq1, eq2)) { + else if (eq_deps.at(dt).is_dep_on(eq1, eq2)) { _dep_on[dt].insert(eg2.getName()); } } @@ -795,8 +785,8 @@ namespace yask { } - // Print stats from eqGroup. - void EqGroup::printStats(ostream& os, const string& msg) + // Print stats from eqBundle. + void EqBundle::printStats(ostream& os, const string& msg) { CounterVisitor cv; visitEqs(&cv); @@ -823,10 +813,10 @@ namespace yask { // Replicate each equation at the non-zero offsets for // each vector in a cluster. - void EqGroup::replicateEqsInCluster(Dimensions& dims) + void EqBundle::replicateEqsInCluster(Dimensions& dims) { // Make a copy of the original equations so we can iterate through - // them while adding to the group. + // them while adding to the bundle. EqList eqs(_eqs); // Loop thru points in cluster. @@ -853,7 +843,7 @@ namespace yask { OffsetVisitor ov(clusterOffset); eq2->accept(&ov); - // Put new equation into group. + // Put new equation into bundle. addEq(eq2); } } @@ -864,13 +854,13 @@ namespace yask { assert(_eqs.size() == eqs.size() * dims._clusterMults.product()); } - // Reorder groups based on dependencies. - void EqGroups::sort() + // Reorder bundles based on dependencies. + void EqBundles::sort() { if (size() < 2) return; - cout << " Sorting " << size() << " eq-group(s)...\n"; + cout << " Sorting " << size() << " eq-bundle(s)...\n"; // Want to keep original order as much as possible. // Only reorder if dependencies are in conflict. @@ -881,7 +871,7 @@ namespace yask { bool done = false; while (!done) { - // Does eq-group[i] depend on any eq-group after it? + // Does eq-bundle[i] depend on any eq-bundle after it? auto& egi = at(i); for (size_t j = i+1; j < size(); j++) { @@ -892,13 +882,13 @@ namespace yask { // Error if also back-dep. if (egj.isDepOn(cur_step_dep, egi)) { - THROW_YASK_EXCEPTION("Error: circular dependency between eq-groups " << + THROW_YASK_EXCEPTION("Error: circular dependency between eq-bundles " << egi.getDescription() << " and " << egj.getDescription()); } // Swap them. - EqGroup temp(egi); + EqBundle temp(egi); egi = egj; egj = temp; @@ -912,24 +902,29 @@ namespace yask { } } - // Add expression 'eq' with condition 'cond' to eq-group with 'baseName' + // Add expression 'eq' from 'eqs' to eq-bundle with 'baseName' // unless alread added. The corresponding index in '_indices' will be - // incremented if a new group is created. + // incremented if a new bundle is created. // 'eq_deps': pre-computed dependencies between equations. - // Returns whether a new group was created. - bool EqGroups::addExprToGroup(EqualsExprPtr eq, - BoolExprPtr cond, - const string& baseName, - bool is_scratch, - EqDepMap& eq_deps) + // Returns whether a new bundle was created. + bool EqBundles::addExprToBundle(Eqs& eqs, + EqualsExprPtr eq, + const string& baseName, + bool is_scratch) { // Equation already added? - if (_eqs_in_groups.count(eq)) + if (_eqs_in_bundles.count(eq)) return false; - // Loop through existing groups, looking for one that + // Get condition, if any. + auto cond = eqs.getCond(eq); + + // Get deps. + auto& eq_deps = eqs.getDeps(); + + // Loop through existing bundles, looking for one that // 'eq' can be added to. - EqGroup* target = 0; + EqBundle* target = 0; for (auto& eg : *this) { // Must match name and condition. @@ -942,7 +937,7 @@ namespace yask { for (auto& eq2 : eg.getEqs()) { for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1)) { - if (eq_deps[dt].is_dep(eq, eq2)) { + if (eq_deps.at(dt).is_dep(eq, eq2)) { #if DEBUG_ADD_EXPRS cout << "addExprsFromGrid: not adding equation " << eq->makeQuotedStr() << " to " << eg.getDescription() << @@ -957,7 +952,7 @@ namespace yask { break; } - // Remember target group if found and no deps. + // Remember target bundle if found and no deps. if (!is_dep) { target = ⪚ break; @@ -965,23 +960,23 @@ namespace yask { } } - // Make new group if no target group found. - bool newGroup = false; + // Make new bundle if no target bundle found. + bool newBundle = false; if (!target) { - EqGroup ne(*_dims, is_scratch); + EqBundle ne(*_dims, is_scratch); push_back(ne); target = &back(); target->baseName = baseName; target->index = _indices[baseName]++; target->cond = cond; - newGroup = true; + newBundle = true; #if DEBUG_ADD_EXPRS cout << "Creating new " << target->getDescription() << endl; #endif } - // Add eq to target eq-group. + // Add eq to target eq-bundle. assert(target); #if DEBUG_ADD_EXPRS cout << "Adding " << eq->makeQuotedStr() << @@ -990,27 +985,26 @@ namespace yask { target->addEq(eq); // Remember eq and updated grid. - _eqs_in_groups.insert(eq); + _eqs_in_bundles.insert(eq); _outGrids.insert(eq->getGrid()); - return newGroup; + return newBundle; } - // Divide all equations into eqGroups. + // Divide all equations into eqBundles. // Only process updates to grids in 'gridRegex'. - // 'targets': string provided by user to specify grouping. + // 'targets': string provided by user to specify bundleing. // 'eq_deps': pre-computed dependencies between equations. - void EqGroups::makeEqGroups(Eqs& allEqs, - const string& gridRegex, - const string& targets, - EqDepMap& eq_deps, - ostream& os) + void EqBundles::makeEqBundles(Eqs& allEqs, + const string& gridRegex, + const string& targets, + ostream& os) { - os << "Partitioning " << allEqs.getNumEqs() << " equation(s) into groups...\n"; + os << "Partitioning " << allEqs.getNumEqs() << " equation(s) into bundles...\n"; //auto& stepDim = _dims->_stepDim; - // Add each scratch equation to a separate group. - // TODO: Allow multiple scratch eqs in a group with same conds & halos. + // Add each scratch equation to a separate bundle. + // TODO: Allow multiple scratch eqs in a bundle with same conds & halos. // TODO: Only add scratch eqs that are needed by grids in 'gridRegex'. for (auto eq : allEqs.getEqs()) { @@ -1021,7 +1015,7 @@ namespace yask { string gname = gp->getName(); // Add equation. - addExprToGroup(eq, allEqs.getCond(eq), gname, true, eq_deps); + addExprToBundle(allEqs, eq, gname, true); } } @@ -1029,7 +1023,7 @@ namespace yask { regex gridx(gridRegex); // Handle each key-value pair in 'targets' string. - // Key is eq-group name (with possible format strings); value is regex pattern. + // Key is eq-bundle name (with possible format strings); value is regex pattern. ArgParser ap; ap.parseKeyValuePairs (targets, [&](const string& egfmt, const string& pattern) { @@ -1058,7 +1052,7 @@ namespace yask { string egname = mr.format(egfmt); // Add equation. - addExprToGroup(eq, allEqs.getCond(eq), egname, false, eq_deps); + addExprToBundle(allEqs, eq, egname, false); } }); @@ -1075,11 +1069,11 @@ namespace yask { continue; // Add equation. - addExprToGroup(eq, allEqs.getCond(eq), _basename_default, false, eq_deps); + addExprToBundle(allEqs, eq, _basename_default, false); } - os << "Created " << size() << " equation group(s):\n"; + os << "Created " << size() << " equation bundle(s):\n"; - // Find dependencies between eq-groups based on deps between their eqs. + // Find dependencies between eq-bundles based on deps between their eqs. for (auto& eg1 : *this) { os << " " << eg1.getDescription() << ":\n" " Contains " << eg1.getNumEqs() << " equation(s).\n" @@ -1092,25 +1086,25 @@ namespace yask { } os << ".\n"; - // Check to see if eg1 depends on other eq-groups. + // Check to see if eg1 depends on other eq-bundles. for (auto& eg2 : *this) { // Don't check against self. if (eg1.getName() == eg2.getName()) continue; - eg1.checkDeps(allEqs, eq_deps, eg2); + eg1.checkDeps(allEqs, eg2); DepType dt = cur_step_dep; if (eg1.isImmDepOn(dt, eg2)) - os << " Immediately dependent on group " << + os << " Immediately dependent on bundle " << eg2.getName() << ".\n"; else if (eg1.isDepOn(dt, eg2)) - os << " Indirectly dependent on group " << + os << " Indirectly dependent on bundle " << eg2.getName() << ".\n"; } auto& sdeps = eg1.getScratchDeps(); if (sdeps.size()) { - os << " Requires evaluation of the following scratch-grid group(s):"; + os << " Requires evaluation of the following scratch-grid bundle(s):"; for (auto& sname : sdeps) os << " " << sname; os << ".\n"; @@ -1121,8 +1115,8 @@ namespace yask { sort(); } - // Print stats from eqGroups. - void EqGroups::printStats(ostream& os, const string& msg) { + // Print stats from eqBundles. + void EqBundles::printStats(ostream& os, const string& msg) { CounterVisitor cv; for (auto& eq : *this) { CounterVisitor ecv; @@ -1133,15 +1127,15 @@ namespace yask { } // Apply optimizations according to the 'settings'. - void EqGroups::optimizeEqGroups(CompilerSettings& settings, + void EqBundles::optimizeEqBundles(CompilerSettings& settings, const string& descr, bool printSets, ostream& os) { // print stats. - string edescr = "for " + descr + " equation-group(s)"; + string edescr = "for " + descr + " equation-bundle(s)"; printStats(os, edescr); - // Make a list of optimizations to apply to eqGroups. + // Make a list of optimizations to apply to eqBundles. vector opts; // CSE. @@ -1164,7 +1158,7 @@ namespace yask { visitEqs(optimizer); int numChanges = optimizer->getNumChanges(); string odescr = "after applying " + optimizer->getName() + " to " + - descr + " equation-group(s)"; + descr + " equation-bundle(s)"; // Get new stats. if (numChanges) @@ -1173,9 +1167,9 @@ namespace yask { os << "No changes " << odescr << '.' << endl; } - // Final stats per equation group. + // Final stats per equation bundle. if (printSets && size() > 1) { - os << "Stats per equation-group:\n"; + os << "Stats per equation-bundle:\n"; for (auto eg : *this) eg.printStats(os, "for " + eg.getDescription()); } diff --git a/src/compiler/lib/Eqs.hpp b/src/compiler/lib/Eqs.hpp index 07f507ea..e005b07e 100644 --- a/src/compiler/lib/Eqs.hpp +++ b/src/compiler/lib/Eqs.hpp @@ -23,7 +23,7 @@ IN THE SOFTWARE. *****************************************************************************/ -///////// Classes for equations and equation groups //////////// +///////// Classes for equations and equation bundles //////////// #ifndef EQS_HPP #define EQS_HPP @@ -123,6 +123,7 @@ namespace yask { EqList _eqs; // just equations w/o conditions. CondMap _conds; // map from equations to their conditions, if any. + EqDepMap _eq_deps; // dependencies between all eqs. EqDeps::DepMap _scratch_deps; // dependencies through scratch grids. public: @@ -158,7 +159,12 @@ namespace yask { return nullptr; } - // Get the scratch-grid eqs that contribute to this eq. + // Get all the deps. + virtual const EqDepMap& getDeps() const { + return _eq_deps; + } + + // Get the scratch-grid eqs that contribute to 'eq'. virtual const EqDeps::EqSet& getScratchDeps(EqualsExprPtr ep) const { return _scratch_deps.at(ep); } @@ -174,7 +180,6 @@ namespace yask { // Find dependencies based on all eqs. If 'eq_deps' is // set, save dependencies between eqs in referent. virtual void findDeps(Dimensions& dims, - EqDepMap* eq_deps, std::ostream& os); // Determine which grid points can be vectorized. @@ -184,37 +189,37 @@ namespace yask { virtual void analyzeLoop(const Dimensions& dims); // Update grid access stats. - virtual void updateGridStats(EqDepMap& eq_deps); + virtual void updateGridStats(); }; - // A named equation group, which contains one or more grid-update equations. - // All equations in a group must have the same condition. + // A named equation bundle, which contains one or more grid-update equations. + // All equations in a bundle must have the same condition. // Equations should not have inter-dependencies because they will be // combined into a single expression. - class EqGroup { + class EqBundle { protected: - EqList _eqs; // expressions in this eqGroup (not including conditions). - Grids _outGrids; // grids updated by this eqGroup. - Grids _inGrids; // grids read from by this eqGroup. + EqList _eqs; // expressions in this eqBundle (not including conditions). + Grids _outGrids; // grids updated by this eqBundle. + Grids _inGrids; // grids read from by this eqBundle. const Dimensions* _dims = 0; bool _isScratch = false; // true if updating temp grid(s). - // Other eq-groups that this group depends on. This means that an - // equation in this group has a grid value on the RHS that appears in + // Other eq-bundles that this bundle depends on. This means that an + // equation in this bundle has a grid value on the RHS that appears in // the LHS of the dependency. map> _imm_dep_on; // immediate deps. map> _dep_on; // immediate and indirect deps. - set _scratch_deps; // scratch groups needed for this group. + set _scratch_deps; // scratch bundles needed for this bundle. public: // TODO: move these into protected section and make accessors. - string baseName; // base name of this eqGroup. + string baseName; // base name of this eqBundle. int index; // index to distinguish repeated names. BoolExprPtr cond; // condition (default is null). // Ctor. - EqGroup(const Dimensions& dims, bool is_scratch) : + EqBundle(const Dimensions& dims, bool is_scratch) : _dims(&dims), _isScratch(is_scratch) { // Create empty map entries. @@ -223,16 +228,16 @@ namespace yask { _dep_on[dt]; } } - virtual ~EqGroup() {} + virtual ~EqBundle() {} - // Add an equation to this group. + // Add an equation to this bundle. virtual void addEq(EqualsExprPtr ee); // Visit all the equations. virtual void visitEqs(ExprVisitor* ev) { for (auto& ep : _eqs) { -#ifdef DEBUG_EQ_GROUP - cout << "EqGroup: visiting " << ep->makeQuotedStr() << endl; +#ifdef DEBUG_EQ_BUNDLE + cout << "EqBundle: visiting " << ep->makeQuotedStr() << endl; #endif ep->accept(ev); } @@ -277,16 +282,16 @@ namespace yask { return _inGrids; } - // Get whether this eq-group depends on eg2. + // Get whether this eq-bundle depends on eg2. // Must have already been set via checkDeps(). - virtual bool isImmDepOn(DepType dt, const EqGroup& eq2) const { + virtual bool isImmDepOn(DepType dt, const EqBundle& eq2) const { return _imm_dep_on.at(dt).count(eq2.getName()) > 0; } - virtual bool isDepOn(DepType dt, const EqGroup& eq2) const { + virtual bool isDepOn(DepType dt, const EqBundle& eq2) const { return _dep_on.at(dt).count(eq2.getName()) > 0; } - // Get dependencies on this eq-group. + // Get dependencies on this eq-bundle. virtual const set& getImmDeps(DepType dt) const { return _imm_dep_on.at(dt); } @@ -294,24 +299,24 @@ namespace yask { return _dep_on.at(dt); } - // Get scratch-group dependencies. + // Get scratch-bundle dependencies. virtual const set& getScratchDeps() const { return _scratch_deps; } // Check for and set dependencies on eg2. - virtual void checkDeps(Eqs& allEqs, EqDepMap& eq_deps, const EqGroup& eg2); + virtual void checkDeps(Eqs& allEqs, const EqBundle& eg2); // Replicate each equation at the non-zero offsets for // each vector in a cluster. virtual void replicateEqsInCluster(Dimensions& dims); - // Print stats for the equation(s) in this group. + // Print stats for the equation(s) in this bundle. virtual void printStats(ostream& os, const string& msg); }; - // Container for multiple equation groups. - class EqGroups : public vector { + // Container for multiple equation bundles. + class EqBundles : public vector { protected: // Copy of some global data. @@ -321,29 +326,27 @@ namespace yask { // Track grids that are udpated. Grids _outGrids; - // Map to track indices per eq-group name. + // Map to track indices per eq-bundle name. map _indices; // Track equations that have been added already. - set _eqs_in_groups; + set _eqs_in_bundles; - // Add expression 'eq' with condition 'cond' to eq-group with 'baseName' + // Add expression 'eq' from 'eqs' to eq-bundle with 'baseName' // unless alread added. The corresponding index in '_indices' will be - // incremented if a new group is created. - // 'eq_deps': pre-computed dependencies between equations. - // Returns whether a new group was created. - virtual bool addExprToGroup(EqualsExprPtr eq, - BoolExprPtr cond, // may be nullptr. - const string& baseName, - bool is_scratch, - EqDepMap& eq_deps); + // incremented if a new bundle is created. + // Returns whether a new bundle was created. + virtual bool addExprToBundle(Eqs& eqs, + EqualsExprPtr eq, + const string& baseName, + bool is_scratch); public: - EqGroups() {} - EqGroups(const string& basename_default, Dimensions& dims) : + EqBundles() {} + EqBundles(const string& basename_default, Dimensions& dims) : _basename_default(basename_default), _dims(&dims) {} - virtual ~EqGroups() {} + virtual ~EqBundles() {} virtual void set_basename_default(const string& basename_default) { _basename_default = basename_default; @@ -352,24 +355,23 @@ namespace yask { _dims = &dims; } - // Separate a set of equations into eqGroups based + // Separate a set of equations into eqBundles based // on the target string. // Target string is a comma-separated list of key-value pairs, e.g., - // "eqGroup1=foo,eqGroup2=bar". - // In this example, all eqs updating grid names containing 'foo' go in eqGroup1, - // all eqs updating grid names containing 'bar' go in eqGroup2, and - // each remaining eq goes into a separate eqGroup. - void makeEqGroups(Eqs& eqs, - const string& gridRegex, - const string& targets, - EqDepMap& eq_deps, - std::ostream& os); + // "eqBundle1=foo,eqBundle2=bar". + // In this example, all eqs updating grid names containing 'foo' go in eqBundle1, + // all eqs updating grid names containing 'bar' go in eqBundle2, and + // each remaining eq goes into a separate eqBundle. + void makeEqBundles(Eqs& eqs, + const string& gridRegex, + const string& targets, + std::ostream& os); virtual const Grids& getOutputGrids() const { return _outGrids; } - // Visit all the equations in all eqGroups. + // Visit all the equations in all eqBundles. // This will not visit the conditions. virtual void visitEqs(ExprVisitor* ev) { for (auto& eg : *this) @@ -383,26 +385,26 @@ namespace yask { eg.replicateEqsInCluster(dims); } - // Reorder groups based on dependencies. + // Reorder bundles based on dependencies. virtual void sort(); - // Print a list of eqGroups. + // Print a list of eqBundles. virtual void printInfo(ostream& os) const { - os << "Identified stencil equation-groups:" << endl; + os << "Identified stencil equation-bundles:" << endl; for (auto& eq : *this) { for (auto gp : eq.getOutputGrids()) { string eqName = eq.getName(); - os << " Equation group '" << eqName << "' updates grid '" << + os << " Equation bundle '" << eqName << "' updates grid '" << gp->getName() << "'." << endl; } } } - // Print stats for the equation(s) in all groups. + // Print stats for the equation(s) in all bundles. virtual void printStats(ostream& os, const string& msg); // Apply optimizations requested in settings. - void optimizeEqGroups(CompilerSettings& settings, + void optimizeEqBundles(CompilerSettings& settings, const string& descr, bool printSets, ostream& os); diff --git a/src/compiler/lib/Grid.cpp b/src/compiler/lib/Grid.cpp index 00522efc..a4bd9e42 100644 --- a/src/compiler/lib/Grid.cpp +++ b/src/compiler/lib/Grid.cpp @@ -304,7 +304,7 @@ namespace yask { if (sz > 1 && first_max_halo == 0 && last_max_halo == 0) sz--; - // TODO: recognize that reading in one eq-group and then writing in + // TODO: recognize that reading in one eq-bundle and then writing in // another can also reuse storage. return sz; diff --git a/src/compiler/lib/Grid.hpp b/src/compiler/lib/Grid.hpp index b253df40..332c2f05 100644 --- a/src/compiler/lib/Grid.hpp +++ b/src/compiler/lib/Grid.hpp @@ -66,7 +66,7 @@ namespace yask { // various step-index values. // bool key: true=left, false=right. // int key: step-dim offset or 0 if no step-dim. - // TODO: keep separate halos for each equation group. + // TODO: keep separate halos for each equation bundle. map> _halos; public: @@ -293,7 +293,7 @@ namespace yask { IntTuple _foldOptions; // vector fold. IntTuple _clusterOptions; // cluster multipliers. bool _firstInner = true; // first dimension of fold is unit step. - string _eq_group_basename_default = "stencil"; + string _eq_bundle_basename_default = "stencil_bundle"; bool _allowUnalignedLoads = false; int _haloSize = 0; // 0 => calculate each halo separately and automatically. int _stepAlloc = 0; // 0 => calculate step allocation automatically. @@ -302,7 +302,7 @@ namespace yask { bool _doCse = true; // do common-subexpr elim. bool _doComb = true; // combine commutative operations. bool _doOptCluster = true; // apply optimizations also to cluster. - string _eqGroupTargets; // how to group equations. + string _eqBundleTargets; // how to bundle equations. string _gridRegex; // grids to update. }; diff --git a/src/compiler/lib/Print.cpp b/src/compiler/lib/Print.cpp index 4ab2e5ae..e1164195 100644 --- a/src/compiler/lib/Print.cpp +++ b/src/compiler/lib/Print.cpp @@ -538,11 +538,11 @@ namespace yask { os << "Stencil '" << _stencil.getName() << "' pseudo-code:" << endl; - // Loop through all eqGroups. - for (auto& eq : _eqGroups) { + // Loop through all eqBundles. + for (auto& eq : _eqBundles) { string egName = eq.getName(); - os << endl << " ////// Equation group '" << egName << + os << endl << " ////// Equation bundle '" << egName << "' //////" << endl; CounterVisitor cv; @@ -579,9 +579,9 @@ namespace yask { os << "digraph \"Stencil " << _stencil.getName() << "\" {\n" "rankdir=LR; ranksep=1.5;\n"; - // Loop through all eqGroups. - for (auto& eq : _eqGroups) { - os << "subgraph \"Equation-group " << eq.getName() << "\" {" << endl; + // Loop through all eqBundles. + for (auto& eq : _eqBundles) { + os << "subgraph \"Equation-bundle " << eq.getName() << "\" {" << endl; eq.visitEqs(pv); os << "}" << endl; } @@ -602,8 +602,8 @@ namespace yask { " look_at <0, 0, 0>" << endl << "}" << endl; - // Loop through all eqGroups. - for (auto& eq : _eqGroups) { + // Loop through all eqBundles. + for (auto& eq : _eqBundles) { // TODO: separate mutiple grids. POVRayPrintVisitor pv(os); diff --git a/src/compiler/lib/Print.hpp b/src/compiler/lib/Print.hpp index efafd83d..606f3b21 100644 --- a/src/compiler/lib/Print.hpp +++ b/src/compiler/lib/Print.hpp @@ -434,15 +434,15 @@ namespace yask { protected: StencilSolution& _stencil; Grids& _grids; - EqGroups& _eqGroups; + EqBundles& _eqBundles; CompilerSettings& _settings; public: PrinterBase(StencilSolution& stencil, - EqGroups& eqGroups) : + EqBundles& eqBundles) : _stencil(stencil), _grids(stencil.getGrids()), - _eqGroups(eqGroups), + _eqBundles(eqBundles), _settings(stencil.getSettings()) { } virtual ~PrinterBase() { } @@ -476,8 +476,8 @@ namespace yask { public: PseudoPrinter(StencilSolution& stencil, - EqGroups& eqGroups) : - PrinterBase(stencil, eqGroups) { } + EqBundles& eqBundles) : + PrinterBase(stencil, eqBundles) { } virtual ~PseudoPrinter() { } virtual void print(ostream& os); @@ -489,9 +489,9 @@ namespace yask { bool _isSimple; public: - DOTPrinter(StencilSolution& stencil, EqGroups& eqGroups, + DOTPrinter(StencilSolution& stencil, EqBundles& eqBundles, bool isSimple) : - PrinterBase(stencil, eqGroups), + PrinterBase(stencil, eqBundles), _isSimple(isSimple) { } virtual ~DOTPrinter() { } @@ -502,8 +502,8 @@ namespace yask { class POVRayPrinter : public PrinterBase { public: - POVRayPrinter(StencilSolution& stencil, EqGroups& eqGroups) : - PrinterBase(stencil, eqGroups) { } + POVRayPrinter(StencilSolution& stencil, EqBundles& eqBundles) : + PrinterBase(stencil, eqBundles) { } virtual ~POVRayPrinter() { } virtual void print(ostream& os); diff --git a/src/compiler/lib/Soln.cpp b/src/compiler/lib/Soln.cpp index e544a984..a296dcb4 100644 --- a/src/compiler/lib/Soln.cpp +++ b/src/compiler/lib/Soln.cpp @@ -85,27 +85,26 @@ namespace yask { _eqs.analyzeLoop(_dims); // Find dependencies between equations. - EqDepMap eq_deps; - _eqs.findDeps(_dims, &eq_deps, *_dos); + _eqs.findDeps(_dims, *_dos); // Update access stats for the grids. - _eqs.updateGridStats(eq_deps); + _eqs.updateGridStats(); - // Create equation groups based on dependencies and/or target strings. - _eqGroups.set_basename_default(_settings._eq_group_basename_default); - _eqGroups.set_dims(_dims); - _eqGroups.makeEqGroups(_eqs, _settings._gridRegex, - _settings._eqGroupTargets, eq_deps, *_dos); - _eqGroups.optimizeEqGroups(_settings, "scalar & vector", false, *_dos); + // Create equation bundles based on dependencies and/or target strings. + _eqBundles.set_basename_default(_settings._eq_bundle_basename_default); + _eqBundles.set_dims(_dims); + _eqBundles.makeEqBundles(_eqs, _settings._gridRegex, + _settings._eqBundleTargets, *_dos); + _eqBundles.optimizeEqBundles(_settings, "scalar & vector", false, *_dos); // Make a copy of each equation at each cluster offset. // We will use these for inter-cluster optimizations and code generation. *_dos << "Constructing cluster of equations containing " << _dims._clusterMults.product() << " vector(s)...\n"; - _clusterEqGroups = _eqGroups; - _clusterEqGroups.replicateEqsInCluster(_dims); + _clusterEqBundles = _eqBundles; + _clusterEqBundles.replicateEqsInCluster(_dims); if (_settings._doOptCluster) - _clusterEqGroups.optimizeEqGroups(_settings, "cluster", true, *_dos); + _clusterEqBundles.optimizeEqBundles(_settings, "cluster", true, *_dos); } // Format in given format-type. @@ -117,21 +116,21 @@ namespace yask { // Data itself will be created in analyze_solution(). PrinterBase* printer = 0; if (format_type == "cpp") - printer = new YASKCppPrinter(*this, _eqGroups, _clusterEqGroups, &_dims); + printer = new YASKCppPrinter(*this, _eqBundles, _clusterEqBundles, &_dims); else if (format_type == "knc") - printer = new YASKKncPrinter(*this, _eqGroups, _clusterEqGroups, &_dims); + printer = new YASKKncPrinter(*this, _eqBundles, _clusterEqBundles, &_dims); else if (format_type == "avx" || format_type == "avx2") - printer = new YASKAvx256Printer(*this, _eqGroups, _clusterEqGroups, &_dims); + printer = new YASKAvx256Printer(*this, _eqBundles, _clusterEqBundles, &_dims); else if (format_type == "avx512" || format_type == "avx512f") - printer = new YASKAvx512Printer(*this, _eqGroups, _clusterEqGroups, &_dims); + printer = new YASKAvx512Printer(*this, _eqBundles, _clusterEqBundles, &_dims); else if (format_type == "dot") - printer = new DOTPrinter(*this, _clusterEqGroups, false); + printer = new DOTPrinter(*this, _clusterEqBundles, false); else if (format_type == "dot-lite") - printer = new DOTPrinter(*this, _clusterEqGroups, true); + printer = new DOTPrinter(*this, _clusterEqBundles, true); else if (format_type == "pseudo") - printer = new PseudoPrinter(*this, _clusterEqGroups); + printer = new PseudoPrinter(*this, _clusterEqBundles); else if (format_type == "pov-ray") // undocumented. - printer = new POVRayPrinter(*this, _clusterEqGroups); + printer = new POVRayPrinter(*this, _clusterEqBundles); else { THROW_YASK_EXCEPTION("Error: format-type '" << format_type << "' is not recognized"); @@ -140,7 +139,7 @@ namespace yask { int vlen = printer->num_vec_elems(); bool is_folding_efficient = printer->is_folding_efficient(); - // Set data for equation groups, dims, etc. + // Set data for equation bundles, dims, etc. analyze_solution(vlen, is_folding_efficient); // Create the output. diff --git a/src/compiler/lib/Soln.hpp b/src/compiler/lib/Soln.hpp index 0139fafe..20142e69 100644 --- a/src/compiler/lib/Soln.hpp +++ b/src/compiler/lib/Soln.hpp @@ -72,8 +72,8 @@ namespace yask { // Intermediate data needed to format output. Dimensions _dims; // various dimensions. - EqGroups _eqGroups; // eq-groups for scalar and vector. - EqGroups _clusterEqGroups; // eq-groups for scalar and vector. + EqBundles _eqBundles; // eq-bundles for scalar and vector. + EqBundles _clusterEqBundles; // eq-bundles for scalar and vector. // Create the intermediate data. void analyze_solution(int vlen, diff --git a/src/compiler/lib/YaskKernel.cpp b/src/compiler/lib/YaskKernel.cpp index ba1a204b..4088fd9b 100644 --- a/src/compiler/lib/YaskKernel.cpp +++ b/src/compiler/lib/YaskKernel.cpp @@ -41,7 +41,7 @@ namespace yask { } // Print an expression as a one-line C++ comment. - void YASKCppPrinter::addComment(ostream& os, EqGroup& eq) { + void YASKCppPrinter::addComment(ostream& os, EqBundle& eq) { // Use a simple human-readable visitor to create a comment. PrintHelper ph(_dims, NULL, "temp", "", " // ", ".\n"); @@ -68,8 +68,8 @@ namespace yask { // First, create a class to hold the data (grids). printData(os); - // A struct for each equation group. - printEqGroups(os); + // A struct for each equation bundle. + printEqBundles(os); // Finish the context. printContext(os); @@ -153,7 +153,7 @@ namespace yask { // get stats. CounterVisitor cve; - _eqGroups.visitEqs(&cve); + _eqBundles.visitEqs(&cve); os << endl << " ////// Stencil-specific data //////" << endl << "class " << _context_base << " : public StencilContext {\n" @@ -185,7 +185,7 @@ namespace yask { os << " '" << grid << "', which is "; if (gp->isScratch()) os << " a scratch variable.\n"; - else if (_eqGroups.getOutputGrids().count(gp)) + else if (_eqBundles.getOutputGrids().count(gp)) os << "updated by one or more equations.\n"; else os << "not updated by any equation (read-only).\n"; @@ -361,7 +361,7 @@ namespace yask { ctorCode += initCode; ctorCode += " " + grid + " = " + grid + "_ptr.get();\n"; ctorCode += " addGrid(" + grid + "_ptr, "; - if (_eqGroups.getOutputGrids().count(gp)) + if (_eqBundles.getOutputGrids().count(gp)) ctorCode += "true /* is an output grid */"; else ctorCode += "false /* is not an output grid */"; @@ -431,25 +431,25 @@ namespace yask { os << "}; // " << _context_base << endl; } - // Print YASK equation groups. - void YASKCppPrinter::printEqGroups(ostream& os) { + // Print YASK equation bundles. + void YASKCppPrinter::printEqBundles(ostream& os) { - for (size_t ei = 0; ei < _eqGroups.size(); ei++) { + for (size_t ei = 0; ei < _eqBundles.size(); ei++) { - // Scalar eqGroup. - auto& eq = _eqGroups.at(ei); + // Scalar eqBundle. + auto& eq = _eqBundles.at(ei); string egName = eq.getName(); string egDesc = eq.getDescription(); - string egsName = "StencilGroup_" + egName; + string egsName = "StencilBundle_" + egName; os << endl << " ////// Stencil " << egDesc << " //////\n" << - "\n class " << egsName << " : public StencilGroupBase {\n" + "\n class " << egsName << " : public StencilBundleBase {\n" " protected:\n" " typedef " << _context_base << " _context_type;\n" " _context_type* _context = 0;\n" " public:\n"; - // Stats for this eqGroup. + // Stats for this eqBundle. CounterVisitor stats; eq.visitEqs(&stats); @@ -457,10 +457,10 @@ namespace yask { os << endl << " // " << stats.getNumOps() << " FP operation(s) per point:" << endl; addComment(os, eq); - // Stencil-group ctor. + // Stencil-bundle ctor. { os << " " << egsName << "(" << _context_base << "* context) :\n" - " StencilGroupBase(context),\n" + " StencilBundleBase(context),\n" " _context(context) {\n" " _name = \"" << egName << "\";\n" " _scalar_fp_ops = " << stats.getNumOps() << ";\n" @@ -529,13 +529,13 @@ namespace yask { // Vector/Cluster code. for (int do_cluster = 0; do_cluster <= 1; do_cluster++) { - // Cluster eqGroup at same 'ei' index. - // This should be the same eq-group because it was copied from the + // Cluster eqBundle at same 'ei' index. + // This should be the same eq-bundle because it was copied from the // scalar one. - auto& vceq = do_cluster ? _clusterEqGroups.at(ei) : _eqGroups.at(ei); + auto& vceq = do_cluster ? _clusterEqBundles.at(ei) : _eqBundles.at(ei); assert(egDesc == vceq.getDescription()); - // Create vector info for this eqGroup. + // Create vector info for this eqBundle. // The visitor is accepted at all nodes in the cluster AST; // for each grid access node in the AST, the vectors // needed are determined and saved in the visitor. @@ -638,7 +638,7 @@ namespace yask { os << "}; // " << egsName << ".\n"; // end of class. - } // stencil eqGroups. + } // stencil eqBundles. } // Print final YASK context. @@ -647,36 +647,36 @@ namespace yask { os << endl << " ////// Overall stencil-specific context //////" << endl << "struct " << _context << " : public " << _context_base << " {" << endl; - // Stencil eqGroup objects. - os << endl << " // Stencil equation-groups." << endl; - for (auto& eg : _eqGroups) { + // Stencil eqBundle objects. + os << endl << " // Stencil equation-bundles." << endl; + for (auto& eg : _eqBundles) { string egName = eg.getName(); - string sgName = "stencilGroup_" + egName; - os << " StencilGroup_" << egName << " " << sgName << ";" << endl; + string sgName = "stencilBundle_" + egName; + os << " StencilBundle_" << egName << " " << sgName << ";" << endl; } // Ctor. os << "\n // Constructor.\n" << " " << _context << "(KernelEnvPtr env, KernelSettingsPtr settings) : " << _context_base << "(env, settings)"; - for (auto& eg : _eqGroups) { + for (auto& eg : _eqBundles) { string egName = eg.getName(); - string sgName = "stencilGroup_" + egName; + string sgName = "stencilBundle_" + egName; os << ",\n " << sgName << "(this)"; } os << " {\n"; - // Push eq-group pointers to list. - os << "\n // Stencil groups.\n"; - for (auto& eg : _eqGroups) { + // Push eq-bundle pointers to list. + os << "\n // Stencil bundles.\n"; + for (auto& eg : _eqBundles) { string egName = eg.getName(); - string sgName = "stencilGroup_" + egName; - os << " stGroups.push_back(&" << sgName << ");\n"; + string sgName = "stencilBundle_" + egName; + os << " stBundles.push_back(&" << sgName << ");\n"; - // Add other-group deps. + // Add other-bundle deps. for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1)) { for (auto& dep : eg.getDeps(dt)) { - string depName = "stencilGroup_" + dep; + string depName = "stencilBundle_" + dep; string dtName = (dt == cur_step_dep) ? "cur_step_dep" : (dt == prev_step_dep) ? "prev_step_dep" : "internal_error"; @@ -686,17 +686,17 @@ namespace yask { } } - // Add scratch-group deps in proper order. + // Add scratch-bundle deps in proper order. auto& sdeps = eg.getScratchDeps(); - for (auto& eg2 : _eqGroups) { + for (auto& eg2 : _eqBundles) { string eg2Name = eg2.getName(); - string sg2Name = "stencilGroup_" + eg2Name; + string sg2Name = "stencilBundle_" + eg2Name; if (sdeps.count(eg2Name)) os << " " << sgName << ".add_scratch_dep(&" << sg2Name << ");\n"; } - } // eq-groups. + } // eq-bundles. os << " } // Ctor.\n"; // Dims creator. diff --git a/src/compiler/main.cpp b/src/compiler/main.cpp index 06905a96..a1c43ea4 100644 --- a/src/compiler/main.cpp +++ b/src/compiler/main.cpp @@ -81,21 +81,21 @@ void usage(const string& cmd) { " -grids \n" " Only process updates to grids whose names match .\n" " This can be used to generate code for a subset of the stencil equations.\n" - " -eq-groups =,...\n" - " Put updates to grids matching in equation-group with base-name .\n" - " By default, eq-groups are created as needed based on dependencies between equations:\n" - " equations that do not depend on each other are grouped together into groups with the\n" - " base-name '" << settings._eq_group_basename_default << "'.\n" - " Each eq-group base-name is appended with a unique index number, so the default group\n" - " names are '" << settings._eq_group_basename_default << "_0', " << - settings._eq_group_basename_default << "_1', etc.\n" - " This option allows more control over this grouping.\n" - " Example: \"-eq-groups a=foo,b=b[aeiou]r\" creates one or more eq-groups named 'a_0', 'a_1', etc.\n" - " containing updates to each grid whose name contains 'foo' and one or more eq-groups\n" + " -eq-bundles =,...\n" + " Put updates to grids matching in equation-bundle with base-name .\n" + " By default, eq-bundles are created as needed based on dependencies between equations:\n" + " equations that do not depend on each other are bundled together into bundles with the\n" + " base-name '" << settings._eq_bundle_basename_default << "'.\n" + " Each eq-bundle base-name is appended with a unique index number, so the default bundle\n" + " names are '" << settings._eq_bundle_basename_default << "_0', " << + settings._eq_bundle_basename_default << "_1', etc.\n" + " This option allows more control over this bundling.\n" + " Example: \"-eq-bundles a=foo,b=b[aeiou]r\" creates one or more eq-bundles named 'a_0', 'a_1', etc.\n" + " containing updates to each grid whose name contains 'foo' and one or more eq-bundles\n" " named 'b_0', 'b_1', etc. containing updates to each grid whose name matches 'b[aeiou]r'.\n" " Standard regex-format tokens in will be replaced based on matches to .\n" - " Example: \"-eq-groups 'g_$&=b[aeiou]r'\" with grids 'bar_x', 'bar_y', 'ber_x', and 'ber_y'\n" - " would create eq-group 'g_bar_0' for grids 'bar_x' and 'bar_y' and eq-group 'g_ber_0' for\n" + " Example: \"-eq-bundles 'g_$&=b[aeiou]r'\" with grids 'bar_x', 'bar_y', 'ber_x', and 'ber_y'\n" + " would create eq-bundle 'g_bar_0' for grids 'bar_x' and 'bar_y' and eq-bundle 'g_ber_0' for\n" " grids 'ber_x' and 'ber_y' because '$&' is substituted by the string that matches the regex.\n" " -step-alloc \n" " Specify the size of the step-dimension memory allocation.\n" @@ -202,8 +202,8 @@ void parseOpts(int argc, const char* argv[]) solutionName = argop; else if (opt == "-grids") settings._gridRegex = argop; - else if (opt == "-eq-groups") - settings._eqGroupTargets = argop; + else if (opt == "-eq-bundles") + settings._eqBundleTargets = argop; else if (opt == "-fold" || opt == "-cluster") { // example: x=4,y=2 diff --git a/src/compiler/swig/yask_compiler_api.i b/src/compiler/swig/yask_compiler_api.i index a5d1f0e5..e9c3c5e1 100644 --- a/src/compiler/swig/yask_compiler_api.i +++ b/src/compiler/swig/yask_compiler_api.i @@ -40,6 +40,7 @@ IN THE SOFTWARE. // Must declare shared_ptrs for the entire expr_node hierarchy! %shared_ptr(yask::yc_solution) //%shared_ptr(yask::yc_grid) +%shared_ptr(yask::yc_equation_group) %shared_ptr(yask::yc_expr_node) %shared_ptr(yask::yc_index_node) %shared_ptr(yask::yc_equation_node) diff --git a/src/kernel/Makefile b/src/kernel/Makefile index 4e37b3ef..fb130302 100644 --- a/src/kernel/Makefile +++ b/src/kernel/Makefile @@ -269,7 +269,7 @@ NDIMS_OPT := `cat $(YK_DIMS_FILE)` RANK_LOOP_OPTS ?= $(NDIMS_OPT) -inVar rank_idxs RANK_LOOP_ORDER ?= 1 .. N-1 RANK_LOOP_CODE ?= $(RANK_LOOP_OUTER_MODS) loop($(RANK_LOOP_ORDER)) \ - { $(RANK_LOOP_INNER_MODS) call(calc_region(stGroup_ptr)); } + { $(RANK_LOOP_INNER_MODS) call(calc_region(stBundle_ptr)); } # Region loops break up a region using OpenMP threading into blocks. The # 'omp' modifier creates an outer OpenMP loop so that each block is assigned @@ -326,7 +326,7 @@ MISC_LOOP_CODE ?= $(MISC_LOOP_OUTER_MODS) loop($(MISC_LOOP_ORDER)) \ # Flags passed to stencil compiler. YC_FLAGS += -stencil $(stencil) -elem-bytes $(real_bytes) -cluster $(cluster) -fold $(fold) ifneq ($(eqs),) - YC_FLAGS += -eq-groups $(eqs) + YC_FLAGS += -eq-bundles $(eqs) endif ifneq ($(radius),) YC_FLAGS += -radius $(radius) diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp index 66c2c957..7524b41f 100644 --- a/src/kernel/lib/context.cpp +++ b/src/kernel/lib/context.cpp @@ -146,7 +146,7 @@ namespace yask { ///// Top-level methods for evaluating reference and optimized stencils. - // Eval stencil group(s) over grid(s) using reference scalar code. + // Eval stencil bundle(s) over grid(s) using reference scalar code. void StencilContext::calc_rank_ref() { run_time.start(); @@ -223,11 +223,11 @@ namespace yask { rank_idxs.stop[step_posn] = stop_t; rank_idxs.step[step_posn] = step_t; - // Loop thru groups. + // Loop thru bundles. // For this reference-code implementation, we - // will do all stencil groups at this level, + // will do all stencil bundles at this level, // even scratch-grid ones. - for (auto* sg : stGroups) { + for (auto* sg : stBundles) { // Exchange all dirty halos. exchange_halos_all(); @@ -239,7 +239,7 @@ namespace yask { // Define misc-loop function. Since step is always 1, we // ignore misc_stop. If point is in sub-domain for this - // group, then evaluate the reference scalar code. + // bundle, then evaluate the reference scalar code. #define misc_fn(misc_idxs) do { \ if (sg->is_in_valid_domain(misc_idxs.start)) \ sg->calc_scalar(scratch_grid_idx, misc_idxs.start); \ @@ -247,17 +247,17 @@ namespace yask { // Scan through n-D space. TRACE_MSG("calc_rank_ref: step " << start_t << - " in group '" << sg->get_name() << "': " << + " in bundle '" << sg->get_name() << "': " << misc_idxs.begin.makeValStr(ndims) << " ... (end before) " << misc_idxs.end.makeValStr(ndims)); #include "yask_misc_loops.hpp" #undef misc_fn - // Remember grids that have been written to by this group, + // Remember grids that have been written to by this bundle, // updated at next step (+/- 1). mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg); - } // groups. + } // bundles. } // iterations. // Final halo exchange. @@ -266,7 +266,7 @@ namespace yask { run_time.stop(); } - // Eval stencil group(s) over grid(s) using optimized code. + // Eval stencil bundle(s) over grid(s) using optimized code. void StencilContext::run_solution(idx_t first_step_index, idx_t last_step_index) { @@ -320,8 +320,8 @@ namespace yask { // Extend end points for overlapping regions due to wavefront angle. // For each subsequent time step in a region, the spatial location // of each block evaluation is shifted by the angle for each - // stencil-group. So, the total shift in a region is the angle * num - // stencils * num timesteps. This assumes all groups + // stencil-bundle. So, the total shift in a region is the angle * num + // stencils * num timesteps. This assumes all bundles // are inter-dependent to find maximum extension. Actual required // extension may be less, but this will just result in some calls to // calc_region() that do nothing. @@ -388,44 +388,44 @@ namespace yask { rank_idxs.stop[step_posn] = stop_t; rank_idxs.step[step_posn] = step_t; - // If no wave-fronts (default), loop through groups here, and do - // only one group at a time in calc_region(). This is similar to + // If no wave-fronts (default), loop through bundles here, and do + // only one bundle at a time in calc_region(). This is similar to // loop in calc_rank_ref(). if (step_t == 1) { - for (auto* sg : stGroups) { + for (auto* sg : stBundles) { // Don't do scratch updates here. if (sg->is_scratch()) continue; - // Exchange halo(s) needed for this group. + // Exchange halo(s) needed for this bundle. exchange_halos(start_t, stop_t, *sg); - // Eval this group in calc_region(). - StencilGroupSet stGroup_set; - stGroup_set.insert(sg); - StencilGroupSet* stGroup_ptr = &stGroup_set; + // Eval this bundle in calc_region(). + StencilBundleSet stBundle_set; + stBundle_set.insert(sg); + StencilBundleSet* stBundle_ptr = &stBundle_set; // Include automatically-generated loop code that calls // calc_region() for each region. TRACE_MSG("run_solution: step " << start_t << - " in group '" << sg->get_name() << "'"); + " in bundle '" << sg->get_name() << "'"); #include "yask_rank_loops.hpp" } } - // If doing wave-fronts, must loop through all groups in + // If doing wave-fronts, must loop through all bundles in // calc_region(). - // TODO: make this the only case, allowing all groups to be done + // TODO: make this the only case, allowing all bundles to be done // between MPI exchanges, even w/o wave-fronts. else { // Exchange all dirty halo(s). exchange_halos_all(); - // Eval all stencil groups. - StencilGroupSet* stGroup_ptr = NULL; + // Eval all stencil bundles. + StencilBundleSet* stBundle_ptr = NULL; // Include automatically-generated loop code that calls calc_region() for each region. TRACE_MSG("run_solution: steps " << start_t << " ... (end before) " << stop_t); @@ -442,9 +442,6 @@ namespace yask { } // step loop. - // Final halo exchange. - exchange_halos_all(); - #ifdef MODEL_CACHE // Print cache stats, then disable. // Thus, cache is only modeled for first call. @@ -477,7 +474,7 @@ namespace yask { // Each region is typically computed in a separate OpenMP 'for' region. // In it, we loop over the time steps and the stencils // and evaluate the blocks in the region. - void StencilContext::calc_region(StencilGroupSet* stGroup_set, + void StencilContext::calc_region(StencilBundleSet* stBundle_set, const ScanIndices& rank_idxs) { int ndims = _dims->_stencil_dims.size(); @@ -523,18 +520,18 @@ namespace yask { region_idxs.start[step_posn] = start_t; region_idxs.stop[step_posn] = stop_t; - // Stencil groups to evaluate at this time step. - for (auto* sg : stGroups) { + // Stencil bundles to evaluate at this time step. + for (auto* sg : stBundles) { // Don't do scratch updates here. if (sg->is_scratch()) continue; - // Group not selected. - if (stGroup_set && !stGroup_set->count(sg)) + // Bundle not selected. + if (stBundle_set && !stBundle_set->count(sg)) continue; - TRACE_MSG("calc_region: stencil-group '" << sg->get_name() << "' w/BB " << + TRACE_MSG("calc_region: stencil-bundle '" << sg->get_name() << "' w/BB " << sg->bb_begin.makeDimValStr() << " ... (end before) " << sg->bb_end.makeDimValStr()); @@ -544,7 +541,7 @@ namespace yask { // each time-step, the parallelogram may be trimmed // based on the BB and WF extensions outside of the rank-BB. - // Actual region boundaries must stay within [extended] BB for this group. + // Actual region boundaries must stay within [extended] BB for this bundle. bool ok = true; for (int i = 0; i < ndims; i++) { if (i == step_posn) continue; @@ -587,7 +584,7 @@ namespace yask { // contains the outer OpenMP loop(s). #include "yask_region_loops.hpp" - // Remember grids that have been written to by this group, + // Remember grids that have been written to by this bundle, // updated at next step (+/- 1). mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg); } @@ -596,7 +593,7 @@ namespace yask { // implement temporal wavefront. Between regions, we only shift // backward, so region loops must strictly increment. They may do // so in any order. TODO: shift only what is needed by - // this group, not the global max. + // this bundle, not the global max. for (int i = 0; i < ndims; i++) { if (i == step_posn) continue; auto& dname = _dims->_stencil_dims.getDimName(i); @@ -606,7 +603,7 @@ namespace yask { stop[i] -= angle; } - } // stencil groups. + } // stencil bundles. } // time. } // calc_region. @@ -1145,7 +1142,7 @@ namespace yask { // based on the grids' halos. update_grids(); - // Determine bounding-boxes for all groups. + // Determine bounding-boxes for all bundles. // This must be done after finding WF extensions. find_bounding_boxes(); @@ -1324,7 +1321,7 @@ namespace yask { // Determine size of exchange. This will be the actual halo size // plus any wave-front extensions. In the current implementation, // we need the wave-front extensions regardless of whether there - // is a halo on a given grid. This is because each stencil-group + // is a halo on a given grid. This is because each stencil-bundle // gets shifted by the WF angles at each step in the WF. // Neighbor is to the left. @@ -1810,7 +1807,7 @@ namespace yask { if (wf_steps > 1) // TODO: don't shift for scratch grids. - num_wf_shifts = max((idx_t(stGroups.size()) * wf_steps) - 1, idx_t(0)); + num_wf_shifts = max((idx_t(stBundles.size()) * wf_steps) - 1, idx_t(0)); for (auto& dim : _dims->_domain_dims.getDims()) { auto& dname = dim.getName(); auto rksize = _opts->_rank_sizes[dname]; @@ -1941,13 +1938,11 @@ namespace yask { } } #endif - // Some grid stats. os << endl; os << "Num grids: " << gridPtrs.size() << endl; os << "Num grids to be updated: " << outputGridPtrs.size() << endl; - os << "Num stencil groups: " << stGroups.size() << endl; // Set up data based on MPI rank, including grid positions. // Update all the grid sizes. @@ -2013,12 +2008,12 @@ namespace yask { } os << endl; - // sums across groups for this rank. + // sums across bundles for this rank. rank_numWrites_1t = 0; rank_reads_1t = 0; rank_numFpOps_1t = 0; - os << "Num equation-groups: " << stGroups.size() << endl; - for (auto* sg : stGroups) { + os << "Num stencil bundles: " << stBundles.size() << endl; + for (auto* sg : stBundles) { idx_t updates1 = sg->get_scalar_points_written(); idx_t updates_domain = updates1 * sg->bb_num_points; rank_numWrites_1t += updates_domain; @@ -2028,7 +2023,7 @@ namespace yask { idx_t fpops1 = sg->get_scalar_fp_ops(); idx_t fpops_domain = fpops1 * sg->bb_num_points; rank_numFpOps_1t += fpops_domain; - os << "Stats for equation-group '" << sg->get_name() << "':\n" << + os << "Stats for bundle '" << sg->get_name() << "':\n" << " sub-domain: " << sg->bb_begin.makeDimValStr() << " ... " << sg->bb_end.subElements(1).makeDimValStr() << endl << " sub-domain size: " << sg->bb_len.makeDimValStr(" * ") << endl << @@ -2110,9 +2105,9 @@ namespace yask { "Notes:\n" " Domain-sizes and overall-problem-sizes are based on rank-domain sizes\n" " and number of ranks regardless of number of grids or sub-domains.\n" - " Num-writes-required is based on sum of grid-updates in sub-domain across stencil-group(s).\n" - " Num-reads-required is based on sum of grid-reads in sub-domain across stencil-group(s).\n" - " Est-FP-ops are based on sum of est-FP-ops in sub-domain across stencil-group(s).\n" + " Num-writes-required is based on sum of grid-updates in sub-domain across stencil-bundle(s).\n" + " Num-reads-required is based on sum of grid-reads in sub-domain across stencil-bundle(s).\n" + " Est-FP-ops are based on sum of est-FP-ops in sub-domain across stencil-bundle(s).\n" "\n"; } @@ -2167,6 +2162,9 @@ namespace yask { // Dealloc grids, etc. void StencilContext::end_solution() { + // Final halo exchange. + exchange_halos_all(); + // Release any MPI data. mpiData.clear(); @@ -2268,7 +2266,7 @@ namespace yask { bb_valid = true; } - // Set the bounding-box for each stencil-group and whole domain. + // Set the bounding-box for each stencil-bundle and whole domain. void StencilContext::find_bounding_boxes() { ostream& os = get_ostr(); @@ -2283,13 +2281,13 @@ namespace yask { ext_bb.bb_end = rank_bb.bb_end.addElements(right_wf_exts); ext_bb.update_bb(os, "extended-rank", *this, true); - // Find BB for each group. - for (auto sg : stGroups) + // Find BB for each bundle. + for (auto sg : stBundles) sg->find_bounding_box(); } // Exchange dirty halo data for all grids and all steps, regardless - // of their stencil-group. + // of their stencil-bundle. void StencilContext::exchange_halos_all() { #ifdef USE_MPI @@ -2305,8 +2303,8 @@ namespace yask { } } - // Initial halo exchange for each group. - for (auto* sg : stGroups) { + // Initial halo exchange for each bundle. + for (auto* sg : stBundles) { // Do exchange over max steps. exchange_halos(start, stop, *sg); @@ -2314,17 +2312,17 @@ namespace yask { #endif } - // Exchange halo data needed by stencil-group 'sg' at the given time. + // Exchange halo data needed by stencil-bundle 'sg' at the given time. // Data is needed for input grids that have not already been updated. // [BIG] TODO: overlap halo exchange with computation. - void StencilContext::exchange_halos(idx_t start, idx_t stop, StencilGroupBase& sg) + void StencilContext::exchange_halos(idx_t start, idx_t stop, StencilBundleBase& sg) { #ifdef USE_MPI if (!enable_halo_exchange || _env->num_ranks < 2) return; mpi_time.start(); TRACE_MSG("exchange_halos: " << start << " ... (end before) " << stop << - " for eq-group '" << sg.get_name() << "'"); + " for stencil-bundle '" << sg.get_name() << "'"); auto opts = get_settings(); auto& sd = _dims->_step_dim; @@ -2358,7 +2356,7 @@ namespace yask { else if (halo_step == halo_unpack) TRACE_MSG("exchange_halos: unpacking data for step " << t << "..."); - // Loop thru all input grids in this group. + // Loop thru all input grids in this bundle. for (size_t gi = 0; gi < sg.inputGridPtrs.size(); gi++) { auto gp = sg.inputGridPtrs[gi]; MPI_Request* grid_recv_reqs = recv_reqs[gi]; @@ -2519,10 +2517,10 @@ namespace yask { #endif } - // Mark grids that have been written to by stencil-group 'sg'. + // Mark grids that have been written to by stencil-bundle 'sg'. // TODO: only mark grids that are written to in their halo-read area. // TODO: add index for misc dim(s). - void StencilContext::mark_grids_dirty(idx_t start, idx_t stop, StencilGroupBase& sg) { + void StencilContext::mark_grids_dirty(idx_t start, idx_t stop, StencilBundleBase& sg) { idx_t step = (start < stop) ? 1 : -1; for (auto gp : sg.outputGridPtrs) { for (idx_t t = start; t != stop; t += step) { diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp index a329dfe7..7e436776 100644 --- a/src/kernel/lib/context.hpp +++ b/src/kernel/lib/context.hpp @@ -94,9 +94,9 @@ namespace yask { }; // Collections of things in a context. - class StencilGroupBase; - typedef std::vector StencilGroupList; - typedef std::set StencilGroupSet; + class StencilBundleBase; + typedef std::vector StencilBundleList; + typedef std::set StencilBundleSet; typedef std::map GridPtrMap; // Data and hierarchical sizes. @@ -155,10 +155,10 @@ namespace yask { // If WFs are not used, this is the same as rank_bb; BoundingBox ext_bb; - // List of all stencil groups in the order in which + // List of all stencil bundles in the order in which // they should be evaluated within a step. // TODO: use dependency info, allowing more parallelism. - StencilGroupList stGroups; + StencilBundleList stBundles; // All grids. GridPtrs gridPtrs; @@ -189,7 +189,7 @@ namespace yask { // 'tot_' prefix indicates over all ranks. // 'domain' indicates points in domain-size specified on cmd-line. // 'numpts' indicates points actually calculated in sub-domains. - // 'reads' indicates points actually read by stencil-groups. + // 'reads' indicates points actually read by stencil-bundles. // 'numFpOps' indicates est. number of FP ops. // 'nbytes' indicates number of bytes allocated. // '_1t' suffix indicates work for one time-step. @@ -518,20 +518,20 @@ namespace yask { // rank-domain loops; the actual begin_r* and end_r* values for the // region are derived from these. TODO: create a public interface // w/a more logical index ordering. - virtual void calc_region(StencilGroupSet* stGroup_set, + virtual void calc_region(StencilBundleSet* stBundle_set, const ScanIndices& rank_idxs); - // Exchange all dirty halo data for all stencil groups + // Exchange all dirty halo data for all stencil bundles // and max number of steps for each grid. virtual void exchange_halos_all(); - // Exchange halo data needed by stencil-group 'sg' at the given step(s). - virtual void exchange_halos(idx_t start, idx_t stop, StencilGroupBase& sg); + // Exchange halo data needed by stencil-bundle 'sg' at the given step(s). + virtual void exchange_halos(idx_t start, idx_t stop, StencilBundleBase& sg); - // Mark grids that have been written to by group 'sg'. - virtual void mark_grids_dirty(idx_t start, idx_t stop, StencilGroupBase& sg); + // Mark grids that have been written to by bundle 'sg'. + virtual void mark_grids_dirty(idx_t start, idx_t stop, StencilBundleBase& sg); - // Set the bounding-box around all eq groups. + // Set the bounding-box around all stencil bundles. virtual void find_bounding_boxes(); // Make new scratch grids. diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp index fddf9e88..c3c6b287 100644 --- a/src/kernel/lib/stencil_calc.cpp +++ b/src/kernel/lib/stencil_calc.cpp @@ -31,8 +31,8 @@ namespace yask { // Calculate results within a block. // Typically called by a top-level OMP thread. // It is here that any required scratch-grid stencils are evaluated - // first and then the non-scratch stencils in the stencil group. - void StencilGroupBase::calc_block(const ScanIndices& region_idxs) { + // first and then the non-scratch stencils in the stencil bundle. + void StencilBundleBase::calc_block(const ScanIndices& region_idxs) { auto opts = _generic_context->get_settings(); auto dims = _generic_context->get_dims(); @@ -40,7 +40,7 @@ namespace yask { auto& step_dim = dims->_step_dim; int thread_idx = omp_get_thread_num(); // used to index the scratch grids. TRACE_MSG3("calc_block:" << - " in non-scratch group '" << get_name() << "': " << + " in non-scratch bundle '" << get_name() << "': " << region_idxs.start.makeValStr(ndims) << " ... (end before) " << region_idxs.stop.makeValStr(ndims) << " by thread " << thread_idx); @@ -56,12 +56,12 @@ namespace yask { // Groups in block loops are based on sub-block-group sizes. def_block_idxs.group_size = opts->_sub_block_group_sizes; - // Update offsets of scratch grids based on this group's location. + // Update offsets of scratch grids based on this bundle's location. _generic_context->update_scratch_grids(thread_idx, def_block_idxs); - // Define the groups that need to be processed in + // Define the bundles that need to be processed in // this block. This will be the prerequisite scratch-grid - // groups plus this non-scratch group. + // bundles plus this non-scratch bundle. auto sg_list = get_scratch_deps(); sg_list.push_back(this); @@ -70,7 +70,7 @@ namespace yask { // This should be nested within a top-level OpenMP task. _generic_context->set_block_threads(); - // Loop through all the needed groups. + // Loop through all the needed bundles. for (auto* sg : sg_list) { // Indices needed for the generated loops. Will normally be a @@ -78,7 +78,7 @@ namespace yask { ScanIndices block_idxs = sg->adjust_scan(thread_idx, def_block_idxs); TRACE_MSG3("calc_block: " << - " in group '" << sg->get_name() << "': " << + " in bundle '" << sg->get_name() << "': " << block_idxs.begin.makeValStr(ndims) << " ... (end before) " << block_idxs.end.makeValStr(ndims) << " by thread " << thread_idx); @@ -93,7 +93,7 @@ namespace yask { // Normalize the indices, i.e., divide by vector len in each dim. // Ranks offsets must already be subtracted. // Each dim in 'orig' must be a multiple of corresponding vec len. - void StencilGroupBase::normalize_indices(const Indices& orig, Indices& norm) const { + void StencilBundleBase::normalize_indices(const Indices& orig, Indices& norm) const { auto* cp = _generic_context; auto dims = cp->get_dims(); int nsdims = dims->_stencil_dims.size(); @@ -124,7 +124,7 @@ namespace yask { // The index ranges in 'block_idxs' are sub-divided // into full vector-clusters, full vectors, and sub-vectors // and finally evaluated by the YASK-compiler-generated loops. - void StencilGroupBase::calc_sub_block(int thread_idx, + void StencilBundleBase::calc_sub_block(int thread_idx, const ScanIndices& block_idxs) { auto* cp = _generic_context; auto opts = cp->get_settings(); @@ -134,7 +134,7 @@ namespace yask { auto& step_dim = dims->_step_dim; auto step_posn = Indices::step_posn; TRACE_MSG3("calc_sub_block:" << - " in group '" << get_name() << "': " << + " in bundle '" << get_name() << "': " << block_idxs.start.makeValStr(nsdims) << " ... (end before) " << block_idxs.stop.makeValStr(nsdims)); @@ -467,7 +467,7 @@ namespace yask { // Define misc-loop function. // If point is in sub-domain for this - // group, then evaluate the reference scalar code. + // bundle, then evaluate the reference scalar code. // If no holes, don't need to check each point in domain. // Since step is always 1, we ignore misc_idxs.stop. #define misc_fn(misc_idxs) do { \ @@ -505,7 +505,7 @@ namespace yask { // The 'loop_idxs' must specify a range only in the inner dim. // Indices must be rank-relative. // Indices must be normalized, i.e., already divided by VLEN_*. - void StencilGroupBase::calc_loop_of_clusters(int thread_idx, + void StencilBundleBase::calc_loop_of_clusters(int thread_idx, const ScanIndices& loop_idxs) { auto* cp = _generic_context; auto dims = cp->get_dims(); @@ -541,7 +541,7 @@ namespace yask { // The 'loop_idxs' must specify a range only in the inner dim. // Indices must be rank-relative. // Indices must be normalized, i.e., already divided by VLEN_*. - void StencilGroupBase::calc_loop_of_vectors(int thread_idx, + void StencilBundleBase::calc_loop_of_vectors(int thread_idx, const ScanIndices& loop_idxs, idx_t write_mask) { auto* cp = _generic_context; @@ -571,11 +571,11 @@ namespace yask { calc_loop_of_vectors(thread_idx, start_idxs, stop_inner, write_mask); } - // If this group is updating scratch grid(s), + // If this bundle is updating scratch grid(s), // expand indices to calculate values in halo. // This will often change vec-len aligned indices to non-aligned. // Return adjusted indices. - ScanIndices StencilGroupBase::adjust_scan(int thread_idx, const ScanIndices& idxs) const { + ScanIndices StencilBundleBase::adjust_scan(int thread_idx, const ScanIndices& idxs) const { ScanIndices adj_idxs(idxs); auto* cp = _generic_context; @@ -583,7 +583,7 @@ namespace yask { int nsdims = dims->_stencil_dims.size(); auto step_posn = Indices::step_posn; - // Loop thru vecs of scratch grids for this group. + // Loop thru vecs of scratch grids for this bundle. for (auto* sv : outputScratchVecs) { assert(sv); @@ -626,8 +626,8 @@ namespace yask { return adj_idxs; } - // Set the bounding-box vars for this group in this rank. - void StencilGroupBase::find_bounding_box() { + // Set the bounding-box vars for this bundle in this rank. + void StencilBundleBase::find_bounding_box() { StencilContext& context = *_generic_context; ostream& os = context.get_ostr(); auto settings = context.get_settings(); @@ -658,7 +658,7 @@ namespace yask { misc_idxs.end = end; // Define misc-loop function. Since step is always 1, we ignore - // misc_stop. Update only if point is in domain for this group. + // misc_stop. Update only if point is in domain for this bundle. #define misc_fn(misc_idxs) do { \ if (is_in_valid_domain(misc_idxs.start)) { \ min_pts = min_pts.minElements(misc_idxs.start); \ diff --git a/src/kernel/lib/stencil_calc.hpp b/src/kernel/lib/stencil_calc.hpp index 374ebd8b..830d3746 100644 --- a/src/kernel/lib/stencil_calc.hpp +++ b/src/kernel/lib/stencil_calc.hpp @@ -27,11 +27,11 @@ IN THE SOFTWARE. namespace yask { - /// Classes that support evaluation of one stencil group. - /// A stencil context contains one or more groups. + /// Classes that support evaluation of one stencil bundle. + /// A stencil context contains one or more bundles. - // A pure-virtual class base for a stencil group. - class StencilGroupBase : public BoundingBox { + // A pure-virtual class base for a stencil bundle. + class StencilBundleBase : public BoundingBox { protected: StencilContext* _generic_context = 0; std::string _name; @@ -42,12 +42,12 @@ namespace yask { // Position of inner dim in stencil-dims tuple. int _inner_posn = 0; - // Other groups that this one depends on. - std::map _depends_on; + // Other bundles that this one depends on. + std::map _depends_on; - // List of scratch-grid groups that need to be evaluated - // before this group. Listed in eval order first-to-last. - StencilGroupList _scratch_deps; + // List of scratch-grid bundles that need to be evaluated + // before this bundle. Listed in eval order first-to-last. + StencilBundleList _scratch_deps; // Whether this updates scratch grid(s); bool _is_scratch = false; @@ -71,7 +71,7 @@ namespace yask { ScratchVecs inputScratchVecs; // ctor, dtor. - StencilGroupBase(StencilContext* context) : + StencilBundleBase(StencilContext* context) : _generic_context(context) { // Make sure map entries exist. @@ -91,7 +91,7 @@ namespace yask { } } - virtual ~StencilGroupBase() { } + virtual ~StencilBundleBase() { } // Access to dims and MPI info. virtual DimsPtr get_dims() const { @@ -101,7 +101,7 @@ namespace yask { return _generic_context->get_mpi_info(); } - // Get name of this group. + // Get name of this bundle. virtual const std::string& get_name() const { return _name; } // Get estimated number of FP ops done for one scalar eval. @@ -116,32 +116,32 @@ namespace yask { virtual void set_scratch(bool is_scratch) { _is_scratch = is_scratch; } // Add dependency. - virtual void add_dep(DepType dt, StencilGroupBase* eg) { + virtual void add_dep(DepType dt, StencilBundleBase* eg) { _depends_on.at(dt).insert(eg); } // Get dependencies. - virtual const StencilGroupSet& get_deps(DepType dt) const { + virtual const StencilBundleSet& get_deps(DepType dt) const { return _depends_on.at(dt); } - // Add needed scratch-group. - virtual void add_scratch_dep(StencilGroupBase* eg) { + // Add needed scratch-bundle. + virtual void add_scratch_dep(StencilBundleBase* eg) { _scratch_deps.push_back(eg); } - // Get needed scratch-group(s). - virtual const StencilGroupList& get_scratch_deps() const { + // Get needed scratch-bundle(s). + virtual const StencilBundleList& get_scratch_deps() const { return _scratch_deps; } - // If this group is updating scratch grid(s), + // If this bundle is updating scratch grid(s), // expand indices to calculate values in halo. // Adjust offsets in grids based on original idxs. // Return adjusted indices. virtual ScanIndices adjust_scan(int thread_idx, const ScanIndices& idxs) const; - // Set the bounding-box vars for this group in this rank. + // Set the bounding-box vars for this bundle in this rank. virtual void find_bounding_box(); // Determine whether indices are in [sub-]domain. diff --git a/src/kernel/swig/yask_kernel_api.i b/src/kernel/swig/yask_kernel_api.i index a4a200aa..2da88f25 100644 --- a/src/kernel/swig/yask_kernel_api.i +++ b/src/kernel/swig/yask_kernel_api.i @@ -43,6 +43,7 @@ IN THE SOFTWARE. %shared_ptr(yask::yk_settings) %shared_ptr(yask::yk_solution) %shared_ptr(yask::yk_grid) +%shared_ptr(yask::yk_stencil_group) %shared_ptr(yask::yk_stats) // Mutable buffer to access raw data. From f73854289eb2c214089603184c0384f0b5a9e292 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Tue, 17 Apr 2018 17:45:29 -0700 Subject: [PATCH 02/21] Fix grid test to work with recent numa change. --- src/kernel/tests/grid_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/kernel/tests/grid_test.cpp b/src/kernel/tests/grid_test.cpp index adf2c488..57158efa 100644 --- a/src/kernel/tests/grid_test.cpp +++ b/src/kernel/tests/grid_test.cpp @@ -66,10 +66,10 @@ int main(int argc, char** argv) { os << "0-D test...\n"; GridDimNames gdims; string name = "test grid"; - YkGridPtr g0 = make_shared>(dims, name, gdims, settings, &osp); + YkGridPtr g0 = make_shared>(dims, name, gdims, &settings, &osp); g0->alloc_storage(); os << g0->make_info_string() << endl; - YkGridPtr g1 = make_shared>(dims, name, gdims, settings, &osp); + YkGridPtr g1 = make_shared>(dims, name, gdims, &settings, &osp); g1->alloc_storage(); os << g1->make_info_string() << endl; @@ -87,8 +87,8 @@ int main(int argc, char** argv) { os << "3-D test...\n"; GridDimNames gdims = {"x", "y", "z"}; string name = "test grid"; - YkGridPtr g3 = make_shared>(dims, name, gdims, settings, &osp); - YkGridPtr g3f = make_shared>(dims, name, gdims, settings, &osp); + YkGridPtr g3 = make_shared>(dims, name, gdims, &settings, &osp); + YkGridPtr g3f = make_shared>(dims, name, gdims, &settings, &osp); int i = 0; int min_pad = 3; for (auto dname : gdims) { From 3a628e074fff3b3a68aedf72bf97fbd940c0bcde Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Fri, 20 Apr 2018 08:43:40 -0700 Subject: [PATCH 03/21] Re-enable surface conditions in AWE Elastic stencil. v2.05.09. Fix some issues with combos of sub-domains, scratch grids, and MPI. --- src/common/common_utils.cpp | 2 +- src/compiler/lib/Eqs.cpp | 19 +- src/kernel/lib/context.cpp | 250 ++++++++++++------- src/kernel/lib/context.hpp | 1 - src/kernel/lib/settings.hpp | 1 + src/kernel/yask_main.cpp | 4 +- src/stencils/AwpElasticStencil.hpp | 379 ++++++++++++++++------------- 7 files changed, 379 insertions(+), 277 deletions(-) diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp index ef7d44bd..0e64ad0f 100644 --- a/src/common/common_utils.cpp +++ b/src/common/common_utils.cpp @@ -41,7 +41,7 @@ namespace yask { // for numbers above 9 (at least up to 99). // Format: "major.minor.patch". - const string version = "2.05.08"; + const string version = "2.05.09"; string yask_get_version_string() { return version; diff --git a/src/compiler/lib/Eqs.cpp b/src/compiler/lib/Eqs.cpp index 0a8acac3..f50285e7 100644 --- a/src/compiler/lib/Eqs.cpp +++ b/src/compiler/lib/Eqs.cpp @@ -634,6 +634,8 @@ namespace yask { g->updateConstIndices(ap->getArgConsts()); } + // We want to start with non-scratch eqs and walk the dep + // tree to find all dependent scratch eqs. // If 'eq1' has a non-scratch output, visit all dependencies of // 'eq1'. It's important to visit the eqs in dep order to // properly propagate halos sizes thru chains of scratch grids. @@ -643,18 +645,15 @@ namespace yask { // 'eq1' is 'b' or depends on 'b', immediately or indirectly. (eq1, [&](EqualsExprPtr b, EqDeps::EqVecSet& path) { - // Only check if conditions are same. - auto cond1 = getCond(eq1); - auto cond2 = getCond(b); - bool same_cond = areExprsSame(cond1, cond2); - // Does 'b' have a scratch-grid output? + // NB: scratch eqs don't have conditions, so + // we don't need to check them. auto* og2 = pv.getOutputGrids().at(b.get()); - if (same_cond && og2->isScratch()) { + if (og2->isScratch()) { // Get halos from the output scratch grid. // These are the points that are read from - // in dependent eq(s). + // the dependent eq(s). // For scratch grids, the halo areas must also be written to. auto _left_ohalo = og2->getHaloSizes(true); auto _right_ohalo = og2->getHaloSizes(false); @@ -679,12 +678,6 @@ namespace yask { EqualsExprPtr prev; for (auto eq2 : path) { - // Only continue if conditions are same. - auto cond1 = getCond(eq1); - auto cond2 = getCond(eq2); - if (!areExprsSame(cond1, cond2)) - break; - // Look for scratch-grid dep from 'prev' to 'eq2'. if (prev) { diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp index 3d724692..ee358044 100644 --- a/src/kernel/lib/context.cpp +++ b/src/kernel/lib/context.cpp @@ -224,40 +224,56 @@ namespace yask { rank_idxs.step[step_posn] = step_t; // Loop thru groups. - // For this reference-code implementation, we - // will do all stencil groups at this level, - // even scratch-grid ones. - for (auto* sg : stGroups) { + for (auto* asg : stGroups) { + // Don't do scratch updates here. + if (asg->is_scratch()) + continue; + + // Scan through n-D space. + TRACE_MSG("calc_rank_ref: step " << start_t << + " in non-scratch group '" << asg->get_name()); + // Exchange all dirty halos. exchange_halos_all(); - // Indices needed for the generated misc loops. Will normally be a - // copy of rank_idxs except when updating scratch-grids. - ScanIndices misc_idxs = sg->adjust_scan(scratch_grid_idx, rank_idxs); - misc_idxs.step.setFromConst(1); // ensure unit step. + // Find the groups that need to be processed. + // This will be the prerequisite scratch-grid + // groups plus this non-scratch group. + auto sg_list = asg->get_scratch_deps(); + sg_list.push_back(asg); + + // Loop through all the needed groups. + for (auto* sg : sg_list) { + + // Indices needed for the generated misc loops. Will normally be a + // copy of rank_idxs except when updating scratch-grids. + ScanIndices misc_idxs = sg->adjust_scan(scratch_grid_idx, rank_idxs); + misc_idxs.step.setFromConst(1); // ensure unit step. - // Define misc-loop function. Since step is always 1, we - // ignore misc_stop. If point is in sub-domain for this - // group, then evaluate the reference scalar code. + // Define misc-loop function. Since step is always 1, we + // ignore misc_stop. If point is in sub-domain for this + // group, then evaluate the reference scalar code. + // TODO: fix domain of scratch grids. #define misc_fn(misc_idxs) do { \ - if (sg->is_in_valid_domain(misc_idxs.start)) \ - sg->calc_scalar(scratch_grid_idx, misc_idxs.start); \ - } while(0) + if (sg->is_in_valid_domain(misc_idxs.start)) \ + sg->calc_scalar(scratch_grid_idx, misc_idxs.start); \ + } while(0) - // Scan through n-D space. - TRACE_MSG("calc_rank_ref: step " << start_t << - " in group '" << sg->get_name() << "': " << - misc_idxs.begin.makeValStr(ndims) << - " ... (end before) " << misc_idxs.end.makeValStr(ndims)); + // Scan through n-D space. + TRACE_MSG("calc_rank_ref: step " << start_t << + " in group '" << sg->get_name() << "': " << + misc_idxs.begin.makeValStr(ndims) << + " ... (end before) " << misc_idxs.end.makeValStr(ndims)); #include "yask_misc_loops.hpp" #undef misc_fn - + } // groups in chain. + // Remember grids that have been written to by this group, // updated at next step (+/- 1). - mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg); + mark_grids_dirty(start_t + step_t, stop_t + step_t, *asg); - } // groups. + } // all groups. } // iterations. // Final halo exchange. @@ -471,6 +487,9 @@ namespace yask { } run_solution(first_t, last_t); + + // Final halo exchange. + exchange_halos_all(); } // Calculate results within a region. @@ -1403,62 +1422,64 @@ namespace yask { // Adjust along domain dims in this grid. for (auto& dim : _dims->_domain_dims.getDims()) { auto& dname = dim.getName(); + if (gp->is_dim_used(dname)) { - // Init range to whole rank domain (including - // outer halos). These may be changed below - // depending on the neighbor's direction. - copy_begin[dname] = first_outer_idx[dname]; - copy_end[dname] = last_outer_idx[dname] + 1; // end = last + 1. + // Init range to whole rank domain (including + // outer halos). These may be changed below + // depending on the neighbor's direction. + copy_begin[dname] = first_outer_idx[dname]; + copy_end[dname] = last_outer_idx[dname] + 1; // end = last + 1. - // Neighbor direction in this dim. - auto neigh_ofs = neigh_offsets[dname]; + // Neighbor direction in this dim. + auto neigh_ofs = neigh_offsets[dname]; - // Region to read from, i.e., data from inside - // this rank's domain to be put into neighbor's - // halo. - if (bd == MPIBufs::bufSend) { + // Region to read from, i.e., data from inside + // this rank's domain to be put into neighbor's + // halo. + if (bd == MPIBufs::bufSend) { - // Neighbor is to the left. - if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { + // Neighbor is to the left. + if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { - // Only read slice as wide as halo from beginning. - copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname]; - } + // Only read slice as wide as halo from beginning. + copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname]; + } - // Neighbor is to the right. - else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { + // Neighbor is to the right. + else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { - // Only read slice as wide as halo before end. - copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname]; - } + // Only read slice as wide as halo before end. + copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname]; + } - // Else, this neighbor is in same posn as I am in this dim, - // so we leave the default begin/end settings. - } + // Else, this neighbor is in same posn as I am in this dim, + // so we leave the default begin/end settings. + } - // Region to write to, i.e., into this rank's halo. - else if (bd == MPIBufs::bufRecv) { + // Region to write to, i.e., into this rank's halo. + else if (bd == MPIBufs::bufRecv) { - // Neighbor is to the left. - if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { + // Neighbor is to the left. + if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { - // Only read slice as wide as halo before beginning. - copy_begin[dname] = first_inner_idx[dname] - my_halo_sizes[dname]; - copy_end[dname] = first_inner_idx[dname]; - } + // Only read slice as wide as halo before beginning. + copy_begin[dname] = first_inner_idx[dname] - my_halo_sizes[dname]; + copy_end[dname] = first_inner_idx[dname]; + } - // Neighbor is to the right. - else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { + // Neighbor is to the right. + else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { - // Only read slice as wide as halo after end. - copy_begin[dname] = last_inner_idx[dname] + 1; - copy_end[dname] = last_inner_idx[dname] + 1 + my_halo_sizes[dname]; - } + // Only read slice as wide as halo after end. + copy_begin[dname] = last_inner_idx[dname] + 1; + copy_end[dname] = last_inner_idx[dname] + 1 + my_halo_sizes[dname]; + } - // Else, this neighbor is in same posn as I am in this dim, - // so we leave the default begin/end settings. - } - } // domain dims in this grid. + // Else, this neighbor is in same posn as I am in this dim, + // so we leave the default begin/end settings. + } + } // domain dims in this grid. + } // domain dims. // Sizes of buffer in all dims of this grid. // Also, set begin/end value for non-domain dims. @@ -2286,6 +2307,7 @@ namespace yask { // Exchange dirty halo data for all grids and all steps, regardless // of their stencil-group. + // TODO: loop through all grids in exchange_halos() instead. void StencilContext::exchange_halos_all() { #ifdef USE_MPI @@ -2305,6 +2327,8 @@ namespace yask { for (auto* sg : stGroups) { // Do exchange over max steps. + // Steps that don't exist in a particular grid or + // steps that are clean will be skipped. exchange_halos(start, stop, *sg); } #endif @@ -2318,20 +2342,17 @@ namespace yask { #ifdef USE_MPI if (!enable_halo_exchange || _env->num_ranks < 2) return; + + // Don't exchange for scratch groups. + if (sg.is_scratch()) + return; + mpi_time.start(); TRACE_MSG("exchange_halos: " << start << " ... (end before) " << stop << " for eq-group '" << sg.get_name() << "'"); auto opts = get_settings(); auto& sd = _dims->_step_dim; - // 1D array to store send request handles. - // We use a 1D array so we can call MPI_Waitall(). - MPI_Request send_reqs[sg.inputGridPtrs.size() * _mpiInfo->neighborhood_size]; - - // 2D array for receive request handles. - // We use a 2D array to simplify individual indexing. - MPI_Request recv_reqs[sg.inputGridPtrs.size()][_mpiInfo->neighborhood_size]; - // Loop through steps. This loop has to be outside halo-step loop // because we only have one buffer per step. Normally, we only // exchange one step; in that case, it doesn't matter. It would be more @@ -2340,10 +2361,57 @@ namespace yask { assert(start != stop); idx_t step = (start < stop) ? 1 : -1; for (idx_t t = start; t != stop; t += step) { - int num_send_reqs = 0; + + // Get list of grids that need to be swapped. + // Use an ordered map to make sure grids are in + // same order on all ranks. + GridPtrMap gridsToSwap; + + // Find the groups that need to be processed. + // This will be the prerequisite scratch-grid + // groups plus this non-scratch group. + auto sg_list = sg.get_scratch_deps(); + sg_list.push_back(&sg); + + // Loop through all the needed groups. + for (auto* csg : sg_list) { + + // Loop thru all *input* grids in this group. + for (auto gp : csg->inputGridPtrs) { + + // Don't swap scratch grids. + if (gp->is_scratch()) + continue; + + // Only need to swap grids whose halos are not up-to-date + // for this step. + if (!gp->is_dirty(t)) + continue; + + // Only need to swap grids that have MPI buffers. + auto& gname = gp->get_name(); + if (mpiData.count(gname) == 0) + continue; + + // Swap this grid. + gridsToSwap[gname] = gp; + } + } + TRACE_MSG("exchange_halos: need to exchange halos for " << + gridsToSwap.size() << " grid(s)"); + + // 1D array to store send request handles. + // We use a 1D array so we can call MPI_Waitall(). + MPI_Request send_reqs[gridsToSwap.size() * _mpiInfo->neighborhood_size]; + + // 2D array for receive request handles. + // We use a 2D array to simplify individual indexing. + MPI_Request recv_reqs[gridsToSwap.size()][_mpiInfo->neighborhood_size]; // Sequence of things to do for each grid's neighbors // (isend includes packing). + int num_send_reqs = 0; + int num_recv_reqs = 0; enum halo_steps { halo_irecv, halo_pack_isend, halo_unpack, halo_nsteps }; for (int halo_step = 0; halo_step < halo_nsteps; halo_step++) { @@ -2353,21 +2421,15 @@ namespace yask { TRACE_MSG("exchange_halos: packing and sending data for step " << t << "..."); else if (halo_step == halo_unpack) TRACE_MSG("exchange_halos: unpacking data for step " << t << "..."); - - // Loop thru all input grids in this group. - for (size_t gi = 0; gi < sg.inputGridPtrs.size(); gi++) { - auto gp = sg.inputGridPtrs[gi]; - MPI_Request* grid_recv_reqs = recv_reqs[gi]; - // Only need to swap grids whose halos are not up-to-date - // for this step. - if (!gp->is_dirty(t)) - continue; - - // Only need to swap grids that have MPI buffers. - auto& gname = gp->get_name(); - if (mpiData.count(gname) == 0) - continue; + // Loop thru all grids to swap. + // Use 'gi' as a unique MPI index. + int gi = -1; + for (auto gtsi : gridsToSwap) { + auto& gname = gtsi.first; + auto gp = gtsi.second; + gi++; + MPI_Request* grid_recv_reqs = recv_reqs[gi]; TRACE_MSG(" for grid '" << gname << "'..."); // Visit all this rank's neighbors. @@ -2375,7 +2437,7 @@ namespace yask { grid_mpi_data.visitNeighbors ([&](const IdxTuple& offsets, // NeighborOffset. int neighbor_rank, - int ni, // 1D index. + int ni, // unique neighbor index. MPIBufs& bufs) { auto& sendBuf = bufs.bufs[MPIBufs::bufSend]; auto& recvBuf = bufs.bufs[MPIBufs::bufRecv]; @@ -2400,6 +2462,7 @@ namespace yask { TRACE_MSG(" requesting " << makeByteStr(nbytes) << "..."); MPI_Irecv(buf, nbytes, MPI_BYTE, neighbor_rank, int(gi), _env->comm, &grid_recv_reqs[ni]); + num_recv_reqs++; } } @@ -2454,7 +2517,7 @@ namespace yask { if (nbytes) { // Wait for data from neighbor before unpacking it. - TRACE_MSG(" waiting for MPI data..."); + TRACE_MSG(" waiting for " << makeByteStr(nbytes) << "..."); MPI_Wait(&grid_recv_reqs[ni], MPI_STATUS_IGNORE); // Vec ok? @@ -2491,16 +2554,19 @@ namespace yask { } // exchange sequence. // Mark grids as up-to-date. - for (size_t gi = 0; gi < sg.inputGridPtrs.size(); gi++) { - auto gp = sg.inputGridPtrs[gi]; + for (auto gtsi : gridsToSwap) { + auto& gname = gtsi.first; + auto gp = gtsi.second; if (gp->is_dirty(t)) { gp->set_dirty(false, t); - TRACE_MSG("grid '" << gp->get_name() << + TRACE_MSG("grid '" << gname << "' marked as clean at step " << t); } } // Wait for all send requests to complete. + TRACE_MSG("exchange_halos: " << num_recv_reqs << + " MPI receive request(s) completed"); if (num_send_reqs) { TRACE_MSG("exchange_halos: waiting for " << num_send_reqs << " MPI send request(s) to complete..."); diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp index 16591148..900fc8a3 100644 --- a/src/kernel/lib/context.hpp +++ b/src/kernel/lib/context.hpp @@ -97,7 +97,6 @@ namespace yask { class StencilGroupBase; typedef std::vector StencilGroupList; typedef std::set StencilGroupSet; - typedef std::map GridPtrMap; // Data and hierarchical sizes. // This is a pure-virtual class that must be implemented diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp index a197e8f6..f9d305e6 100644 --- a/src/kernel/lib/settings.hpp +++ b/src/kernel/lib/settings.hpp @@ -340,6 +340,7 @@ namespace yask { typedef std::shared_ptr YkGridPtr; typedef std::set GridPtrSet; typedef std::vector GridPtrs; + typedef std::map GridPtrMap; typedef std::vector ScratchVecs; // Environmental settings. diff --git a/src/kernel/yask_main.cpp b/src/kernel/yask_main.cpp index d5b96658..04063de2 100644 --- a/src/kernel/yask_main.cpp +++ b/src/kernel/yask_main.cpp @@ -343,7 +343,7 @@ int main(int argc, char** argv) best_elapsed_time = stats->get_elapsed_run_secs(); } } - + os << divLine << "best-elapsed-time (sec): " << makeNumStr(best_elapsed_time) << endl << "best-throughput (num-writes/sec): " << makeNumStr(best_apps) << endl << @@ -407,9 +407,11 @@ int main(int argc, char** argv) cerr << "This is not uncommon for low-precision FP; try with 8-byte reals." << endl; ok = false; } + ref_soln->end_solution(); } else os << "\nRESULTS NOT VERIFIED.\n"; + ksoln->end_solution(); kenv->global_barrier(); if (!ok) diff --git a/src/stencils/AwpElasticStencil.hpp b/src/stencils/AwpElasticStencil.hpp index 8ecb9b6e..888a03b5 100644 --- a/src/stencils/AwpElasticStencil.hpp +++ b/src/stencils/AwpElasticStencil.hpp @@ -35,8 +35,10 @@ IN THE SOFTWARE. //#define FULL_SPONGE_GRID // Set the following macro to calculate free-surface boundary values. -// This feature is currently under development. -//#define DO_SURFACE +#define DO_ABOVE_SURFACE + +// Set the following macro to use intermediate scratch grids. +#define USE_SCRATCH_GRIDS #include "Soln.hpp" @@ -86,6 +88,25 @@ class AwpElasticStencil : public StencilBase { MAKE_SCALAR(delta_t); MAKE_SCALAR(h); + // For the surface stress conditions, we need to write into 2 points + // above the surface. Since we can only write into the "domain", we + // will define the surface index to be 2 points before the last domain + // index. Thus, there will be two layers in the domain above the surface. +#define SURFACE_IDX (last_index(z) - 2) + + // Define some sub-domains related to the surface. +#define IF_BELOW_SURFACE IF (z < SURFACE_IDX) +#define IF_AT_SURFACE IF (z == SURFACE_IDX) +#define IF_AT_OR_BELOW_SURFACE IF (z <= SURFACE_IDX) +#define IF_ONE_ABOVE_SURFACE IF (z == SURFACE_IDX + 1) +#define IF_TWO_ABOVE_SURFACE IF (z == SURFACE_IDX + 2) + +#ifdef USE_SCRATCH_GRIDS + MAKE_SCRATCH_GRID(tmp_vel_x, x, y, z); + MAKE_SCRATCH_GRID(tmp_vel_y, x, y, z); + MAKE_SCRATCH_GRID(tmp_vel_z, x, y, z); +#endif + public: AwpElasticStencil(StencilList& stencils) : @@ -107,7 +128,7 @@ class AwpElasticStencil : public StencilBase { // time or space, so half-steps due to staggered grids are adjusted // appropriately. - void define_vel_x(Condition at_last_z) { + GridValue get_next_vel_x(GridIndex x, GridIndex y, GridIndex z) { GridValue rho_val = (rho(x, y, z ) + rho(x, y-1, z ) + rho(x, y, z-1) + @@ -122,10 +143,10 @@ class AwpElasticStencil : public StencilBase { GridValue next_vel_x = vel_x(t, x, y, z) + (delta_t / (h * rho_val)) * d_val; adjust_for_sponge(next_vel_x); - // define the value at t+1. - vel_x(t+1, x, y, z) EQUALS next_vel_x; + // Return the value at t+1. + return next_vel_x; } - void define_vel_y(Condition at_last_z) { + GridValue get_next_vel_y(GridIndex x, GridIndex y, GridIndex z) { GridValue rho_val = (rho(x, y, z ) + rho(x+1, y, z ) + rho(x, y, z-1) + @@ -140,10 +161,10 @@ class AwpElasticStencil : public StencilBase { GridValue next_vel_y = vel_y(t, x, y, z) + (delta_t / (h * rho_val)) * d_val; adjust_for_sponge(next_vel_y); - // define the value at t+1. - vel_y(t+1, x, y, z) EQUALS next_vel_y; + // Return the value at t+1. + return next_vel_y; } - void define_vel_z(Condition at_last_z) { + GridValue get_next_vel_z(GridIndex x, GridIndex y, GridIndex z) { GridValue rho_val = (rho(x, y, z) + rho(x+1, y, z) + rho(x, y-1, z) + @@ -158,42 +179,100 @@ class AwpElasticStencil : public StencilBase { GridValue next_vel_z = vel_z(t, x, y, z) + (delta_t / (h * rho_val)) * d_val; adjust_for_sponge(next_vel_z); - // define the value at t+1. - vel_z(t+1, x, y, z) EQUALS next_vel_z; + // Return the value at t+1. + return next_vel_z; } // Free-surface boundary equations for velocity. - void define_free_surface_vel(Condition at_last_z) { + void define_free_surface_vel() { + + // Since we're defining points when z == surface + 1, + // the surface itself will be at z - 1; + GridIndex surf = z - 1; + +#ifdef USE_SCRATCH_GRIDS + + // The values for velocity at t+1 will be needed + // in multiple free-surface calculations. + // Thus, it will reduce the number of FP ops + // required if we pre-compute them and store them + // in scratch grids. +#define VEL_X tmp_vel_x +#define VEL_Y tmp_vel_y +#define VEL_Z tmp_vel_z + VEL_X(x, y, z) EQUALS get_next_vel_x(x, y, z); + VEL_Y(x, y, z) EQUALS get_next_vel_y(x, y, z); + VEL_Z(x, y, z) EQUALS get_next_vel_z(x, y, z); + +#else + + // If not using scratch grids, just call the + // functions to calculate each value of velocity + // at t+1 every time it's needed. +#define VEL_X get_next_vel_x +#define VEL_Y get_next_vel_y +#define VEL_Z get_next_vel_z +#endif - // Following expressions are valid only when z == last value in domain. - // Note that values beyond the last index are updated, i.e., in the halo. - // A couple of intermediate values. - GridValue d_x_val = vel_x(t+1, x+1, y, z) - - (vel_z(t+1, x+1, y, z) - vel_z(t+1, x, y, z)); - GridValue d_y_val = vel_y(t+1, x, y-1, z) - - (vel_z(t+1, x, y, z) - vel_z(t+1, x, y-1, z)); + GridValue d_x_val = VEL_X(x+1, y, surf) - + (VEL_Z(x+1, y, surf) - VEL_Z(x, y, surf)); + GridValue d_y_val = VEL_Y(x, y-1, surf) - + (VEL_Z(x, y, surf) - VEL_Z(x, y-1, surf)); - // Following values are valid at the free surface. - GridValue plus1_vel_x = vel_x(t+1, x, y, z) - - (vel_z(t+1, x, y, z) - vel_z(t+1, x-1, y, z)); - GridValue plus1_vel_y = vel_y(t+1, x, y, z) - - (vel_z(t+1, x, y+1, z) - vel_z(t+1, x, y, z)); - GridValue plus1_vel_z = vel_z(t+1, x, y, z) - + // Following values are valid one layer above the free surface. + GridValue plus1_vel_x = VEL_X(x, y, surf) - + (VEL_Z(x, y, surf) - VEL_Z(x-1, y, surf)); + GridValue plus1_vel_y = VEL_Y(x, y, surf) - + (VEL_Z(x, y+1, surf) - VEL_Z(x, y, surf)); + GridValue plus1_vel_z = VEL_Z(x, y, surf) - ((d_x_val - plus1_vel_x) + - (vel_x(t+1, x+1, y, z) - vel_x(t+1, x, y, z)) + + (VEL_X(x+1, y, surf) - VEL_X(x, y, surf)) + (plus1_vel_y - d_y_val) + - (vel_y(t+1, x, y, z) - vel_y(t+1, x, y-1, z))) / - ((mu(x, y, z) * - (2.0 / mu(x, y, z) + 1.0 / lambda(x, y, z)))); - - // Define equivalencies to be valid only when z == last value in domain. - vel_x(t+1, x, y, z+1) EQUALS plus1_vel_x - IF at_last_z; - vel_y(t+1, x, y, z+1) EQUALS plus1_vel_y - IF at_last_z; - vel_z(t+1, x, y, z+1) EQUALS plus1_vel_z - IF at_last_z; + (VEL_Y(x, y, surf) - VEL_Y(x, y-1, surf))) / + ((mu(x, y, surf) * + (2.0 / mu(x, y, surf) + 1.0 / lambda(x, y, surf)))); +#undef VEL_X +#undef VEL_Y +#undef VEL_Z + + // Define layer at one point above surface. + vel_x(t+1, x, y, z) EQUALS plus1_vel_x IF_ONE_ABOVE_SURFACE; + vel_y(t+1, x, y, z) EQUALS plus1_vel_y IF_ONE_ABOVE_SURFACE; + vel_z(t+1, x, y, z) EQUALS plus1_vel_z IF_ONE_ABOVE_SURFACE; + + // Define layer two points above surface for completeness, even + // though these aren't input to any stencils. + vel_x(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE; + vel_y(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE; + vel_z(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE; + } + + // Compute average of 8 neighbors. + GridValue ave8(Grid& g, GridIndex x, GridIndex y, GridIndex z) { + + return 8.0 / + (g(x, y, z ) + g(x+1, y, z ) + + g(x, y-1, z ) + g(x+1, y-1, z ) + + g(x, y, z-1) + g(x+1, y, z-1) + + g(x, y-1, z-1) + g(x+1, y-1, z-1)); + } + + // Some common velocity calculations. + GridValue d_x_val(GridIndex x, GridIndex y, GridIndex z) { + return + c1 * (vel_x(t+1, x+1, y, z ) - vel_x(t+1, x, y, z )) + + c2 * (vel_x(t+1, x+2, y, z ) - vel_x(t+1, x-1, y, z )); + } + GridValue d_y_val(GridIndex x, GridIndex y, GridIndex z) { + return + c1 * (vel_y(t+1, x, y, z ) - vel_y(t+1, x, y-1, z )) + + c2 * (vel_y(t+1, x, y+1, z ) - vel_y(t+1, x, y-2, z )); + } + GridValue d_z_val(GridIndex x, GridIndex y, GridIndex z) { + return + c1 * (vel_z(t+1, x, y, z ) - vel_z(t+1, x, y, z-1)) + + c2 * (vel_z(t+1, x, y, z+1) - vel_z(t+1, x, y, z-2)); } // Stress-grid define functions. For each D in xx, yy, zz, xy, xz, yz, @@ -204,33 +283,43 @@ class AwpElasticStencil : public StencilBase { // space, so half-steps due to staggered grids are adjusted // appropriately. - void define_stress_xx(Condition at_last_z, - GridValue lambda_val, GridValue mu_val, - GridValue d_x_val, GridValue d_y_val, GridValue d_z_val) { + GridValue get_next_stress_xx(GridIndex x, GridIndex y, GridIndex z) { GridValue next_stress_xx = stress_xx(t, x, y, z) + - ((delta_t / h) * ((2 * mu_val * d_x_val) + - (lambda_val * (d_x_val + d_y_val + d_z_val)))); + ((delta_t / h) * ((2 * ave8(mu, x, y, z) * d_x_val(x, y, z)) + + (ave8(lambda, x, y, z) * + (d_x_val(x, y, z) + d_y_val(x, y, z) + d_z_val(x, y, z))))); adjust_for_sponge(next_stress_xx); - // define the value at t+1. - stress_xx(t+1, x, y, z) EQUALS next_stress_xx; + // Return the value at t+1. + return next_stress_xx; } - void define_stress_yy(Condition at_last_z, - GridValue lambda_val, GridValue mu_val, - GridValue d_x_val, GridValue d_y_val, GridValue d_z_val) { + GridValue get_next_stress_yy(GridIndex x, GridIndex y, GridIndex z) { GridValue next_stress_yy = stress_yy(t, x, y, z) + - ((delta_t / h) * ((2 * mu_val * d_y_val) + - (lambda_val * (d_x_val + d_y_val + d_z_val)))); + ((delta_t / h) * ((2 * ave8(mu, x, y, z) * d_y_val(x, y, z)) + + (ave8(lambda, x, y, z) * + (d_x_val(x, y, z) + d_y_val(x, y, z) + d_z_val(x, y, z))))); adjust_for_sponge(next_stress_yy); - // define the value at t+1. - stress_yy(t+1, x, y, z) EQUALS next_stress_yy; + // Return the value at t+1. + return next_stress_yy; } - void define_stress_xy(Condition at_last_z) { + GridValue get_next_stress_zz(GridIndex x, GridIndex y, GridIndex z) { + + GridValue next_stress_zz = stress_zz(t, x, y, z) + + ((delta_t / h) * ((2 * ave8(mu, x, y, z) * d_z_val(x, y, z)) + + (ave8(lambda, x, y, z) * + (d_x_val(x, y, z) + d_y_val(x, y, z) + d_z_val(x, y, z))))); + adjust_for_sponge(next_stress_zz); - GridValue mu_val = 2.0 / + // return the value at t+1. + return next_stress_zz; + } + GridValue get_next_stress_xy(GridIndex x, GridIndex y, GridIndex z) { + + // Compute average of 2 neighbors. + GridValue mu2 = 2.0 / (mu(x, y, z ) + mu(x, y, z-1)); // Note that we are using the velocity values at t+1. @@ -242,15 +331,16 @@ class AwpElasticStencil : public StencilBase { c2 * (vel_y(t+1, x+1, y, z ) - vel_y(t+1, x-2, y, z )); GridValue next_stress_xy = stress_xy(t, x, y, z) + - ((mu_val * delta_t / h) * (d_xy_val + d_yx_val)); + ((mu2 * delta_t / h) * (d_xy_val + d_yx_val)); adjust_for_sponge(next_stress_xy); - // define the value at t+1. - stress_xy(t+1, x, y, z) EQUALS next_stress_xy; + // return the value at t+1. + return next_stress_xy; } - void define_stress_xz(Condition at_last_z) { + GridValue get_next_stress_xz(GridIndex x, GridIndex y, GridIndex z) { - GridValue mu_val = 2.0 / + // Compute average of 2 neighbors. + GridValue mu2 = 2.0 / (mu(x, y, z ) + mu(x, y-1, z )); // Note that we are using the velocity values at t+1. @@ -262,22 +352,16 @@ class AwpElasticStencil : public StencilBase { c2 * (vel_z(t+1, x+1, y, z ) - vel_z(t+1, x-2, y, z )); GridValue next_stress_xz = stress_xz(t, x, y, z) + - ((mu_val * delta_t / h) * (d_xz_val + d_zx_val)); + ((mu2 * delta_t / h) * (d_xz_val + d_zx_val)); adjust_for_sponge(next_stress_xz); - // define the value at t+1 (special case: zero at surface). -#ifdef DO_SURFACE - stress_xz(t+1, x, y, z) EQUALS next_stress_xz - IF !at_last_z; - stress_xz(t+1, x, y, z) EQUALS 0.0 - IF at_last_z; -#else - stress_xz(t+1, x, y, z) EQUALS next_stress_xz; -#endif + // return the value at t+1. + return next_stress_xz; } - void define_stress_yz(Condition at_last_z) { + GridValue get_next_stress_yz(GridIndex x, GridIndex y, GridIndex z) { - GridValue mu_val = 2.0 / + // Compute average of 2 neighbors. + GridValue mu2 = 2.0 / (mu(x, y, z ) + mu(x+1, y, z )); // Note that we are using the velocity values at t+1. @@ -289,122 +373,79 @@ class AwpElasticStencil : public StencilBase { c2 * (vel_z(t+1, x, y+2, z ) - vel_z(t+1, x, y-1, z )); GridValue next_stress_yz = stress_yz(t, x, y, z) + - ((mu_val * delta_t / h) * (d_yz_val + d_zy_val)); + ((mu2 * delta_t / h) * (d_yz_val + d_zy_val)); adjust_for_sponge(next_stress_yz); - // define the value at t+1 (special case: zero at surface). -#ifdef DO_SURFACE - stress_yz(t+1, x, y, z) EQUALS next_stress_yz - IF !at_last_z; - stress_yz(t+1, x, y, z) EQUALS 0.0 - IF at_last_z; -#else - stress_yz(t+1, x, y, z) EQUALS next_stress_yz; -#endif - } - void define_stress_zz(Condition at_last_z, - GridValue lambda_val, GridValue mu_val, - GridValue d_x_val, GridValue d_y_val, GridValue d_z_val) { - - GridValue next_stress_zz = stress_zz(t, x, y, z) + - ((delta_t / h) * ((2 * mu_val * d_z_val) + - (lambda_val * (d_x_val + d_y_val + d_z_val)))); - adjust_for_sponge(next_stress_zz); - - // define the value at t+1 (special case: zero at surface). -#ifdef DO_SURFACE - stress_zz(t+1, x, y, z) EQUALS next_stress_zz - IF !at_last_z; - stress_zz(t+1, x, y, z) EQUALS 0.0 - IF at_last_z; -#else - stress_zz(t+1, x, y, z) EQUALS next_stress_zz; -#endif + // return the value at t+1. + return next_stress_yz; } // Free-surface boundary equations for stress. - void define_free_surface_stress(Condition at_last_z) { + void define_free_surface_stress() { - // Define equivalencies to be valid only when z == last value in domain. - // Note that values beyond the last index are updated, i.e., in the halo. + // When z == surface + 1, the surface will be at z - 1; + GridIndex surf = z - 1; - stress_zz(t+1, x, y, z+1) EQUALS -stress_zz(t+1, x, y, z) - IF at_last_z; - stress_zz(t+1, x, y, z+2) EQUALS -stress_zz(t+1, x, y, z-1) - IF at_last_z; + stress_zz(t+1, x, y, z) EQUALS -get_next_stress_zz(x, y, surf) IF_ONE_ABOVE_SURFACE; + stress_xz(t+1, x, y, z) EQUALS -get_next_stress_xz(x, y, surf-1) IF_ONE_ABOVE_SURFACE; + stress_yz(t+1, x, y, z) EQUALS -get_next_stress_yz(x, y, surf-1) IF_ONE_ABOVE_SURFACE; - stress_xz(t+1, x, y, z+1) EQUALS -stress_xz(t+1, x, y, z-1) - IF at_last_z; - stress_xz(t+1, x, y, z+2) EQUALS -stress_xz(t+1, x, y, z-2) - IF at_last_z; - - stress_yz(t+1, x, y, z+1) EQUALS -stress_yz(t+1, x, y, z-1) - IF at_last_z; - stress_yz(t+1, x, y, z+2) EQUALS -stress_yz(t+1, x, y, z-2) - IF at_last_z; + // Define other 3 stress values for completeness, even + // though these aren't input to any stencils. + stress_xx(t+1, x, y, z) EQUALS 0.0 IF_ONE_ABOVE_SURFACE; + stress_yy(t+1, x, y, z) EQUALS 0.0 IF_ONE_ABOVE_SURFACE; + stress_xy(t+1, x, y, z) EQUALS 0.0 IF_ONE_ABOVE_SURFACE; + + // When z == surface + 2, the surface will be at z - 2; + surf = z - 2; + + stress_zz(t+1, x, y, z) EQUALS -get_next_stress_zz(x, y, surf-1) IF_TWO_ABOVE_SURFACE; + stress_xz(t+1, x, y, z) EQUALS -get_next_stress_xz(x, y, surf-2) IF_TWO_ABOVE_SURFACE; + stress_yz(t+1, x, y, z) EQUALS -get_next_stress_yz(x, y, surf-2) IF_TWO_ABOVE_SURFACE; + + // Define other 3 stress values for completeness, even + // though these aren't input to any stencils. + stress_xx(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE; + stress_yy(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE; + stress_xy(t+1, x, y, z) EQUALS 0.0 IF_TWO_ABOVE_SURFACE; } - // Call all the define_* functions. + // Define the t+1 values for all velocity and stress grids. virtual void define() { - // A condition that is true when index 'z' is at the free-surface boundary. - Condition at_last_z = (z == last_index(z)); - // Define velocity components. - define_vel_x(at_last_z); - define_vel_y(at_last_z); - define_vel_z(at_last_z); + vel_x(t+1, x, y, z) EQUALS get_next_vel_x(x, y, z) IF_AT_OR_BELOW_SURFACE; + vel_y(t+1, x, y, z) EQUALS get_next_vel_y(x, y, z) IF_AT_OR_BELOW_SURFACE; + vel_z(t+1, x, y, z) EQUALS get_next_vel_z(x, y, z) IF_AT_OR_BELOW_SURFACE; + + // Define stress components. Use non-overlapping sub-domains only, + // i.e. AT and BELOW but not AT_OR_BELOW, even though there are some + // repeated stencils. This allows the YASK compiler to bundle all + // the stress equations together. + stress_xx(t+1, x, y, z) EQUALS get_next_stress_xx(x, y, z) IF_BELOW_SURFACE; + stress_yy(t+1, x, y, z) EQUALS get_next_stress_yy(x, y, z) IF_BELOW_SURFACE; + stress_xy(t+1, x, y, z) EQUALS get_next_stress_xy(x, y, z) IF_BELOW_SURFACE; + stress_xz(t+1, x, y, z) EQUALS get_next_stress_xz(x, y, z) IF_BELOW_SURFACE; + stress_yz(t+1, x, y, z) EQUALS get_next_stress_yz(x, y, z) IF_BELOW_SURFACE; + stress_zz(t+1, x, y, z) EQUALS get_next_stress_zz(x, y, z) IF_BELOW_SURFACE; + + stress_xx(t+1, x, y, z) EQUALS get_next_stress_xx(x, y, z) IF_AT_SURFACE; + stress_yy(t+1, x, y, z) EQUALS get_next_stress_yy(x, y, z) IF_AT_SURFACE; + stress_xy(t+1, x, y, z) EQUALS get_next_stress_xy(x, y, z) IF_AT_SURFACE; + stress_xz(t+1, x, y, z) EQUALS 0.0 IF_AT_SURFACE; + stress_yz(t+1, x, y, z) EQUALS 0.0 IF_AT_SURFACE; + stress_zz(t+1, x, y, z) EQUALS get_next_stress_zz(x, y, z) IF_AT_SURFACE; // Boundary conditions. -#ifdef DO_SURFACE - define_free_surface_vel(at_last_z); -#endif - - // Define some values common to the diagonal stress equations. -#ifdef PRECOMPUTED_LAMBDA - // Use this the lambda values are pre-computed once before - // all time-steps. - GridValue lambda_val = lambda(x, y, z); -#else - GridValue lambda_val = 8.0 / - (lambda(x, y, z ) + lambda(x+1, y, z ) + - lambda(x, y-1, z ) + lambda(x+1, y-1, z ) + - lambda(x, y, z-1) + lambda(x+1, y, z-1) + - lambda(x, y-1, z-1) + lambda(x+1, y-1, z-1)); -#endif - GridValue mu_val = 8.0 / - (mu(x, y, z ) + mu(x+1, y, z ) + - mu(x, y-1, z ) + mu(x+1, y-1, z ) + - mu(x, y, z-1) + mu(x+1, y, z-1) + - mu(x, y-1, z-1) + mu(x+1, y-1, z-1)); - - // Note that we are using the velocity values at t+1. - GridValue d_x_val = - c1 * (vel_x(t+1, x+1, y, z ) - vel_x(t+1, x, y, z )) + - c2 * (vel_x(t+1, x+2, y, z ) - vel_x(t+1, x-1, y, z )); - GridValue d_y_val = - c1 * (vel_y(t+1, x, y, z ) - vel_y(t+1, x, y-1, z )) + - c2 * (vel_y(t+1, x, y+1, z ) - vel_y(t+1, x, y-2, z )); - GridValue d_z_val = - c1 * (vel_z(t+1, x, y, z ) - vel_z(t+1, x, y, z-1)) + - c2 * (vel_z(t+1, x, y, z+1) - vel_z(t+1, x, y, z-2)); - - // Define stress components. - define_stress_xx(at_last_z, - lambda_val, mu_val, d_x_val, d_y_val, d_z_val); - define_stress_yy(at_last_z, - lambda_val, mu_val, d_x_val, d_y_val, d_z_val); - define_stress_zz(at_last_z, - lambda_val, mu_val, d_x_val, d_y_val, d_z_val); - define_stress_xy(at_last_z); - define_stress_xz(at_last_z); - define_stress_yz(at_last_z); - - // Boundary conditions. -#ifdef DO_SURFACE - define_free_surface_stress(at_last_z); +#ifdef DO_ABOVE_SURFACE + define_free_surface_vel(); + define_free_surface_stress(); #endif } }; REGISTER_STENCIL(AwpElasticStencil); + +#undef DO_SURFACE +#undef FULL_SPONGE_GRID +#undef USE_SCRATCH_GRIDS From 2f1ec50ea39ab7a0bfd643821af2ae972826c571 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Fri, 20 Apr 2018 08:56:18 -0700 Subject: [PATCH 04/21] Remove some deprecated compiler APIs. --- include/yask_compiler_api.hpp | 148 ++++++-------------------- include/yask_kernel_api.hpp | 6 +- include/yk_solution_api.hpp | 103 ------------------ src/compiler/lib/Expr.hpp | 11 +- src/compiler/lib/Soln.hpp | 8 +- src/compiler/swig/yask_compiler_api.i | 5 +- src/kernel/swig/yask_kernel_api.i | 3 +- src/stencils/AwpElasticStencil.hpp | 38 +++---- 8 files changed, 65 insertions(+), 257 deletions(-) diff --git a/include/yask_compiler_api.hpp b/include/yask_compiler_api.hpp index a6c3bd81..f865e167 100644 --- a/include/yask_compiler_api.hpp +++ b/include/yask_compiler_api.hpp @@ -44,13 +44,9 @@ namespace yask { typedef std::shared_ptr yc_solution_ptr; class yc_grid; - /// Shared pointer to \ref yc_grid + /// Pointer to \ref yc_grid typedef yc_grid* yc_grid_ptr; - class yc_equation_group; - /// Shared pointer to \ref yc_equation_group; - typedef std::shared_ptr yc_equation_group_ptr; - // Forward declarations of expression nodes and their pointers. class yc_expr_node; @@ -105,7 +101,7 @@ namespace yask { /// Shared pointer to \ref yc_bool_node typedef std::shared_ptr yc_bool_node_ptr; - /// Factory to create objects needed to define a stencil solution. + /// Bootstrap factory to create objects needed to define a stencil solution. class yc_factory { public: virtual ~yc_factory() {} @@ -157,6 +153,16 @@ namespace yask { set_name(std::string name /**< [in] Name; must be a valid C++ identifier. */ ) =0; + /// Get current floating-point precision setting. + /** @returns Number of bytes in a FP number. */ + virtual int + get_element_bytes() const =0; + + /// Set floating-point precision. + virtual void + set_element_bytes(int nbytes /**< [in] Number of bytes in a FP number. + Should be 4 or 8. */ ) =0; + /// Create an n-dimensional grid variable in the solution. /** "Grid" is a generic term for any n-dimensional variable. A 0-dim @@ -237,64 +243,21 @@ namespace yask { Each dimension is identified by an associated index. */ ) =0; #endif - /// Get all the grids in the solution. - /** @returns Vector containing pointer to all grids. */ - virtual std::vector - get_grids() =0; - /// Get the number of grids in the solution. /** @returns Number of grids that have been created via new_grid(). */ virtual int get_num_grids() const =0; + /// Get all the grids in the solution. + /** @returns Vector containing pointer to all grids. */ + virtual std::vector + get_grids() =0; + /// Get the specified grid. /** @returns Pointer to the specified grid or null pointer if it does not exist. */ virtual yc_grid_ptr get_grid(const std::string& name /**< [in] Name of the grid. */ ) =0; - /// Get the number of equations in the solution. - /** Equations are added when yc_node_factory::new_equation_node() is called. - @returns Number of equations that have been created. */ - virtual int - get_num_equations() const =0; - - /// Get the specified equation. - /** @returns Pointer to \ref yc_equation_node of nth equation. */ - virtual yc_equation_node_ptr - get_equation(int n /**< [in] Index of equation between zero (0) - and get_num_equations()-1. */ ) =0; - - /// Create a new equation group. - /** - In normal usage, equation groups are created automatically when - format() is called. Under automatic grouping, the YASK compiler - discovers dependencies between equations and places equations - together in a group if they do not depend upon one another. - Then, the YASK compiler schedules the resulting groups for - execution in the kernel based on the dependencies between groups. - - A \ref yc_equation_group object allows manual grouping of equations. - Under manual grouping, the YASK compiler does _not_ check - for illegal dependencies within the group. - In addition, if `do_schedule` is `false`, the YASK compiler - will not check for dependencies with other groups and - will not schedule the group for execution in the kernel. - Then, it will be the programmer's responsibility to run the - stencil group via yk_solution::run_stencil_group(). - - This capability is useful for processing equations that - the YASK compiler cannot currently handle, like equations - with dependencies between different points of a grid - at the same step index. - - @returns Pointer to the new \ref yc_equation_group object. - */ - virtual yc_equation_group_ptr - new_equation_group(const std::string& name - /**< [in] Name of the group. */, - bool do_schedule = true - /**< [in] Schedule the group for execution in the kernel. */ ) =0; - /// Set the vectorization length in given dimension. /** For YASK-code generation, the product of the fold lengths should be equal to the number of elements in a HW SIMD register. @@ -321,16 +284,6 @@ namespace yask { virtual void clear_folding() =0; - /// Get current floating-point precision setting. - /** @returns Number of bytes in a FP number. */ - virtual int - get_element_bytes() const =0; - - /// Set floating-point precision. - virtual void - set_element_bytes(int nbytes /**< [in] Number of bytes in a FP number. - Should be 4 or 8. */ ) =0; - /// Set the cluster multiplier (unroll factor) in given dimension. /** For YASK-code generation, this will have the effect of creating N vectors of output for each equation, where N is the product of @@ -349,6 +302,18 @@ namespace yask { virtual void clear_clustering() =0; + /// Get the number of equations in the solution. + /** Equations are added when yc_node_factory::new_equation_node() is called. + @returns Number of equations that have been created. */ + virtual int + get_num_equations() const =0; + + /// Get a list of all the defined equations. + /** @returns Vector of containing pointers to all + equations that have been created. */ + virtual std::vector + get_equations() =0; + /// Format the current equation(s) and write to given output object. /** Currently supported format types: Type | Output @@ -396,12 +361,6 @@ namespace yask { /** @returns Number of dimensions created via new_grid(). */ virtual int get_num_dims() const =0; - /// Get the name of the specified dimension. - /** @returns String containing name of dimension created via new_grid(). */ - virtual const std::string& - get_dim_name(int n /**< [in] Index of dimension between zero (0) - and get_num_dims()-1. */ ) const =0; - /// Get all the dimensions in this grid. /** Includes step dimension if it is a dimension of this grid. @@ -584,7 +543,7 @@ namespace yask { virtual yc_number_node_ptr get_rhs() =0; }; - /// Base class for all real or integer AST nodes. + /// Base class for all numerical AST nodes. /** An object of this abstract type cannot be created. */ class yc_number_node : public virtual yc_expr_node { }; @@ -671,11 +630,10 @@ namespace yask { virtual int get_num_operands() =0; - /// Get the specified operand. - /** @returns Pointer to node at given position or null pointer if out of bounds. */ - virtual yc_number_node_ptr - get_operand(int i /**< [in] Index between zero (0) - and get_num_operands()-1. */ ) =0; + /// Get a list of the operands. + /** @returns Vector of pointers to all operand nodes. */ + virtual std::vector + get_operands() =0; /// Add an operand. virtual void @@ -722,44 +680,6 @@ namespace yask { get_rhs() =0; }; - /// A manual grouping of stencil equations. - /** - Created via yc_solution::new_equation_group(). - See yc_solution::new_equation_group() for a description of - automatic versus manual grouping. - - After a \ref yc_equation_group is processed by the YASK - compiler and the resulting kernel is compiled, - it will be visible as a \ref yk_stencil_group - in the corresponding YASK kernel. - */ - class yc_equation_group { - public: - - /// Get the name of this group. - /** - @returns Name created via yc_solution::new_equation_group(). - */ - virtual const std::string& - get_name() const =0; - - /// Determine whether this group will be automatically scheduled. - /** - @returns `true` if this group will be run via yk_solution::run_solution() - or `false` if this group must be run via yk_solution::run_stencil_group(). - This is the `do_schedule` setting passed via yc_solution::new_equation_group(). - */ - virtual bool - get_do_schedule() const =0; - - /// Add an equation to this group. - virtual void - add_equation(yc_equation_node_ptr equation - /**< [in] Pointer to equation to be added. */ ) =0; - - public: - }; - } // namespace yask. #endif diff --git a/include/yask_kernel_api.hpp b/include/yask_kernel_api.hpp index e579cac1..8d407962 100644 --- a/include/yask_kernel_api.hpp +++ b/include/yask_kernel_api.hpp @@ -60,10 +60,6 @@ namespace yask { /// Shared pointer to \ref yk_grid. typedef std::shared_ptr yk_grid_ptr; - class yk_stencil_group; - /// Shared pointer to \ref yk_stencil_group. - typedef std::shared_ptr yk_stencil_group; - class yk_stats; /// Shared pointer to \ref yk_stats. typedef std::shared_ptr yk_stats_ptr; @@ -75,7 +71,7 @@ namespace yask { namespace yask { - /// Factory to create a stencil solution. + /// Bootstrap factory to create a stencil solution. class yk_factory { public: virtual ~yk_factory() {} diff --git a/include/yk_solution_api.hpp b/include/yk_solution_api.hpp index d23f335c..d7c554a8 100644 --- a/include/yk_solution_api.hpp +++ b/include/yk_solution_api.hpp @@ -668,80 +668,6 @@ namespace yask { apply_command_line_options(const std::string& args /**< [in] String of arguments to parse. */ ) =0; - /// **[Advanced]** Get the specified stencil group. - /** - @returns Pointer to the specified \ref yk_stencil_group - or null pointer if it does not exist. - */ - virtual yk_stencil_group_ptr - get_stencil_group(const std::string& name - /**< [in] Name of the group. */ ) =0; - - /// **[Advanced]** Get all the stencil groups. - /** - @returns List of all stencil groups in the solution. - */ - virtual std::vector - get_stencil_groups() =0; - - /// **[Advanced]** Run the specified stencil group over the given sub-domain. - /** - Applies all the stencil kernels in the given group - from `first_domain_indices` at `first_step_index` - to `last_domain_indices` at `last_domain_index` (inclusive) in each dimension. - Each list of domain indices should contain the indices for the - dimensions returned by get_domain_dim_names() in the same order. - - Indices are relative to the *overall* problem domain and - need not be limited to fall within the domain of the current MPI rank. - The actual points to which the group is applied on each rank will be - limited internally as needed. - - Example C++ usage: - - \code{.cpp} - // Find my custom stencil group created in the YASK compiler. - auto my_group = soln->get_stencil_group("my_group"); - ... - soln->prepare_solution(); - ... - // Set first_indices and last_indices to apply my_group - // to only the first slice in the "z" dimension. - std::vector first_indices, last_indices; - for (auto dim : soln->get_domain_dim_names()) { - auto overall_size = soln->get_overall_domain_size(dim); - first_indices.push_back(0); - if (dim == "z") - last_indices.push_back(0); - else - last_indices.push_back(overall_size - 1); - } - ... - // Execute the time-steps. - for (idx_t t = 0; t < num_steps; t++) { - - // Apply the automatically-scheduled stencils. - soln->run_solution(t); - - // Apply my custom stencil group. - soln->run_stencil_group(my_group, - t, first_indices, - t, last_indices); - } - soln->end_solution(); - \endcode - - @returns Number of points to which the group was applied. - */ - virtual idx_t - run_stencil_group(yk_stencil_group_ptr stencil_group - /**< [in] Pointer to the stencil group obtained from - get_stencil_groups() or get_stencil_group(). */, - const std::vector& first_domain_indices - /**< [in] List of initial domain indices. */, - const std::vector& last_domain_indices - /**< [in] List of final domain indices. */ ) =0; - /// **[Advanced]** Use data-storage from existing grids in specified solution. /** Calls yk_grid::share_storage() for each pair of grids that have the same name @@ -804,35 +730,6 @@ namespace yask { get_elapsed_run_secs() =0; }; - /// A group of stencil kernels. - /** - Groups of stencils are created automatically by the YASK stencil compiler - or manually via yc_solution::new_equation_group(). See the latter for - more information. - */ - class yk_stencil_group { - public: - virtual ~yk_stencil_group() {} - - /// Get the name of this group. - /** - @returns Default name given by the YASK stencil compiler - or the name provided via yc_solution::new_equation_group(). - */ - virtual const std::string& - get_name() const =0; - - /// Determine whether this group will be automatically scheduled. - /** - @returns `true` if this group will be run via yk_solution::run_solution() - or `false` if this group must be run via yk_solution::run_stencil_group(). - This is the `do_schedule` setting passed via yc_solution::new_equation_group(). - */ - virtual bool - is_scheduled() const =0; - - }; - } // namespace yask. #endif diff --git a/src/compiler/lib/Expr.hpp b/src/compiler/lib/Expr.hpp index de6d9810..b83db317 100644 --- a/src/compiler/lib/Expr.hpp +++ b/src/compiler/lib/Expr.hpp @@ -662,12 +662,11 @@ namespace yask { virtual int get_num_operands() { return _ops.size(); } - virtual yc_number_node_ptr get_operand(int i) { - if (i >= 0 && - size_t(i) < _ops.size()) - return _ops.at(size_t(i)); - else - return nullptr; + virtual std::vector get_operands() { + std::vector nv; + for (int i = 0; i < get_num_operands(); i++) + nv.push_back(_ops.at(i)); + return nv; } virtual void add_operand(yc_number_node_ptr node) { auto p = dynamic_pointer_cast(node); diff --git a/src/compiler/lib/Soln.hpp b/src/compiler/lib/Soln.hpp index 20142e69..b91a50da 100644 --- a/src/compiler/lib/Soln.hpp +++ b/src/compiler/lib/Soln.hpp @@ -164,9 +164,11 @@ namespace yask { virtual int get_num_equations() const { return _eqs.getNumEqs(); } - virtual yc_equation_node_ptr get_equation(int n) { - assert(n >= 0 && n < get_num_equations()); - return _eqs.getEqs().at(n); + virtual std::vector get_equations() { + std::vector ev; + for (int i = 0; i < get_num_equations(); i++) + ev.push_back(_eqs.getEqs().at(i)); + return ev; } virtual void set_fold(const std::string& dim, int len) { auto& fold = _settings._foldOptions; diff --git a/src/compiler/swig/yask_compiler_api.i b/src/compiler/swig/yask_compiler_api.i index e9c3c5e1..9b36ae0e 100644 --- a/src/compiler/swig/yask_compiler_api.i +++ b/src/compiler/swig/yask_compiler_api.i @@ -40,7 +40,6 @@ IN THE SOFTWARE. // Must declare shared_ptrs for the entire expr_node hierarchy! %shared_ptr(yask::yc_solution) //%shared_ptr(yask::yc_grid) -%shared_ptr(yask::yc_equation_group) %shared_ptr(yask::yc_expr_node) %shared_ptr(yask::yc_index_node) %shared_ptr(yask::yc_equation_node) @@ -63,7 +62,9 @@ IN THE SOFTWARE. // All vector types used in API. %template(vector_int) std::vector; %template(vector_str) std::vector; -%template(vector_index_ptr) std::vector>; +%template(vector_index) std::vector>; +%template(vector_num) std::vector>; +%template(vector_eq) std::vector>; %template(vector_grid) std::vector; %exception { diff --git a/src/kernel/swig/yask_kernel_api.i b/src/kernel/swig/yask_kernel_api.i index 2da88f25..421f79ad 100644 --- a/src/kernel/swig/yask_kernel_api.i +++ b/src/kernel/swig/yask_kernel_api.i @@ -43,7 +43,6 @@ IN THE SOFTWARE. %shared_ptr(yask::yk_settings) %shared_ptr(yask::yk_solution) %shared_ptr(yask::yk_grid) -%shared_ptr(yask::yk_stencil_group) %shared_ptr(yask::yk_stats) // Mutable buffer to access raw data. @@ -70,3 +69,5 @@ IN THE SOFTWARE. %include "yask_common_api.hpp" %include "yask_kernel_api.hpp" +%include "yk_solution_api.hpp" +%include "yk_grid_api.hpp" diff --git a/src/stencils/AwpElasticStencil.hpp b/src/stencils/AwpElasticStencil.hpp index 8ecb9b6e..5dabd289 100644 --- a/src/stencils/AwpElasticStencil.hpp +++ b/src/stencils/AwpElasticStencil.hpp @@ -188,6 +188,7 @@ class AwpElasticStencil : public StencilBase { (2.0 / mu(x, y, z) + 1.0 / lambda(x, y, z)))); // Define equivalencies to be valid only when z == last value in domain. + // This writes into the halo region. vel_x(t+1, x, y, z+1) EQUALS plus1_vel_x IF at_last_z; vel_y(t+1, x, y, z+1) EQUALS plus1_vel_y @@ -265,15 +266,12 @@ class AwpElasticStencil : public StencilBase { ((mu_val * delta_t / h) * (d_xz_val + d_zx_val)); adjust_for_sponge(next_stress_xz); - // define the value at t+1 (special case: zero at surface). -#ifdef DO_SURFACE + // define the value at t+1. stress_xz(t+1, x, y, z) EQUALS next_stress_xz - IF !at_last_z; - stress_xz(t+1, x, y, z) EQUALS 0.0 - IF at_last_z; -#else - stress_xz(t+1, x, y, z) EQUALS next_stress_xz; +#ifdef DO_SURFACE + IF !at_last_z #endif + ; } void define_stress_yz(Condition at_last_z) { @@ -292,15 +290,12 @@ class AwpElasticStencil : public StencilBase { ((mu_val * delta_t / h) * (d_yz_val + d_zy_val)); adjust_for_sponge(next_stress_yz); - // define the value at t+1 (special case: zero at surface). -#ifdef DO_SURFACE + // define the value at t+1. stress_yz(t+1, x, y, z) EQUALS next_stress_yz - IF !at_last_z; - stress_yz(t+1, x, y, z) EQUALS 0.0 - IF at_last_z; -#else - stress_yz(t+1, x, y, z) EQUALS next_stress_yz; +#ifdef DO_SURFACE + IF !at_last_z #endif + ; } void define_stress_zz(Condition at_last_z, GridValue lambda_val, GridValue mu_val, @@ -311,33 +306,30 @@ class AwpElasticStencil : public StencilBase { (lambda_val * (d_x_val + d_y_val + d_z_val)))); adjust_for_sponge(next_stress_zz); - // define the value at t+1 (special case: zero at surface). -#ifdef DO_SURFACE - stress_zz(t+1, x, y, z) EQUALS next_stress_zz - IF !at_last_z; - stress_zz(t+1, x, y, z) EQUALS 0.0 - IF at_last_z; -#else + // define the value at t+1 (no special case at surface). stress_zz(t+1, x, y, z) EQUALS next_stress_zz; -#endif } // Free-surface boundary equations for stress. void define_free_surface_stress(Condition at_last_z) { // Define equivalencies to be valid only when z == last value in domain. - // Note that values beyond the last index are updated, i.e., in the halo. + // This writes into the halo region. stress_zz(t+1, x, y, z+1) EQUALS -stress_zz(t+1, x, y, z) IF at_last_z; stress_zz(t+1, x, y, z+2) EQUALS -stress_zz(t+1, x, y, z-1) IF at_last_z; + stress_xz(t+1, x, y, z) EQUALS 0.0 + IF at_last_z; stress_xz(t+1, x, y, z+1) EQUALS -stress_xz(t+1, x, y, z-1) IF at_last_z; stress_xz(t+1, x, y, z+2) EQUALS -stress_xz(t+1, x, y, z-2) IF at_last_z; + stress_yz(t+1, x, y, z) EQUALS 0.0 + IF at_last_z; stress_yz(t+1, x, y, z+1) EQUALS -stress_yz(t+1, x, y, z-1) IF at_last_z; stress_yz(t+1, x, y, z+2) EQUALS -stress_yz(t+1, x, y, z-2) From 4224e47954efd4b73296d83b1946ae9eb5057713 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Mon, 23 Apr 2018 17:42:01 -0700 Subject: [PATCH 05/21] Add API to turn off dependency checker. Improved docs by adding separate modules. Add operator overloading for numerical ops. Still need to do bool ones. Partial work on issues #93 and #96. --- bin/gen_nodes.pl | 159 ++++++++++--- bin/yask_compiler_api_test.py | 35 +-- docs/api/mainpage.txt | 79 ++++--- include/yask_common_api.hpp | 8 + include/yask_compiler_api.hpp | 38 +++- include/yask_kernel_api.hpp | 13 ++ include/{yc_nodes.hpp => yc_node_api.hpp} | 209 +++++++++++------- include/yk_grid_api.hpp | 8 +- include/yk_solution_api.hpp | 6 + src/compiler/lib/Eqs.cpp | 29 ++- src/compiler/lib/Eqs.hpp | 5 +- src/compiler/lib/Expr.cpp | 59 ++++- src/compiler/lib/Expr.hpp | 8 +- src/compiler/lib/Grid.hpp | 1 + src/compiler/lib/Soln.cpp | 2 +- src/compiler/lib/Soln.hpp | 2 + src/compiler/main.cpp | 17 +- src/compiler/swig/yask_compiler_api.i | 63 +++++- src/compiler/tests/yask_compiler_api_test.cpp | 13 +- 19 files changed, 564 insertions(+), 190 deletions(-) rename include/{yc_nodes.hpp => yc_node_api.hpp} (81%) diff --git a/bin/gen_nodes.pl b/bin/gen_nodes.pl index fc6c3abd..897fa461 100755 --- a/bin/gen_nodes.pl +++ b/bin/gen_nodes.pl @@ -1,6 +1,10 @@ #! /usr/bin/env perl #-*-Perl-*- This line forces emacs to use Perl mode. +# This utility is used to generate some API code for the compiler. +# The code generated may require additional editing, so it is only +# used for one-time generation. + use strict; use File::Basename; use File::Path; @@ -14,12 +18,30 @@ $| = 1; # autoflush. -my @nbops = qw(equals not_equals less_than greater_than not_less_than not_greater_than); -my @bbops = qw(and or); -my @ubops = qw(not); +# num to num. +my %nops = ("add" => "+", + "subtract" => "-", + "multiply" => "*", + "divide" => "/"); +my %bnops = ("negate" => "-"); + +# num to bool. +my %nbops = ("equals" => "==", + "not_equals" => "!=", + "less_than" => "<", + "greater_than" => ">", + "not_less_than" => ">=", + "not_greater_than" => "<="); + +# bool to bool. +my %bbops = ("and" => "&&", + "or" => "||"); +my %ubops = ("not" => "!"); # decls. -for my $node (@nbops, @bbops, @ubops) { +for my $node (sort keys %nbops, + sort keys %bbops, + sort keys %ubops) { my $n2 = "yc_${node}_node"; print " class $n2;\n". @@ -28,14 +50,17 @@ } # swig decls. -for my $node (@nbops, @bbops, @ubops) { +for my $node (sort keys %nbops, + sort keys %bbops, + sort keys %ubops) { my $n2 = "yc_${node}_node"; print "\%shared_ptr(yask::$n2)\n"; } # binary ops. -for my $node (@bbops) { - my $n2 = "yc_${node}_node"; +for my $node (sort keys %bbops) { + my $n2 = "yc_${node}_node"; + my $oper = $bbops{$node}; print <<"END"; /// Create a boolean $node node. @@ -43,14 +68,15 @@ \@returns Pointer to new \\ref $n2 object. */ virtual ${n2}_ptr - new_${node}_node(yc_bool_node_ptr lhs /**< [in] Expression before '?' sign. */, - yc_bool_node_ptr rhs /**< [in] Expression after '?' sign. */ ); + new_${node}_node(yc_bool_node_ptr lhs /**< [in] Expression before '$oper' sign. */, + yc_bool_node_ptr rhs /**< [in] Expression after '$oper' sign. */ ); END } # comparison ops. -for my $node (@nbops) { - my $n2 = "yc_${node}_node"; +for my $node (sort keys %nbops) { + my $n2 = "yc_${node}_node"; + my $oper = $nbops{$node}; print <<"END"; @@ -59,31 +85,32 @@ END \@returns Pointer to new \\ref $n2 object. */ virtual ${n2}_ptr - new_${node}_node(yc_number_node_ptr lhs /**< [in] Expression before '?' sign. */, - yc_number_node_ptr rhs /**< [in] Expression after '?' sign. */ ); + new_${node}_node(yc_number_node_ptr lhs /**< [in] Expression before '$oper' sign. */, + yc_number_node_ptr rhs /**< [in] Expression after '$oper' sign. */ ); END } # binary ops. -for my $node (@bbops) { - my $n2 = "yc_${node}_node"; +for my $node (sort keys %bbops) { + my $n2 = "yc_${node}_node"; + my $oper = $bbops{$node}; print <<"END"; /// A boolean '$node' operator. - /** Example: used to implement `a ?? b`. + /** Example: used to implement `a $oper b`. Created via yc_node_factory::new_${node}_node(). */ class $n2 : public virtual yc_bool_node { public: /// Get the left-hand-side operand. - /** \@returns Expression node on left-hand-side of '?' sign. */ + /** \@returns Expression node on left-hand-side of '$oper' sign. */ virtual yc_bool_node_ptr get_lhs() =0; /// Get the right-hand-size operand. - /** \@returns Expression node on right-hand-side of '?' sign. */ + /** \@returns Expression node on right-hand-side of '$oper' sign. */ virtual yc_bool_node_ptr get_rhs() =0; }; @@ -91,25 +118,26 @@ END } # comparison ops. -for my $node (@nbops) { - my $n2 = "yc_${node}_node"; +for my $node (sort keys %nbops) { + my $n2 = "yc_${node}_node"; + my $oper = $nbops{$node}; print <<"END"; /// A numerical-comparison '$node' operator. - /** Example: used to implement `a ?? b`. + /** Example: used to implement `a $oper b`. Created via yc_node_factory::new_${node}_node(). */ class $n2 : public virtual yc_bool_node { public: /// Get the left-hand-side operand. - /** \@returns Expression node on left-hand-side of '?' sign. */ + /** \@returns Expression node on left-hand-side of '$oper' sign. */ virtual yc_bool_node_ptr get_lhs() =0; /// Get the right-hand-size operand. - /** \@returns Expression node on right-hand-side of '?' sign. */ + /** \@returns Expression node on right-hand-side of '$oper' sign. */ virtual yc_bool_node_ptr get_rhs() =0; }; @@ -117,11 +145,13 @@ END } # binary ops. -for my $node (@bbops) { - my $n2 = "yc_${node}_node"; - my $n3 = $node; - $n3 =~ s/(\w+)/\u\L$1/g; - $n3 .= 'Expr'; +for my $node (sort keys %bbops) { + my $n2 = "yc_${node}_node"; + my $oper = $bbops{$node}; + my $n3 = $node; + $n3 =~ s/([a-z]+)/\u\L$1/g; + $n3 =~ s/_//g; + $n3 .= 'Expr'; print <<"END"; ${n2}_ptr @@ -137,13 +167,14 @@ END END } -# comp. ops. -for my $node (@nbops) { - my $n2 = "yc_${node}_node"; - my $n3 = $node; - $n3 =~ s/(\w+)/\u\L$1/g; - $n3 =~ s/_//g; - $n3 .= 'Expr'; +# comparison ops. +for my $node (sort keys %nbops) { + my $n2 = "yc_${node}_node"; + my $oper = $nbops{$node}; + my $n3 = $node; + $n3 =~ s/([a-z]+)/\u\L$1/g; + $n3 =~ s/_//g; + $n3 .= 'Expr'; print <<"END"; ${n2}_ptr @@ -157,3 +188,61 @@ END } END } + +# binary num ops. +for my $node (sort keys %nops) { + my $n2 = "yc_${node}_node"; + my $n2p = $n2.'_ptr'; + my $oper = $nops{$node}; + print "$n2p operator$oper(yc_number_node_ptr lhs, yc_number_node_ptr rhs);\n"; + print "$n2p operator$oper(double lhs, yc_number_node_ptr rhs);\n"; + print "$n2p operator$oper(yc_number_node_ptr lhs, double);\n"; +} + +# binary num ops. +for my $node (sort keys %nops) { + my $n2 = "yc_${node}_node"; + my $n2p = $n2.'_ptr'; + my $oper = $nops{$node}; + my $n3 = $node; + $n3 =~ s/([a-z]+)/\u\L$1/g; + $n3 =~ s/_//g; + $n3 .= 'Expr'; + + print <<"END"; + $n2p operator$oper(yc_number_node_ptr lhs, yc_number_node_ptr rhs) { + auto lp = dynamic_pointer_cast(lhs); + assert(lp); + auto rp = dynamic_pointer_cast(rhs); + assert(rp); + return make_shared<$n3>(lp, rp); + } + $n2p operator$oper(double lhs, yc_number_node_ptr rhs) { + return operator$oper(constNum(lhs), rhs); + } + $n2p operator$oper(yc_number_node_ptr lhs, double rhs) { + return operator$oper(lhs, constNum(rhs)); + } +END +} + +# binary num ops. +for my $node (sort keys %nops) { + my $n2 = "yc_${node}_node"; + my $n2p = $n2.'_ptr'; + my $oper = $nops{$node}; + my $n3 = $node; + $n3 =~ s/([a-z]+)/\u\L$1/g; + $n3 =~ s/_//g; + $n3 .= 'Expr'; + + print <<"END"; +%extend yask::yc_number_node { + yask::yc_number_node_ptr __${node}__(yask::yc_number_node* rhs) { + auto lp = \$self->clone_ast(); + auto rp = rhs->clone_ast(); + return yask::operator$oper(lp, rp); + } + }; +END +} diff --git a/bin/yask_compiler_api_test.py b/bin/yask_compiler_api_test.py index 1e9979b0..1932ce21 100755 --- a/bin/yask_compiler_api_test.py +++ b/bin/yask_compiler_api_test.py @@ -54,28 +54,31 @@ # Create an expression for the new value. # This will average some of the neighboring points around the # current stencil application point in the current timestep. - n0 = g1.new_relative_grid_point([0, 0, 0, 0]) # center-point at this timestep. - n1 = nfac.new_add_node(n0, g1.new_relative_grid_point([0, -1, 0, 0])) # left. - n1 = nfac.new_add_node(n1, g1.new_relative_grid_point([0, 1, 0, 0])) # right. - n1 = nfac.new_add_node(n1, g1.new_relative_grid_point([0, 0, -1, 0])) # above. - n1 = nfac.new_add_node(n1, g1.new_relative_grid_point([0, 0, 1, 0])) # below. - n1 = nfac.new_add_node(n1, g1.new_relative_grid_point([0, 0, 0, -1])) # in front. - n1 = nfac.new_add_node(n1, g1.new_relative_grid_point([0, 0, 0, 1])) # behind. - n2 = nfac.new_divide_node(n1, nfac.new_const_number_node(7)) # div by 7. - - # Define value in scratch grid. - sn0 = sg1.new_relative_grid_point([0, 0, 0]) # center-point. + n1 = (g1.new_relative_grid_point([0, 0, 0, 0]) + # center-point at this timestep. + g1.new_relative_grid_point([0, -1, 0, 0]) + # left. + g1.new_relative_grid_point([0, 1, 0, 0]) + # right. + g1.new_relative_grid_point([0, 0, -1, 0]) + # above. + g1.new_relative_grid_point([0, 0, 1, 0]) + # below. + g1.new_relative_grid_point([0, 0, 0, -1]) + # in front. + g1.new_relative_grid_point([0, 0, 0, 1])) # behind. + n2 = n1 / 7 # ave of the 7 points. + + # Define value in scratch grid to be the above equation, i.e., + # this is a temporary 3-D variable that holds the average + # values of each point. + sn0 = sg1.new_relative_grid_point([0, 0, 0]) # LHS of eq is just a point on scratch-grid sn1 = nfac.new_equation_node(sn0, n2) # equate to expr n2. print("Scratch-grid equation before formatting: " + sn1.format_simple()) - # Use values in scratch grid. - sn2 = sg1.new_relative_grid_point([1, 0, 0]) - sn3 = nfac.new_add_node(sn2, sg1.new_relative_grid_point([0, 1, 0])) - sn4 = nfac.new_add_node(sn3, sg1.new_relative_grid_point([0, 0, 1])) + # Use values in scratch grid to make a new eq. + sn2 = (sg1.new_relative_grid_point([1, 0, 0]) + + sg1.new_relative_grid_point([0, 1, 0]) + + sg1.new_relative_grid_point([0, 0, 1])) + sn5 = -sn2 * 2.5 - 9.1 # Create an equation to define the value at the next timestep. n3 = g1.new_relative_grid_point([1, 0, 0, 0]) # center-point at next timestep. - n4 = nfac.new_equation_node(n3, sn4) # equate to expr from scratch grid. + n4 = nfac.new_equation_node(n3, sn5) # equate to expr from scratch grid. print("Main-grid equation before formatting: " + n4.format_simple()) print("Solution '" + soln.get_name() + "' contains " + str(soln.get_num_grids()) + " grid(s), and " + diff --git a/docs/api/mainpage.txt b/docs/api/mainpage.txt index c4afffd1..d6d6ddf1 100644 --- a/docs/api/mainpage.txt +++ b/docs/api/mainpage.txt @@ -7,17 +7,22 @@ namespace yask { /** @mainpage +\tableofcontents + @section intro Introduction The typical high-level YASK workflow is as follows: --# Define a stencil solution. - Use the YASK stencil compiler to generate C++ kernel code from the stencil solution. --# Compile the generated kernel code to create a YASK kernel library. - Create and use a stencil-based application using the kernel library. - -There are two sets of APIs provided by YASK corresponding to these tasks: --# The YASK Stencil Compiler API (available in C++ and Python). --# The YASK Stencil Kernel API (available in C++ and Python). +-# \ref yc_intro + - Define a stencil solution using the YASK domain-specific language (DSL). + - Use the YASK stencil compiler to generate C++ kernel code from the stencil solution. +-# \ref yk_intro + - Compile the generated C++ kernel code to create a YASK kernel library. + - Create and use a stencil-based application using the kernel library. +-# Test and deploy your new YASK-enabled application. + +There are two sets of APIs provided by YASK corresponding to the first two tasks: +-# The \ref sec_yc (available in C++ and Python). +-# The \ref sec_yk (available in C++ and Python). For each of the tasks, you can either use the YASK-provided application or create your own application built with the corresponding API. @@ -48,15 +53,16 @@ A new stencil solution may be defined in one of the following ways: - See example stencils in `src/stencils`. These stencils are written as classes inherited from the `StencilBase` class. -# Use the YASK compiler API to create another application that defines stencils. - - This approach is typically taken when a front-end tool will be creating stencils - from a higher-level description, e.g., applying finite-difference methods to differential equations. + - This approach is typically taken when a 3rd-party front-end tool will be creating stencils + from another, possibly higher-level, description, + e.g., applying finite-difference methods to differential equations. - In this case, the equations are built up programmatically into an abstract syntax tree (AST). - The mechanism to generate the kernel code in this approach depends on the application created from the APIs. The application might be run from a command-prompt, or the user might control it interactively. - See `src/compiler/tests/yask_compiler_api_test.cpp` for an example stencil definition in C++. - See `bin/yask_compiler_api_test.py` for an example stencil definition in Python. - - See \ref yc for documentation on the compiler API. + - See \ref sec_yc for documentation on the compiler API. In either case, the resulting generated code should written to the C++ stencil-code file, `src/kernel/gen/yask_stencil_code.hpp`. @@ -88,31 +94,13 @@ This may be done in one of the following ways: final-state data for analysis or further processing. - See `src/kernel/tests/yask_kernel_api_test.cpp` for an example kernel usage in C++. - See `bin/yask_kernel_api_test.py` for an example kernel usage in Python. - - See \ref yk for documentation on the kernel API. + - See \ref sec_yk for documentation on the kernel API. -@note Anytime you want to change the name or compile-time properties of the kernel, be sure to run +@note Anytime you want to change the name or any compile-time properties of the kernel, be sure to run `make clean` to force the removal of all kernel-specific intermediate code. Otherwise, you will likely see some unexpected errors when building the new kernel. -@subsection examples Example Tests - -The following examples illustrate possible combinations of compilers and kernels. -- You can substitute `snb` for one of the other architectures listed in the `Makefile` if desired. -- Run `make clean` before all of the example commands to ensure consistent builds. - -Stencil Compiler | Stencil Application | Test Command ---------------------|---------------------|------------- -YASK-provided | YASK-provided | `make -j stencil=iso3dfd arch=snb yc-and-yk-test` -YASK-provided | C++ test example | `make -j stencil=iso3dfd arch=snb yc-and-cxx-yk-api-test` -YASK-provided | Python test example | `make -j stencil=iso3dfd arch=snb yc-and-py-yk-api-test` -C++ test example | YASK-provided | `make -j stencil=test arch=snb cxx-yc-api-and-yk-test` -C++ test example | C++ test example | `make -j stencil=test arch=snb cxx-yc-api-and-cxx-yk-api-test` -C++ test example | Python test example | `make -j stencil=test arch=snb cxx-yc-api-and-py-yk-api-test` -Python test example | YASK-provided | `make -j stencil=test arch=snb py-yc-api-and-yk-test` -Python test example | C++ test example | `make -j stencil=test arch=snb py-yc-api-and-cxx-yk-api-test` -Python test example | Python test example | `make -j stencil=test arch=snb py-yc-api-and-py-yk-api-test` - -@section yc YASK Stencil Compiler API +@section sec_yc YASK Stencil Compiler API @subsection yc_oview Compiler Overview This section provides usage information for the YASK stencil compiler @@ -121,6 +109,8 @@ The API is available for C++ and for Python via SWIG. Type names are prefixed with 'yc_' to indicate "YASK compiler"; this distinguishes them from the 'yk_'-prefixed types used in the "YASK kernel" API. +The types, classes, and functions are listed in \ref yc. + @subsection yc_usage Typical Program Flow using the Compiler API - Create a yc_factory. This is the "bootstrap" object that will be used to create others. - Create a yc_solution object via yc_factory::new_solution(). @@ -149,7 +139,7 @@ this distinguishes them from the 'yk_'-prefixed types used in the "YASK kernel" yc_solution::set_fold_len() and/or yc_solution::set_cluster_mult(). - Format the equations for additional processing via yc_solution::format(). -@section yk YASK Stencil Kernel API +@section sec_yk YASK Stencil Kernel API @subsection yk_oview Kernel Overview This section provides usage information for the YASK stencil kernel @@ -158,6 +148,8 @@ The API is available for C++ and for Python via SWIG. Type names are prefixed with 'yk_' to indicate "YASK kernel"; this distinguishes them from the 'yc_'-prefixed types used in the "YASK compiler" API. +The types, classes, and functions are listed in \ref yk. + @subsection yk_usage Typical Program Flow using the Kernel API - Create a \ref yk_factory. This is the "bootstrap" object that will be used to create others. - Create a \ref yk_env object via yk_factory::new_env(). This initializes MPI if you have enabled it. @@ -177,6 +169,25 @@ this distinguishes them from the 'yc_'-prefixed types used in the "YASK compiler - Apply the stencil(s) to the grids via yk_solution::run_solution(). There are versions for advancing one or more steps. - Retrieve the final results via yk_grid::get_element(). -*/ +@section sec_tests Example Tests + +The following examples illustrate possible combinations of compilers and kernels. +- You can add `stencil=`_stencil-name_ to use a specific stencil for testing. +- You can add `arch=`_arch-name_ to target one of the architectures listed in the `Makefile` if desired. +- Run `make clean` before all of the example commands to ensure consistent builds. + +Stencil Compiler | Stencil Application | Test Command +--------------------|---------------------|------------- +YASK-provided | YASK-provided | `make -j yc-and-yk-test` +YASK-provided | C++ test example | `make -j yc-and-cxx-yk-api-test` +YASK-provided | Python test example | `make -j yc-and-py-yk-api-test` +C++ test example | YASK-provided | `make -j cxx-yc-api-and-yk-test` +C++ test example | C++ test example | `make -j cxx-yc-api-and-cxx-yk-api-test` +C++ test example | Python test example | `make -j cxx-yc-api-and-py-yk-api-test` +Python test example | YASK-provided | `make -j py-yc-api-and-yk-test` +Python test example | C++ test example | `make -j py-yc-api-and-cxx-yk-api-test` +Python test example | Python test example | `make -j py-yc-api-and-py-yk-api-test` + +*/ } diff --git a/include/yask_common_api.hpp b/include/yask_common_api.hpp index cce4c80a..7443c897 100644 --- a/include/yask_common_api.hpp +++ b/include/yask_common_api.hpp @@ -39,6 +39,12 @@ IN THE SOFTWARE. namespace yask { + /** + * \defgroup yask YASK Commmon Utilities + * Types, clases, and functions used in both the \ref sec_yc and \ref sec_yk. + * @{ + */ + /// Version information. /** @returns String describing the current version. @@ -181,6 +187,8 @@ namespace yask { virtual ~yask_null_output() {} }; + /** @}*/ + } // namespace yask. #endif diff --git a/include/yask_compiler_api.hpp b/include/yask_compiler_api.hpp index 41614756..e3ab4951 100644 --- a/include/yask_compiler_api.hpp +++ b/include/yask_compiler_api.hpp @@ -37,6 +37,12 @@ IN THE SOFTWARE. namespace yask { + /** + * \defgroup yc YASK Compiler + * Types, clases, and functions used in the \ref sec_yc. + * @{ + */ + // Forward declarations of classes and their pointers. // See yask_compiler_api.hpp for more. @@ -65,12 +71,19 @@ namespace yask { class yc_grid_point_node; /// Shared pointer to \ref yc_grid_point_node typedef std::shared_ptr yc_grid_point_node_ptr; + + /** @}*/ } -#include "yc_nodes.hpp" +#include "yc_node_api.hpp" namespace yask { + /** + * \addtogroup yc + * @{ + */ + /// Bootstrap factory to create objects needed to define a stencil solution. class yc_factory { public: @@ -310,6 +323,27 @@ namespace yask { yask_output_ptr output /**< [out] Pointer to object to receive formatted output. See \ref yask_output_factory. */) =0; + + /// **[Advanced]** Enable or disable automatic dependency checker. + /** + This should be used whenever the built-in dependency checker is + insufficient. Currently, the provided checker does not allow + stencils in which points in one sub-domain depend on points + in another sub-domain within the same value of the step index. + + @warning If dependency checker is disabled, *all* dependencies + must be set via the APIs. + */ + virtual void + set_dependency_checker_enabled(bool enable + /**< [in] `true` to enable or `false` to disable. */) =0; + + /// **[Advanced]** Determine whether automatic dependency checker is enabled. + /** + @returns Current setting. + */ + virtual bool + is_dependency_checker_enabled() const =0; }; /// A compile-time grid. @@ -366,6 +400,8 @@ namespace yask { #endif }; + /** @}*/ + } // namespace yask. #endif diff --git a/include/yask_kernel_api.hpp b/include/yask_kernel_api.hpp index 8d407962..c1579f50 100644 --- a/include/yask_kernel_api.hpp +++ b/include/yask_kernel_api.hpp @@ -38,6 +38,12 @@ IN THE SOFTWARE. namespace yask { + /** + * \defgroup yk YASK Kernel + * Types, clases, and functions used in both the \ref sec_yk. + * @{ + */ + /// Type to use for indexing grids. /** Index types are signed to allow negative indices in padding/halos. */ #ifdef SWIG @@ -64,6 +70,7 @@ namespace yask { /// Shared pointer to \ref yk_stats. typedef std::shared_ptr yk_stats_ptr; + /** @}*/ } // namespace yask. #include "yk_solution_api.hpp" @@ -71,6 +78,11 @@ namespace yask { namespace yask { + /** + * \addtogroup yk + * @{ + */ + /// Bootstrap factory to create a stencil solution. class yk_factory { public: @@ -144,6 +156,7 @@ namespace yask { global_barrier() const =0; }; + /** @}*/ } // namespace yask. #endif diff --git a/include/yc_nodes.hpp b/include/yc_node_api.hpp similarity index 81% rename from include/yc_nodes.hpp rename to include/yc_node_api.hpp index b6c3079e..86bf2eb4 100644 --- a/include/yc_nodes.hpp +++ b/include/yc_node_api.hpp @@ -27,13 +27,18 @@ IN THE SOFTWARE. // This file uses Doxygen 1.8 markup for API documentation-generation. // See http://www.stack.nl/~dimitri/doxygen. -/** @file yask_compiler_api.hpp */ +/** @file yc_node_api.hpp */ #ifndef YC_NODES #define YC_NODES namespace yask { + /** + * \addtogroup yc + * @{ + */ + // Forward declarations of expression nodes and their pointers. // See yask_compiler_api.hpp for more. @@ -125,7 +130,7 @@ namespace yask { */ virtual yc_index_node_ptr new_step_index(const std::string& name - /**< [in] Step dimension name. */ ); + /**< [in] Step dimension name. */ ); /// Create a domain-index node. /** @@ -135,10 +140,10 @@ namespace yask { This should *not* include the step dimension, which is specified via new_step_index(). @returns Pointer to new \ref yc_index_node object. - */ + */ virtual yc_index_node_ptr new_domain_index(const std::string& name - /**< [in] Domain index name. */ ); + /**< [in] Domain index name. */ ); /// Create a new miscellaneous index. /** @@ -146,7 +151,7 @@ namespace yask { some dimension that is not the step dimension or a domain dimension. Example: index into an array. @returns Pointer to new \ref yc_index_node object. - */ + */ virtual yc_index_node_ptr new_misc_index(const std::string& name /**< [in] Index name. */ ); @@ -179,7 +184,7 @@ namespace yask { /// Create a constant numerical value node. /** This is unary negation. - Use new_subtraction_node() for binary '-'. + Use new_subtraction_node() for binary `-`. @returns Pointer to new \ref yc_const_number_node object. */ virtual yc_const_number_node_ptr @@ -187,48 +192,56 @@ namespace yask { /// Create a numerical negation operator node. /** - @returns Pointer to new \ref yc_negate_node object. + New negation nodes can also be created via the overloaded unary `-` operator. + @returns Pointer to new \ref yc_negate_node object. */ virtual yc_negate_node_ptr - new_negate_node(yc_number_node_ptr rhs /**< [in] Expression after '-' sign. */ ); + new_negate_node(yc_number_node_ptr rhs /**< [in] Expression after `-` sign. */ ); /// Create an addition node. /** Nodes must be created with at least two operands, and more can be added by calling add_operand() on the returned node. + + New addition nodes can also be created via the overloaded `+` operator. @returns Pointer to new \ref yc_add_node object. */ virtual yc_add_node_ptr - new_add_node(yc_number_node_ptr lhs /**< [in] Expression before '+' sign. */, - yc_number_node_ptr rhs /**< [in] Expression after '+' sign. */ ); + new_add_node(yc_number_node_ptr lhs /**< [in] Expression before `+` sign. */, + yc_number_node_ptr rhs /**< [in] Expression after `+` sign. */ ); /// Create a multiplication node. /** Nodes must be created with at least two operands, and more can be added by calling add_operand() on the returned node. + + New multiplication nodes can also be created via the overloaded `*` operator. @returns Pointer to new \ref yc_multiply_node object. */ virtual yc_multiply_node_ptr - new_multiply_node(yc_number_node_ptr lhs /**< [in] Expression before '*' sign. */, - yc_number_node_ptr rhs /**< [in] Expression after '*' sign. */ ); + new_multiply_node(yc_number_node_ptr lhs /**< [in] Expression before `*` sign. */, + yc_number_node_ptr rhs /**< [in] Expression after `*` sign. */ ); /// Create a subtraction node. /** This is binary subtraction. - Use new_negation_node() for unary '-'. + Use new_negation_node() for unary `-`. + + New subtraction nodes can also be created via the overloaded `-` operator. @returns Pointer to new \ref yc_subtract_node object. */ virtual yc_subtract_node_ptr - new_subtract_node(yc_number_node_ptr lhs /**< [in] Expression before '-' sign. */, - yc_number_node_ptr rhs /**< [in] Expression after '-' sign. */ ); + new_subtract_node(yc_number_node_ptr lhs /**< [in] Expression before `-` sign. */, + yc_number_node_ptr rhs /**< [in] Expression after `-` sign. */ ); /// Create a division node. /** + New division nodes can also be created via the overloaded `/` operator. @returns Pointer to new \ref yc_divide_node object. */ virtual yc_divide_node_ptr - new_divide_node(yc_number_node_ptr lhs /**< [in] Expression before '/' sign. */, - yc_number_node_ptr rhs /**< [in] Expression after '/' sign. */ ); + new_divide_node(yc_number_node_ptr lhs /**< [in] Expression before `/` sign. */, + yc_number_node_ptr rhs /**< [in] Expression after `/` sign. */ ); /// Create a symbol for the first index value in a given dimension. /** @@ -244,14 +257,14 @@ namespace yask { // Create expression for "first_x + 10". auto left10 = node_fac.new_add_node(first_x, - node_fac.new_const_number_node(10)); + node_fac.new_const_number_node(10)); // Create boolean expression for "x > first_x + 10". auto expr = node_fac.new_greater_than_node(x, left10); \endcode @returns Pointer to new \ref yc_index_node object. - */ + */ virtual yc_number_node_ptr new_first_domain_index(yc_index_node_ptr idx /**< [in] Domain index. */ ); @@ -270,14 +283,14 @@ namespace yask { // Create expression for "last_x - 10". auto right10 = node_fac.new_subtract_node(last_x, - node_fac.new_const_number_node(10)); + node_fac.new_const_number_node(10)); // Create boolean expression for "x < first_x - 10". auto expr = node_fac.new_less_than_node(x, right10); \endcode @returns Pointer to new \ref yc_index_node object. - */ + */ virtual yc_number_node_ptr new_last_domain_index(yc_index_node_ptr idx /**< [in] Domain index. */ ); @@ -287,71 +300,71 @@ namespace yask { @returns Pointer to new \ref yc_not_node object. */ virtual yc_not_node_ptr - new_not_node(yc_bool_node_ptr rhs /**< [in] Expression after '!' sign. */ ); + new_not_node(yc_bool_node_ptr rhs /**< [in] Expression after `!` sign. */ ); /// Create a boolean 'and' node. /** @returns Pointer to new \ref yc_and_node object. */ virtual yc_and_node_ptr - new_and_node(yc_bool_node_ptr lhs /**< [in] Expression before '&&' sign. */, - yc_bool_node_ptr rhs /**< [in] Expression after '&&' sign. */ ); + new_and_node(yc_bool_node_ptr lhs /**< [in] Expression before `&&` sign. */, + yc_bool_node_ptr rhs /**< [in] Expression after `&&` sign. */ ); /// Create a boolean 'or' node. /** @returns Pointer to new \ref yc_or_node object. */ virtual yc_or_node_ptr - new_or_node(yc_bool_node_ptr lhs /**< [in] Expression before '||' sign. */, - yc_bool_node_ptr rhs /**< [in] Expression after '||' sign. */ ); + new_or_node(yc_bool_node_ptr lhs /**< [in] Expression before `||` sign. */, + yc_bool_node_ptr rhs /**< [in] Expression after `||` sign. */ ); /// Create a numerical-comparison 'equals' node. /** @returns Pointer to new \ref yc_equals_node object. */ virtual yc_equals_node_ptr - new_equals_node(yc_number_node_ptr lhs /**< [in] Expression before '==' sign. */, - yc_number_node_ptr rhs /**< [in] Expression after '==' sign. */ ); + new_equals_node(yc_number_node_ptr lhs /**< [in] Expression before `==` sign. */, + yc_number_node_ptr rhs /**< [in] Expression after `==` sign. */ ); /// Create a numerical-comparison 'not-equals' node. /** @returns Pointer to new \ref yc_not_equals_node object. */ virtual yc_not_equals_node_ptr - new_not_equals_node(yc_number_node_ptr lhs /**< [in] Expression before '!=' sign. */, - yc_number_node_ptr rhs /**< [in] Expression after '!=' sign. */ ); + new_not_equals_node(yc_number_node_ptr lhs /**< [in] Expression before `!=` sign. */, + yc_number_node_ptr rhs /**< [in] Expression after `!=` sign. */ ); /// Create a numerical-comparison 'less-than' node. /** @returns Pointer to new \ref yc_less_than_node object. */ virtual yc_less_than_node_ptr - new_less_than_node(yc_number_node_ptr lhs /**< [in] Expression before '<' sign. */, - yc_number_node_ptr rhs /**< [in] Expression after '<' sign. */ ); + new_less_than_node(yc_number_node_ptr lhs /**< [in] Expression before `<` sign. */, + yc_number_node_ptr rhs /**< [in] Expression after `<` sign. */ ); /// Create a numerical-comparison 'greater-than' node. /** @returns Pointer to new \ref yc_greater_than_node object. */ virtual yc_greater_than_node_ptr - new_greater_than_node(yc_number_node_ptr lhs /**< [in] Expression before '>' sign. */, - yc_number_node_ptr rhs /**< [in] Expression after '>' sign. */ ); + new_greater_than_node(yc_number_node_ptr lhs /**< [in] Expression before `>` sign. */, + yc_number_node_ptr rhs /**< [in] Expression after `>` sign. */ ); /// Create a numerical-comparison 'greater-than or equals' node. /** @returns Pointer to new \ref yc_not_less_than_node object. */ virtual yc_not_less_than_node_ptr - new_not_less_than_node(yc_number_node_ptr lhs /**< [in] Expression before '>=' sign. */, - yc_number_node_ptr rhs /**< [in] Expression after '>=' sign. */ ); + new_not_less_than_node(yc_number_node_ptr lhs /**< [in] Expression before `>=` sign. */, + yc_number_node_ptr rhs /**< [in] Expression after `>=` sign. */ ); /// Create a numerical-comparison 'less-than or equals' node. /** @returns Pointer to new \ref yc_not_greater_than_node object. */ virtual yc_not_greater_than_node_ptr - new_not_greater_than_node(yc_number_node_ptr lhs /**< [in] Expression before '<=' sign. */, - yc_number_node_ptr rhs /**< [in] Expression after '<=' sign. */ ); + new_not_greater_than_node(yc_number_node_ptr lhs /**< [in] Expression before `<=` sign. */, + yc_number_node_ptr rhs /**< [in] Expression after `<=` sign. */ ); }; @@ -365,7 +378,7 @@ namespace yask { /** Formats the expression starting at this node. @returns String containing a single-line human-readable version of the expression. - */ + */ virtual std::string format_simple() const =0; /// Count the size of the AST. @@ -406,7 +419,12 @@ namespace yask { /// Base class for all numerical AST nodes. /** An object of this abstract type cannot be created. */ - class yc_number_node : public virtual yc_expr_node { }; + class yc_number_node : public virtual yc_expr_node { + public: + + /// Create a deep copy of AST starting with this node. + virtual yc_number_node_ptr clone_ast() const =0; + }; /// Base class for all boolean AST nodes. /** An object of this abstract type cannot be created. */ @@ -432,7 +450,7 @@ namespace yask { /** Created via yc_grid::new_relative_grid_point(). */ - class yc_grid_point_node : public virtual yc_number_node { + class yc_grid_point_node : public virtual yc_number_node { public: /// Get the grid this point is in. @@ -464,14 +482,14 @@ namespace yask { /// A numerical negation operator. /** Example: used to implement -(a*b). Created via yc_node_factory::new_negate_node(). - */ + */ class yc_negate_node : public virtual yc_number_node { public: /// Get the [only] operand. /** This node implements unary negation only, not subtraction, so there is - never a left-hand-side. - @returns Expression node on right-hand-side of '-' sign. */ + never a left-hand-side. + @returns Expression node on right-hand-side of `-` sign. */ virtual yc_number_node_ptr get_rhs() =0; }; @@ -485,8 +503,8 @@ namespace yask { /// Get the number of operands. /** If there is just one operand, the operation itself is moot. If there are more than one operand, the operation applies between - them. Example: for an add operator, if the operands are 'a', - 'b', and 'c', then the expression is 'a + b + c'. + them. Example: for an add operator, if the operands are `a`, + `b`, and `c`, then the expression is `a + b + c`. @returns Number of operands. */ virtual int get_num_operands() =0; @@ -515,12 +533,12 @@ namespace yask { public: /// Get the left-hand-side operand. - /** @returns Pointer to expression node appearing before the '-' sign. */ + /** @returns Pointer to expression node appearing before the `-` sign. */ virtual yc_number_node_ptr get_lhs() =0; /// Get the right-hand-side operand. - /** @returns Pointer to expression node appearing after the '-' sign. */ + /** @returns Pointer to expression node appearing after the `-` sign. */ virtual yc_number_node_ptr get_rhs() =0; }; @@ -531,12 +549,12 @@ namespace yask { public: /// Get the left-hand-side operand. - /** @returns Pointer to expression node appearing before the '/' sign. */ + /** @returns Pointer to expression node appearing before the `/` sign. */ virtual yc_number_node_ptr get_lhs() =0; /// Get the right-hand-side operand. - /** @returns Pointer to expression node appearing after the '/' sign. */ + /** @returns Pointer to expression node appearing after the `/` sign. */ virtual yc_number_node_ptr get_rhs() =0; }; @@ -544,12 +562,12 @@ namespace yask { /// A boolean inversion operator. /** Example: used to implement `!(a || b)`. Created via yc_node_factory::new_not_node(). - */ + */ class yc_not_node : public virtual yc_bool_node { public: /// Get the [only] operand. - /** @returns Expression node on right-hand-side of '!' sign. */ + /** @returns Expression node on right-hand-side of `!` sign. */ virtual yc_bool_node_ptr get_rhs() =0; }; @@ -557,17 +575,17 @@ namespace yask { /// A boolean 'and' operator. /** Example: used to implement `a && b`. Created via yc_node_factory::new_and_node(). - */ + */ class yc_and_node : public virtual yc_bool_node { public: /// Get the left-hand-side operand. - /** @returns Expression node on left-hand-side of '&&' sign. */ + /** @returns Expression node on left-hand-side of `&&` sign. */ virtual yc_bool_node_ptr get_lhs() =0; /// Get the right-hand-size operand. - /** @returns Expression node on right-hand-side of '&&' sign. */ + /** @returns Expression node on right-hand-side of `&&` sign. */ virtual yc_bool_node_ptr get_rhs() =0; }; @@ -575,17 +593,17 @@ namespace yask { /// A boolean 'or' operator. /** Example: used to implement `a || b`. Created via yc_node_factory::new_or_node(). - */ + */ class yc_or_node : public virtual yc_bool_node { public: /// Get the left-hand-side operand. - /** @returns Expression node on left-hand-side of '||' sign. */ + /** @returns Expression node on left-hand-side of `||` sign. */ virtual yc_bool_node_ptr get_lhs() =0; /// Get the right-hand-size operand. - /** @returns Expression node on right-hand-side of '||' sign. */ + /** @returns Expression node on right-hand-side of `||` sign. */ virtual yc_bool_node_ptr get_rhs() =0; }; @@ -593,17 +611,17 @@ namespace yask { /// A numerical-comparison 'equals' operator. /** Example: used to implement `a == b`. Created via yc_node_factory::new_equals_node(). - */ + */ class yc_equals_node : public virtual yc_bool_node { public: /// Get the left-hand-side operand. - /** @returns Expression node on left-hand-side of '==' sign. */ + /** @returns Expression node on left-hand-side of `==` sign. */ virtual yc_number_node_ptr get_lhs() =0; /// Get the right-hand-size operand. - /** @returns Expression node on right-hand-side of '==' sign. */ + /** @returns Expression node on right-hand-side of `==` sign. */ virtual yc_number_node_ptr get_rhs() =0; }; @@ -611,17 +629,17 @@ namespace yask { /// A numerical-comparison 'not_equals' operator. /** Example: used to implement `a != b`. Created via yc_node_factory::new_not_equals_node(). - */ + */ class yc_not_equals_node : public virtual yc_bool_node { public: /// Get the left-hand-side operand. - /** @returns Expression node on left-hand-side of '!=' sign. */ + /** @returns Expression node on left-hand-side of `!=` sign. */ virtual yc_number_node_ptr get_lhs() =0; /// Get the right-hand-size operand. - /** @returns Expression node on right-hand-side of '!=' sign. */ + /** @returns Expression node on right-hand-side of `!=` sign. */ virtual yc_number_node_ptr get_rhs() =0; }; @@ -629,17 +647,17 @@ namespace yask { /// A numerical-comparison 'less_than' operator. /** Example: used to implement `a < b`. Created via yc_node_factory::new_less_than_node(). - */ + */ class yc_less_than_node : public virtual yc_bool_node { public: /// Get the left-hand-side operand. - /** @returns Expression node on left-hand-side of '<' sign. */ + /** @returns Expression node on left-hand-side of `<` sign. */ virtual yc_number_node_ptr get_lhs() =0; /// Get the right-hand-size operand. - /** @returns Expression node on right-hand-side of '<' sign. */ + /** @returns Expression node on right-hand-side of `<` sign. */ virtual yc_number_node_ptr get_rhs() =0; }; @@ -647,17 +665,17 @@ namespace yask { /// A numerical-comparison 'greater_than' operator. /** Example: used to implement `a > b`. Created via yc_node_factory::new_greater_than_node(). - */ + */ class yc_greater_than_node : public virtual yc_bool_node { public: /// Get the left-hand-side operand. - /** @returns Expression node on left-hand-side of '>' sign. */ + /** @returns Expression node on left-hand-side of `>` sign. */ virtual yc_number_node_ptr get_lhs() =0; /// Get the right-hand-size operand. - /** @returns Expression node on right-hand-side of '>' sign. */ + /** @returns Expression node on right-hand-side of `>` sign. */ virtual yc_number_node_ptr get_rhs() =0; }; @@ -666,17 +684,17 @@ namespace yask { /// A numerical-comparison 'not_less_than' operator. /** Example: used to implement `a >= b`. Created via yc_node_factory::new_not_less_than_node(). - */ + */ class yc_not_less_than_node : public virtual yc_bool_node { public: /// Get the left-hand-side operand. - /** @returns Expression node on left-hand-side of '>=' sign. */ + /** @returns Expression node on left-hand-side of `>=` sign. */ virtual yc_number_node_ptr get_lhs() =0; /// Get the right-hand-size operand. - /** @returns Expression node on right-hand-side of '>=' sign. */ + /** @returns Expression node on right-hand-side of `>=` sign. */ virtual yc_number_node_ptr get_rhs() =0; }; @@ -684,23 +702,62 @@ namespace yask { /// A numerical-comparison 'not_greater_than' operator. /** Example: used to implement `a <= b`. Created via yc_node_factory::new_not_greater_than_node(). - */ + */ class yc_not_greater_than_node : public virtual yc_bool_node { public: /// Get the left-hand-side operand. - /** @returns Expression node on left-hand-side of '<=' sign. */ + /** @returns Expression node on left-hand-side of `<=` sign. */ virtual yc_number_node_ptr get_lhs() =0; /// Get the right-hand-size operand. - /** @returns Expression node on right-hand-side of '<=' sign. */ + /** @returns Expression node on right-hand-side of `<=` sign. */ virtual yc_number_node_ptr get_rhs() =0; }; + // Non-class operators. + // These are only defined if the older "internal DSL" is not used. + // The internal version will eventually be deprecated and + // perhaps removed in favor of this API. + +#ifndef USE_INTERNAL_DSL + /// Operator version of yc_node_factory::new_negation_node(). + yc_negate_node_ptr operator-(yc_number_node_ptr rhs); + + /// Operator version of yc_node_factory::new_addition_node(). + yc_add_node_ptr operator+(yc_number_node_ptr lhs, yc_number_node_ptr rhs); + /// Operator version of yc_node_factory::new_addition_node(). + yc_add_node_ptr operator+(double lhs, yc_number_node_ptr rhs); + /// Operator version of yc_node_factory::new_addition_node(). + yc_add_node_ptr operator+(yc_number_node_ptr lhs, double rhs); + + /// Operator version of yc_node_factory::new_division_node(). + yc_divide_node_ptr operator/(yc_number_node_ptr lhs, yc_number_node_ptr rhs); + /// Operator version of yc_node_factory::new_division_node(). + yc_divide_node_ptr operator/(double lhs, yc_number_node_ptr rhs); + /// Operator version of yc_node_factory::new_division_node(). + yc_divide_node_ptr operator/(yc_number_node_ptr lhs, double rhs); + + /// Operator version of yc_node_factory::new_multiplication_node(). + yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, yc_number_node_ptr rhs); + /// Operator version of yc_node_factory::new_multiplication_node(). + yc_multiply_node_ptr operator*(double lhs, yc_number_node_ptr rhs); + /// Operator version of yc_node_factory::new_multiplication_node(). + yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, double rhs); + + /// Operator version of yc_node_factory::new_subtraction_node(). + yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, yc_number_node_ptr rhs); + /// Operator version of yc_node_factory::new_subtraction_node(). + yc_subtract_node_ptr operator-(double lhs, yc_number_node_ptr rhs); + /// Operator version of yc_node_factory::new_subtraction_node(). + yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, double rhs); +#endif + /** @}*/ + } // namespace yask. #endif diff --git a/include/yk_grid_api.hpp b/include/yk_grid_api.hpp index c0219109..d06dd6b4 100644 --- a/include/yk_grid_api.hpp +++ b/include/yk_grid_api.hpp @@ -36,6 +36,11 @@ IN THE SOFTWARE. namespace yask { + /** + * \addtogroup yk + * @{ + */ + /// A run-time grid. /** "Grid" is a generic term for any n-dimensional array. A 0-dim grid @@ -935,10 +940,9 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names(). */ ) const =0; - }; - + /** @}*/ } // namespace yask. #endif diff --git a/include/yk_solution_api.hpp b/include/yk_solution_api.hpp index d7c554a8..b66feb02 100644 --- a/include/yk_solution_api.hpp +++ b/include/yk_solution_api.hpp @@ -36,6 +36,11 @@ IN THE SOFTWARE. namespace yask { + /** + * \addtogroup yk + * @{ + */ + /// Allocate grids on local NUMA node. /** This is used in yk_solution::set_default_numa_preferred @@ -730,6 +735,7 @@ namespace yask { get_elapsed_run_secs() =0; }; + /** @}*/ } // namespace yask. #endif diff --git a/src/compiler/lib/Eqs.cpp b/src/compiler/lib/Eqs.cpp index 27af5e44..94955543 100644 --- a/src/compiler/lib/Eqs.cpp +++ b/src/compiler/lib/Eqs.cpp @@ -180,22 +180,24 @@ namespace yask { }); _done = true; } - - // Find dependencies based on all eqs. - // Side effect: sets _stepDir in dims. + + // Analyze group of equations. + // Sets _stepDir in dims. + // Finds dependencies based on all eqs if 'settings._findDeps'. // Throws exceptions on illegal dependencies. // TODO: split this into smaller functions. // BIG-TODO: replace dependency algorithms with integration of a polyhedral // library. - void Eqs::findDeps(Dimensions& dims, - ostream& os) { + void Eqs::analyzeEqs(CompilerSettings& settings, + Dimensions& dims, + ostream& os) { auto& stepDim = dims._stepDim; // Gather points from all eqs in all grids. PointVisitor pt_vis; // Gather initial stats from all eqs. - os << "Scanning " << getEqs().size() << " equation(s) for dependencies...\n"; + os << "Scanning " << getEqs().size() << " stencil equation(s) for dependencies...\n"; for (auto eq1 : getEqs()) eq1->accept(&pt_vis); auto& outGrids = pt_vis.getOutputGrids(); @@ -315,6 +317,9 @@ namespace yask { "' on LHS"); } } + + // TODO: check that domain indices are simple offsets and + // misc indices are consts. } // TODO: check to make sure cond1 depends only on indices. @@ -363,6 +368,7 @@ namespace yask { // dependencies by looking for exact matches. // We do this check first because it's quicker than the // detailed scan done later if this one doesn't find a dep. + // Also, this is always illegal, even if not finding deps. // // Example: // eq1: a(t+1, x, ...) EQUALS ... @@ -380,13 +386,18 @@ namespace yask { // Save dependency. #ifdef DEBUG_DEP cout << " Exact match found to " << op1->makeQuotedStr() << ".\n"; -#endif - _eq_deps[cur_step_dep].set_imm_dep_on(eq2, eq1); +#endif + if (settings._findDeps) + _eq_deps[cur_step_dep].set_imm_dep_on(eq2, eq1); // Move along to next eq2. continue; } + // Don't do more conservative checks if not looking for deps. + if (!settings._findDeps) + continue; + // Next dep check: inexact matches on LHS of eq1 to RHS of eq2. // Does eq1 define *any* point in a grid that eq2 inputs // at the same step index? If so, they *might* have a @@ -453,6 +464,8 @@ namespace yask { } // for all eqs (eq1). // Resolve indirect dependencies. + // Do this even if not finding deps because we want to + // resolve deps provided by the user. os << " Resolving indirect dependencies...\n"; for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1)) _eq_deps[dt].find_all_deps(); diff --git a/src/compiler/lib/Eqs.hpp b/src/compiler/lib/Eqs.hpp index ae942798..d403ea0a 100644 --- a/src/compiler/lib/Eqs.hpp +++ b/src/compiler/lib/Eqs.hpp @@ -162,8 +162,9 @@ namespace yask { // Find dependencies based on all eqs. If 'eq_deps' is // set, save dependencies between eqs in referent. - virtual void findDeps(Dimensions& dims, - std::ostream& os); + virtual void analyzeEqs(CompilerSettings& settings, + Dimensions& dims, + std::ostream& os); // Determine which grid points can be vectorized. virtual void analyzeVec(const Dimensions& dims); diff --git a/src/compiler/lib/Expr.cpp b/src/compiler/lib/Expr.cpp index 198113b0..361ccf43 100644 --- a/src/compiler/lib/Expr.cpp +++ b/src/compiler/lib/Expr.cpp @@ -203,7 +203,64 @@ namespace yask { if (!p) THROW_YASK_EXCEPTION("Error: new_last_domain_index() called without index-node argument"); return last_index(p); - } + } + yc_negate_node_ptr operator-(yc_number_node_ptr rhs) { + auto p = dynamic_pointer_cast(rhs); + assert(p); + return make_shared(p); + } + yc_add_node_ptr operator+(yc_number_node_ptr lhs, yc_number_node_ptr rhs) { + auto lp = dynamic_pointer_cast(lhs); + assert(lp); + auto rp = dynamic_pointer_cast(rhs); + assert(rp); + return make_shared(lp, rp); + } + yc_add_node_ptr operator+(double lhs, yc_number_node_ptr rhs) { + return operator+(constNum(lhs), rhs); + } + yc_add_node_ptr operator+(yc_number_node_ptr lhs, double rhs) { + return operator+(lhs, constNum(rhs)); + } + yc_divide_node_ptr operator/(yc_number_node_ptr lhs, yc_number_node_ptr rhs) { + auto lp = dynamic_pointer_cast(lhs); + assert(lp); + auto rp = dynamic_pointer_cast(rhs); + assert(rp); + return make_shared(lp, rp); + } + yc_divide_node_ptr operator/(double lhs, yc_number_node_ptr rhs) { + return operator/(constNum(lhs), rhs); + } + yc_divide_node_ptr operator/(yc_number_node_ptr lhs, double rhs) { + return operator/(lhs, constNum(rhs)); + } + yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, yc_number_node_ptr rhs) { + auto lp = dynamic_pointer_cast(lhs); + assert(lp); + auto rp = dynamic_pointer_cast(rhs); + assert(rp); + return make_shared(lp, rp); + } + yc_multiply_node_ptr operator*(double lhs, yc_number_node_ptr rhs) { + return operator*(constNum(lhs), rhs); + } + yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, double rhs) { + return operator*(lhs, constNum(rhs)); + } + yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, yc_number_node_ptr rhs) { + auto lp = dynamic_pointer_cast(lhs); + assert(lp); + auto rp = dynamic_pointer_cast(rhs); + assert(rp); + return make_shared(lp, rp); + } + yc_subtract_node_ptr operator-(double lhs, yc_number_node_ptr rhs) { + return operator-(constNum(lhs), rhs); + } + yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, double rhs) { + return operator-(lhs, constNum(rhs)); + } // Compare 2 expr pointers and return whether the expressions are // equivalent. diff --git a/src/compiler/lib/Expr.hpp b/src/compiler/lib/Expr.hpp index 3b45743c..f10c1ebd 100644 --- a/src/compiler/lib/Expr.hpp +++ b/src/compiler/lib/Expr.hpp @@ -207,7 +207,8 @@ namespace yask { } // Real or int value. - class NumExpr : public Expr, public virtual yc_number_node { + class NumExpr : public Expr, + public virtual yc_number_node { public: // Return 'true' if this is a compile-time constant. @@ -245,6 +246,9 @@ namespace yask { // For this to work properly, each derived type // should also implement a deep-copy copy ctor. virtual NumExprPtr clone() const =0; + virtual yc_number_node_ptr clone_ast() const { + return clone(); + } }; // Grid index types. @@ -306,7 +310,7 @@ namespace yask { // A free function to create a constant expression. // Usually not needed due to operator overloading. - NumExprPtr constNum(double rhs); + NumExprPtr constNum(double val); // Free functions to create boundary indices, e.g., 'first_index(x)'. NumExprPtr first_index(IndexExprPtr dim); diff --git a/src/compiler/lib/Grid.hpp b/src/compiler/lib/Grid.hpp index 332c2f05..399f856d 100644 --- a/src/compiler/lib/Grid.hpp +++ b/src/compiler/lib/Grid.hpp @@ -304,6 +304,7 @@ namespace yask { bool _doOptCluster = true; // apply optimizations also to cluster. string _eqBundleTargets; // how to bundle equations. string _gridRegex; // grids to update. + bool _findDeps = true; }; // Stencil dimensions. diff --git a/src/compiler/lib/Soln.cpp b/src/compiler/lib/Soln.cpp index a296dcb4..53f95bfe 100644 --- a/src/compiler/lib/Soln.cpp +++ b/src/compiler/lib/Soln.cpp @@ -85,7 +85,7 @@ namespace yask { _eqs.analyzeLoop(_dims); // Find dependencies between equations. - _eqs.findDeps(_dims, *_dos); + _eqs.analyzeEqs(_settings, _dims, *_dos); // Update access stats for the grids. _eqs.updateGridStats(); diff --git a/src/compiler/lib/Soln.hpp b/src/compiler/lib/Soln.hpp index b91a50da..4cefe9f5 100644 --- a/src/compiler/lib/Soln.hpp +++ b/src/compiler/lib/Soln.hpp @@ -184,6 +184,8 @@ namespace yask { virtual void clear_clustering() { _settings._clusterOptions.clear(); } virtual void set_element_bytes(int nbytes) { _settings._elem_bytes = nbytes; } virtual int get_element_bytes() const { return _settings._elem_bytes; } + virtual bool is_dependency_checker_enabled() const { return _settings._findDeps; } + virtual void set_dependency_checker_enabled(bool enable) { _settings._findDeps = enable; } virtual void format(const std::string& format_type, yask_output_ptr output); }; diff --git a/src/compiler/main.cpp b/src/compiler/main.cpp index a1c43ea4..1fd99f35 100644 --- a/src/compiler/main.cpp +++ b/src/compiler/main.cpp @@ -25,6 +25,11 @@ IN THE SOFTWARE. /////////////// Main vector-folding code-generation code. ///////////// +// This macro blocks the operator overloads in the API. +// This is temporary until the "internal DSL" gets completely +// replaced by the APIs. +#define USE_INTERNAL_DSL + // Generation code. #include "ExprUtils.hpp" #include "Grid.hpp" @@ -121,11 +126,11 @@ void usage(const string& cmd) { " [-no]-opt-cluster\n" " Do [not] apply optimizations across the cluster (default=" << settings._doOptCluster << ").\n" " -max-es \n" - " Set heuristic for max single expression-size (default=" << - settings._maxExprSize << ").\n" + " Set heuristic for max single expression-size (default=" << settings._maxExprSize << ").\n" " -min-es \n" - " Set heuristic for min expression-size for reuse (default=" << - settings._minExprSize << ").\n" + " Set heuristic for min expression-size for reuse (default=" << settings._minExprSize << ").\n" + " [-no]-find-deps\n" + " Find dependencies between stencil equations (default=" << settings._findDeps << ").\n" "\n" " -p \n" " Format output per and write to .\n" @@ -184,6 +189,10 @@ void parseOpts(int argc, const char* argv[]) settings._doOptCluster = true; else if (opt == "-no-opt-cluster") settings._doOptCluster = false; + else if (opt == "-find-deps") + settings._findDeps = true; + else if (opt == "-no-find-deps") + settings._findDeps = false; // add any more options w/o values above. diff --git a/src/compiler/swig/yask_compiler_api.i b/src/compiler/swig/yask_compiler_api.i index a9b154d8..715632cc 100644 --- a/src/compiler/swig/yask_compiler_api.i +++ b/src/compiler/swig/yask_compiler_api.i @@ -76,6 +76,7 @@ IN THE SOFTWARE. %template(vector_eq) std::vector>; %template(vector_grid) std::vector; +// Tell SWIG how to catch a YASK exception and rethrow it in Python. %exception { try { $action @@ -85,6 +86,66 @@ IN THE SOFTWARE. } } +// Tell SWIG how to handle non-class overloaded operators in Python. +%extend yask::yc_number_node { + yask::yc_number_node_ptr __neg__() { + auto p = $self->clone_ast(); + return yask::operator-(p); + } + }; +%extend yask::yc_number_node { + yask::yc_number_node_ptr __add__(yask::yc_number_node* rhs) { + auto lp = $self->clone_ast(); + auto rp = rhs->clone_ast(); + return yask::operator+(lp, rp); + } + }; +%extend yask::yc_number_node { + yask::yc_number_node_ptr __add__(double rhs) { + auto lp = $self->clone_ast(); + return yask::operator+(lp, rhs); + } + }; +%extend yask::yc_number_node { + yask::yc_number_node_ptr __truediv__(yask::yc_number_node* rhs) { + auto lp = $self->clone_ast(); + auto rp = rhs->clone_ast(); + return yask::operator/(lp, rp); + } + }; +%extend yask::yc_number_node { + yask::yc_number_node_ptr __truediv__(double rhs) { + auto lp = $self->clone_ast(); + return yask::operator/(lp, rhs); + } + }; +%extend yask::yc_number_node { + yask::yc_number_node_ptr __mul__(yask::yc_number_node* rhs) { + auto lp = $self->clone_ast(); + auto rp = rhs->clone_ast(); + return yask::operator*(lp, rp); + } + }; +%extend yask::yc_number_node { + yask::yc_number_node_ptr __mul__(double rhs) { + auto lp = $self->clone_ast(); + return yask::operator*(lp, rhs); + } + }; +%extend yask::yc_number_node { + yask::yc_number_node_ptr __sub__(yask::yc_number_node* rhs) { + auto lp = $self->clone_ast(); + auto rp = rhs->clone_ast(); + return yask::operator-(lp, rp); + } + }; +%extend yask::yc_number_node { + yask::yc_number_node_ptr __sub__(double rhs) { + auto lp = $self->clone_ast(); + return yask::operator-(lp, rhs); + } + }; + %include "yask_common_api.hpp" %include "yask_compiler_api.hpp" -%include "yc_nodes.hpp" +%include "yc_node_api.hpp" diff --git a/src/compiler/tests/yask_compiler_api_test.cpp b/src/compiler/tests/yask_compiler_api_test.cpp index 7bf77a66..910e105b 100644 --- a/src/compiler/tests/yask_compiler_api_test.cpp +++ b/src/compiler/tests/yask_compiler_api_test.cpp @@ -61,20 +61,19 @@ int main() { auto n1 = fac.new_const_number_node(3.14); cout << n1->format_simple() << endl; - auto n2 = fac.new_negate_node(n1); + auto n2 = g1->new_relative_grid_point({0, +1, 0, -2}); cout << n2->format_simple() << endl; - auto n3 = g1->new_relative_grid_point({0, +1, 0, -2}); + auto n3 = n1 + n2; cout << n3->format_simple() << endl; - auto n4a = fac.new_add_node(n2, n3); - auto n4b = fac.new_add_node(n4a, n1); - cout << n4b->format_simple() << endl; + auto n4 = n2 * -n3 * 0.9; + cout << n4->format_simple() << endl; auto n5 = g1->new_relative_grid_point({0, +1, -1, 0}); cout << n5->format_simple() << endl; - auto n6 = fac.new_multiply_node(n4b, n5); + auto n6 = n4 / n5; cout << n6->format_simple() << endl; // Define scratch grid value. @@ -87,7 +86,7 @@ int main() { // Use scratch grid value. auto n7a = sg1->new_relative_grid_point({-1, 0, +2}); auto n7b = sg1->new_relative_grid_point({+1, -1, -2}); - auto n8 = fac.new_add_node(n7a, n7b); + auto n8 = n7a + n7b; cout << n8->format_simple() << endl; // Define main grid value at t+1. From d9b5e4032eb87475860e84ea113c997b742efd72 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Tue, 24 Apr 2018 09:30:39 -0700 Subject: [PATCH 06/21] Add explicit API for integer consts. --- bin/yask_compiler_api_test.py | 2 +- include/yask_common_api.hpp | 8 ++++++++ include/yask_kernel_api.hpp | 8 -------- include/yc_node_api.hpp | 8 ++++++++ src/compiler/lib/Expr.cpp | 4 ++++ src/compiler/lib/Expr.hpp | 5 +++++ 6 files changed, 26 insertions(+), 9 deletions(-) diff --git a/bin/yask_compiler_api_test.py b/bin/yask_compiler_api_test.py index 1932ce21..adcde90e 100755 --- a/bin/yask_compiler_api_test.py +++ b/bin/yask_compiler_api_test.py @@ -74,7 +74,7 @@ sn2 = (sg1.new_relative_grid_point([1, 0, 0]) + sg1.new_relative_grid_point([0, 1, 0]) + sg1.new_relative_grid_point([0, 0, 1])) - sn5 = -sn2 * 2.5 - 9.1 + sn5 = -sn2 * 2.5 - 9 # Create an equation to define the value at the next timestep. n3 = g1.new_relative_grid_point([1, 0, 0, 0]) # center-point at next timestep. diff --git a/include/yask_common_api.hpp b/include/yask_common_api.hpp index 7443c897..6d8050b4 100644 --- a/include/yask_common_api.hpp +++ b/include/yask_common_api.hpp @@ -51,6 +51,14 @@ namespace yask { */ std::string yask_get_version_string(); + /// Type to use for indexing grids. + /** Index types are signed to allow negative indices in padding/halos. */ +#ifdef SWIG + typedef long int idx_t; // SWIG doesn't seem to understand int64_t. +#else + typedef std::int64_t idx_t; +#endif + // Forward declarations of class-pointers. class yask_output; diff --git a/include/yask_kernel_api.hpp b/include/yask_kernel_api.hpp index c1579f50..93b734c8 100644 --- a/include/yask_kernel_api.hpp +++ b/include/yask_kernel_api.hpp @@ -44,14 +44,6 @@ namespace yask { * @{ */ - /// Type to use for indexing grids. - /** Index types are signed to allow negative indices in padding/halos. */ -#ifdef SWIG - typedef long int idx_t; // SWIG doesn't seem to understand int64_t. -#else - typedef std::int64_t idx_t; -#endif - // Forward declarations of classes and pointers. class yk_env; diff --git a/include/yc_node_api.hpp b/include/yc_node_api.hpp index 86bf2eb4..eeb02311 100644 --- a/include/yc_node_api.hpp +++ b/include/yc_node_api.hpp @@ -190,6 +190,14 @@ namespace yask { virtual yc_const_number_node_ptr new_const_number_node(double val /**< [in] Value to store in node. */ ); + /// + /** + Integer version of new_const_number_node(double). + @returns Pointer to new \ref yc_const_number_node object. + */ + virtual yc_const_number_node_ptr + new_const_number_node(idx_t val /**< [in] Value to store in node. */ ); + /// Create a numerical negation operator node. /** New negation nodes can also be created via the overloaded unary `-` operator. diff --git a/src/compiler/lib/Expr.cpp b/src/compiler/lib/Expr.cpp index 361ccf43..e6191f62 100644 --- a/src/compiler/lib/Expr.cpp +++ b/src/compiler/lib/Expr.cpp @@ -70,6 +70,10 @@ namespace yask { yc_node_factory::new_const_number_node(double val) { return make_shared(val); } + yc_const_number_node_ptr + yc_node_factory::new_const_number_node(idx_t val) { + return make_shared(val); + } yc_negate_node_ptr yc_node_factory::new_negate_node(yc_number_node_ptr rhs) { auto p = dynamic_pointer_cast(rhs); diff --git a/src/compiler/lib/Expr.hpp b/src/compiler/lib/Expr.hpp index f10c1ebd..89023f95 100644 --- a/src/compiler/lib/Expr.hpp +++ b/src/compiler/lib/Expr.hpp @@ -358,6 +358,11 @@ namespace yask { public: ConstExpr(double f) : _f(f) { } + ConstExpr(idx_t i) : _f(i) { + if (idx_t(_f) != i) + THROW_YASK_EXCEPTION("Error: integer value " << i << + " cannot be stored accurately as a double"); + } ConstExpr(const ConstExpr& src) : _f(src._f) { } virtual ~ConstExpr() { } From c24a2987df428026820da9fa439e8056f197d99b Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Tue, 24 Apr 2018 19:13:18 -0700 Subject: [PATCH 07/21] Add API to specify flow dependencies. Closes #96. Closes #85. Improve some other API docs. --- include/yask_compiler_api.hpp | 75 +++++++++++++++++++++++++++++--- include/yc_node_api.hpp | 80 +++++++++++++++++++---------------- src/compiler/lib/Eqs.cpp | 7 +-- src/compiler/lib/Eqs.hpp | 34 +++++++++++---- src/compiler/lib/Soln.cpp | 2 +- src/compiler/lib/Soln.hpp | 26 ++++++++---- 6 files changed, 162 insertions(+), 62 deletions(-) diff --git a/include/yask_compiler_api.hpp b/include/yask_compiler_api.hpp index e3ab4951..251049a8 100644 --- a/include/yask_compiler_api.hpp +++ b/include/yask_compiler_api.hpp @@ -268,9 +268,10 @@ namespace yask { clear_folding() =0; /// Set the cluster multiplier (unroll factor) in given dimension. - /** For YASK-code generation, this will have the effect of creating + /** For YASK kernel-code generation, this will have the effect of creating N vectors of output for each equation, where N is the product of the cluster multipliers. + @note A multiplier >1 cannot be applied to the step dimension. @note Default is one (1) in each dimension. */ @@ -326,13 +327,13 @@ namespace yask { /// **[Advanced]** Enable or disable automatic dependency checker. /** - This should be used whenever the built-in dependency checker is - insufficient. Currently, the provided checker does not allow - stencils in which points in one sub-domain depend on points + Disabling the built-in dependency checker may be done when it is + overly conservative. Currently, the provided checker does not + allow stencils in which points in one sub-domain depend on points in another sub-domain within the same value of the step index. @warning If dependency checker is disabled, *all* dependencies - must be set via the APIs. + must be set via add_flow_dependency(). */ virtual void set_dependency_checker_enabled(bool enable @@ -344,6 +345,70 @@ namespace yask { */ virtual bool is_dependency_checker_enabled() const =0; + + /// **[Advanced]** Add a dependency between two equations. + /** + This function adds an arc in the data dependency graph `from` one + equation `to` another one, + indicating that the `from` equation depends on the `to` equation. + In other words, the `to` expression must be evaluated _before_ + the `from` equation. + In compiler-theory terms, this is a _flow_ dependency, also + known as a _true_ or _read-after-write_ (RAW) dependency. + (Strictly speaking, however, equations in the YASK compiler + are declarative instead of imperative, so they describe + equalities rather than assignments with reads and writes.) + + Additional considerations: + - Only _immediate_ dependencies should be added. + For example, if **A** depends on **B** and **B** depends on **C**, + it is not necessary to add a derived dependence from **A** to **C**. + + - Only dependencies at a given step-index value should + be added. + For example, given + equation **A**: `A(t+1, x) EQUALS B(t+1, x) + 5` and + equation **B**: `B(t+1, x) EQUALS A(t, x) / 2`, + **A** depends on **B** at some value of the step-index `t`. + It is true that `B(t+2)` depends on `A(t+1)`, but that + inter-step dependency should not be added with this function. + + - If a cycle of dependencies is created, the YASK compiler + will throw an exception containing an error message + about a circular dependency. This exception may not be + thrown until format() is called. + + - If using scratch grids, dependencies among scratch grids + and between scratch-grid equations and non-scratch-grid + equations should also be added. Each scratch grid equation + should ultimately depend on non-scratch-grid values. + + - This function can be used in cooperation with or instead of + the built-in automatic dependency checker. + When used in cooperation with the built-in checker, + both dependencies from this function and the built-in checker + will be considered. + When the built-in checker is diabled via + `set_dependency_checker_enabled(false)`, only dependencies + from this function will be considered. + In this case, it is imperative that all immediate + dependencies are added. + If the dependency graph is incomplete, the resulting generated + stencil code will contain illegal race conditions, + and it will most likely produce incorrect results. + */ + virtual void + add_flow_dependency(yc_equation_node_ptr from + /**< [in] Equation that must be evaluated _after_ `to`. */, + yc_equation_node_ptr to + /**< [in] Equation that must be evaluated _before_ `from`. */) =0; + + /// **[Advanced]** Remove all existing dependencies. + /** + Removes dependencies added via add_flow_dependency(). + */ + virtual void + clear_dependencies() =0; }; /// A compile-time grid. diff --git a/include/yc_node_api.hpp b/include/yc_node_api.hpp index eeb02311..57ab4e32 100644 --- a/include/yc_node_api.hpp +++ b/include/yc_node_api.hpp @@ -136,8 +136,12 @@ namespace yask { /** Create a variable to be used to index grids in the solution-domain dimension. - The name usually describes spatial dimensions, e.g. "x" or "y". - This should *not* include the step dimension, which is specified via + The name usually describes spatial dimensions, e.g. "x" or "y", + but it can be any dimension that is specified at run-time, + such as an index into a number of parallel problems + being solved simultaneously. + + @note This should *not* include the step dimension, which is specified via new_step_index(). @returns Pointer to new \ref yc_index_node object. */ @@ -149,7 +153,9 @@ namespace yask { /** Create an variable to be used to index grids in the some dimension that is not the step dimension - or a domain dimension. Example: index into an array. + or a domain dimension. + The value of these indices are normally compile-time + constants, e.g., a fixed index into an array. @returns Pointer to new \ref yc_index_node object. */ virtual yc_index_node_ptr @@ -164,12 +170,15 @@ namespace yask { LHS. An optional condition may be provided to define the sub-domain - to which this equation applies. Example: `x > 10`. + to which this equation applies. See new_first_domain_index() + for more information and an example. Conditions are always evaluated with respect to the overall - problem domain independent of any MPI domain decomposition - that might occur at run-time. + problem domain, i.e., independent of any specific + MPI domain decomposition that might occur at run-time. If a condition is not provided, the equation applies to the entire problem domain. + A condition can be added to an equation after its creation + via yc_equation_node.set_cond(). @returns Pointer to new \ref yc_equation_node object. */ @@ -179,7 +188,8 @@ namespace yask { yc_number_node_ptr rhs /**< [in] Expression after EQUALS operator. */, yc_bool_node_ptr cond = nullptr - /**< [in] Expression defining sub-domain. */ ); + /**< [in] Optional expression defining sub-domain + where `lhs EQUALS rhs` is valid. */ ); /// Create a constant numerical value node. /** @@ -261,16 +271,28 @@ namespace yask { \code{.cpp} auto x = node_fac.new_domain_index("x"); - auto first_x = node_fac.new_first_domain_index(x); - // Create expression for "first_x + 10". - auto left10 = node_fac.new_add_node(first_x, - node_fac.new_const_number_node(10)); + // Create boolean expression for the + // boundary sub-domain "x < first_x + 10". + auto first_x = node_fac.new_first_domain_index(x); + auto left_bc_cond = node_fac.new_less_than_node(x, first_x + 10); - // Create boolean expression for "x > first_x + 10". - auto expr = node_fac.new_greater_than_node(x, left10); + // Create a new equation that is valid in this range. + auto left_bc_eq = + node_fac.new_equation_node(grid_pt_expr, left_bc_expr, left_bc_cond); \endcode + Specification of the "interior" part of a 2-D domain could be + represented by an expression similar to + `x >= new_first_domain_index(x) + 20 && + x <= new_last_domain_index(x) - 20 && + y >= new_first_domain_index(y) + 20 && + y <= new_last_domain_index(y) - 20`. + + @note The entire domain in dimension "x" would be represented by + `x >= new_first_domain_index(x) && x <= new_last_domain_index(x)`, but + that is the default condition so does not need to be specified. + @returns Pointer to new \ref yc_index_node object. */ virtual yc_number_node_ptr @@ -283,20 +305,6 @@ namespace yask { domain in `dim` dimension. The `dim` argument is created via new_domain_index(). - Typical C++ usage: - - \code{.cpp} - auto x = node_fac.new_domain_index("x"); - auto last_x = node_fac.new_last_domain_index(x); - - // Create expression for "last_x - 10". - auto right10 = node_fac.new_subtract_node(last_x, - node_fac.new_const_number_node(10)); - - // Create boolean expression for "x < first_x - 10". - auto expr = node_fac.new_less_than_node(x, right10); - \endcode - @returns Pointer to new \ref yc_index_node object. */ virtual yc_number_node_ptr @@ -734,34 +742,34 @@ namespace yask { /// Operator version of yc_node_factory::new_negation_node(). yc_negate_node_ptr operator-(yc_number_node_ptr rhs); - + + //@{ /// Operator version of yc_node_factory::new_addition_node(). yc_add_node_ptr operator+(yc_number_node_ptr lhs, yc_number_node_ptr rhs); - /// Operator version of yc_node_factory::new_addition_node(). yc_add_node_ptr operator+(double lhs, yc_number_node_ptr rhs); - /// Operator version of yc_node_factory::new_addition_node(). yc_add_node_ptr operator+(yc_number_node_ptr lhs, double rhs); + //@} + //@{ /// Operator version of yc_node_factory::new_division_node(). yc_divide_node_ptr operator/(yc_number_node_ptr lhs, yc_number_node_ptr rhs); - /// Operator version of yc_node_factory::new_division_node(). yc_divide_node_ptr operator/(double lhs, yc_number_node_ptr rhs); - /// Operator version of yc_node_factory::new_division_node(). yc_divide_node_ptr operator/(yc_number_node_ptr lhs, double rhs); + //@} + //@{ /// Operator version of yc_node_factory::new_multiplication_node(). yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, yc_number_node_ptr rhs); - /// Operator version of yc_node_factory::new_multiplication_node(). yc_multiply_node_ptr operator*(double lhs, yc_number_node_ptr rhs); - /// Operator version of yc_node_factory::new_multiplication_node(). yc_multiply_node_ptr operator*(yc_number_node_ptr lhs, double rhs); + //@} + //@{ /// Operator version of yc_node_factory::new_subtraction_node(). yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, yc_number_node_ptr rhs); - /// Operator version of yc_node_factory::new_subtraction_node(). yc_subtract_node_ptr operator-(double lhs, yc_number_node_ptr rhs); - /// Operator version of yc_node_factory::new_subtraction_node(). yc_subtract_node_ptr operator-(yc_number_node_ptr lhs, double rhs); + //@} #endif /** @}*/ diff --git a/src/compiler/lib/Eqs.cpp b/src/compiler/lib/Eqs.cpp index 94955543..d554dcc5 100644 --- a/src/compiler/lib/Eqs.cpp +++ b/src/compiler/lib/Eqs.cpp @@ -183,7 +183,8 @@ namespace yask { // Analyze group of equations. // Sets _stepDir in dims. - // Finds dependencies based on all eqs if 'settings._findDeps'. + // Finds dependencies based on all eqs if 'settings._findDeps', setting + // _imm_dep_on and _dep_on. // Throws exceptions on illegal dependencies. // TODO: split this into smaller functions. // BIG-TODO: replace dependency algorithms with integration of a polyhedral @@ -606,7 +607,7 @@ namespace yask { visitEqs(&slv); } - // Update access stats for the grids. + // Update access stats for the grids, i.e., halos and const indices. // Also finds scratch-grid eqs needed for each non-scratch eq. void Eqs::updateGridStats() { @@ -647,7 +648,7 @@ namespace yask { (eq1, [&](EqualsExprPtr b, EqDeps::EqVecSet& path) { // Does 'b' have a scratch-grid output? - // NB: scratch eqs don't have conditions, so + // NB: scratch eqs don't have their own conditions, so // we don't need to check them. auto* og2 = pv.getOutputGrids().at(b.get()); if (og2->isScratch()) { diff --git a/src/compiler/lib/Eqs.hpp b/src/compiler/lib/Eqs.hpp index d403ea0a..5399f432 100644 --- a/src/compiler/lib/Eqs.hpp +++ b/src/compiler/lib/Eqs.hpp @@ -67,6 +67,14 @@ namespace yask { _all.insert(b); _done = false; } + + // Clear all deps. + virtual void clear_deps() { + _imm_deps.clear(); + _full_deps.clear(); + _all.clear(); + _done = false; + } // Check whether eq a directly depends on b. virtual bool is_imm_dep_on(EqualsExprPtr a, EqualsExprPtr b) const { @@ -120,14 +128,21 @@ namespace yask { protected: // Equations(s) describing how values in this grid are computed. - EqList _eqs; // just equations w/o conditions. + EqList _eqs; + + // Dependencies between all eqs. + EqDepMap _eq_deps; - EqDepMap _eq_deps; // dependencies between all eqs. - EqDeps::DepMap _scratch_deps; // dependencies through scratch grids. + // Dependencies through scratch grids. + EqDeps::DepMap _scratch_deps; public: - Eqs() {} + Eqs() { + // Make sure map keys exist. + for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1)) + _eq_deps[dt]; + } virtual ~Eqs() {} // Equation accessors. @@ -146,6 +161,9 @@ namespace yask { virtual const EqDepMap& getDeps() const { return _eq_deps; } + virtual EqDepMap& getDeps() { + return _eq_deps; + } // Get the scratch-grid eqs that contribute to 'eq'. virtual const EqDeps::EqSet& getScratchDeps(EqualsExprPtr ep) const { @@ -176,10 +194,10 @@ namespace yask { virtual void updateGridStats(); }; - // A named equation bundle, which contains one or more grid-update equations. - // All equations in a bundle must have the same condition. - // Equations should not have inter-dependencies because they will be - // combined into a single expression. + // A named equation bundle, which contains one or more grid-update + // equations. All equations in a bundle must have the same condition. + // Equations in a bundle should not have inter-dependencies because they + // will be combined into a single expression. class EqBundle { protected: EqList _eqs; // expressions in this eqBundle (not including conditions). diff --git a/src/compiler/lib/Soln.cpp b/src/compiler/lib/Soln.cpp index 53f95bfe..5c21a9c6 100644 --- a/src/compiler/lib/Soln.cpp +++ b/src/compiler/lib/Soln.cpp @@ -70,7 +70,7 @@ namespace yask { bool is_folding_efficient) { // Call the stencil 'define' method to create ASTs. - // ASTs can also be created via the APIs. + // ASTs and grids can also be created via the APIs. define(); // Find all the stencil dimensions from the grids. diff --git a/src/compiler/lib/Soln.hpp b/src/compiler/lib/Soln.hpp index 4cefe9f5..46e05939 100644 --- a/src/compiler/lib/Soln.hpp +++ b/src/compiler/lib/Soln.hpp @@ -53,10 +53,10 @@ namespace yask { // Debug output. yask_output_ptr _debug_output; - ostream* _dos = &std::cout; + ostream* _dos = &std::cout; // just a handy pointer to an ostream. // All vars accessible by the kernel. - Grids _grids; // keep track of all registered grids. + Grids _grids; // All equations defined in this solution. Eqs _eqs; @@ -170,22 +170,30 @@ namespace yask { ev.push_back(_eqs.getEqs().at(i)); return ev; } - virtual void set_fold(const std::string& dim, int len) { - auto& fold = _settings._foldOptions; - auto* p = fold.lookup(dim); - if (p) - *p = len; - else - fold.addDimBack(dim, len); + virtual void add_flow_dependency(yc_equation_node_ptr from, + yc_equation_node_ptr to) { + auto fp = dynamic_pointer_cast(from); + assert(fp); + auto tp = dynamic_pointer_cast(to); + assert(tp); + _eqs.getDeps().at(cur_step_dep).set_imm_dep_on(fp, tp); } + virtual void clear_dependencies() { + for (DepType dt = DepType(0); dt < num_deps; dt = DepType(dt+1)) + _eqs.getDeps().at(dt).clear_deps(); + } + virtual void set_fold_len(const yc_index_node_ptr, int len); virtual void clear_folding() { _settings._foldOptions.clear(); } virtual void set_cluster_mult(const yc_index_node_ptr, int mult); virtual void clear_clustering() { _settings._clusterOptions.clear(); } + virtual void set_element_bytes(int nbytes) { _settings._elem_bytes = nbytes; } virtual int get_element_bytes() const { return _settings._elem_bytes; } + virtual bool is_dependency_checker_enabled() const { return _settings._findDeps; } virtual void set_dependency_checker_enabled(bool enable) { _settings._findDeps = enable; } + virtual void format(const std::string& format_type, yask_output_ptr output); }; From b2e16f330dfbbc0f35c58762e587dec98216fb56 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Wed, 25 Apr 2018 13:02:54 -0700 Subject: [PATCH 08/21] Restructure code to reduce compile time. Turns off prefetching at O0 and O1. Only builds grids to max size in stencil. --- src/kernel/Makefile | 166 +++-- src/kernel/lib/context.cpp | 1230 +------------------------------ src/kernel/lib/grid_apis.cpp | 387 ++++++++++ src/kernel/lib/realv_grids.cpp | 354 --------- src/kernel/lib/setup.cpp | 1260 ++++++++++++++++++++++++++++++++ src/kernel/lib/yask.hpp | 5 + 6 files changed, 1740 insertions(+), 1662 deletions(-) create mode 100644 src/kernel/lib/grid_apis.cpp create mode 100644 src/kernel/lib/setup.cpp diff --git a/src/kernel/Makefile b/src/kernel/Makefile index 3ab5bf9c..3d5dccdf 100644 --- a/src/kernel/Makefile +++ b/src/kernel/Makefile @@ -219,7 +219,6 @@ def_pad_args ?= -ep 1 cluster ?= x=1 pfd_l1 ?= 0 pfd_l2 ?= 2 -max_dims ?= 5 # max grid dims >= max stencil dims. # default folding depends on HW vector size. ifneq ($(findstring INTRIN512,$(MACROS)),) # 512 bits. @@ -248,76 +247,6 @@ endif # not 512 bits. # Select fold based on size of reals. fold = $(fold_$(real_bytes)byte) # e.g., fold_4byte -######## Loop-compiler configuration: -# The loop indices range from 0..N-1. -# Dim 0 is the step dim, usually time. -# The step loop is handled outside of the generated loops, -# so the following loop codes do not scan over dim 0. -# Dims 1..N-1 are the domain dims, usually spatial. -# Thus, N-1 is the inner-most dim. -# For best perf, this should be the unit-stride dim in the grids. - -# File with number of dims extracted from YASK compiler output. -YK_DIMS_FILE := num_dims.$(stencil).txt -NDIMS_OPT := `cat $(YK_DIMS_FILE)` - -# Rank loops break up the whole rank into smaller regions. In order for -# temporal wavefronts to operate properly, the order of spatial dimensions -# may be changed, but the scanning paths must have strictly incrementing -# indices. Those that do not (e.g., grouped, serpentine, square-wave) may -# *not* be used here when using temporal wavefronts. The time loop may be -# found in StencilEquations::run_solution(). -RANK_LOOP_OPTS ?= $(NDIMS_OPT) -inVar rank_idxs -RANK_LOOP_ORDER ?= 1 .. N-1 -RANK_LOOP_CODE ?= $(RANK_LOOP_OUTER_MODS) loop($(RANK_LOOP_ORDER)) \ - { $(RANK_LOOP_INNER_MODS) call(calc_region(stBundle_ptr)); } - -# Region loops break up a region using OpenMP threading into blocks. The -# 'omp' modifier creates an outer OpenMP loop so that each block is assigned -# to a top-level OpenMP thread. The region time loops are not coded here to -# allow for proper spatial skewing for temporal wavefronts. The time loop -# may be found in StencilEquations::calc_region(). -REGION_LOOP_OPTS ?= $(NDIMS_OPT) -inVar region_idxs \ - -ompConstruct '$(omp_par_for) schedule($(omp_region_schedule)) proc_bind(spread)' \ - -callPrefix 'sg->' -REGION_LOOP_OUTER_MODS ?= grouped omp -REGION_LOOP_ORDER ?= 1 .. N-1 -REGION_LOOP_CODE ?= $(REGION_LOOP_OUTER_MODS) loop($(REGION_LOOP_ORDER)) { \ - $(REGION_LOOP_INNER_MODS) call(calc_block); } - -# Block loops break up a block into sub-blocks. The 'omp' modifier creates -# a *nested* OpenMP loop so that each sub-block is assigned to a nested OpenMP -# thread. There is no time loop because threaded temporal blocking is -# not yet supported. -BLOCK_LOOP_OPTS ?= $(NDIMS_OPT) -inVar block_idxs \ - -ompConstruct '$(omp_par_for) schedule($(omp_block_schedule)) proc_bind(close)' \ - -callPrefix 'sg->' -BLOCK_LOOP_OUTER_MODS ?= grouped omp -BLOCK_LOOP_ORDER ?= 1 .. N-1 -BLOCK_LOOP_CODE ?= $(BLOCK_LOOP_OUTER_MODS) loop($(BLOCK_LOOP_ORDER)) { \ - $(BLOCK_LOOP_INNER_MODS) call(calc_sub_block(thread_idx)); } - -# Sub-block loops break up a sub-block into clusters or vectors. These loops -# are run by a single OMP thread. The N-1 (inner) loop is generated by the -# stencil compiler. There is no time loop because threaded temporal -# blocking is not yet supported. The indexes in this loop are 'normalized', -# i.e., vector units and rank-relative. -SUB_BLOCK_LOOP_OPTS ?= $(NDIMS_OPT) -inVar norm_sub_block_idxs -SUB_BLOCK_LOOP_OUTER_MODS ?= -SUB_BLOCK_LOOP_ORDER ?= 1 .. N-2 -SUB_BLOCK_LOOP_CODE ?= $(SUB_BLOCK_LOOP_OUTER_MODS) loop($(SUB_BLOCK_LOOP_ORDER)) { \ - $(SUB_BLOCK_LOOP_INNER_MODS) call(calc_inner_loop(thread_idx)); } - -# General-purpose parallel loop. -# Nested OpenMP is not used here because there is no sharing between threads. -# TODO: Consider using nested OpenMP to hide more latency. -MISC_LOOP_OPTS ?= $(NDIMS_OPT) -inVar misc_idxs \ - -ompConstruct '$(omp_par_for) schedule($(omp_misc_schedule)) proc_bind(spread)' -MISC_LOOP_OUTER_MODS ?= omp -MISC_LOOP_ORDER ?= 1 .. N-1 -MISC_LOOP_CODE ?= $(MISC_LOOP_OUTER_MODS) loop($(MISC_LOOP_ORDER)) \ - $(MISC_LOOP_INNER_MODS) { call(misc_fn); } - ######## End of vars that control the function and performance of the kernel. # The remainder of this file specifies how to build and test the kernel. @@ -370,6 +299,7 @@ YK_PY_MOD := $(YASK_DIR)/$(YK_MODULE).py YK_API_TEST_EXEC := $(BIN_DIR)/$(YK_BASE)_api_test.exe YK_GRID_TEST_EXEC := $(BIN_DIR)/$(YK_BASE)_grid_test.exe YK_API_TEST_EXEC_WITH_EXCEPTION := $(BIN_DIR)/$(YK_BASE)_api_exception_test.exe +YK_DIMS_FILE := num_dims.$(stencil).txt MAKE_REPORT_FILE:= make-report.$(YK_TAG).txt @@ -382,7 +312,7 @@ COMM_SRC_BASES := $(addprefix $(COMM_DIR)/,$(COMM_SRC_NAMES)) YK_SWIG_DIR := ./swig YK_GEN_DIR := ./gen YK_LIB_DIR := ./lib -YK_SRC_NAMES := factory new_grid generic_grids realv_grids utils settings context stencil_calc +YK_SRC_NAMES := factory context setup realv_grids grid_apis new_grid generic_grids utils settings stencil_calc YK_SRC_BASES := $(addprefix $(YK_LIB_DIR)/,$(YK_SRC_NAMES)) YK_OBJS := $(addsuffix .$(YK_TAG).o,$(YK_SRC_BASES) $(COMM_SRC_BASES)) YK_MACRO_FILE := $(YK_GEN_DIR)/yask_macros.hpp @@ -462,10 +392,15 @@ PYINC := $(addprefix -I,$(shell $(PYTHON) -c 'import distutils.sysconfig; prin RUN_PYTHON := $(RUN_PREFIX) env PYTHONPATH=$(LIB_DIR):$(YASK_DIR):$(PYTHONPATH) $(PYTHON) +# Turn off prefetching at O0 or O1. +ifneq ($(filter -O0 -O1,$(YK_CXXOPT)),) + pfd_l1 = 0 + pfd_l2 = 0 +endif + # Set MACROS based on individual makefile vars. # MACROS and EXTRA_MACROS will be written to a header file. MACROS += PFD_L1=$(pfd_l1) PFD_L2=$(pfd_l2) -MACROS += MAX_DIMS=$(max_dims) ifeq ($(streaming_stores),1) MACROS += USE_STREAMING_STORE endif @@ -539,6 +474,75 @@ endif # Add in final flags and user-added flags. YK_CXXFLAGS += $(YK_CXXOPT) $(OMPFLAGS) $(EXTRA_YK_CXXFLAGS) +# Number of dims extracted from YASK compiler output. +NDIMS := `cat $(YK_DIMS_FILE)` + +######## Loop-compiler configuration: +# The loop indices range from 0..N-1. +# Dim 0 is the step dim, usually time. +# The step loop is handled outside of the generated loops, +# so the following loop codes do not scan over dim 0. +# Dims 1..N-1 are the domain dims, usually spatial. +# Thus, N-1 is the inner-most dim. +# For best perf, this should be the unit-stride dim in the grids. + +# Rank loops break up the whole rank into smaller regions. In order for +# temporal wavefronts to operate properly, the order of spatial dimensions +# may be changed, but the scanning paths must have strictly incrementing +# indices. Those that do not (e.g., grouped, serpentine, square-wave) may +# *not* be used here when using temporal wavefronts. The time loop may be +# found in StencilEquations::run_solution(). +RANK_LOOP_OPTS ?= -ndims $(NDIMS) -inVar rank_idxs +RANK_LOOP_ORDER ?= 1 .. N-1 +RANK_LOOP_CODE ?= $(RANK_LOOP_OUTER_MODS) loop($(RANK_LOOP_ORDER)) \ + { $(RANK_LOOP_INNER_MODS) call(calc_region(stBundle_ptr)); } + +# Region loops break up a region using OpenMP threading into blocks. The +# 'omp' modifier creates an outer OpenMP loop so that each block is assigned +# to a top-level OpenMP thread. The region time loops are not coded here to +# allow for proper spatial skewing for temporal wavefronts. The time loop +# may be found in StencilEquations::calc_region(). +REGION_LOOP_OPTS ?= -ndims $(NDIMS) -inVar region_idxs \ + -ompConstruct '$(omp_par_for) schedule($(omp_region_schedule)) proc_bind(spread)' \ + -callPrefix 'sg->' +REGION_LOOP_OUTER_MODS ?= grouped omp +REGION_LOOP_ORDER ?= 1 .. N-1 +REGION_LOOP_CODE ?= $(REGION_LOOP_OUTER_MODS) loop($(REGION_LOOP_ORDER)) { \ + $(REGION_LOOP_INNER_MODS) call(calc_block); } + +# Block loops break up a block into sub-blocks. The 'omp' modifier creates +# a *nested* OpenMP loop so that each sub-block is assigned to a nested OpenMP +# thread. There is no time loop because threaded temporal blocking is +# not yet supported. +BLOCK_LOOP_OPTS ?= -ndims $(NDIMS) -inVar block_idxs \ + -ompConstruct '$(omp_par_for) schedule($(omp_block_schedule)) proc_bind(close)' \ + -callPrefix 'sg->' +BLOCK_LOOP_OUTER_MODS ?= grouped omp +BLOCK_LOOP_ORDER ?= 1 .. N-1 +BLOCK_LOOP_CODE ?= $(BLOCK_LOOP_OUTER_MODS) loop($(BLOCK_LOOP_ORDER)) { \ + $(BLOCK_LOOP_INNER_MODS) call(calc_sub_block(thread_idx)); } + +# Sub-block loops break up a sub-block into clusters or vectors. These loops +# are run by a single OMP thread. The N-1 (inner) loop is generated by the +# stencil compiler. There is no time loop because threaded temporal +# blocking is not yet supported. The indexes in this loop are 'normalized', +# i.e., vector units and rank-relative. +SUB_BLOCK_LOOP_OPTS ?= -ndims $(NDIMS) -inVar norm_sub_block_idxs +SUB_BLOCK_LOOP_OUTER_MODS ?= +SUB_BLOCK_LOOP_ORDER ?= 1 .. N-2 +SUB_BLOCK_LOOP_CODE ?= $(SUB_BLOCK_LOOP_OUTER_MODS) loop($(SUB_BLOCK_LOOP_ORDER)) { \ + $(SUB_BLOCK_LOOP_INNER_MODS) call(calc_inner_loop(thread_idx)); } + +# General-purpose parallel loop. +# Nested OpenMP is not used here because there is no sharing between threads. +# TODO: Consider using nested OpenMP to hide more latency. +MISC_LOOP_OPTS ?= -ndims $(NDIMS) -inVar misc_idxs \ + -ompConstruct '$(omp_par_for) schedule($(omp_misc_schedule)) proc_bind(spread)' +MISC_LOOP_OUTER_MODS ?= omp +MISC_LOOP_ORDER ?= 1 .. N-1 +MISC_LOOP_CODE ?= $(MISC_LOOP_OUTER_MODS) loop($(MISC_LOOP_ORDER)) \ + $(MISC_LOOP_INNER_MODS) { call(misc_fn); } + ######## Primary targets & rules # NB: must set stencil and arch make vars to generate the desired YASK kernel. @@ -588,28 +592,28 @@ $(YK_GEN_DIR)/yask_misc_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE) $(YK_MK_GEN_DIR) $< -output $@ $(MISC_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_MISC_LOOP_OPTS) "$(MISC_LOOP_CODE)" -$(YK_GEN_DIR)/yask_layout_macros.hpp: $(GEN_LAYOUTS) +$(YK_GEN_DIR)/yask_layout_macros.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE) $(YK_MK_GEN_DIR) - $(PERL) $< -m $(max_dims) > $@ + $(PERL) $< -m $(NDIMS) > $@ @- gindent -fca $@ || \ indent -fca $@ || \ echo "note:" $@ "is not properly indented because indent program failed or was not found." -$(YK_GEN_DIR)/yask_layouts.hpp: $(GEN_LAYOUTS) +$(YK_GEN_DIR)/yask_layouts.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE) $(YK_MK_GEN_DIR) - $(PERL) $< -d $(max_dims) > $@ + $(PERL) $< -d $(NDIMS) > $@ @- gindent -fca $@ || \ indent -fca $@ || \ echo "note:" $@ "is not properly indented because indent program failed or was not found." -$(YK_GEN_DIR)/yask_grid_code.hpp: $(GEN_LAYOUTS) +$(YK_GEN_DIR)/yask_grid_code.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE) $(YK_MK_GEN_DIR) - $(PERL) $< -g $(max_dims) > $@ + $(PERL) $< -g $(NDIMS) > $@ # Extract the number of stencil dims from the compiler output. # Use this to create an option to pass to the loop generator script. $(YK_DIMS_FILE): $(YK_CODE_FILE) - awk '/NUM_STENCIL_DIMS/ {print "-ndims",$$NF}' $< > $@ + awk '/NUM_STENCIL_DIMS/ {print $$NF}' $< > $@ $(YK_CODE_FILE): $(YC_EXEC) $(YK_MK_GEN_DIR) @@ -633,7 +637,7 @@ headers: $(YK_GEN_HEADERS) # NB: must set stencil and arch to generate the desired kernel API. # Build C++ and Python kernel API libs. -api: $(YK_LIB) $(YK_PY_LIB) $(MAKE_REPORT_FILE) +api: $(YK_LIB) $(YK_PY_LIB) # Build python kernel API lib. # TODO: consider adding $(YK_TAG) to [some of] these targets. diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp index 5188ed1a..2a47ca33 100644 --- a/src/kernel/lib/context.cpp +++ b/src/kernel/lib/context.cpp @@ -23,6 +23,9 @@ IN THE SOFTWARE. *****************************************************************************/ +// This file contains implementations of StencilContext methods. +// Also see context_setup.cpp. + #include "yask.hpp" using namespace std; @@ -963,769 +966,6 @@ namespace yask { outputGridMap[gname] = gp; } } - - // Init MPI-related vars and other vars related to my rank's place in - // the global problem: rank index, offset, etc. Need to call this even - // if not using MPI to properly init these vars. Called from - // prepare_solution(), so it doesn't normally need to be called from user code. - void StencilContext::setupRank() { - ostream& os = get_ostr(); - auto& step_dim = _dims->_step_dim; - auto me = _env->my_rank; - - // Check ranks. - idx_t req_ranks = _opts->_num_ranks.product(); - if (req_ranks != _env->num_ranks) { - THROW_YASK_EXCEPTION("error: " << req_ranks << " rank(s) requested (" << - _opts->_num_ranks.makeDimValStr(" * ") << "), but " << - _env->num_ranks << " rank(s) are active"); - } - assertEqualityOverRanks(_opts->_rank_sizes[step_dim], _env->comm, "num steps"); - - // Determine my coordinates if not provided already. - // TODO: do this more intelligently based on proximity. - if (_opts->find_loc) - _opts->_rank_indices = _opts->_num_ranks.unlayout(me); - - // A table of rank-coordinates for everyone. - auto num_ddims = _opts->_rank_indices.size(); // domain-dims only! - idx_t coords[_env->num_ranks][num_ddims]; - - // Init coords for this rank. - for (int i = 0; i < num_ddims; i++) - coords[me][i] = _opts->_rank_indices[i]; - - // A table of rank-domain sizes for everyone. - idx_t rsizes[_env->num_ranks][num_ddims]; - - // Init sizes for this rank. - for (int di = 0; di < num_ddims; di++) { - auto& dname = _opts->_rank_indices.getDimName(di); - rsizes[me][di] = _opts->_rank_sizes[dname]; - } - -#ifdef USE_MPI - // Exchange coord and size info between all ranks. - for (int rn = 0; rn < _env->num_ranks; rn++) { - MPI_Bcast(&coords[rn][0], num_ddims, MPI_INTEGER8, - rn, _env->comm); - MPI_Bcast(&rsizes[rn][0], num_ddims, MPI_INTEGER8, - rn, _env->comm); - } - // Now, the tables are filled in for all ranks. -#endif - - // Init offsets and total sizes. - rank_domain_offsets.setValsSame(0); - overall_domain_sizes.setValsSame(0); - - // Loop over all ranks, including myself. - int num_neighbors = 0; - for (int rn = 0; rn < _env->num_ranks; rn++) { - - // Coord offset of rn from me: prev => negative, self => 0, next => positive. - IdxTuple rcoords(_dims->_domain_dims); - IdxTuple rdeltas(_dims->_domain_dims); - for (int di = 0; di < num_ddims; di++) { - rcoords[di] = coords[rn][di]; - rdeltas[di] = coords[rn][di] - _opts->_rank_indices[di]; - } - - // Manhattan distance from rn (sum of abs deltas in all dims). - // Max distance in any dim. - int mandist = 0; - int maxdist = 0; - for (int di = 0; di < num_ddims; di++) { - mandist += abs(rdeltas[di]); - maxdist = max(maxdist, abs(int(rdeltas[di]))); - } - - // Myself. - if (rn == me) { - if (mandist != 0) - THROW_YASK_EXCEPTION("Internal error: distance to own rank == " << mandist); - } - - // Someone else. - else { - if (mandist == 0) - THROW_YASK_EXCEPTION("Error: ranks " << me << - " and " << rn << " at same coordinates"); - } - - // Loop through domain dims. - for (int di = 0; di < num_ddims; di++) { - auto& dname = _opts->_rank_indices.getDimName(di); - - // Is rank 'rn' in-line with my rank in 'dname' dim? - // True when deltas in other dims are zero. - bool is_inline = true; - for (int dj = 0; dj < num_ddims; dj++) { - if (di != dj && rdeltas[dj] != 0) { - is_inline = false; - break; - } - } - - // Process ranks that are in-line in 'dname', including self. - if (is_inline) { - - // Accumulate total problem size in each dim for ranks that - // intersect with this rank, including myself. - overall_domain_sizes[dname] += rsizes[rn][di]; - - // Adjust my offset in the global problem by adding all domain - // sizes from prev ranks only. - if (rdeltas[di] < 0) - rank_domain_offsets[dname] += rsizes[rn][di]; - - // Make sure all the other dims are the same size. - // This ensures that all the ranks' domains line up - // properly along their edges and at their corners. - for (int dj = 0; dj < num_ddims; dj++) { - if (di != dj) { - auto mysz = rsizes[me][dj]; - auto rnsz = rsizes[rn][dj]; - if (mysz != rnsz) { - auto& dnamej = _opts->_rank_indices.getDimName(dj); - THROW_YASK_EXCEPTION("Error: rank " << rn << " and " << me << - " are both at rank-index " << coords[me][di] << - " in the '" << dname << - "' dimension , but their rank-domain sizes are " << - rnsz << " and " << mysz << - " (resp.) in the '" << dj << - "' dimension, making them unaligned"); - } - } - } - } - } - - // Rank rn is myself or my immediate neighbor if its distance <= 1 in - // every dim. Assume we do not need to exchange halos except - // with immediate neighbor. We validate this assumption below by - // making sure that the rank domain size is at least as big as the - // largest halo. - if (maxdist <= 1) { - - // At this point, rdeltas contains only -1..+1 for each domain dim. - // Add one to -1..+1 to get 0..2 range for my_neighbors offsets. - IdxTuple roffsets = rdeltas.addElements(1); - assert(rdeltas.min() >= -1); - assert(rdeltas.max() <= 1); - assert(roffsets.min() >= 0); - assert(roffsets.max() <= 2); - - // Convert the offsets into a 1D index. - auto rn_ofs = _mpiInfo->getNeighborIndex(roffsets); - TRACE_MSG("neighborhood size = " << _mpiInfo->neighborhood_sizes.makeDimValStr() << - " & roffsets of rank " << rn << " = " << roffsets.makeDimValStr() << - " => " << rn_ofs); - assert(idx_t(rn_ofs) < _mpiInfo->neighborhood_size); - - // Save rank of this neighbor into the MPI info object. - _mpiInfo->my_neighbors.at(rn_ofs) = rn; - if (rn != me) { - num_neighbors++; - os << "Neighbor #" << num_neighbors << " is rank " << rn << - " at absolute rank indices " << rcoords.makeDimValStr() << - " (" << rdeltas.makeDimValOffsetStr() << " relative to rank " << - me << ")\n"; - } - - // Save manhattan dist. - _mpiInfo->man_dists.at(rn_ofs) = mandist; - - // Loop through domain dims. - bool vlen_mults = true; - for (int di = 0; di < num_ddims; di++) { - auto& dname = _opts->_rank_indices.getDimName(di); - - // Does rn have all VLEN-multiple sizes? - auto rnsz = rsizes[rn][di]; - auto vlen = _dims->_fold_pts[di]; - if (rnsz % vlen != 0) { - TRACE_MSG("cannot use vector halo exchange with rank " << rn << - " because its size in '" << dname << "' is " << rnsz); - vlen_mults = false; - } - } - - // Save vec-mult flag. - _mpiInfo->has_all_vlen_mults.at(rn_ofs) = vlen_mults; - - } // self or immediate neighbor in any direction. - - } // ranks. - - // Set offsets in grids and find WF extensions - // based on the grids' halos. - update_grids(); - - // Determine bounding-boxes for all bundles. - // This must be done after finding WF extensions. - find_bounding_boxes(); - - } // setupRank. - - // Alloc 'nbytes' on each requested NUMA node. - // Map keys are preferred NUMA nodes or -1 for local. - // Pointers are returned in '_data_buf'. - // 'ngrids' and 'type' are only used for debug msg. - void StencilContext::_alloc_data(const map & nbytes, - const map & ngrids, - map >& data_buf, - const std::string& type) { - ostream& os = get_ostr(); - - for (const auto& i : nbytes) { - int numa_pref = i.first; - size_t nb = i.second; - size_t ng = ngrids.at(numa_pref); - - // Don't need pad after last one. - if (nb >= _data_buf_pad) - nb -= _data_buf_pad; - - // Allocate data. - os << "Allocating " << makeByteStr(nb) << - " for " << ng << " " << type << "(s)"; -#ifdef USE_NUMA - if (numa_pref >= 0) - os << " preferring NUMA node " << numa_pref; - else - os << " using NUMA policy " << numa_pref; -#endif - os << "...\n" << flush; - auto p = shared_numa_alloc(nb, numa_pref); - TRACE_MSG("Got memory at " << static_cast(p.get())); - - // Save using original key. - data_buf[numa_pref] = p; - } - } - - // Allocate memory for grids that do not already have storage. - void StencilContext::allocGridData(ostream& os) { - - // Base ptrs for all default-alloc'd data. - // These pointers will be shared by the ones in the grid - // objects, which will take over ownership when these go - // out of scope. - // Key is preferred numa node or -1 for local. - map > _grid_data_buf; - - // Pass 0: count required size for each NUMA node, allocate chunk of memory at end. - // Pass 1: distribute parts of already-allocated memory chunk. - for (int pass = 0; pass < 2; pass++) { - TRACE_MSG("allocGridData pass " << pass << " for " << - gridPtrs.size() << " grid(s)"); - - // Count bytes needed and number of grids for each NUMA node. - map npbytes, ngrids; - - // Grids. - for (auto gp : gridPtrs) { - if (!gp) - continue; - auto& gname = gp->get_name(); - - // Grid data. - // Don't alloc if already done. - if (!gp->is_storage_allocated()) { - int numa_pref = gp->get_numa_preferred(); - - // Set storage if buffer has been allocated in pass 0. - if (pass == 1) { - auto p = _grid_data_buf[numa_pref]; - assert(p); - gp->set_storage(p, npbytes[numa_pref]); - os << gp->make_info_string() << endl; - } - - // Determine padded size (also offset to next location). - size_t nbytes = gp->get_num_storage_bytes(); - npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad, - CACHELINE_BYTES); - ngrids[numa_pref]++; - if (pass == 0) - TRACE_MSG(" grid '" << gname << "' needs " << makeByteStr(nbytes) << - " on NUMA node " << numa_pref); - } - } - - // Alloc for each node. - if (pass == 0) - _alloc_data(npbytes, ngrids, _grid_data_buf, "grid"); - - } // grid passes. - }; - - // Create MPI and allocate buffers. - void StencilContext::allocMpiData(ostream& os) { - - // Remove any old MPI data. - freeMpiData(os); - -#ifdef USE_MPI - - int num_exchanges = 0; - auto me = _env->my_rank; - - // Need to determine the size and shape of all MPI buffers. - // Visit all neighbors of this rank. - _mpiInfo->visitNeighbors - ([&](const IdxTuple& neigh_offsets, int neigh_rank, int neigh_idx) { - if (neigh_rank == MPI_PROC_NULL) - return; // from lambda fn. - - // Determine max dist needed. TODO: determine max dist - // automatically from stencils; may not be same for all - // grids. -#ifndef MAX_EXCH_DIST -#define MAX_EXCH_DIST (NUM_STENCIL_DIMS - 1) -#endif - // Always use max dist with WF. - // TODO: determine if this is overkill. - int maxdist = MAX_EXCH_DIST; - if (num_wf_shifts > 0) - maxdist = NUM_STENCIL_DIMS - 1; - - // Manhattan dist. - int mandist = _mpiInfo->man_dists.at(neigh_idx); - - // Check distance. - // TODO: calculate and use exch dist for each grid. - if (mandist > maxdist) { - TRACE_MSG("no halo exchange needed with rank " << neigh_rank << - " because L1-norm = " << mandist); - return; // from lambda fn. - } - - // Determine size of MPI buffers between neigh_rank and my rank - // for each grid and create those that are needed. - for (auto gp : gridPtrs) { - if (!gp) - continue; - auto& gname = gp->get_name(); - - // Lookup first & last domain indices and calc exchange sizes - // for this grid. - bool found_delta = false; - IdxTuple my_halo_sizes, neigh_halo_sizes; - IdxTuple first_inner_idx, last_inner_idx; - IdxTuple first_outer_idx, last_outer_idx; - for (auto& dim : _dims->_domain_dims.getDims()) { - auto& dname = dim.getName(); - if (gp->is_dim_used(dname)) { - - // Get domain indices for this grid. - // If there are no more ranks in the given direction, extend - // the index into the outer halo to make sure all data are sync'd. - // This is critical for WFs. - idx_t fidx = gp->get_first_rank_domain_index(dname); - idx_t lidx = gp->get_last_rank_domain_index(dname); - first_inner_idx.addDimBack(dname, fidx); - last_inner_idx.addDimBack(dname, lidx); - if (_opts->is_first_rank(dname)) - fidx -= gp->get_left_halo_size(dname); - if (_opts->is_last_rank(dname)) - lidx += gp->get_right_halo_size(dname); - first_outer_idx.addDimBack(dname, fidx); - last_outer_idx.addDimBack(dname, lidx); - - // Determine size of exchange. This will be the actual halo size - // plus any wave-front extensions. In the current implementation, - // we need the wave-front extensions regardless of whether there - // is a halo on a given grid. This is because each stencil-bundle - // gets shifted by the WF angles at each step in the WF. - - // Neighbor is to the left. - if (neigh_offsets[dname] == MPIInfo::rank_prev) { - auto ext = left_wf_exts[dname]; - - // my halo. - auto halo_size = gp->get_left_halo_size(dname); - halo_size += ext; - my_halo_sizes.addDimBack(dname, halo_size); - - // neighbor halo. - halo_size = gp->get_right_halo_size(dname); // their right is on my left. - halo_size += ext; - neigh_halo_sizes.addDimBack(dname, halo_size); - } - - // Neighbor is to the right. - else if (neigh_offsets[dname] == MPIInfo::rank_next) { - auto ext = right_wf_exts[dname]; - - // my halo. - auto halo_size = gp->get_right_halo_size(dname); - halo_size += ext; - my_halo_sizes.addDimBack(dname, halo_size); - - // neighbor halo. - halo_size = gp->get_left_halo_size(dname); // their left is on my right. - halo_size += ext; - neigh_halo_sizes.addDimBack(dname, halo_size); - } - - // Neighbor in-line. - else { - my_halo_sizes.addDimBack(dname, 0); - neigh_halo_sizes.addDimBack(dname, 0); - } - - // Vectorized exchange allowed based on domain sizes? - // Both my rank and neighbor rank must have all domain sizes - // of vector multiples. - bool vec_ok = allow_vec_exchange && - _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] && - _mpiInfo->has_all_vlen_mults[neigh_idx]; - - // Round up halo sizes if vectorized exchanges allowed. - // TODO: add a heuristic to avoid increasing by a large factor. - if (vec_ok) { - auto vec_size = _dims->_fold_pts[dname]; - my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size)); - neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size)); - } - - // Is this neighbor before or after me in this domain direction? - if (neigh_offsets[dname] != MPIInfo::rank_self) - found_delta = true; - } - } - - // Is buffer needed? - // Example: if this grid is 2D in y-z, but only neighbors are in - // x-dim, we don't need any exchange. - if (!found_delta) { - TRACE_MSG("no halo exchange needed for grid '" << gname << - "' with rank " << neigh_rank << - " because the neighbor is not in a direction" - " corresponding to a grid dim"); - continue; // to next grid. - } - - // Make a buffer in both directions (send & receive). - for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) { - - // Begin/end vars to indicate what part - // of main grid to read from or write to based on - // the current neighbor being processed. - IdxTuple copy_begin = gp->get_allocs(); - IdxTuple copy_end = gp->get_allocs(); - - // Adjust along domain dims in this grid. - for (auto& dim : _dims->_domain_dims.getDims()) { - auto& dname = dim.getName(); - if (gp->is_dim_used(dname)) { - - // Init range to whole rank domain (including - // outer halos). These may be changed below - // depending on the neighbor's direction. - copy_begin[dname] = first_outer_idx[dname]; - copy_end[dname] = last_outer_idx[dname] + 1; // end = last + 1. - - // Neighbor direction in this dim. - auto neigh_ofs = neigh_offsets[dname]; - - // Region to read from, i.e., data from inside - // this rank's domain to be put into neighbor's - // halo. - if (bd == MPIBufs::bufSend) { - - // Neighbor is to the left. - if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { - - // Only read slice as wide as halo from beginning. - copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname]; - } - - // Neighbor is to the right. - else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { - - // Only read slice as wide as halo before end. - copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname]; - } - - // Else, this neighbor is in same posn as I am in this dim, - // so we leave the default begin/end settings. - } - - // Region to write to, i.e., into this rank's halo. - else if (bd == MPIBufs::bufRecv) { - - // Neighbor is to the left. - if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { - - // Only read slice as wide as halo before beginning. - copy_begin[dname] = first_inner_idx[dname] - my_halo_sizes[dname]; - copy_end[dname] = first_inner_idx[dname]; - } - - // Neighbor is to the right. - else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { - - // Only read slice as wide as halo after end. - copy_begin[dname] = last_inner_idx[dname] + 1; - copy_end[dname] = last_inner_idx[dname] + 1 + my_halo_sizes[dname]; - } - - // Else, this neighbor is in same posn as I am in this dim, - // so we leave the default begin/end settings. - } - } // domain dims in this grid. - } // domain dims. - - // Sizes of buffer in all dims of this grid. - // Also, set begin/end value for non-domain dims. - IdxTuple buf_sizes = gp->get_allocs(); - bool vlen_mults = true; - for (auto& dname : gp->get_dim_names()) { - idx_t dsize = 1; - - // domain dim? - if (_dims->_domain_dims.lookup(dname)) { - dsize = copy_end[dname] - copy_begin[dname]; - - // Check whether size is multiple of vlen. - auto vlen = _dims->_fold_pts[dname]; - if (dsize % vlen != 0) - vlen_mults = false; - } - - // step dim? - // Allowing only one step to be exchanged. - // TODO: consider exchanging mutiple steps at once for WFs. - else if (dname == _dims->_step_dim) { - - // Use 0..1 as a place-holder range. - // The actual values will be supplied during - // halo exchange. - copy_begin[dname] = 0; - copy_end[dname] = 1; - } - - // misc? - // Copy over entire range. - // TODO: make dirty flags for misc dims in grids. - else { - dsize = gp->get_alloc_size(dname); - copy_begin[dname] = gp->get_first_misc_index(dname); - copy_end[dname] = gp->get_last_misc_index(dname) + 1; - } - - // Save computed size. - buf_sizes[dname] = dsize; - - } // all dims in this grid. - - // Does buffer have non-zero size? - if (buf_sizes.size() == 0 || buf_sizes.product() == 0) { - TRACE_MSG("no halo exchange needed for grid '" << gname << - "' with rank " << neigh_rank << - " because there is no data to exchange"); - continue; - } - - // At this point, buf_sizes, copy_begin, and copy_end - // should be set for each dim in this grid. - // Convert end to last. - IdxTuple copy_last = copy_end.subElements(1); - - // Unique name for buffer based on grid name, direction, and ranks. - ostringstream oss; - oss << gname; - if (bd == MPIBufs::bufSend) - oss << "_send_halo_from_" << me << "_to_" << neigh_rank; - else if (bd == MPIBufs::bufRecv) - oss << "_recv_halo_from_" << neigh_rank << "_to_" << me; - string bufname = oss.str(); - - // Make MPI data entry for this grid. - auto gbp = mpiData.emplace(gname, _mpiInfo); - auto& gbi = gbp.first; // iterator from pair returned by emplace(). - auto& gbv = gbi->second; // value from iterator. - auto& buf = gbv.getBuf(MPIBufs::BufDir(bd), neigh_offsets); - - // Config buffer for this grid. - // (But don't allocate storage yet.) - buf.begin_pt = copy_begin; - buf.last_pt = copy_last; - buf.num_pts = buf_sizes; - buf.name = bufname; - buf.has_all_vlen_mults = vlen_mults; - - TRACE_MSG("configured MPI buffer object '" << buf.name << - "' for rank at relative offsets " << - neigh_offsets.subElements(1).makeDimValStr() << " with " << - buf.num_pts.makeDimValStr(" * ") << " = " << buf.get_size() << - " element(s) at " << buf.begin_pt.makeDimValStr() << - " ... " << buf.last_pt.makeDimValStr()); - num_exchanges++; - - } // send, recv. - } // grids. - }); // neighbors. - TRACE_MSG("number of halo-exchanges needed on this rank: " << num_exchanges); - - // Base ptrs for all alloc'd data. - // These pointers will be shared by the ones in the grid - // objects, which will take over ownership when these go - // out of scope. - map > _mpi_data_buf; - - // Allocate MPI buffers. - // Pass 0: count required size, allocate chunk of memory at end. - // Pass 1: distribute parts of already-allocated memory chunk. - for (int pass = 0; pass < 2; pass++) { - TRACE_MSG("allocMpiData pass " << pass << " for " << - mpiData.size() << " MPI buffer set(s)"); - - // Count bytes needed and number of buffers for each NUMA node. - map npbytes, nbufs; - - // Grids. - for (auto gp : gridPtrs) { - if (!gp) - continue; - auto& gname = gp->get_name(); - int numa_pref = gp->get_numa_preferred(); - - // MPI bufs for this grid. - if (mpiData.count(gname)) { - auto& grid_mpi_data = mpiData.at(gname); - - // Visit buffers for each neighbor for this grid. - grid_mpi_data.visitNeighbors - ([&](const IdxTuple& roffsets, - int rank, - int idx, - MPIBufs& bufs) { - - // Send and recv. - for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) { - auto& buf = grid_mpi_data.getBuf(MPIBufs::BufDir(bd), roffsets); - if (buf.get_size() == 0) - continue; - - // Set storage if buffer has been allocated in pass 0. - if (pass == 1) { - auto p = _mpi_data_buf[numa_pref]; - assert(p); - buf.set_storage(p, npbytes[numa_pref]); - } - - // Determine padded size (also offset to next location). - auto sbytes = buf.get_bytes(); - npbytes[numa_pref] += ROUND_UP(sbytes + _data_buf_pad, - CACHELINE_BYTES); - nbufs[numa_pref]++; - if (pass == 0) - TRACE_MSG(" MPI buf '" << buf.name << "' needs " << - makeByteStr(sbytes) << - " on NUMA node " << numa_pref); - } - } ); - } - } - - // Alloc for each node. - if (pass == 0) - _alloc_data(npbytes, nbufs, _mpi_data_buf, "MPI buffer"); - - } // MPI passes. -#endif - } - - // Allocate memory for scratch grids based on number of threads and - // block sizes. - void StencilContext::allocScratchData(ostream& os) { - - // Remove any old scratch data. - freeScratchData(os); - - // Base ptrs for all alloc'd data. - // This pointer will be shared by the ones in the grid - // objects, which will take over ownership when it goes - // out of scope. - map > _scratch_data_buf; - - // Make sure the right number of threads are set so we - // have the right number of scratch grids. - int rthreads = set_region_threads(); - - // Delete any existing scratch grids. - // Create new scratch grids. - makeScratchGrids(rthreads); - - // Pass 0: count required size, allocate chunk of memory at end. - // Pass 1: distribute parts of already-allocated memory chunk. - for (int pass = 0; pass < 2; pass++) { - TRACE_MSG("allocScratchData pass " << pass << " for " << - scratchVecs.size() << " set(s) of scratch grids"); - - // Count bytes needed and number of grids for each NUMA node. - map npbytes, ngrids; - - // Loop through each scratch grid vector. - for (auto* sgv : scratchVecs) { - assert(sgv); - - // Loop through each scratch grid in this vector. - // There will be one for each region thread. - assert(int(sgv->size()) == rthreads); - int thr_num = 0; - for (auto gp : *sgv) { - assert(gp); - auto& gname = gp->get_name(); - int numa_pref = gp->get_numa_preferred(); - - // Loop through each domain dim. - for (auto& dim : _dims->_domain_dims.getDims()) { - auto& dname = dim.getName(); - - if (gp->is_dim_used(dname)) { - - // Set domain size of grid to block size. - gp->_set_domain_size(dname, _opts->_block_sizes[dname]); - - // Pads. - // Set via both 'extra' and 'min'; larger result will be used. - gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]); - gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]); - } - } // dims. - - // Set storage if buffer has been allocated. - if (pass == 1) { - auto p = _scratch_data_buf[numa_pref]; - assert(p); - gp->set_storage(p, npbytes[numa_pref]); - TRACE_MSG(gp->make_info_string()); - } - - // Determine size used (also offset to next location). - size_t nbytes = gp->get_num_storage_bytes(); - npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad, - CACHELINE_BYTES); - ngrids[numa_pref]++; - if (pass == 0) - TRACE_MSG(" scratch grid '" << gname << "' for thread " << - thr_num << " needs " << makeByteStr(nbytes) << - " on NUMA node " << numa_pref); - thr_num++; - } // scratch grids. - } // scratch-grid vecs. - - // Alloc for each node. - if (pass == 0) - _alloc_data(npbytes, ngrids, _scratch_data_buf, "scratch grid"); - - } // scratch-grid passes. - } // Adjust offsets of scratch grids based // on thread and scan indices. @@ -1773,362 +1013,6 @@ namespace yask { } } - - // Set non-scratch grid sizes and offsets based on settings. - // Set wave-front settings. - // This should be called anytime a setting or rank offset is changed. - void StencilContext::update_grids() - { - assert(_opts); - - // Reset halos to zero. - max_halos = _dims->_domain_dims; - - // Loop through each non-scratch grid. - for (auto gp : gridPtrs) { - assert(gp); - - // Ignore manually-sized grid. - if (gp->is_fixed_size()) - continue; - - // Loop through each domain dim. - for (auto& dim : _dims->_domain_dims.getDims()) { - auto& dname = dim.getName(); - - if (gp->is_dim_used(dname)) { - - // Rank domains. - gp->_set_domain_size(dname, _opts->_rank_sizes[dname]); - - // Pads. - // Set via both 'extra' and 'min'; larger result will be used. - gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]); - gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]); - - // Offsets. - gp->_set_offset(dname, rank_domain_offsets[dname]); - - // Update max halo across grids, used for wavefront angles. - max_halos[dname] = max(max_halos[dname], gp->get_left_halo_size(dname)); - max_halos[dname] = max(max_halos[dname], gp->get_right_halo_size(dname)); - } - } - } // grids. - - // Calculate wave-front settings based on max halos. - // See the wavefront diagram in run_solution() for description - // of angles and extensions. - auto& step_dim = _dims->_step_dim; - auto wf_steps = _opts->_region_sizes[step_dim]; - num_wf_shifts = 0; - if (wf_steps > 1) - - // TODO: don't shift for scratch grids. - num_wf_shifts = max((idx_t(stBundles.size()) * wf_steps) - 1, idx_t(0)); - for (auto& dim : _dims->_domain_dims.getDims()) { - auto& dname = dim.getName(); - auto rksize = _opts->_rank_sizes[dname]; - auto nranks = _opts->_num_ranks[dname]; - - // Determine the max spatial skewing angles for temporal - // wave-fronts based on the max halos. We only need non-zero - // angles if the region size is less than the rank size and - // there are no other ranks in this dim, i.e., if the region - // covers the global domain in a given dim, no wave-front is - // needed in that dim. TODO: make rounding-up an option. - idx_t angle = 0; - if (_opts->_region_sizes[dname] < rksize || nranks > 0) - angle = ROUND_UP(max_halos[dname], _dims->_cluster_pts[dname]); - wf_angles[dname] = angle; - - // Determine the total WF shift to be added in each dim. - idx_t shifts = angle * num_wf_shifts; - wf_shifts[dname] = shifts; - - // Is domain size at least as large as halo + wf_ext in direction - // when there are multiple ranks? - auto min_size = max_halos[dname] + shifts; - if (_opts->_num_ranks[dname] > 1 && rksize < min_size) { - THROW_YASK_EXCEPTION("Error: rank-domain size of " << rksize << " in '" << - dname << "' dim is less than minimum size of " << min_size << - ", which is based on stencil halos and temporal wave-front sizes"); - } - - // If there is another rank to the left, set wave-front - // extension on the left. - left_wf_exts[dname] = _opts->is_first_rank(dname) ? 0 : shifts; - - // If there is another rank to the right, set wave-front - // extension on the right. - right_wf_exts[dname] = _opts->is_last_rank(dname) ? 0 : shifts; - } - - // Now that wave-front settings are known, we can push this info - // back to the grids. It's useful to store this redundant info - // in the grids, because there it's indexed by grid dims instead - // of domain dims. This makes it faster to do grid indexing. - for (auto gp : gridPtrs) { - assert(gp); - - // Ignore manually-sized grid. - if (gp->is_fixed_size()) - continue; - - // Loop through each domain dim. - for (auto& dim : _dims->_domain_dims.getDims()) { - auto& dname = dim.getName(); - if (gp->is_dim_used(dname)) { - - // Set extensions to be the same as the global ones. - gp->_set_left_wf_ext(dname, left_wf_exts[dname]); - gp->_set_right_wf_ext(dname, right_wf_exts[dname]); - } - } - } - } - - // Allocate grids and MPI bufs. - // Initialize some data structures. - void StencilContext::prepare_solution() { - auto& step_dim = _dims->_step_dim; - - // Don't continue until all ranks are this far. - _env->global_barrier(); - - ostream& os = get_ostr(); -#ifdef DEBUG - os << "*** WARNING: YASK compiled with DEBUG; ignore performance results.\n"; -#endif -#if defined(NO_INTRINSICS) && (VLEN > 1) - os << "*** WARNING: YASK compiled with NO_INTRINSICS; ignore performance results.\n"; -#endif -#ifdef MODEL_CACHE - os << "*** WARNING: YASK compiled with MODEL_CACHE; ignore performance results.\n"; -#endif -#ifdef TRACE_MEM - os << "*** WARNING: YASK compiled with TRACE_MEM; ignore performance results.\n"; -#endif -#ifdef TRACE_INTRINSICS - os << "*** WARNING: YASK compiled with TRACE_INTRINSICS; ignore performance results.\n"; -#endif - - // reset time keepers. - clear_timers(); - - // Init auto-tuner to run silently during normal operation. - _at.clear(false, false); - - // Adjust all settings before setting MPI buffers or sizing grids. - // Prints final settings. - // TODO: print settings again after auto-tuning. - _opts->adjustSettings(os, _env); - - // Report ranks. - os << endl; - os << "Num ranks: " << _env->get_num_ranks() << endl; - os << "This rank index: " << _env->get_rank_index() << endl; - - // report threads. - os << "Num OpenMP procs: " << omp_get_num_procs() << endl; - set_all_threads(); - os << "Num OpenMP threads: " << omp_get_max_threads() << endl; - set_region_threads(); // Temporary; just for reporting. - os << " Num threads per region: " << omp_get_max_threads() << endl; - set_block_threads(); // Temporary; just for reporting. - os << " Num threads per block: " << omp_get_max_threads() << endl; - - // Set the number of threads for a region. It should stay this - // way for top-level OpenMP parallel sections. - int rthreads = set_region_threads(); - - // Run a dummy nested OMP loop to make sure nested threading is - // initialized. -#ifdef _OPENMP -#pragma omp parallel for - for (int i = 0; i < rthreads * 100; i++) { - - idx_t dummy = 0; - set_block_threads(); -#pragma omp parallel for reduction(+:dummy) - for (int j = 0; j < i * 100; j++) { - dummy += j; - } - } -#endif - - // Some grid stats. - os << endl; - os << "Num grids: " << gridPtrs.size() << endl; - os << "Num grids to be updated: " << outputGridPtrs.size() << endl; - - // Set up data based on MPI rank, including grid positions. - // Update all the grid sizes. - setupRank(); - - // Alloc grids, scratch grids, MPI bufs. - // This is the order in which preferred NUMA nodes (e.g., HBW mem) - // will be used. - // We free the scratch and MPI data first to give grids preference. - freeScratchData(os); - freeMpiData(os); - allocGridData(os); - allocScratchData(os); - allocMpiData(os); - - // Report total allocation. - rank_nbytes = get_num_bytes(); - os << "Total allocation in this rank: " << - makeByteStr(rank_nbytes) << "\n"; - tot_nbytes = sumOverRanks(rank_nbytes, _env->comm); - os << "Total overall allocation in " << _env->num_ranks << " rank(s): " << - makeByteStr(tot_nbytes) << "\n"; - - // Report some stats. - idx_t dt = _opts->_rank_sizes[step_dim]; - os << "\nProblem sizes in points (from smallest to largest):\n" - " vector-size: " << _dims->_fold_pts.makeDimValStr(" * ") << endl << - " cluster-size: " << _dims->_cluster_pts.makeDimValStr(" * ") << endl << - " sub-block-size: " << _opts->_sub_block_sizes.makeDimValStr(" * ") << endl << - " sub-block-group-size: " << _opts->_sub_block_group_sizes.makeDimValStr(" * ") << endl << - " block-size: " << _opts->_block_sizes.makeDimValStr(" * ") << endl << - " block-group-size: " << _opts->_block_group_sizes.makeDimValStr(" * ") << endl << - " region-size: " << _opts->_region_sizes.makeDimValStr(" * ") << endl << - " rank-domain-size: " << _opts->_rank_sizes.makeDimValStr(" * ") << endl << - " overall-problem-size: " << overall_domain_sizes.makeDimValStr(" * ") << endl << - endl << - "Other settings:\n" - " yask-version: " << yask_get_version_string() << endl << - " stencil-name: " << get_name() << endl << - " element-size: " << makeByteStr(get_element_bytes()) << endl << -#ifdef USE_MPI - " num-ranks: " << _opts->_num_ranks.makeDimValStr(" * ") << endl << - " rank-indices: " << _opts->_rank_indices.makeDimValStr() << endl << - " rank-domain-offsets: " << rank_domain_offsets.makeDimValOffsetStr() << endl << -#endif - " rank-domain: " << rank_bb.bb_begin.makeDimValStr() << - " ... " << rank_bb.bb_end.subElements(1).makeDimValStr() << endl << - " vector-len: " << VLEN << endl << - " extra-padding: " << _opts->_extra_pad_sizes.makeDimValStr() << endl << - " minimum-padding: " << _opts->_min_pad_sizes.makeDimValStr() << endl << - " L1-prefetch-distance: " << PFD_L1 << endl << - " L2-prefetch-distance: " << PFD_L2 << endl << - " max-halos: " << max_halos.makeDimValStr() << endl; - if (num_wf_shifts > 0) { - os << - " wave-front-angles: " << wf_angles.makeDimValStr() << endl << - " num-wave-front-shifts: " << num_wf_shifts << endl << - " wave-front-shift-lens: " << wf_shifts.makeDimValStr() << endl << - " left-wave-front-exts: " << left_wf_exts.makeDimValStr() << endl << - " right-wave-front-exts: " << right_wf_exts.makeDimValStr() << endl << - " ext-rank-domain: " << ext_bb.bb_begin.makeDimValStr() << - " ... " << ext_bb.bb_end.subElements(1).makeDimValStr() << endl; - } - os << endl; - - // sums across bundles for this rank. - rank_numWrites_1t = 0; - rank_reads_1t = 0; - rank_numFpOps_1t = 0; - os << "Num stencil bundles: " << stBundles.size() << endl; - for (auto* sg : stBundles) { - idx_t updates1 = sg->get_scalar_points_written(); - idx_t updates_domain = updates1 * sg->bb_num_points; - rank_numWrites_1t += updates_domain; - idx_t reads1 = sg->get_scalar_points_read(); - idx_t reads_domain = reads1 * sg->bb_num_points; - rank_reads_1t += reads_domain; - idx_t fpops1 = sg->get_scalar_fp_ops(); - idx_t fpops_domain = fpops1 * sg->bb_num_points; - rank_numFpOps_1t += fpops_domain; - os << "Stats for bundle '" << sg->get_name() << "':\n" << - " sub-domain: " << sg->bb_begin.makeDimValStr() << - " ... " << sg->bb_end.subElements(1).makeDimValStr() << endl << - " sub-domain size: " << sg->bb_len.makeDimValStr(" * ") << endl << - " valid points in sub domain: " << makeNumStr(sg->bb_num_points) << endl << - " grid-updates per point: " << updates1 << endl << - " grid-updates in sub-domain: " << makeNumStr(updates_domain) << endl << - " grid-reads per point: " << reads1 << endl << - " grid-reads in sub-domain: " << makeNumStr(reads_domain) << endl << - " est FP-ops per point: " << fpops1 << endl << - " est FP-ops in sub-domain: " << makeNumStr(fpops_domain) << endl; - } - - // Various metrics for amount of work. - rank_numWrites_dt = rank_numWrites_1t * dt; - tot_numWrites_1t = sumOverRanks(rank_numWrites_1t, _env->comm); - tot_numWrites_dt = tot_numWrites_1t * dt; - - rank_reads_dt = rank_reads_1t * dt; - tot_reads_1t = sumOverRanks(rank_reads_1t, _env->comm); - tot_reads_dt = tot_reads_1t * dt; - - rank_numFpOps_dt = rank_numFpOps_1t * dt; - tot_numFpOps_1t = sumOverRanks(rank_numFpOps_1t, _env->comm); - tot_numFpOps_dt = tot_numFpOps_1t * dt; - - rank_domain_1t = rank_bb.bb_num_points; - rank_domain_dt = rank_domain_1t * dt; // same as _opts->_rank_sizes.product(); - tot_domain_1t = sumOverRanks(rank_domain_1t, _env->comm); - tot_domain_dt = tot_domain_1t * dt; - - // Print some more stats. - os << endl << - "Amount-of-work stats:\n" << - " domain-size in this rank for one time-step: " << - makeNumStr(rank_domain_1t) << endl << - " overall-problem-size in all ranks for one time-step: " << - makeNumStr(tot_domain_1t) << endl << - endl << - " num-writes-required in this rank for one time-step: " << - makeNumStr(rank_numWrites_1t) << endl << - " num-writes-required in all ranks for one time-step: " << - makeNumStr(tot_numWrites_1t) << endl << - endl << - " num-reads-required in this rank for one time-step: " << - makeNumStr(rank_reads_1t) << endl << - " num-reads-required in all ranks for one time-step: " << - makeNumStr(tot_reads_1t) << endl << - endl << - " est-FP-ops in this rank for one time-step: " << - makeNumStr(rank_numFpOps_1t) << endl << - " est-FP-ops in all ranks for one time-step: " << - makeNumStr(tot_numFpOps_1t) << endl << - endl; - - if (dt > 1) { - os << - " domain-size in this rank for all time-steps: " << - makeNumStr(rank_domain_dt) << endl << - " overall-problem-size in all ranks for all time-steps: " << - makeNumStr(tot_domain_dt) << endl << - endl << - " num-writes-required in this rank for all time-steps: " << - makeNumStr(rank_numWrites_dt) << endl << - " num-writes-required in all ranks for all time-steps: " << - makeNumStr(tot_numWrites_dt) << endl << - endl << - " num-reads-required in this rank for all time-steps: " << - makeNumStr(rank_reads_dt) << endl << - " num-reads-required in all ranks for all time-steps: " << - makeNumStr(tot_reads_dt) << endl << - endl << - " est-FP-ops in this rank for all time-steps: " << - makeNumStr(rank_numFpOps_dt) << endl << - " est-FP-ops in all ranks for all time-steps: " << - makeNumStr(tot_numFpOps_dt) << endl << - endl; - } - os << - "Notes:\n" - " Domain-sizes and overall-problem-sizes are based on rank-domain sizes\n" - " and number of ranks regardless of number of grids or sub-domains.\n" - " Num-writes-required is based on sum of grid-updates in sub-domain across stencil-bundle(s).\n" - " Num-reads-required is based on sum of grid-reads in sub-domain across stencil-bundle(s).\n" - " Est-FP-ops are based on sum of est-FP-ops in sub-domain across stencil-bundle(s).\n" - "\n"; - } - /// Get statistics associated with preceding calls to run_solution(). yk_stats_ptr StencilContext::get_stats() { ostream& os = get_ostr(); @@ -2177,38 +1061,6 @@ namespace yask { return p; } - // Dealloc grids, etc. - void StencilContext::end_solution() { - - // Final halo exchange. - exchange_halos_all(); - - // Release any MPI data. - mpiData.clear(); - - // Release grid data. - for (auto gp : gridPtrs) { - if (!gp) - continue; - gp->release_storage(); - } - - // Reset threads to original value. - set_max_threads(); - } - - // Init all grids & params by calling initFn. - void StencilContext::initValues(function realInitFn) { - ostream& os = get_ostr(); - real_t v = 0.1; - os << "Initializing grids..." << endl; - for (auto gp : gridPtrs) { - realInitFn(gp, v); - v += 0.01; - } - } - // Compare grids in contexts. // Return number of mis-compares. idx_t StencilContext::compareData(const StencilContext& ref) const { @@ -2228,82 +1080,6 @@ namespace yask { return errs; } - // Compute convenience values for a bounding-box. - void BoundingBox::update_bb(ostream& os, - const string& name, - StencilContext& context, - bool force_full) { - - auto dims = context.get_dims(); - auto& domain_dims = dims->_domain_dims; - bb_len = bb_end.subElements(bb_begin); - bb_size = bb_len.product(); - if (force_full) - bb_num_points = bb_size; - - // Solid rectangle? - bb_is_full = true; - if (bb_num_points != bb_size) { - os << "Warning: '" << name << "' domain has only " << - makeNumStr(bb_num_points) << - " valid point(s) inside its bounding-box of " << - makeNumStr(bb_size) << - " point(s); slower scalar calculations will be used.\n"; - bb_is_full = false; - } - - // Does everything start on a vector-length boundary? - bb_is_aligned = true; - for (auto& dim : domain_dims.getDims()) { - auto& dname = dim.getName(); - if ((bb_begin[dname] - context.rank_domain_offsets[dname]) % - dims->_fold_pts[dname] != 0) { - os << "Note: '" << name << "' domain" - " has one or more starting edges not on vector boundaries;" - " masked calculations will be used in peel and remainder sub-blocks.\n"; - bb_is_aligned = false; - break; - } - } - - // Lengths are cluster-length multiples? - bb_is_cluster_mult = true; - for (auto& dim : domain_dims.getDims()) { - auto& dname = dim.getName(); - if (bb_len[dname] % dims->_cluster_pts[dname] != 0) { - if (bb_is_full && bb_is_aligned) - os << "Note: '" << name << "' domain" - " has one or more sizes that are not vector-cluster multiples;" - " masked calculations will be used in peel and remainder sub-blocks.\n"; - bb_is_cluster_mult = false; - break; - } - } - - // All done. - bb_valid = true; - } - - // Set the bounding-box for each stencil-bundle and whole domain. - void StencilContext::find_bounding_boxes() - { - ostream& os = get_ostr(); - - // Rank BB is based only on rank offsets and rank domain sizes. - rank_bb.bb_begin = rank_domain_offsets; - rank_bb.bb_end = rank_domain_offsets.addElements(_opts->_rank_sizes, false); - rank_bb.update_bb(os, "rank", *this, true); - - // Overall BB may be extended for wave-fronts. - ext_bb.bb_begin = rank_bb.bb_begin.subElements(left_wf_exts); - ext_bb.bb_end = rank_bb.bb_end.addElements(right_wf_exts); - ext_bb.update_bb(os, "extended-rank", *this, true); - - // Find BB for each bundle. - for (auto sg : stBundles) - sg->find_bounding_box(); - } - // Exchange dirty halo data for all grids and all steps, regardless // of their stencil-bundle. // TODO: loop through all grids in exchange_halos() instead. diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp new file mode 100644 index 00000000..d0e163ba --- /dev/null +++ b/src/kernel/lib/grid_apis.cpp @@ -0,0 +1,387 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kernel +Copyright (c) 2014-2018, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +// Implement methods for yk_grid APIs. + +#include "yask.hpp" +using namespace std; + +namespace yask { + + // APIs to get info from vars. +#define GET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \ + idx_t YkGridBase::api_name(const string& dim) const { \ + checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ + int posn = get_dim_posn(dim, true, #api_name); \ + if (prep_req && _offsets[posn] < 0) \ + THROW_YASK_EXCEPTION("Error: '" #api_name "()' called on grid '" << \ + get_name() << "' before calling 'prepare_solution()'"); \ + return expr; \ + } \ + idx_t YkGridBase::api_name(int posn) const { \ + return expr; \ + } + GET_GRID_API(get_rank_domain_size, _domains[posn], false, true, false, false) + GET_GRID_API(get_left_pad_size, _left_pads[posn], false, true, false, false) // _left_pads is actual size. + GET_GRID_API(get_right_pad_size, _allocs[posn] - _left_pads[posn], false, true, false, false) // _right_pads is request only. + GET_GRID_API(get_pad_size, _left_pads[posn], false, true, false, false) + GET_GRID_API(get_left_halo_size, _left_halos[posn], false, true, false, false) + GET_GRID_API(get_right_halo_size, _right_halos[posn], false, true, false, false) + GET_GRID_API(get_halo_size, _left_halos[posn], false, true, false, false) + GET_GRID_API(get_first_misc_index, _offsets[posn], false, false, true, false) + GET_GRID_API(get_last_misc_index, _offsets[posn] + _domains[posn] - 1, false, false, true, false) + GET_GRID_API(get_left_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false) + GET_GRID_API(get_right_extra_pad_size, (_allocs[posn] - _left_pads[posn] - _domains[posn]) - + _right_halos[posn], false, true, false, false) + GET_GRID_API(get_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false) + GET_GRID_API(get_alloc_size, _allocs[posn], true, true, true, false) + GET_GRID_API(get_first_rank_domain_index, _offsets[posn] - _local_offsets[posn], false, true, false, true) + GET_GRID_API(get_last_rank_domain_index, _offsets[posn] - _local_offsets[posn] + _domains[posn] - 1; + assert(!_is_scratch), false, true, false, true) + GET_GRID_API(get_first_rank_halo_index, _offsets[posn] - _left_halos[posn], false, false, true, true) + GET_GRID_API(get_last_rank_halo_index, _offsets[posn] + _domains[posn] + _right_halos[posn] - 1, false, false, true, true) + GET_GRID_API(get_first_rank_alloc_index, _offsets[posn] - _left_pads[posn], false, true, false, true) + GET_GRID_API(get_last_rank_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, false, true, false, true) + GET_GRID_API(_get_left_wf_ext, _left_wf_exts[posn], true, true, true, false) + GET_GRID_API(_get_right_wf_ext, _right_wf_exts[posn], true, true, true, false) + GET_GRID_API(_get_offset, _offsets[posn], true, true, true, true) + GET_GRID_API(_get_local_offset, _local_offsets[posn], true, true, true, false) + GET_GRID_API(_get_first_alloc_index, _offsets[posn] - _left_pads[posn], true, true, true, true) + GET_GRID_API(_get_last_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, true, true, true, true) +#undef GET_GRID_API + + // APIs to set vars. +#define COMMA , +#define SET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok) \ + void YkGridBase::api_name(const string& dim, idx_t n) { \ + TRACE_MSG0(get_ostr(), "grid '" << get_name() << "'." \ + #api_name "('" << dim << "', " << n << ")"); \ + checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ + int posn = get_dim_posn(dim, true, #api_name); \ + expr; \ + } \ + void YkGridBase::api_name(int posn, idx_t n) { \ + int dim = posn; \ + expr; \ + } + SET_GRID_API(_set_offset, _offsets[posn] = n, true, true, true) + SET_GRID_API(_set_local_offset, _local_offsets[posn] = n; + _vec_local_offsets[posn] = n / _vec_lens[posn], true, true, true) + SET_GRID_API(_set_domain_size, _domains[posn] = n; resize(), true, true, true) + SET_GRID_API(_set_left_pad_size, _left_pads[posn] = n; resize(), true, true, true) + SET_GRID_API(_set_right_pad_size, _right_pads[posn] = n; resize(), true, true, true) + SET_GRID_API(_set_left_wf_ext, _left_wf_exts[posn] = n; resize(), true, true, true) + SET_GRID_API(_set_right_wf_ext, _right_wf_exts[posn] = n; resize(), true, true, true) + SET_GRID_API(set_left_halo_size, _left_halos[posn] = n; resize(), false, true, false) + SET_GRID_API(set_right_halo_size, _right_halos[posn] = n; resize(), false, true, false) + SET_GRID_API(set_halo_size, _left_halos[posn] = _right_halos[posn] = n; resize(), false, true, false) + + SET_GRID_API(set_alloc_size, _set_domain_size(posn, n), true, false, true) + SET_GRID_API(set_left_min_pad_size, + if (!get_raw_storage_buffer() && n > _left_pads[posn]) + _set_left_pad_size(posn, n), + false, true, false) + SET_GRID_API(set_right_min_pad_size, + if (!get_raw_storage_buffer() && n > _right_pads[posn]) + _set_right_pad_size(posn, n), + false, true, false) + SET_GRID_API(set_min_pad_size, + if (!get_raw_storage_buffer() && n > _left_pads[posn]) + _set_left_pad_size(posn, n); + if (!get_raw_storage_buffer() && n > _right_pads[posn]) + _set_right_pad_size(posn, n), + false, true, false) + SET_GRID_API(set_left_extra_pad_size, + set_left_min_pad_size(posn, _left_halos[posn] + _left_wf_exts[posn] + n), false, true, false) + SET_GRID_API(set_right_extra_pad_size, + set_right_min_pad_size(posn, _right_halos[posn] + _right_wf_exts[posn] + n), false, true, false) + SET_GRID_API(set_extra_pad_size, set_left_extra_pad_size(posn, n); + set_right_extra_pad_size(posn, n), false, true, false) + SET_GRID_API(set_first_misc_index, _offsets[posn] = n, false, false, true) +#undef COMMA +#undef SET_GRID_API + + bool YkGridBase::is_storage_layout_identical(const yk_grid_ptr other) const { + auto op = dynamic_pointer_cast(other); + assert(op); + + // Same size? + if (get_num_storage_bytes() != op->get_num_storage_bytes()) + return false; + + // Same dims? + if (get_num_dims() != op->get_num_dims()) + return false; + for (int i = 0; i < get_num_dims(); i++) { + auto dname = get_dim_name(i); + + // Same dims? + if (dname != op->get_dim_name(i)) + return false; + + // Same sizes? + // NB: not checking right pads because actual values + // are determined as function of other 3. + if (_allocs[i] != op->_allocs[i]) + return false; + if (_domains[i] != op->_domains[i]) + return false; + if (_left_pads[i] != op->_left_pads[i]) + return false; + } + return true; + } + + void YkGridBase::share_storage(yk_grid_ptr source) { + auto sp = dynamic_pointer_cast(source); + assert(sp); + + if (!sp->get_raw_storage_buffer()) { + THROW_YASK_EXCEPTION("Error: share_storage() called without source storage allocated"); + } + + // Determine required padding from halos. + Indices left_pads2 = getReqdPad(_left_halos, _left_wf_exts); + Indices right_pads2 = getReqdPad(_right_halos, _left_wf_exts); + + // NB: requirements to successful share_storage() is not as strict as + // is_storage_layout_identical(). See note on pad & halo below and API docs. + for (int i = 0; i < get_num_dims(); i++) { + auto dname = get_dim_name(i); + + // Same dims? + if (sp->get_num_dims() != get_num_dims() || + sp->get_dim_name(i) != dname) + THROW_YASK_EXCEPTION("Error: share_storage() called with incompatible grids: " << + make_info_string() << " and " << sp->make_info_string()); + + + // Check folding. + if (_vec_lens[i] != sp->_vec_lens[i]) { + THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() << + "' of fold-length " << sp->_vec_lens[i] << " with grid '" << get_name() << + "' of fold-length " << _vec_lens[i] << " in '" << dname << "' dim"); + } + + // Not a domain dim? + bool is_domain = _dims->_domain_dims.lookup(dname) != 0; + if (!is_domain) { + auto tas = get_alloc_size(dname); + auto sas = sp->get_alloc_size(dname); + if (tas != sas) { + THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() << + "' of alloc-size " << sas << " with grid '" << get_name() << + "' of alloc-size " << tas << " in '" << dname << "' dim"); + } + } + + // Domain dim. + else { + auto tdom = get_rank_domain_size(i); + auto sdom = sp->get_rank_domain_size(i); + if (tdom != sdom) { + THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() << + "' of domain-size " << sdom << " with grid '" << get_name() << + "' of domain-size " << tdom << " in '" << dname << "' dim"); + } + + // Halo and pad sizes don't have to be the same. + // Requirement is that halo (reqd pad) of target fits inside of pad of source. + auto spad = sp->get_left_pad_size(i); + if (left_pads2[i] > spad) { + THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() << + "' of left padding-size " << spad << + ", which is insufficient for grid '" << get_name() << + "' requiring " << left_pads2[i] << " in '" << dname << "' dim"); + } + spad = sp->get_right_pad_size(i); + if (right_pads2[i] > spad) { + THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() << + "' of right padding-size " << spad << + ", which is insufficient for grid '" << get_name() << + "' requiring " << right_pads2[i] << " in '" << dname << "' dim"); + } + } + } + + // Copy pad sizes. + for (int i = 0; i < get_num_dims(); i++) { + auto dname = get_dim_name(i); + bool is_domain = _dims->_domain_dims.lookup(dname) != 0; + if (is_domain) { + _left_pads[i] = sp->_left_pads[i]; + _right_pads[i] = sp->_right_pads[i]; + } + } + + // Copy data. + release_storage(); + resize(); + if (!share_data(sp.get(), true)) { + THROW_YASK_EXCEPTION("Error: unexpected failure in data sharing"); + } + } + + // API get, set, setc. + bool YkGridBase::is_element_allocated(const Indices& indices) const { + if (!is_storage_allocated()) + return false; + return checkIndices(indices, "is_element_allocated", false, false); + } + double YkGridBase::get_element(const Indices& indices) const { + if (!is_storage_allocated()) { + THROW_YASK_EXCEPTION("Error: call to 'get_element' with no data allocated for grid '" << + get_name() << "'"); + } + checkIndices(indices, "get_element", true, false); + idx_t asi = get_alloc_step_index(indices); + real_t val = readElem(indices, asi, __LINE__); + return double(val); + } + idx_t YkGridBase::set_element(double val, + const Indices& indices, + bool strict_indices) { + idx_t nup = 0; + if (get_raw_storage_buffer() && + checkIndices(indices, "set_element", strict_indices, false)) { + idx_t asi = get_alloc_step_index(indices); + writeElem(real_t(val), indices, asi, __LINE__); + nup++; + + // Set appropriate dirty flag. + set_dirty_using_alloc_index(true, asi); + } + return nup; + } + idx_t YkGridBase::add_to_element(double val, + const Indices& indices, + bool strict_indices) { + idx_t nup = 0; + if (get_raw_storage_buffer() && + checkIndices(indices, "add_to_element", strict_indices, false)) { + idx_t asi = get_alloc_step_index(indices); + addToElem(real_t(val), indices, asi, __LINE__); + nup++; + + // Set appropriate dirty flag. + set_dirty_using_alloc_index(true, asi); + } + return nup; + } + + idx_t YkGridBase::get_elements_in_slice(void* buffer_ptr, + const Indices& first_indices, + const Indices& last_indices) const { + if (!is_storage_allocated()) { + THROW_YASK_EXCEPTION("Error: call to 'get_elements_in_slice' with no data allocated for grid '" << + get_name() << "'"); + } + checkIndices(first_indices, "get_elements_in_slice", true, false); + checkIndices(last_indices, "get_elements_in_slice", true, false); + + // Find range. + IdxTuple numElemsTuple = get_slice_range(first_indices, last_indices); + + // Visit points in slice. + numElemsTuple.visitAllPointsInParallel + ([&](const IdxTuple& ofs, size_t idx) { + Indices pt = first_indices.addElements(ofs); + + // TODO: move this outside of loop for const step index. + idx_t asi = get_alloc_step_index(pt); + + real_t val = readElem(pt, asi, __LINE__); + ((real_t*)buffer_ptr)[idx] = val; + return true; // keep going. + }); + return numElemsTuple.product(); + } + idx_t YkGridBase::set_elements_in_slice_same(double val, + const Indices& first_indices, + const Indices& last_indices, + bool strict_indices) { + if (!is_storage_allocated()) + return 0; + + // 'Fixed' copy of indices. + Indices first, last; + checkIndices(first_indices, "set_elements_in_slice_same", + strict_indices, false, &first); + checkIndices(last_indices, "set_elements_in_slice_same", + strict_indices, false, &last); + + // Find range. + IdxTuple numElemsTuple = get_slice_range(first, last); + + // Visit points in slice. + numElemsTuple.visitAllPointsInParallel([&](const IdxTuple& ofs, + size_t idx) { + Indices pt = first.addElements(ofs); + + // TODO: move this outside of loop for const step index. + idx_t asi = get_alloc_step_index(pt); + + writeElem(real_t(val), pt, asi, __LINE__); + return true; // keep going. + }); + + // Set appropriate dirty flag(s). + set_dirty_in_slice(first, last); + + return numElemsTuple.product(); + } + idx_t YkGridBase::set_elements_in_slice(const void* buffer_ptr, + const Indices& first_indices, + const Indices& last_indices) { + if (!is_storage_allocated()) + return 0; + checkIndices(first_indices, "set_elements_in_slice", true, false); + checkIndices(last_indices, "set_elements_in_slice", true, false); + + // Find range. + IdxTuple numElemsTuple = get_slice_range(first_indices, last_indices); + + // Visit points in slice. + numElemsTuple.visitAllPointsInParallel + ([&](const IdxTuple& ofs, + size_t idx) { + Indices pt = first_indices.addElements(ofs); + + // TODO: move this outside of loop for const step index. + idx_t asi = get_alloc_step_index(pt); + + real_t val = ((real_t*)buffer_ptr)[idx]; + writeElem(val, pt, asi, __LINE__); + return true; // keep going. + }); + + // Set appropriate dirty flag(s). + set_dirty_in_slice(first_indices, last_indices); + + return numElemsTuple.product(); + } + +} // namespace. + diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp index 1a4faa87..51ba3086 100644 --- a/src/kernel/lib/realv_grids.cpp +++ b/src/kernel/lib/realv_grids.cpp @@ -30,99 +30,6 @@ using namespace std; namespace yask { - // APIs to get info from vars. -#define GET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \ - idx_t YkGridBase::api_name(const string& dim) const { \ - checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ - int posn = get_dim_posn(dim, true, #api_name); \ - if (prep_req && _offsets[posn] < 0) \ - THROW_YASK_EXCEPTION("Error: '" #api_name "()' called on grid '" << \ - get_name() << "' before calling 'prepare_solution()'"); \ - return expr; \ - } \ - idx_t YkGridBase::api_name(int posn) const { \ - return expr; \ - } - GET_GRID_API(get_rank_domain_size, _domains[posn], false, true, false, false) - GET_GRID_API(get_left_pad_size, _left_pads[posn], false, true, false, false) // _left_pads is actual size. - GET_GRID_API(get_right_pad_size, _allocs[posn] - _left_pads[posn], false, true, false, false) // _right_pads is request only. - GET_GRID_API(get_pad_size, _left_pads[posn], false, true, false, false) - GET_GRID_API(get_left_halo_size, _left_halos[posn], false, true, false, false) - GET_GRID_API(get_right_halo_size, _right_halos[posn], false, true, false, false) - GET_GRID_API(get_halo_size, _left_halos[posn], false, true, false, false) - GET_GRID_API(get_first_misc_index, _offsets[posn], false, false, true, false) - GET_GRID_API(get_last_misc_index, _offsets[posn] + _domains[posn] - 1, false, false, true, false) - GET_GRID_API(get_left_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false) - GET_GRID_API(get_right_extra_pad_size, (_allocs[posn] - _left_pads[posn] - _domains[posn]) - - _right_halos[posn], false, true, false, false) - GET_GRID_API(get_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false) - GET_GRID_API(get_alloc_size, _allocs[posn], true, true, true, false) - GET_GRID_API(get_first_rank_domain_index, _offsets[posn] - _local_offsets[posn], false, true, false, true) - GET_GRID_API(get_last_rank_domain_index, _offsets[posn] - _local_offsets[posn] + _domains[posn] - 1; - assert(!_is_scratch), false, true, false, true) - GET_GRID_API(get_first_rank_halo_index, _offsets[posn] - _left_halos[posn], false, false, true, true) - GET_GRID_API(get_last_rank_halo_index, _offsets[posn] + _domains[posn] + _right_halos[posn] - 1, false, false, true, true) - GET_GRID_API(get_first_rank_alloc_index, _offsets[posn] - _left_pads[posn], false, true, false, true) - GET_GRID_API(get_last_rank_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, false, true, false, true) - GET_GRID_API(_get_left_wf_ext, _left_wf_exts[posn], true, true, true, false) - GET_GRID_API(_get_right_wf_ext, _right_wf_exts[posn], true, true, true, false) - GET_GRID_API(_get_offset, _offsets[posn], true, true, true, true) - GET_GRID_API(_get_local_offset, _local_offsets[posn], true, true, true, false) - GET_GRID_API(_get_first_alloc_index, _offsets[posn] - _left_pads[posn], true, true, true, true) - GET_GRID_API(_get_last_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, true, true, true, true) -#undef GET_GRID_API - - // APIs to set vars. -#define COMMA , -#define SET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok) \ - void YkGridBase::api_name(const string& dim, idx_t n) { \ - TRACE_MSG0(get_ostr(), "grid '" << get_name() << "'." \ - #api_name "('" << dim << "', " << n << ")"); \ - checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ - int posn = get_dim_posn(dim, true, #api_name); \ - expr; \ - } \ - void YkGridBase::api_name(int posn, idx_t n) { \ - int dim = posn; \ - expr; \ - } - SET_GRID_API(_set_offset, _offsets[posn] = n, true, true, true) - SET_GRID_API(_set_local_offset, _local_offsets[posn] = n; - _vec_local_offsets[posn] = n / _vec_lens[posn], true, true, true) - SET_GRID_API(_set_domain_size, _domains[posn] = n; resize(), true, true, true) - SET_GRID_API(_set_left_pad_size, _left_pads[posn] = n; resize(), true, true, true) - SET_GRID_API(_set_right_pad_size, _right_pads[posn] = n; resize(), true, true, true) - SET_GRID_API(_set_left_wf_ext, _left_wf_exts[posn] = n; resize(), true, true, true) - SET_GRID_API(_set_right_wf_ext, _right_wf_exts[posn] = n; resize(), true, true, true) - SET_GRID_API(set_left_halo_size, _left_halos[posn] = n; resize(), false, true, false) - SET_GRID_API(set_right_halo_size, _right_halos[posn] = n; resize(), false, true, false) - SET_GRID_API(set_halo_size, _left_halos[posn] = _right_halos[posn] = n; resize(), false, true, false) - - SET_GRID_API(set_alloc_size, _set_domain_size(posn, n), true, false, true) - SET_GRID_API(set_left_min_pad_size, - if (!get_raw_storage_buffer() && n > _left_pads[posn]) - _set_left_pad_size(posn, n), - false, true, false) - SET_GRID_API(set_right_min_pad_size, - if (!get_raw_storage_buffer() && n > _right_pads[posn]) - _set_right_pad_size(posn, n), - false, true, false) - SET_GRID_API(set_min_pad_size, - if (!get_raw_storage_buffer() && n > _left_pads[posn]) - _set_left_pad_size(posn, n); - if (!get_raw_storage_buffer() && n > _right_pads[posn]) - _set_right_pad_size(posn, n), - false, true, false) - SET_GRID_API(set_left_extra_pad_size, - set_left_min_pad_size(posn, _left_halos[posn] + _left_wf_exts[posn] + n), false, true, false) - SET_GRID_API(set_right_extra_pad_size, - set_right_min_pad_size(posn, _right_halos[posn] + _right_wf_exts[posn] + n), false, true, false) - SET_GRID_API(set_extra_pad_size, set_left_extra_pad_size(posn, n); - set_right_extra_pad_size(posn, n), false, true, false) - SET_GRID_API(set_first_misc_index, _offsets[posn] = n, false, false, true) -#undef COMMA -#undef SET_GRID_API - // Ctor. YkGridBase::YkGridBase(GenericGridBase* ggb, size_t ndims, @@ -150,7 +57,6 @@ namespace yask { _vec_local_offsets.setFromConst(0, n); } - // Convenience function to format indices like // "x=5, y=3". std::string YkGridBase::makeIndexString(const Indices& idxs, @@ -304,127 +210,6 @@ namespace yask { _dims->checkDimType(dim, fn_name, step_ok, domain_ok, misc_ok); } - bool YkGridBase::is_storage_layout_identical(const yk_grid_ptr other) const { - auto op = dynamic_pointer_cast(other); - assert(op); - - // Same size? - if (get_num_storage_bytes() != op->get_num_storage_bytes()) - return false; - - // Same dims? - if (get_num_dims() != op->get_num_dims()) - return false; - for (int i = 0; i < get_num_dims(); i++) { - auto dname = get_dim_name(i); - - // Same dims? - if (dname != op->get_dim_name(i)) - return false; - - // Same sizes? - // NB: not checking right pads because actual values - // are determined as function of other 3. - if (_allocs[i] != op->_allocs[i]) - return false; - if (_domains[i] != op->_domains[i]) - return false; - if (_left_pads[i] != op->_left_pads[i]) - return false; - } - return true; - } - - void YkGridBase::share_storage(yk_grid_ptr source) { - auto sp = dynamic_pointer_cast(source); - assert(sp); - - if (!sp->get_raw_storage_buffer()) { - THROW_YASK_EXCEPTION("Error: share_storage() called without source storage allocated"); - } - - // Determine required padding from halos. - Indices left_pads2 = getReqdPad(_left_halos, _left_wf_exts); - Indices right_pads2 = getReqdPad(_right_halos, _left_wf_exts); - - // NB: requirements to successful share_storage() is not as strict as - // is_storage_layout_identical(). See note on pad & halo below and API docs. - for (int i = 0; i < get_num_dims(); i++) { - auto dname = get_dim_name(i); - - // Same dims? - if (sp->get_num_dims() != get_num_dims() || - sp->get_dim_name(i) != dname) - THROW_YASK_EXCEPTION("Error: share_storage() called with incompatible grids: " << - make_info_string() << " and " << sp->make_info_string()); - - - // Check folding. - if (_vec_lens[i] != sp->_vec_lens[i]) { - THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() << - "' of fold-length " << sp->_vec_lens[i] << " with grid '" << get_name() << - "' of fold-length " << _vec_lens[i] << " in '" << dname << "' dim"); - } - - // Not a domain dim? - bool is_domain = _dims->_domain_dims.lookup(dname) != 0; - if (!is_domain) { - auto tas = get_alloc_size(dname); - auto sas = sp->get_alloc_size(dname); - if (tas != sas) { - THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() << - "' of alloc-size " << sas << " with grid '" << get_name() << - "' of alloc-size " << tas << " in '" << dname << "' dim"); - } - } - - // Domain dim. - else { - auto tdom = get_rank_domain_size(i); - auto sdom = sp->get_rank_domain_size(i); - if (tdom != sdom) { - THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() << - "' of domain-size " << sdom << " with grid '" << get_name() << - "' of domain-size " << tdom << " in '" << dname << "' dim"); - } - - // Halo and pad sizes don't have to be the same. - // Requirement is that halo (reqd pad) of target fits inside of pad of source. - auto spad = sp->get_left_pad_size(i); - if (left_pads2[i] > spad) { - THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() << - "' of left padding-size " << spad << - ", which is insufficient for grid '" << get_name() << - "' requiring " << left_pads2[i] << " in '" << dname << "' dim"); - } - spad = sp->get_right_pad_size(i); - if (right_pads2[i] > spad) { - THROW_YASK_EXCEPTION("Error: attempt to share storage from grid '" << sp->get_name() << - "' of right padding-size " << spad << - ", which is insufficient for grid '" << get_name() << - "' requiring " << right_pads2[i] << " in '" << dname << "' dim"); - } - } - } - - // Copy pad sizes. - for (int i = 0; i < get_num_dims(); i++) { - auto dname = get_dim_name(i); - bool is_domain = _dims->_domain_dims.lookup(dname) != 0; - if (is_domain) { - _left_pads[i] = sp->_left_pads[i]; - _right_pads[i] = sp->_right_pads[i]; - } - } - - // Copy data. - release_storage(); - resize(); - if (!share_data(sp.get(), true)) { - THROW_YASK_EXCEPTION("Error: unexpected failure in data sharing"); - } - } - // Check for equality. // Return number of mismatches greater than epsilon. idx_t YkGridBase::compare(const YkGridBase* ref, @@ -587,145 +372,6 @@ namespace yask { return numElemsTuple; } - // API get, set, setc. - bool YkGridBase::is_element_allocated(const Indices& indices) const { - if (!is_storage_allocated()) - return false; - return checkIndices(indices, "is_element_allocated", false, false); - } - double YkGridBase::get_element(const Indices& indices) const { - if (!is_storage_allocated()) { - THROW_YASK_EXCEPTION("Error: call to 'get_element' with no data allocated for grid '" << - get_name() << "'"); - } - checkIndices(indices, "get_element", true, false); - idx_t asi = get_alloc_step_index(indices); - real_t val = readElem(indices, asi, __LINE__); - return double(val); - } - idx_t YkGridBase::set_element(double val, - const Indices& indices, - bool strict_indices) { - idx_t nup = 0; - if (get_raw_storage_buffer() && - checkIndices(indices, "set_element", strict_indices, false)) { - idx_t asi = get_alloc_step_index(indices); - writeElem(real_t(val), indices, asi, __LINE__); - nup++; - - // Set appropriate dirty flag. - set_dirty_using_alloc_index(true, asi); - } - return nup; - } - idx_t YkGridBase::add_to_element(double val, - const Indices& indices, - bool strict_indices) { - idx_t nup = 0; - if (get_raw_storage_buffer() && - checkIndices(indices, "add_to_element", strict_indices, false)) { - idx_t asi = get_alloc_step_index(indices); - addToElem(real_t(val), indices, asi, __LINE__); - nup++; - - // Set appropriate dirty flag. - set_dirty_using_alloc_index(true, asi); - } - return nup; - } - - idx_t YkGridBase::get_elements_in_slice(void* buffer_ptr, - const Indices& first_indices, - const Indices& last_indices) const { - if (!is_storage_allocated()) { - THROW_YASK_EXCEPTION("Error: call to 'get_elements_in_slice' with no data allocated for grid '" << - get_name() << "'"); - } - checkIndices(first_indices, "get_elements_in_slice", true, false); - checkIndices(last_indices, "get_elements_in_slice", true, false); - - // Find range. - IdxTuple numElemsTuple = get_slice_range(first_indices, last_indices); - - // Visit points in slice. - numElemsTuple.visitAllPointsInParallel - ([&](const IdxTuple& ofs, size_t idx) { - Indices pt = first_indices.addElements(ofs); - - // TODO: move this outside of loop for const step index. - idx_t asi = get_alloc_step_index(pt); - - real_t val = readElem(pt, asi, __LINE__); - ((real_t*)buffer_ptr)[idx] = val; - return true; // keep going. - }); - return numElemsTuple.product(); - } - idx_t YkGridBase::set_elements_in_slice_same(double val, - const Indices& first_indices, - const Indices& last_indices, - bool strict_indices) { - if (!is_storage_allocated()) - return 0; - - // 'Fixed' copy of indices. - Indices first, last; - checkIndices(first_indices, "set_elements_in_slice_same", - strict_indices, false, &first); - checkIndices(last_indices, "set_elements_in_slice_same", - strict_indices, false, &last); - - // Find range. - IdxTuple numElemsTuple = get_slice_range(first, last); - - // Visit points in slice. - numElemsTuple.visitAllPointsInParallel([&](const IdxTuple& ofs, - size_t idx) { - Indices pt = first.addElements(ofs); - - // TODO: move this outside of loop for const step index. - idx_t asi = get_alloc_step_index(pt); - - writeElem(real_t(val), pt, asi, __LINE__); - return true; // keep going. - }); - - // Set appropriate dirty flag(s). - set_dirty_in_slice(first, last); - - return numElemsTuple.product(); - } - idx_t YkGridBase::set_elements_in_slice(const void* buffer_ptr, - const Indices& first_indices, - const Indices& last_indices) { - if (!is_storage_allocated()) - return 0; - checkIndices(first_indices, "set_elements_in_slice", true, false); - checkIndices(last_indices, "set_elements_in_slice", true, false); - - // Find range. - IdxTuple numElemsTuple = get_slice_range(first_indices, last_indices); - - // Visit points in slice. - numElemsTuple.visitAllPointsInParallel - ([&](const IdxTuple& ofs, - size_t idx) { - Indices pt = first_indices.addElements(ofs); - - // TODO: move this outside of loop for const step index. - idx_t asi = get_alloc_step_index(pt); - - real_t val = ((real_t*)buffer_ptr)[idx]; - writeElem(val, pt, asi, __LINE__); - return true; // keep going. - }); - - // Set appropriate dirty flag(s). - set_dirty_in_slice(first_indices, last_indices); - - return numElemsTuple.product(); - } - // Print one element like // "message: mygrid[x=4, y=7] = 3.14 at line 35". void YkGridBase::printElem(const std::string& msg, diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp new file mode 100644 index 00000000..0cc0b503 --- /dev/null +++ b/src/kernel/lib/setup.cpp @@ -0,0 +1,1260 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kernel +Copyright (c) 2014-2018, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +// This file contains implementations of StencilContext methods +// specific to the preparation steps. + +#include "yask.hpp" +using namespace std; + +namespace yask { + + // Init MPI-related vars and other vars related to my rank's place in + // the global problem: rank index, offset, etc. Need to call this even + // if not using MPI to properly init these vars. Called from + // prepare_solution(), so it doesn't normally need to be called from user code. + void StencilContext::setupRank() { + ostream& os = get_ostr(); + auto& step_dim = _dims->_step_dim; + auto me = _env->my_rank; + + // Check ranks. + idx_t req_ranks = _opts->_num_ranks.product(); + if (req_ranks != _env->num_ranks) { + THROW_YASK_EXCEPTION("error: " << req_ranks << " rank(s) requested (" << + _opts->_num_ranks.makeDimValStr(" * ") << "), but " << + _env->num_ranks << " rank(s) are active"); + } + assertEqualityOverRanks(_opts->_rank_sizes[step_dim], _env->comm, "num steps"); + + // Determine my coordinates if not provided already. + // TODO: do this more intelligently based on proximity. + if (_opts->find_loc) + _opts->_rank_indices = _opts->_num_ranks.unlayout(me); + + // A table of rank-coordinates for everyone. + auto num_ddims = _opts->_rank_indices.size(); // domain-dims only! + idx_t coords[_env->num_ranks][num_ddims]; + + // Init coords for this rank. + for (int i = 0; i < num_ddims; i++) + coords[me][i] = _opts->_rank_indices[i]; + + // A table of rank-domain sizes for everyone. + idx_t rsizes[_env->num_ranks][num_ddims]; + + // Init sizes for this rank. + for (int di = 0; di < num_ddims; di++) { + auto& dname = _opts->_rank_indices.getDimName(di); + rsizes[me][di] = _opts->_rank_sizes[dname]; + } + +#ifdef USE_MPI + // Exchange coord and size info between all ranks. + for (int rn = 0; rn < _env->num_ranks; rn++) { + MPI_Bcast(&coords[rn][0], num_ddims, MPI_INTEGER8, + rn, _env->comm); + MPI_Bcast(&rsizes[rn][0], num_ddims, MPI_INTEGER8, + rn, _env->comm); + } + // Now, the tables are filled in for all ranks. +#endif + + // Init offsets and total sizes. + rank_domain_offsets.setValsSame(0); + overall_domain_sizes.setValsSame(0); + + // Loop over all ranks, including myself. + int num_neighbors = 0; + for (int rn = 0; rn < _env->num_ranks; rn++) { + + // Coord offset of rn from me: prev => negative, self => 0, next => positive. + IdxTuple rcoords(_dims->_domain_dims); + IdxTuple rdeltas(_dims->_domain_dims); + for (int di = 0; di < num_ddims; di++) { + rcoords[di] = coords[rn][di]; + rdeltas[di] = coords[rn][di] - _opts->_rank_indices[di]; + } + + // Manhattan distance from rn (sum of abs deltas in all dims). + // Max distance in any dim. + int mandist = 0; + int maxdist = 0; + for (int di = 0; di < num_ddims; di++) { + mandist += abs(rdeltas[di]); + maxdist = max(maxdist, abs(int(rdeltas[di]))); + } + + // Myself. + if (rn == me) { + if (mandist != 0) + THROW_YASK_EXCEPTION("Internal error: distance to own rank == " << mandist); + } + + // Someone else. + else { + if (mandist == 0) + THROW_YASK_EXCEPTION("Error: ranks " << me << + " and " << rn << " at same coordinates"); + } + + // Loop through domain dims. + for (int di = 0; di < num_ddims; di++) { + auto& dname = _opts->_rank_indices.getDimName(di); + + // Is rank 'rn' in-line with my rank in 'dname' dim? + // True when deltas in other dims are zero. + bool is_inline = true; + for (int dj = 0; dj < num_ddims; dj++) { + if (di != dj && rdeltas[dj] != 0) { + is_inline = false; + break; + } + } + + // Process ranks that are in-line in 'dname', including self. + if (is_inline) { + + // Accumulate total problem size in each dim for ranks that + // intersect with this rank, including myself. + overall_domain_sizes[dname] += rsizes[rn][di]; + + // Adjust my offset in the global problem by adding all domain + // sizes from prev ranks only. + if (rdeltas[di] < 0) + rank_domain_offsets[dname] += rsizes[rn][di]; + + // Make sure all the other dims are the same size. + // This ensures that all the ranks' domains line up + // properly along their edges and at their corners. + for (int dj = 0; dj < num_ddims; dj++) { + if (di != dj) { + auto mysz = rsizes[me][dj]; + auto rnsz = rsizes[rn][dj]; + if (mysz != rnsz) { + auto& dnamej = _opts->_rank_indices.getDimName(dj); + THROW_YASK_EXCEPTION("Error: rank " << rn << " and " << me << + " are both at rank-index " << coords[me][di] << + " in the '" << dname << + "' dimension , but their rank-domain sizes are " << + rnsz << " and " << mysz << + " (resp.) in the '" << dj << + "' dimension, making them unaligned"); + } + } + } + } + } + + // Rank rn is myself or my immediate neighbor if its distance <= 1 in + // every dim. Assume we do not need to exchange halos except + // with immediate neighbor. We validate this assumption below by + // making sure that the rank domain size is at least as big as the + // largest halo. + if (maxdist <= 1) { + + // At this point, rdeltas contains only -1..+1 for each domain dim. + // Add one to -1..+1 to get 0..2 range for my_neighbors offsets. + IdxTuple roffsets = rdeltas.addElements(1); + assert(rdeltas.min() >= -1); + assert(rdeltas.max() <= 1); + assert(roffsets.min() >= 0); + assert(roffsets.max() <= 2); + + // Convert the offsets into a 1D index. + auto rn_ofs = _mpiInfo->getNeighborIndex(roffsets); + TRACE_MSG("neighborhood size = " << _mpiInfo->neighborhood_sizes.makeDimValStr() << + " & roffsets of rank " << rn << " = " << roffsets.makeDimValStr() << + " => " << rn_ofs); + assert(idx_t(rn_ofs) < _mpiInfo->neighborhood_size); + + // Save rank of this neighbor into the MPI info object. + _mpiInfo->my_neighbors.at(rn_ofs) = rn; + if (rn != me) { + num_neighbors++; + os << "Neighbor #" << num_neighbors << " is rank " << rn << + " at absolute rank indices " << rcoords.makeDimValStr() << + " (" << rdeltas.makeDimValOffsetStr() << " relative to rank " << + me << ")\n"; + } + + // Save manhattan dist. + _mpiInfo->man_dists.at(rn_ofs) = mandist; + + // Loop through domain dims. + bool vlen_mults = true; + for (int di = 0; di < num_ddims; di++) { + auto& dname = _opts->_rank_indices.getDimName(di); + + // Does rn have all VLEN-multiple sizes? + auto rnsz = rsizes[rn][di]; + auto vlen = _dims->_fold_pts[di]; + if (rnsz % vlen != 0) { + TRACE_MSG("cannot use vector halo exchange with rank " << rn << + " because its size in '" << dname << "' is " << rnsz); + vlen_mults = false; + } + } + + // Save vec-mult flag. + _mpiInfo->has_all_vlen_mults.at(rn_ofs) = vlen_mults; + + } // self or immediate neighbor in any direction. + + } // ranks. + + // Set offsets in grids and find WF extensions + // based on the grids' halos. + update_grids(); + + // Determine bounding-boxes for all bundles. + // This must be done after finding WF extensions. + find_bounding_boxes(); + + } // setupRank. + + // Alloc 'nbytes' on each requested NUMA node. + // Map keys are preferred NUMA nodes or -1 for local. + // Pointers are returned in '_data_buf'. + // 'ngrids' and 'type' are only used for debug msg. + void StencilContext::_alloc_data(const map & nbytes, + const map & ngrids, + map >& data_buf, + const std::string& type) { + ostream& os = get_ostr(); + + for (const auto& i : nbytes) { + int numa_pref = i.first; + size_t nb = i.second; + size_t ng = ngrids.at(numa_pref); + + // Don't need pad after last one. + if (nb >= _data_buf_pad) + nb -= _data_buf_pad; + + // Allocate data. + os << "Allocating " << makeByteStr(nb) << + " for " << ng << " " << type << "(s)"; +#ifdef USE_NUMA + if (numa_pref >= 0) + os << " preferring NUMA node " << numa_pref; + else + os << " using NUMA policy " << numa_pref; +#endif + os << "...\n" << flush; + auto p = shared_numa_alloc(nb, numa_pref); + TRACE_MSG("Got memory at " << static_cast(p.get())); + + // Save using original key. + data_buf[numa_pref] = p; + } + } + + // Allocate memory for grids that do not already have storage. + void StencilContext::allocGridData(ostream& os) { + + // Base ptrs for all default-alloc'd data. + // These pointers will be shared by the ones in the grid + // objects, which will take over ownership when these go + // out of scope. + // Key is preferred numa node or -1 for local. + map > _grid_data_buf; + + // Pass 0: count required size for each NUMA node, allocate chunk of memory at end. + // Pass 1: distribute parts of already-allocated memory chunk. + for (int pass = 0; pass < 2; pass++) { + TRACE_MSG("allocGridData pass " << pass << " for " << + gridPtrs.size() << " grid(s)"); + + // Count bytes needed and number of grids for each NUMA node. + map npbytes, ngrids; + + // Grids. + for (auto gp : gridPtrs) { + if (!gp) + continue; + auto& gname = gp->get_name(); + + // Grid data. + // Don't alloc if already done. + if (!gp->is_storage_allocated()) { + int numa_pref = gp->get_numa_preferred(); + + // Set storage if buffer has been allocated in pass 0. + if (pass == 1) { + auto p = _grid_data_buf[numa_pref]; + assert(p); + gp->set_storage(p, npbytes[numa_pref]); + os << gp->make_info_string() << endl; + } + + // Determine padded size (also offset to next location). + size_t nbytes = gp->get_num_storage_bytes(); + npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad, + CACHELINE_BYTES); + ngrids[numa_pref]++; + if (pass == 0) + TRACE_MSG(" grid '" << gname << "' needs " << makeByteStr(nbytes) << + " on NUMA node " << numa_pref); + } + } + + // Alloc for each node. + if (pass == 0) + _alloc_data(npbytes, ngrids, _grid_data_buf, "grid"); + + } // grid passes. + }; + + // Create MPI and allocate buffers. + void StencilContext::allocMpiData(ostream& os) { + + // Remove any old MPI data. + freeMpiData(os); + +#ifdef USE_MPI + + int num_exchanges = 0; + auto me = _env->my_rank; + + // Need to determine the size and shape of all MPI buffers. + // Visit all neighbors of this rank. + _mpiInfo->visitNeighbors + ([&](const IdxTuple& neigh_offsets, int neigh_rank, int neigh_idx) { + if (neigh_rank == MPI_PROC_NULL) + return; // from lambda fn. + + // Determine max dist needed. TODO: determine max dist + // automatically from stencils; may not be same for all + // grids. +#ifndef MAX_EXCH_DIST +#define MAX_EXCH_DIST (NUM_STENCIL_DIMS - 1) +#endif + // Always use max dist with WF. + // TODO: determine if this is overkill. + int maxdist = MAX_EXCH_DIST; + if (num_wf_shifts > 0) + maxdist = NUM_STENCIL_DIMS - 1; + + // Manhattan dist. + int mandist = _mpiInfo->man_dists.at(neigh_idx); + + // Check distance. + // TODO: calculate and use exch dist for each grid. + if (mandist > maxdist) { + TRACE_MSG("no halo exchange needed with rank " << neigh_rank << + " because L1-norm = " << mandist); + return; // from lambda fn. + } + + // Determine size of MPI buffers between neigh_rank and my rank + // for each grid and create those that are needed. + for (auto gp : gridPtrs) { + if (!gp) + continue; + auto& gname = gp->get_name(); + + // Lookup first & last domain indices and calc exchange sizes + // for this grid. + bool found_delta = false; + IdxTuple my_halo_sizes, neigh_halo_sizes; + IdxTuple first_inner_idx, last_inner_idx; + IdxTuple first_outer_idx, last_outer_idx; + for (auto& dim : _dims->_domain_dims.getDims()) { + auto& dname = dim.getName(); + if (gp->is_dim_used(dname)) { + + // Get domain indices for this grid. + // If there are no more ranks in the given direction, extend + // the index into the outer halo to make sure all data are sync'd. + // This is critical for WFs. + idx_t fidx = gp->get_first_rank_domain_index(dname); + idx_t lidx = gp->get_last_rank_domain_index(dname); + first_inner_idx.addDimBack(dname, fidx); + last_inner_idx.addDimBack(dname, lidx); + if (_opts->is_first_rank(dname)) + fidx -= gp->get_left_halo_size(dname); + if (_opts->is_last_rank(dname)) + lidx += gp->get_right_halo_size(dname); + first_outer_idx.addDimBack(dname, fidx); + last_outer_idx.addDimBack(dname, lidx); + + // Determine size of exchange. This will be the actual halo size + // plus any wave-front extensions. In the current implementation, + // we need the wave-front extensions regardless of whether there + // is a halo on a given grid. This is because each stencil-bundle + // gets shifted by the WF angles at each step in the WF. + + // Neighbor is to the left. + if (neigh_offsets[dname] == MPIInfo::rank_prev) { + auto ext = left_wf_exts[dname]; + + // my halo. + auto halo_size = gp->get_left_halo_size(dname); + halo_size += ext; + my_halo_sizes.addDimBack(dname, halo_size); + + // neighbor halo. + halo_size = gp->get_right_halo_size(dname); // their right is on my left. + halo_size += ext; + neigh_halo_sizes.addDimBack(dname, halo_size); + } + + // Neighbor is to the right. + else if (neigh_offsets[dname] == MPIInfo::rank_next) { + auto ext = right_wf_exts[dname]; + + // my halo. + auto halo_size = gp->get_right_halo_size(dname); + halo_size += ext; + my_halo_sizes.addDimBack(dname, halo_size); + + // neighbor halo. + halo_size = gp->get_left_halo_size(dname); // their left is on my right. + halo_size += ext; + neigh_halo_sizes.addDimBack(dname, halo_size); + } + + // Neighbor in-line. + else { + my_halo_sizes.addDimBack(dname, 0); + neigh_halo_sizes.addDimBack(dname, 0); + } + + // Vectorized exchange allowed based on domain sizes? + // Both my rank and neighbor rank must have all domain sizes + // of vector multiples. + bool vec_ok = allow_vec_exchange && + _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] && + _mpiInfo->has_all_vlen_mults[neigh_idx]; + + // Round up halo sizes if vectorized exchanges allowed. + // TODO: add a heuristic to avoid increasing by a large factor. + if (vec_ok) { + auto vec_size = _dims->_fold_pts[dname]; + my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size)); + neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size)); + } + + // Is this neighbor before or after me in this domain direction? + if (neigh_offsets[dname] != MPIInfo::rank_self) + found_delta = true; + } + } + + // Is buffer needed? + // Example: if this grid is 2D in y-z, but only neighbors are in + // x-dim, we don't need any exchange. + if (!found_delta) { + TRACE_MSG("no halo exchange needed for grid '" << gname << + "' with rank " << neigh_rank << + " because the neighbor is not in a direction" + " corresponding to a grid dim"); + continue; // to next grid. + } + + // Make a buffer in both directions (send & receive). + for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) { + + // Begin/end vars to indicate what part + // of main grid to read from or write to based on + // the current neighbor being processed. + IdxTuple copy_begin = gp->get_allocs(); + IdxTuple copy_end = gp->get_allocs(); + + // Adjust along domain dims in this grid. + for (auto& dim : _dims->_domain_dims.getDims()) { + auto& dname = dim.getName(); + if (gp->is_dim_used(dname)) { + + // Init range to whole rank domain (including + // outer halos). These may be changed below + // depending on the neighbor's direction. + copy_begin[dname] = first_outer_idx[dname]; + copy_end[dname] = last_outer_idx[dname] + 1; // end = last + 1. + + // Neighbor direction in this dim. + auto neigh_ofs = neigh_offsets[dname]; + + // Region to read from, i.e., data from inside + // this rank's domain to be put into neighbor's + // halo. + if (bd == MPIBufs::bufSend) { + + // Neighbor is to the left. + if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { + + // Only read slice as wide as halo from beginning. + copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname]; + } + + // Neighbor is to the right. + else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { + + // Only read slice as wide as halo before end. + copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname]; + } + + // Else, this neighbor is in same posn as I am in this dim, + // so we leave the default begin/end settings. + } + + // Region to write to, i.e., into this rank's halo. + else if (bd == MPIBufs::bufRecv) { + + // Neighbor is to the left. + if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { + + // Only read slice as wide as halo before beginning. + copy_begin[dname] = first_inner_idx[dname] - my_halo_sizes[dname]; + copy_end[dname] = first_inner_idx[dname]; + } + + // Neighbor is to the right. + else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { + + // Only read slice as wide as halo after end. + copy_begin[dname] = last_inner_idx[dname] + 1; + copy_end[dname] = last_inner_idx[dname] + 1 + my_halo_sizes[dname]; + } + + // Else, this neighbor is in same posn as I am in this dim, + // so we leave the default begin/end settings. + } + } // domain dims in this grid. + } // domain dims. + + // Sizes of buffer in all dims of this grid. + // Also, set begin/end value for non-domain dims. + IdxTuple buf_sizes = gp->get_allocs(); + bool vlen_mults = true; + for (auto& dname : gp->get_dim_names()) { + idx_t dsize = 1; + + // domain dim? + if (_dims->_domain_dims.lookup(dname)) { + dsize = copy_end[dname] - copy_begin[dname]; + + // Check whether size is multiple of vlen. + auto vlen = _dims->_fold_pts[dname]; + if (dsize % vlen != 0) + vlen_mults = false; + } + + // step dim? + // Allowing only one step to be exchanged. + // TODO: consider exchanging mutiple steps at once for WFs. + else if (dname == _dims->_step_dim) { + + // Use 0..1 as a place-holder range. + // The actual values will be supplied during + // halo exchange. + copy_begin[dname] = 0; + copy_end[dname] = 1; + } + + // misc? + // Copy over entire range. + // TODO: make dirty flags for misc dims in grids. + else { + dsize = gp->get_alloc_size(dname); + copy_begin[dname] = gp->get_first_misc_index(dname); + copy_end[dname] = gp->get_last_misc_index(dname) + 1; + } + + // Save computed size. + buf_sizes[dname] = dsize; + + } // all dims in this grid. + + // Does buffer have non-zero size? + if (buf_sizes.size() == 0 || buf_sizes.product() == 0) { + TRACE_MSG("no halo exchange needed for grid '" << gname << + "' with rank " << neigh_rank << + " because there is no data to exchange"); + continue; + } + + // At this point, buf_sizes, copy_begin, and copy_end + // should be set for each dim in this grid. + // Convert end to last. + IdxTuple copy_last = copy_end.subElements(1); + + // Unique name for buffer based on grid name, direction, and ranks. + ostringstream oss; + oss << gname; + if (bd == MPIBufs::bufSend) + oss << "_send_halo_from_" << me << "_to_" << neigh_rank; + else if (bd == MPIBufs::bufRecv) + oss << "_recv_halo_from_" << neigh_rank << "_to_" << me; + string bufname = oss.str(); + + // Make MPI data entry for this grid. + auto gbp = mpiData.emplace(gname, _mpiInfo); + auto& gbi = gbp.first; // iterator from pair returned by emplace(). + auto& gbv = gbi->second; // value from iterator. + auto& buf = gbv.getBuf(MPIBufs::BufDir(bd), neigh_offsets); + + // Config buffer for this grid. + // (But don't allocate storage yet.) + buf.begin_pt = copy_begin; + buf.last_pt = copy_last; + buf.num_pts = buf_sizes; + buf.name = bufname; + buf.has_all_vlen_mults = vlen_mults; + + TRACE_MSG("configured MPI buffer object '" << buf.name << + "' for rank at relative offsets " << + neigh_offsets.subElements(1).makeDimValStr() << " with " << + buf.num_pts.makeDimValStr(" * ") << " = " << buf.get_size() << + " element(s) at " << buf.begin_pt.makeDimValStr() << + " ... " << buf.last_pt.makeDimValStr()); + num_exchanges++; + + } // send, recv. + } // grids. + }); // neighbors. + TRACE_MSG("number of halo-exchanges needed on this rank: " << num_exchanges); + + // Base ptrs for all alloc'd data. + // These pointers will be shared by the ones in the grid + // objects, which will take over ownership when these go + // out of scope. + map > _mpi_data_buf; + + // Allocate MPI buffers. + // Pass 0: count required size, allocate chunk of memory at end. + // Pass 1: distribute parts of already-allocated memory chunk. + for (int pass = 0; pass < 2; pass++) { + TRACE_MSG("allocMpiData pass " << pass << " for " << + mpiData.size() << " MPI buffer set(s)"); + + // Count bytes needed and number of buffers for each NUMA node. + map npbytes, nbufs; + + // Grids. + for (auto gp : gridPtrs) { + if (!gp) + continue; + auto& gname = gp->get_name(); + int numa_pref = gp->get_numa_preferred(); + + // MPI bufs for this grid. + if (mpiData.count(gname)) { + auto& grid_mpi_data = mpiData.at(gname); + + // Visit buffers for each neighbor for this grid. + grid_mpi_data.visitNeighbors + ([&](const IdxTuple& roffsets, + int rank, + int idx, + MPIBufs& bufs) { + + // Send and recv. + for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) { + auto& buf = grid_mpi_data.getBuf(MPIBufs::BufDir(bd), roffsets); + if (buf.get_size() == 0) + continue; + + // Set storage if buffer has been allocated in pass 0. + if (pass == 1) { + auto p = _mpi_data_buf[numa_pref]; + assert(p); + buf.set_storage(p, npbytes[numa_pref]); + } + + // Determine padded size (also offset to next location). + auto sbytes = buf.get_bytes(); + npbytes[numa_pref] += ROUND_UP(sbytes + _data_buf_pad, + CACHELINE_BYTES); + nbufs[numa_pref]++; + if (pass == 0) + TRACE_MSG(" MPI buf '" << buf.name << "' needs " << + makeByteStr(sbytes) << + " on NUMA node " << numa_pref); + } + } ); + } + } + + // Alloc for each node. + if (pass == 0) + _alloc_data(npbytes, nbufs, _mpi_data_buf, "MPI buffer"); + + } // MPI passes. +#endif + } + + // Allocate memory for scratch grids based on number of threads and + // block sizes. + void StencilContext::allocScratchData(ostream& os) { + + // Remove any old scratch data. + freeScratchData(os); + + // Base ptrs for all alloc'd data. + // This pointer will be shared by the ones in the grid + // objects, which will take over ownership when it goes + // out of scope. + map > _scratch_data_buf; + + // Make sure the right number of threads are set so we + // have the right number of scratch grids. + int rthreads = set_region_threads(); + + // Delete any existing scratch grids. + // Create new scratch grids. + makeScratchGrids(rthreads); + + // Pass 0: count required size, allocate chunk of memory at end. + // Pass 1: distribute parts of already-allocated memory chunk. + for (int pass = 0; pass < 2; pass++) { + TRACE_MSG("allocScratchData pass " << pass << " for " << + scratchVecs.size() << " set(s) of scratch grids"); + + // Count bytes needed and number of grids for each NUMA node. + map npbytes, ngrids; + + // Loop through each scratch grid vector. + for (auto* sgv : scratchVecs) { + assert(sgv); + + // Loop through each scratch grid in this vector. + // There will be one for each region thread. + assert(int(sgv->size()) == rthreads); + int thr_num = 0; + for (auto gp : *sgv) { + assert(gp); + auto& gname = gp->get_name(); + int numa_pref = gp->get_numa_preferred(); + + // Loop through each domain dim. + for (auto& dim : _dims->_domain_dims.getDims()) { + auto& dname = dim.getName(); + + if (gp->is_dim_used(dname)) { + + // Set domain size of grid to block size. + gp->_set_domain_size(dname, _opts->_block_sizes[dname]); + + // Pads. + // Set via both 'extra' and 'min'; larger result will be used. + gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]); + gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]); + } + } // dims. + + // Set storage if buffer has been allocated. + if (pass == 1) { + auto p = _scratch_data_buf[numa_pref]; + assert(p); + gp->set_storage(p, npbytes[numa_pref]); + TRACE_MSG(gp->make_info_string()); + } + + // Determine size used (also offset to next location). + size_t nbytes = gp->get_num_storage_bytes(); + npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad, + CACHELINE_BYTES); + ngrids[numa_pref]++; + if (pass == 0) + TRACE_MSG(" scratch grid '" << gname << "' for thread " << + thr_num << " needs " << makeByteStr(nbytes) << + " on NUMA node " << numa_pref); + thr_num++; + } // scratch grids. + } // scratch-grid vecs. + + // Alloc for each node. + if (pass == 0) + _alloc_data(npbytes, ngrids, _scratch_data_buf, "scratch grid"); + + } // scratch-grid passes. + } + + // Set non-scratch grid sizes and offsets based on settings. + // Set wave-front settings. + // This should be called anytime a setting or rank offset is changed. + void StencilContext::update_grids() + { + assert(_opts); + + // Reset halos to zero. + max_halos = _dims->_domain_dims; + + // Loop through each non-scratch grid. + for (auto gp : gridPtrs) { + assert(gp); + + // Ignore manually-sized grid. + if (gp->is_fixed_size()) + continue; + + // Loop through each domain dim. + for (auto& dim : _dims->_domain_dims.getDims()) { + auto& dname = dim.getName(); + + if (gp->is_dim_used(dname)) { + + // Rank domains. + gp->_set_domain_size(dname, _opts->_rank_sizes[dname]); + + // Pads. + // Set via both 'extra' and 'min'; larger result will be used. + gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]); + gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]); + + // Offsets. + gp->_set_offset(dname, rank_domain_offsets[dname]); + + // Update max halo across grids, used for wavefront angles. + max_halos[dname] = max(max_halos[dname], gp->get_left_halo_size(dname)); + max_halos[dname] = max(max_halos[dname], gp->get_right_halo_size(dname)); + } + } + } // grids. + + // Calculate wave-front settings based on max halos. + // See the wavefront diagram in run_solution() for description + // of angles and extensions. + auto& step_dim = _dims->_step_dim; + auto wf_steps = _opts->_region_sizes[step_dim]; + num_wf_shifts = 0; + if (wf_steps > 1) + + // TODO: don't shift for scratch grids. + num_wf_shifts = max((idx_t(stBundles.size()) * wf_steps) - 1, idx_t(0)); + for (auto& dim : _dims->_domain_dims.getDims()) { + auto& dname = dim.getName(); + auto rksize = _opts->_rank_sizes[dname]; + auto nranks = _opts->_num_ranks[dname]; + + // Determine the max spatial skewing angles for temporal + // wave-fronts based on the max halos. We only need non-zero + // angles if the region size is less than the rank size and + // there are no other ranks in this dim, i.e., if the region + // covers the global domain in a given dim, no wave-front is + // needed in that dim. TODO: make rounding-up an option. + idx_t angle = 0; + if (_opts->_region_sizes[dname] < rksize || nranks > 0) + angle = ROUND_UP(max_halos[dname], _dims->_cluster_pts[dname]); + wf_angles[dname] = angle; + + // Determine the total WF shift to be added in each dim. + idx_t shifts = angle * num_wf_shifts; + wf_shifts[dname] = shifts; + + // Is domain size at least as large as halo + wf_ext in direction + // when there are multiple ranks? + auto min_size = max_halos[dname] + shifts; + if (_opts->_num_ranks[dname] > 1 && rksize < min_size) { + THROW_YASK_EXCEPTION("Error: rank-domain size of " << rksize << " in '" << + dname << "' dim is less than minimum size of " << min_size << + ", which is based on stencil halos and temporal wave-front sizes"); + } + + // If there is another rank to the left, set wave-front + // extension on the left. + left_wf_exts[dname] = _opts->is_first_rank(dname) ? 0 : shifts; + + // If there is another rank to the right, set wave-front + // extension on the right. + right_wf_exts[dname] = _opts->is_last_rank(dname) ? 0 : shifts; + } + + // Now that wave-front settings are known, we can push this info + // back to the grids. It's useful to store this redundant info + // in the grids, because there it's indexed by grid dims instead + // of domain dims. This makes it faster to do grid indexing. + for (auto gp : gridPtrs) { + assert(gp); + + // Ignore manually-sized grid. + if (gp->is_fixed_size()) + continue; + + // Loop through each domain dim. + for (auto& dim : _dims->_domain_dims.getDims()) { + auto& dname = dim.getName(); + if (gp->is_dim_used(dname)) { + + // Set extensions to be the same as the global ones. + gp->_set_left_wf_ext(dname, left_wf_exts[dname]); + gp->_set_right_wf_ext(dname, right_wf_exts[dname]); + } + } + } + } + + // Allocate grids and MPI bufs. + // Initialize some data structures. + void StencilContext::prepare_solution() { + auto& step_dim = _dims->_step_dim; + + // Don't continue until all ranks are this far. + _env->global_barrier(); + + ostream& os = get_ostr(); +#ifdef DEBUG + os << "*** WARNING: YASK compiled with DEBUG; ignore performance results.\n"; +#endif +#if defined(NO_INTRINSICS) && (VLEN > 1) + os << "*** WARNING: YASK compiled with NO_INTRINSICS; ignore performance results.\n"; +#endif +#ifdef MODEL_CACHE + os << "*** WARNING: YASK compiled with MODEL_CACHE; ignore performance results.\n"; +#endif +#ifdef TRACE_MEM + os << "*** WARNING: YASK compiled with TRACE_MEM; ignore performance results.\n"; +#endif +#ifdef TRACE_INTRINSICS + os << "*** WARNING: YASK compiled with TRACE_INTRINSICS; ignore performance results.\n"; +#endif + + // reset time keepers. + clear_timers(); + + // Init auto-tuner to run silently during normal operation. + _at.clear(false, false); + + // Adjust all settings before setting MPI buffers or sizing grids. + // Prints final settings. + // TODO: print settings again after auto-tuning. + _opts->adjustSettings(os, _env); + + // Report ranks. + os << endl; + os << "Num ranks: " << _env->get_num_ranks() << endl; + os << "This rank index: " << _env->get_rank_index() << endl; + + // report threads. + os << "Num OpenMP procs: " << omp_get_num_procs() << endl; + set_all_threads(); + os << "Num OpenMP threads: " << omp_get_max_threads() << endl; + set_region_threads(); // Temporary; just for reporting. + os << " Num threads per region: " << omp_get_max_threads() << endl; + set_block_threads(); // Temporary; just for reporting. + os << " Num threads per block: " << omp_get_max_threads() << endl; + + // Set the number of threads for a region. It should stay this + // way for top-level OpenMP parallel sections. + int rthreads = set_region_threads(); + + // Run a dummy nested OMP loop to make sure nested threading is + // initialized. +#ifdef _OPENMP +#pragma omp parallel for + for (int i = 0; i < rthreads * 100; i++) { + + idx_t dummy = 0; + set_block_threads(); +#pragma omp parallel for reduction(+:dummy) + for (int j = 0; j < i * 100; j++) { + dummy += j; + } + } +#endif + + // Some grid stats. + os << endl; + os << "Num grids: " << gridPtrs.size() << endl; + os << "Num grids to be updated: " << outputGridPtrs.size() << endl; + + // Set up data based on MPI rank, including grid positions. + // Update all the grid sizes. + setupRank(); + + // Alloc grids, scratch grids, MPI bufs. + // This is the order in which preferred NUMA nodes (e.g., HBW mem) + // will be used. + // We free the scratch and MPI data first to give grids preference. + freeScratchData(os); + freeMpiData(os); + allocGridData(os); + allocScratchData(os); + allocMpiData(os); + + // Report total allocation. + rank_nbytes = get_num_bytes(); + os << "Total allocation in this rank: " << + makeByteStr(rank_nbytes) << "\n"; + tot_nbytes = sumOverRanks(rank_nbytes, _env->comm); + os << "Total overall allocation in " << _env->num_ranks << " rank(s): " << + makeByteStr(tot_nbytes) << "\n"; + + // Report some stats. + idx_t dt = _opts->_rank_sizes[step_dim]; + os << "\nProblem sizes in points (from smallest to largest):\n" + " vector-size: " << _dims->_fold_pts.makeDimValStr(" * ") << endl << + " cluster-size: " << _dims->_cluster_pts.makeDimValStr(" * ") << endl << + " sub-block-size: " << _opts->_sub_block_sizes.makeDimValStr(" * ") << endl << + " sub-block-group-size: " << _opts->_sub_block_group_sizes.makeDimValStr(" * ") << endl << + " block-size: " << _opts->_block_sizes.makeDimValStr(" * ") << endl << + " block-group-size: " << _opts->_block_group_sizes.makeDimValStr(" * ") << endl << + " region-size: " << _opts->_region_sizes.makeDimValStr(" * ") << endl << + " rank-domain-size: " << _opts->_rank_sizes.makeDimValStr(" * ") << endl << + " overall-problem-size: " << overall_domain_sizes.makeDimValStr(" * ") << endl << + endl << + "Other settings:\n" + " yask-version: " << yask_get_version_string() << endl << + " stencil-name: " << get_name() << endl << + " element-size: " << makeByteStr(get_element_bytes()) << endl << +#ifdef USE_MPI + " num-ranks: " << _opts->_num_ranks.makeDimValStr(" * ") << endl << + " rank-indices: " << _opts->_rank_indices.makeDimValStr() << endl << + " rank-domain-offsets: " << rank_domain_offsets.makeDimValOffsetStr() << endl << +#endif + " rank-domain: " << rank_bb.bb_begin.makeDimValStr() << + " ... " << rank_bb.bb_end.subElements(1).makeDimValStr() << endl << + " vector-len: " << VLEN << endl << + " extra-padding: " << _opts->_extra_pad_sizes.makeDimValStr() << endl << + " minimum-padding: " << _opts->_min_pad_sizes.makeDimValStr() << endl << + " L1-prefetch-distance: " << PFD_L1 << endl << + " L2-prefetch-distance: " << PFD_L2 << endl << + " max-halos: " << max_halos.makeDimValStr() << endl; + if (num_wf_shifts > 0) { + os << + " wave-front-angles: " << wf_angles.makeDimValStr() << endl << + " num-wave-front-shifts: " << num_wf_shifts << endl << + " wave-front-shift-lens: " << wf_shifts.makeDimValStr() << endl << + " left-wave-front-exts: " << left_wf_exts.makeDimValStr() << endl << + " right-wave-front-exts: " << right_wf_exts.makeDimValStr() << endl << + " ext-rank-domain: " << ext_bb.bb_begin.makeDimValStr() << + " ... " << ext_bb.bb_end.subElements(1).makeDimValStr() << endl; + } + os << endl; + + // sums across bundles for this rank. + rank_numWrites_1t = 0; + rank_reads_1t = 0; + rank_numFpOps_1t = 0; + os << "Num stencil bundles: " << stBundles.size() << endl; + for (auto* sg : stBundles) { + idx_t updates1 = sg->get_scalar_points_written(); + idx_t updates_domain = updates1 * sg->bb_num_points; + rank_numWrites_1t += updates_domain; + idx_t reads1 = sg->get_scalar_points_read(); + idx_t reads_domain = reads1 * sg->bb_num_points; + rank_reads_1t += reads_domain; + idx_t fpops1 = sg->get_scalar_fp_ops(); + idx_t fpops_domain = fpops1 * sg->bb_num_points; + rank_numFpOps_1t += fpops_domain; + os << "Stats for bundle '" << sg->get_name() << "':\n" << + " sub-domain: " << sg->bb_begin.makeDimValStr() << + " ... " << sg->bb_end.subElements(1).makeDimValStr() << endl << + " sub-domain size: " << sg->bb_len.makeDimValStr(" * ") << endl << + " valid points in sub domain: " << makeNumStr(sg->bb_num_points) << endl << + " grid-updates per point: " << updates1 << endl << + " grid-updates in sub-domain: " << makeNumStr(updates_domain) << endl << + " grid-reads per point: " << reads1 << endl << + " grid-reads in sub-domain: " << makeNumStr(reads_domain) << endl << + " est FP-ops per point: " << fpops1 << endl << + " est FP-ops in sub-domain: " << makeNumStr(fpops_domain) << endl; + } + + // Various metrics for amount of work. + rank_numWrites_dt = rank_numWrites_1t * dt; + tot_numWrites_1t = sumOverRanks(rank_numWrites_1t, _env->comm); + tot_numWrites_dt = tot_numWrites_1t * dt; + + rank_reads_dt = rank_reads_1t * dt; + tot_reads_1t = sumOverRanks(rank_reads_1t, _env->comm); + tot_reads_dt = tot_reads_1t * dt; + + rank_numFpOps_dt = rank_numFpOps_1t * dt; + tot_numFpOps_1t = sumOverRanks(rank_numFpOps_1t, _env->comm); + tot_numFpOps_dt = tot_numFpOps_1t * dt; + + rank_domain_1t = rank_bb.bb_num_points; + rank_domain_dt = rank_domain_1t * dt; // same as _opts->_rank_sizes.product(); + tot_domain_1t = sumOverRanks(rank_domain_1t, _env->comm); + tot_domain_dt = tot_domain_1t * dt; + + // Print some more stats. + os << endl << + "Amount-of-work stats:\n" << + " domain-size in this rank for one time-step: " << + makeNumStr(rank_domain_1t) << endl << + " overall-problem-size in all ranks for one time-step: " << + makeNumStr(tot_domain_1t) << endl << + endl << + " num-writes-required in this rank for one time-step: " << + makeNumStr(rank_numWrites_1t) << endl << + " num-writes-required in all ranks for one time-step: " << + makeNumStr(tot_numWrites_1t) << endl << + endl << + " num-reads-required in this rank for one time-step: " << + makeNumStr(rank_reads_1t) << endl << + " num-reads-required in all ranks for one time-step: " << + makeNumStr(tot_reads_1t) << endl << + endl << + " est-FP-ops in this rank for one time-step: " << + makeNumStr(rank_numFpOps_1t) << endl << + " est-FP-ops in all ranks for one time-step: " << + makeNumStr(tot_numFpOps_1t) << endl << + endl; + + if (dt > 1) { + os << + " domain-size in this rank for all time-steps: " << + makeNumStr(rank_domain_dt) << endl << + " overall-problem-size in all ranks for all time-steps: " << + makeNumStr(tot_domain_dt) << endl << + endl << + " num-writes-required in this rank for all time-steps: " << + makeNumStr(rank_numWrites_dt) << endl << + " num-writes-required in all ranks for all time-steps: " << + makeNumStr(tot_numWrites_dt) << endl << + endl << + " num-reads-required in this rank for all time-steps: " << + makeNumStr(rank_reads_dt) << endl << + " num-reads-required in all ranks for all time-steps: " << + makeNumStr(tot_reads_dt) << endl << + endl << + " est-FP-ops in this rank for all time-steps: " << + makeNumStr(rank_numFpOps_dt) << endl << + " est-FP-ops in all ranks for all time-steps: " << + makeNumStr(tot_numFpOps_dt) << endl << + endl; + } + os << + "Notes:\n" + " Domain-sizes and overall-problem-sizes are based on rank-domain sizes\n" + " and number of ranks regardless of number of grids or sub-domains.\n" + " Num-writes-required is based on sum of grid-updates in sub-domain across stencil-bundle(s).\n" + " Num-reads-required is based on sum of grid-reads in sub-domain across stencil-bundle(s).\n" + " Est-FP-ops are based on sum of est-FP-ops in sub-domain across stencil-bundle(s).\n" + "\n"; + } + + // Dealloc grids, etc. + void StencilContext::end_solution() { + + // Final halo exchange. + exchange_halos_all(); + + // Release any MPI data. + mpiData.clear(); + + // Release grid data. + for (auto gp : gridPtrs) { + if (!gp) + continue; + gp->release_storage(); + } + + // Reset threads to original value. + set_max_threads(); + } + + // Init all grids & params by calling initFn. + void StencilContext::initValues(function realInitFn) { + ostream& os = get_ostr(); + real_t v = 0.1; + os << "Initializing grids..." << endl; + for (auto gp : gridPtrs) { + realInitFn(gp, v); + v += 0.01; + } + } + + // Compute convenience values for a bounding-box. + void BoundingBox::update_bb(ostream& os, + const string& name, + StencilContext& context, + bool force_full) { + + auto dims = context.get_dims(); + auto& domain_dims = dims->_domain_dims; + bb_len = bb_end.subElements(bb_begin); + bb_size = bb_len.product(); + if (force_full) + bb_num_points = bb_size; + + // Solid rectangle? + bb_is_full = true; + if (bb_num_points != bb_size) { + os << "Warning: '" << name << "' domain has only " << + makeNumStr(bb_num_points) << + " valid point(s) inside its bounding-box of " << + makeNumStr(bb_size) << + " point(s); slower scalar calculations will be used.\n"; + bb_is_full = false; + } + + // Does everything start on a vector-length boundary? + bb_is_aligned = true; + for (auto& dim : domain_dims.getDims()) { + auto& dname = dim.getName(); + if ((bb_begin[dname] - context.rank_domain_offsets[dname]) % + dims->_fold_pts[dname] != 0) { + os << "Note: '" << name << "' domain" + " has one or more starting edges not on vector boundaries;" + " masked calculations will be used in peel and remainder sub-blocks.\n"; + bb_is_aligned = false; + break; + } + } + + // Lengths are cluster-length multiples? + bb_is_cluster_mult = true; + for (auto& dim : domain_dims.getDims()) { + auto& dname = dim.getName(); + if (bb_len[dname] % dims->_cluster_pts[dname] != 0) { + if (bb_is_full && bb_is_aligned) + os << "Note: '" << name << "' domain" + " has one or more sizes that are not vector-cluster multiples;" + " masked calculations will be used in peel and remainder sub-blocks.\n"; + bb_is_cluster_mult = false; + break; + } + } + + // All done. + bb_valid = true; + } + + // Set the bounding-box for each stencil-bundle and whole domain. + void StencilContext::find_bounding_boxes() + { + ostream& os = get_ostr(); + + // Rank BB is based only on rank offsets and rank domain sizes. + rank_bb.bb_begin = rank_domain_offsets; + rank_bb.bb_end = rank_domain_offsets.addElements(_opts->_rank_sizes, false); + rank_bb.update_bb(os, "rank", *this, true); + + // Overall BB may be extended for wave-fronts. + ext_bb.bb_begin = rank_bb.bb_begin.subElements(left_wf_exts); + ext_bb.bb_end = rank_bb.bb_end.addElements(right_wf_exts); + ext_bb.update_bb(os, "extended-rank", *this, true); + + // Find BB for each bundle. + for (auto sg : stBundles) + sg->find_bounding_box(); + } + +} // namespace yask. diff --git a/src/kernel/lib/yask.hpp b/src/kernel/lib/yask.hpp index ec2ef99b..06c08636 100644 --- a/src/kernel/lib/yask.hpp +++ b/src/kernel/lib/yask.hpp @@ -140,6 +140,11 @@ inline void omp_set_nested(int n) { } #include "yask_stencil_code.hpp" #undef DEFINE_MACROS +// Max number of dims allowed in grids. +#ifndef MAX_DIMS +#define MAX_DIMS NUM_STENCIL_DIMS +#endif + // Default cmd-line arguments. #ifndef DEF_ARGS #define DEF_ARGS "" From 102a27393f93d55e941872b691c7953ed31dca9e Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Wed, 25 Apr 2018 13:43:23 -0700 Subject: [PATCH 09/21] Change DEBUG macro to CHECK. Turn on by default at -O0. This turns on lots of assertions in the kernel code. Add py-kernel-api target. --- Makefile | 3 +++ src/kernel/Makefile | 15 +++++++++++---- src/kernel/lib/generic_grids.hpp | 4 ++-- src/kernel/lib/realv.hpp | 4 ++-- src/kernel/lib/setup.cpp | 4 ++-- src/kernel/lib/stencil_calc.cpp | 4 ++-- src/kernel/lib/yask.hpp | 4 ++-- 7 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 5b12591e..f334a7bc 100644 --- a/Makefile +++ b/Makefile @@ -138,6 +138,9 @@ compiler-api: kernel-api: $(YK_MAKE) api +py-kernel-api: + $(YK_MAKE) py-api + api: $(YC_MAKE) $@ $(YK_MAKE) $@ diff --git a/src/kernel/Makefile b/src/kernel/Makefile index 3d5dccdf..d1e6c502 100644 --- a/src/kernel/Makefile +++ b/src/kernel/Makefile @@ -398,6 +398,11 @@ ifneq ($(filter -O0 -O1,$(YK_CXXOPT)),) pfd_l2 = 0 endif +# Turn on checking at O0. +ifneq ($(filter -O0,$(YK_CXXOPT)),) + MACROS += CHECK +endif + # Set MACROS based on individual makefile vars. # MACROS and EXTRA_MACROS will be written to a header file. MACROS += PFD_L1=$(pfd_l1) PFD_L2=$(pfd_l2) @@ -639,6 +644,8 @@ headers: $(YK_GEN_HEADERS) # Build C++ and Python kernel API libs. api: $(YK_LIB) $(YK_PY_LIB) +py-api: $(YK_PY_LIB) + # Build python kernel API lib. # TODO: consider adding $(YK_TAG) to [some of] these targets. $(YK_SWIG_DIR)/yask_kernel_api_wrap.cpp: $(YK_SWIG_DIR)/yask*.i $(INC_DIR)/*.hpp @@ -873,12 +880,12 @@ help: @echo " $(MAKE) clean; $(MAKE) -j arch=skl stencil=awp yk-api" @echo " " @echo "Example debug builds of kernel cmd-line tool:" - @echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' EXTRA_MACROS='DEBUG'" - @echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' EXTRA_MACROS='DEBUG TRACE' # TRACE is a useful debug setting!" - @echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis radius=0 fold='x=1,y=1,z=1' mpi=0 YK_CXX=g++ OMPFLAGS='' YK_CXXOPT='-O0' EXTRA_MACROS='DEBUG TRACE TRACE_MEM TRACE_INTRINSICS'" + @echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' EXTRA_MACROS='CHECK'" + @echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' EXTRA_MACROS='CHECK TRACE' # TRACE is a useful debug setting!" + @echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis radius=0 fold='x=1,y=1,z=1' mpi=0 YK_CXX=g++ OMPFLAGS='' YK_CXXOPT='-O0' EXTRA_MACROS='CHECK TRACE TRACE_MEM TRACE_INTRINSICS'" @echo " " @echo "Example builds with test runs:" @echo " $(MAKE) -j all" @echo " $(MAKE) -j all ranks=2" @echo " $(MAKE) -j all YK_CXX=g++ YK_CXXOPT=-O2 mpi=0" - @echo " $(MAKE) -j all YK_CXX=mpigxx YK_CXXOPT=-O2 ranks=3 EXTRA_MACROS='DEBUG'" + @echo " $(MAKE) -j all YK_CXX=mpigxx YK_CXXOPT=-O2 ranks=3 EXTRA_MACROS='CHECK'" diff --git a/src/kernel/lib/generic_grids.hpp b/src/kernel/lib/generic_grids.hpp index 7833a1d2..fd57fc5d 100644 --- a/src/kernel/lib/generic_grids.hpp +++ b/src/kernel/lib/generic_grids.hpp @@ -341,7 +341,7 @@ namespace yask { // Get 1D index using layout. virtual idx_t get_index(const Indices& idxs, bool check=true) const final { -#ifdef DEBUG +#ifdef CHECK if (check) { for (int i = 0; i < this->_dims.size(); i++) { idx_t j = idxs[i]; @@ -351,7 +351,7 @@ namespace yask { } #endif idx_t ai = _layout.layout(idxs); -#ifdef DEBUG +#ifdef CHECK if (check) assert(ai < this->get_num_elems()); #endif diff --git a/src/kernel/lib/realv.hpp b/src/kernel/lib/realv.hpp index a1b81bad..427660ab 100644 --- a/src/kernel/lib/realv.hpp +++ b/src/kernel/lib/realv.hpp @@ -99,7 +99,7 @@ namespace yask { #undef VEC_ELEMS // Macro for looping through an aligned real_vec_t. -#if defined(DEBUG) || (VLEN==1) || !defined(__INTEL_COMPILER) +#if defined(CHECK) || (VLEN==1) || !defined(__INTEL_COMPILER) #define REAL_VEC_LOOP(i) \ for (int i=0; iglobal_barrier(); ostream& os = get_ostr(); -#ifdef DEBUG - os << "*** WARNING: YASK compiled with DEBUG; ignore performance results.\n"; +#ifdef CHECK + os << "*** WARNING: YASK compiled with CHECK; ignore performance results.\n"; #endif #if defined(NO_INTRINSICS) && (VLEN > 1) os << "*** WARNING: YASK compiled with NO_INTRINSICS; ignore performance results.\n"; diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp index c3c6b287..f6d2bf5d 100644 --- a/src/kernel/lib/stencil_calc.cpp +++ b/src/kernel/lib/stencil_calc.cpp @@ -515,7 +515,7 @@ namespace yask { loop_idxs.start.makeValStr(nsdims) << " ... (end before) " << loop_idxs.stop.makeValStr(nsdims)); -#ifdef DEBUG +#ifdef CHECK // Check that only the inner dim has a range greater than one cluster. for (int i = 0, j = 0; i < nsdims; i++) { if (i != step_posn) { @@ -553,7 +553,7 @@ namespace yask { " ... (end before) " << loop_idxs.stop.makeValStr(nsdims) << " w/write-mask = 0x" << hex << write_mask << dec); -#ifdef DEBUG +#ifdef CHECK // Check that only the inner dim has a range greater than one vector. for (int i = 0; i < nsdims; i++) { if (i != step_posn && i != _inner_posn) diff --git a/src/kernel/lib/yask.hpp b/src/kernel/lib/yask.hpp index 06c08636..abaa3dfa 100644 --- a/src/kernel/lib/yask.hpp +++ b/src/kernel/lib/yask.hpp @@ -43,9 +43,9 @@ typedef std::uint64_t uidx_t; // Settings from makefile. #include "yask_macros.hpp" -// Control assert() by turning on with DEBUG instead of turning off with +// Control assert() by turning on with CHECK instead of turning off with // NDEBUG. This makes it off by default. -#ifndef DEBUG +#ifndef CHECK #define NDEBUG #endif From fd20e788f900fb1112393d37bb54c06f3b90b1f2 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Thu, 26 Apr 2018 11:53:29 -0700 Subject: [PATCH 10/21] Calculate max grid dims correctly. Needed when grid dims > stencil dims. --- src/compiler/lib/YaskKernel.cpp | 10 ++++++- src/kernel/Makefile | 49 +++++++++++++++++++-------------- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/src/compiler/lib/YaskKernel.cpp b/src/compiler/lib/YaskKernel.cpp index f26151a0..b0aa9b83 100644 --- a/src/compiler/lib/YaskKernel.cpp +++ b/src/compiler/lib/YaskKernel.cpp @@ -92,7 +92,15 @@ namespace yask { os << "\n// Number of stencil dimensions (step and domain):\n" "#define NUM_STENCIL_DIMS " << _dims->_stencilDims.size() << endl; - + + int gdims = 0; + for (auto gp : _grids) { + int ndims = gp->get_num_dims(); + gdims = max(gdims, ndims); + } + os << "\n// Max number of grid dimensions:\n" + "#define NUM_GRID_DIMS " << gdims << endl; + // Vec/cluster lengths. auto nvec = _dims->_foldGT1.getNumDims(); os << "\n// One vector fold: " << _dims->_fold.makeDimValStr(" * ") << endl; diff --git a/src/kernel/Makefile b/src/kernel/Makefile index d1e6c502..6caeb23d 100644 --- a/src/kernel/Makefile +++ b/src/kernel/Makefile @@ -299,7 +299,8 @@ YK_PY_MOD := $(YASK_DIR)/$(YK_MODULE).py YK_API_TEST_EXEC := $(BIN_DIR)/$(YK_BASE)_api_test.exe YK_GRID_TEST_EXEC := $(BIN_DIR)/$(YK_BASE)_grid_test.exe YK_API_TEST_EXEC_WITH_EXCEPTION := $(BIN_DIR)/$(YK_BASE)_api_exception_test.exe -YK_DIMS_FILE := num_dims.$(stencil).txt +YK_STENCIL_DIMS_FILE := num_stencil_dims.$(stencil).txt +YK_GRID_DIMS_FILE := num_grid_dims.$(stencil).txt MAKE_REPORT_FILE:= make-report.$(YK_TAG).txt @@ -479,8 +480,9 @@ endif # Add in final flags and user-added flags. YK_CXXFLAGS += $(YK_CXXOPT) $(OMPFLAGS) $(EXTRA_YK_CXXFLAGS) -# Number of dims extracted from YASK compiler output. -NDIMS := `cat $(YK_DIMS_FILE)` +# Number of stencil/grid dims extracted from YASK compiler output. +NSDIMS := `cat $(YK_STENCIL_DIMS_FILE)` +NGDIMS := `cat $(YK_GRID_DIMS_FILE)` ######## Loop-compiler configuration: # The loop indices range from 0..N-1. @@ -497,7 +499,7 @@ NDIMS := `cat $(YK_DIMS_FILE)` # indices. Those that do not (e.g., grouped, serpentine, square-wave) may # *not* be used here when using temporal wavefronts. The time loop may be # found in StencilEquations::run_solution(). -RANK_LOOP_OPTS ?= -ndims $(NDIMS) -inVar rank_idxs +RANK_LOOP_OPTS ?= -ndims $(NSDIMS) -inVar rank_idxs RANK_LOOP_ORDER ?= 1 .. N-1 RANK_LOOP_CODE ?= $(RANK_LOOP_OUTER_MODS) loop($(RANK_LOOP_ORDER)) \ { $(RANK_LOOP_INNER_MODS) call(calc_region(stBundle_ptr)); } @@ -507,7 +509,7 @@ RANK_LOOP_CODE ?= $(RANK_LOOP_OUTER_MODS) loop($(RANK_LOOP_ORDER)) \ # to a top-level OpenMP thread. The region time loops are not coded here to # allow for proper spatial skewing for temporal wavefronts. The time loop # may be found in StencilEquations::calc_region(). -REGION_LOOP_OPTS ?= -ndims $(NDIMS) -inVar region_idxs \ +REGION_LOOP_OPTS ?= -ndims $(NSDIMS) -inVar region_idxs \ -ompConstruct '$(omp_par_for) schedule($(omp_region_schedule)) proc_bind(spread)' \ -callPrefix 'sg->' REGION_LOOP_OUTER_MODS ?= grouped omp @@ -519,7 +521,7 @@ REGION_LOOP_CODE ?= $(REGION_LOOP_OUTER_MODS) loop($(REGION_LOOP_ORDER)) { \ # a *nested* OpenMP loop so that each sub-block is assigned to a nested OpenMP # thread. There is no time loop because threaded temporal blocking is # not yet supported. -BLOCK_LOOP_OPTS ?= -ndims $(NDIMS) -inVar block_idxs \ +BLOCK_LOOP_OPTS ?= -ndims $(NSDIMS) -inVar block_idxs \ -ompConstruct '$(omp_par_for) schedule($(omp_block_schedule)) proc_bind(close)' \ -callPrefix 'sg->' BLOCK_LOOP_OUTER_MODS ?= grouped omp @@ -532,7 +534,7 @@ BLOCK_LOOP_CODE ?= $(BLOCK_LOOP_OUTER_MODS) loop($(BLOCK_LOOP_ORDER)) { \ # stencil compiler. There is no time loop because threaded temporal # blocking is not yet supported. The indexes in this loop are 'normalized', # i.e., vector units and rank-relative. -SUB_BLOCK_LOOP_OPTS ?= -ndims $(NDIMS) -inVar norm_sub_block_idxs +SUB_BLOCK_LOOP_OPTS ?= -ndims $(NSDIMS) -inVar norm_sub_block_idxs SUB_BLOCK_LOOP_OUTER_MODS ?= SUB_BLOCK_LOOP_ORDER ?= 1 .. N-2 SUB_BLOCK_LOOP_CODE ?= $(SUB_BLOCK_LOOP_OUTER_MODS) loop($(SUB_BLOCK_LOOP_ORDER)) { \ @@ -541,7 +543,7 @@ SUB_BLOCK_LOOP_CODE ?= $(SUB_BLOCK_LOOP_OUTER_MODS) loop($(SUB_BLOCK_LOOP_ORDER # General-purpose parallel loop. # Nested OpenMP is not used here because there is no sharing between threads. # TODO: Consider using nested OpenMP to hide more latency. -MISC_LOOP_OPTS ?= -ndims $(NDIMS) -inVar misc_idxs \ +MISC_LOOP_OPTS ?= -ndims $(NSDIMS) -inVar misc_idxs \ -ompConstruct '$(omp_par_for) schedule($(omp_misc_schedule)) proc_bind(spread)' MISC_LOOP_OUTER_MODS ?= omp MISC_LOOP_ORDER ?= 1 .. N-1 @@ -577,49 +579,54 @@ $(MAKE_REPORT_FILE): $(YK_LIB) #$(MAKE) code-stats | tee -a $@ # Generated source files. -$(YK_GEN_DIR)/yask_rank_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE) +$(YK_GEN_DIR)/yask_rank_loops.hpp: $(GEN_LOOPS) $(YK_STENCIL_DIMS_FILE) $(YK_MK_GEN_DIR) $(PERL) $< -output $@ $(RANK_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_RANK_LOOP_OPTS) "$(RANK_LOOP_CODE)" -$(YK_GEN_DIR)/yask_region_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE) +$(YK_GEN_DIR)/yask_region_loops.hpp: $(GEN_LOOPS) $(YK_STENCIL_DIMS_FILE) $(YK_MK_GEN_DIR) $(PERL) $< -output $@ $(REGION_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_REGION_LOOP_OPTS) "$(REGION_LOOP_CODE)" -$(YK_GEN_DIR)/yask_block_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE) +$(YK_GEN_DIR)/yask_block_loops.hpp: $(GEN_LOOPS) $(YK_STENCIL_DIMS_FILE) $(YK_MK_GEN_DIR) $(PERL) $< -output $@ $(BLOCK_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_BLOCK_LOOP_OPTS) "$(BLOCK_LOOP_CODE)" -$(YK_GEN_DIR)/yask_sub_block_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE) +$(YK_GEN_DIR)/yask_sub_block_loops.hpp: $(GEN_LOOPS) $(YK_STENCIL_DIMS_FILE) $(YK_MK_GEN_DIR) $(PERL) $< -output $@ $(SUB_BLOCK_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_SUB_BLOCK_LOOP_OPTS) "$(SUB_BLOCK_LOOP_CODE)" -$(YK_GEN_DIR)/yask_misc_loops.hpp: $(GEN_LOOPS) $(YK_DIMS_FILE) +$(YK_GEN_DIR)/yask_misc_loops.hpp: $(GEN_LOOPS) $(YK_STENCIL_DIMS_FILE) $(YK_MK_GEN_DIR) $< -output $@ $(MISC_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_MISC_LOOP_OPTS) "$(MISC_LOOP_CODE)" -$(YK_GEN_DIR)/yask_layout_macros.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE) +$(YK_GEN_DIR)/yask_layout_macros.hpp: $(GEN_LAYOUTS) $(YK_GRID_DIMS_FILE) $(YK_MK_GEN_DIR) - $(PERL) $< -m $(NDIMS) > $@ + $(PERL) $< -m $(NGDIMS) > $@ @- gindent -fca $@ || \ indent -fca $@ || \ echo "note:" $@ "is not properly indented because indent program failed or was not found." -$(YK_GEN_DIR)/yask_layouts.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE) +$(YK_GEN_DIR)/yask_layouts.hpp: $(GEN_LAYOUTS) $(YK_GRID_DIMS_FILE) $(YK_MK_GEN_DIR) - $(PERL) $< -d $(NDIMS) > $@ + $(PERL) $< -d $(NGDIMS) > $@ @- gindent -fca $@ || \ indent -fca $@ || \ echo "note:" $@ "is not properly indented because indent program failed or was not found." -$(YK_GEN_DIR)/yask_grid_code.hpp: $(GEN_LAYOUTS) $(YK_DIMS_FILE) +$(YK_GEN_DIR)/yask_grid_code.hpp: $(GEN_LAYOUTS) $(YK_GRID_DIMS_FILE) $(YK_MK_GEN_DIR) - $(PERL) $< -g $(NDIMS) > $@ + $(PERL) $< -g $(NGDIMS) > $@ # Extract the number of stencil dims from the compiler output. # Use this to create an option to pass to the loop generator script. -$(YK_DIMS_FILE): $(YK_CODE_FILE) +$(YK_STENCIL_DIMS_FILE): $(YK_CODE_FILE) awk '/NUM_STENCIL_DIMS/ {print $$NF}' $< > $@ +# Extract the number of grid dims from the compiler output. +# Use this to create an option to pass to the layout generator script. +$(YK_GRID_DIMS_FILE): $(YK_CODE_FILE) + awk '/NUM_GRID_DIMS/ {print $$NF}' $< > $@ + $(YK_CODE_FILE): $(YC_EXEC) $(YK_MK_GEN_DIR) $(RUN_PREFIX) $< $(YC_FLAGS) $(EXTRA_YC_FLAGS) -p $(YC_TARGET) $@ @@ -781,7 +788,7 @@ all: # Make this target before rebuilding YASK with any new parameters. clean: rm -fv *.s - rm -fv num_dims.*.txt + rm -fv num_*dims.*.txt rm -fr $(YK_SWIG_DIR)/build $(YK_GEN_DIR) rm -fv $(YK_SWIG_DIR)/*_api_wrap.* rm -fv $(YK_OBJS) From 0be535ff8414e0fdfaf1d3b573297c40ae4bee40 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Thu, 26 Apr 2018 14:46:26 -0700 Subject: [PATCH 11/21] Reorganize tests. --- Makefile | 44 +++++++++++++++----------------------------- src/kernel/Makefile | 32 +++++++++++++++----------------- 2 files changed, 30 insertions(+), 46 deletions(-) diff --git a/Makefile b/Makefile index f334a7bc..6c357999 100644 --- a/Makefile +++ b/Makefile @@ -196,39 +196,21 @@ py-yc-api-and-cxx-yk-api-test: $(YK_MAKE) py-yc-api-test $(YK_MAKE) cxx-yk-api-test -# Run C++ compiler API test with exception, then run C++ kernel API test with exception. -cxx-yc-api-and-cxx-yk-api-test-with-exception: - $(YK_MAKE) cxx-yc-api-test-with-exception - $(YK_MAKE) cxx-yk-api-test-with-exception - -# Run python compiler API test with exception, then run python kernel API test with exception. -py-yc-api-and-py-yk-api-test-with-exception: - $(YK_MAKE) py-yc-api-test-with-exception - $(YK_MAKE) py-yk-api-test-with-exception - -# Run C++ compiler API test with exception, then run python kernel API test with exception. -cxx-yc-api-and-py-yk-api-test-with-exception: - $(YK_MAKE) cxx-yc-api-test-with-exception - $(YK_MAKE) py-yk-api-test-with-exception - -# Run python compiler API test with exception, then run C++ kernel API test with exception. -py-yc-api-and-cxx-yk-api-test-with-exception: - $(YK_MAKE) py-yc-api-test-with-exception - $(YK_MAKE) cxx-yk-api-test-with-exception - -api-tests: - $(MAKE) yc-and-cxx-yk-api-test - $(MAKE) yc-and-py-yk-api-test +# Run 8 out of 9 combos of (built-in, C++, Python)^2 +# API tests. The 9th one is built-in with built-in, +# which is tested more extensively in the kernel tests. +# When the built-in stencil examples aren't being used, +# "stencil=test" in the commands below is simply used to +# create file names. +combo-api-tests: + $(MAKE) stencil=iso3dfd yc-and-cxx-yk-api-test + $(MAKE) stencil=iso3dfd yc-and-py-yk-api-test $(MAKE) stencil=test cxx-yc-api-and-yk-test $(MAKE) stencil=test py-yc-api-and-yk-test $(MAKE) stencil=test cxx-yc-api-and-cxx-yk-api-test $(MAKE) stencil=test py-yc-api-and-py-yk-api-test $(MAKE) stencil=test cxx-yc-api-and-py-yk-api-test $(MAKE) stencil=test py-yc-api-and-cxx-yk-api-test - $(MAKE) stencil=test cxx-yc-api-and-cxx-yk-api-test-with-exception - $(MAKE) stencil=test py-yc-api-and-py-yk-api-test-with-exception - $(MAKE) stencil=test cxx-yc-api-and-py-yk-api-test-with-exception - $(MAKE) stencil=test py-yc-api-and-cxx-yk-api-test-with-exception ######## Misc targets @@ -248,10 +230,14 @@ tuple-test: $(TUPLE_TEST_EXEC) @echo '*** Running the C++ YASK tuple test...' $(RUN_PREFIX) $< -all-tests: compiler +api-tests: compiler-api + $(MAKE) combo-api-tests + $(YK_MAKE) $@ + +all-tests: compiler-api $(MAKE) tuple-test + $(MAKE) combo-api-tests $(YK_MAKE) $@ - $(MAKE) api-tests docs: api-docs diff --git a/src/kernel/Makefile b/src/kernel/Makefile index 6caeb23d..06301110 100644 --- a/src/kernel/Makefile +++ b/src/kernel/Makefile @@ -723,18 +723,6 @@ cxx-yc-api-test: $(YK_MK_GEN_DIR) mv $(YC_SRC_DIR)/yc-api-test-cxx.hpp $(YK_CODE_FILE) -# Run Python compiler API test with exceptions to create stencil-code file. -py-yc-api-test-with-exception: - $(MAKE) -C $(YC_SRC_DIR) $@ - $(YK_MK_GEN_DIR) - mv $(YC_SRC_DIR)/yc-api-test-with-exception-py.hpp $(YK_CODE_FILE) - -# Run C++ compiler API test with exceptions to create stencil-code file. -cxx-yc-api-test-with-exception: - $(MAKE) -C $(YC_SRC_DIR) $@ - $(YK_MK_GEN_DIR) - mv $(YC_SRC_DIR)/yc-api-test-with-exception-cxx.hpp $(YK_CODE_FILE) - ######## Misc targets # Run the default YASK compiler and kernel. @@ -755,15 +743,20 @@ kernel-only: yk-test-no-yc: kernel-only $(BIN_DIR)/yask.sh -stencil $(stencil) -arch $(arch) -ranks $(ranks) -v $(v_args) +# Run the kernel API tests for C++ and Python with and w/o expected exceptions. +api-tests: + $(MAKE) clean; $(MAKE) cxx-yk-api-test real_bytes=8 stencil=iso3dfd + $(MAKE) clean; $(MAKE) py-yk-api-test stencil=iso3dfd + $(MAKE) clean; $(MAKE) cxx-yk-api-test-with-exception real_bytes=8 stencil=iso3dfd + $(MAKE) clean; $(MAKE) py-yk-api-test-with-exception stencil=iso3dfd + +# Run several stencils using built-in validation. # NB: set arch var if applicable. # NB: save some time by using YK_CXXOPT=-O2. # These tests are focused on the kernel and not the compiler. # For testing both the kernel and compiler in various combinations, # run the tests from the top-level Makefile. -all-tests: - $(MAKE) clean; $(MAKE) cxx-yk-grid-test stencil=test_3d fold=x=4,y=2 - $(MAKE) clean; $(MAKE) cxx-yk-api-test real_bytes=8 stencil=iso3dfd - $(MAKE) clean; $(MAKE) py-yk-api-test stencil=iso3dfd +stencil-tests: $(MAKE) clean; $(MAKE) yc-and-yk-test real_bytes=8 stencil=test_1d $(MAKE) clean; $(MAKE) yc-and-yk-test real_bytes=8 stencil=3axis fold=x=2,y=2 $(MAKE) clean; $(MAKE) yc-and-yk-test real_bytes=8 stencil=9axis fold=x=2,z=2 @@ -778,6 +771,11 @@ all-tests: $(MAKE) clean; $(MAKE) yc-and-yk-test real_bytes=8 stencil=fsg_abc $(MAKE) clean; $(MAKE) yc-and-yk-test real_bytes=8 stencil=fsg2 +all-tests: + $(MAKE) clean; $(MAKE) cxx-yk-grid-test stencil=test_3d fold=x=4,y=2 + $(MAKE) api-tests + $(MAKE) stencil-tests + all: $(MAKE) kernel $(MAKE) api @@ -895,4 +893,4 @@ help: @echo " $(MAKE) -j all" @echo " $(MAKE) -j all ranks=2" @echo " $(MAKE) -j all YK_CXX=g++ YK_CXXOPT=-O2 mpi=0" - @echo " $(MAKE) -j all YK_CXX=mpigxx YK_CXXOPT=-O2 ranks=3 EXTRA_MACROS='CHECK'" + @echo " $(MAKE) -j all YK_CXX=mpigxx YK_CXXOPT=-O1 ranks=3 EXTRA_MACROS='CHECK'" From 777259da0e9dedb7137eac6dfbf56347711e489a Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Thu, 26 Apr 2018 14:48:23 -0700 Subject: [PATCH 12/21] Add check for gcc version. --- src/compiler/lib/Expr.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/compiler/lib/Expr.hpp b/src/compiler/lib/Expr.hpp index 89023f95..6fd375ec 100644 --- a/src/compiler/lib/Expr.hpp +++ b/src/compiler/lib/Expr.hpp @@ -39,6 +39,14 @@ IN THE SOFTWARE. #include #include #include + +// Need g++ >= 4.9 for regex. +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) +#if GCC_VERSION < 40900 +#error G++ 4.9.0 or later is required +#endif #include // Common utilities. From a2078372ceb5abfab6ab3f4b444af19e93bec940 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Fri, 27 Apr 2018 14:42:53 -0700 Subject: [PATCH 13/21] Fix alignment of blocks in ranks with non-aligned starting offsets. Fix padding and offsets of non-vectorized grids. --- bin/gen_loops.pl | 17 ++++++- src/kernel/lib/context.cpp | 55 ++++++++++++++++------- src/kernel/lib/context.hpp | 5 +-- src/kernel/lib/grid_apis.cpp | 1 + src/kernel/lib/realv_grids.cpp | 33 ++++++++++++++ src/kernel/lib/realv_grids.hpp | 19 +------- src/kernel/lib/settings.hpp | 34 +++++++++----- src/kernel/lib/stencil_calc.cpp | 80 +++++++++++++++++++++++++-------- 8 files changed, 174 insertions(+), 70 deletions(-) diff --git a/bin/gen_loops.pl b/bin/gen_loops.pl index ff62a008..50b75f40 100755 --- a/bin/gen_loops.pl +++ b/bin/gen_loops.pl @@ -92,6 +92,9 @@ sub stepVar { sub alignVar { return inVar("align", @_); } +sub alignOfsVar { + return inVar("align_ofs", @_); +} sub groupSizeVar { return inVar("group_size", @_); } @@ -201,6 +204,7 @@ ($$$) my $evar = endVar($dim); my $svar = stepVar($dim); my $avar = alignVar($dim); + my $aovar = alignOfsVar($dim); my $aavar = adjAlignVar($dim); my $abvar = alignBeginVar($dim); my $nvar = numItersVar($dim); @@ -208,11 +212,20 @@ ($$$) my $tsvar = groupSizeVar($dim); my $ntivar = numFullGroupItersVar($dim); + # Example alignment: + # bvar = 20. + # svar = 8. + # avar = 4. + # aovar = 15. + # Then, + # aavar = min(4, 8) = 4. + # abvar = round_down_flr(20 - 15, 4) + 15 = 4 + 15 = 19. + push @$code, " // Alignment must be less than or equal to step size.", " const $itype $aavar = std::min($avar, $svar);", - " // Aligned beginning point. May be at or before $bvar.", - " const $itype $abvar = yask::round_down_flr($bvar, $aavar);", + " // Aligned beginning point such that ($bvar - $svar) < $abvar <= $bvar.", + " const $itype $abvar = yask::round_down_flr($bvar - $aovar, $aavar) + $aovar;", " // Number of iterations to get from $abvar to (but not including) $evar, stepping by $svar.". " This value is rounded up because the last iteration may cover fewer than $svar steps.", " const $itype $nvar = yask::ceil_idiv_flr($evar - $abvar, $svar);"; diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp index 2a47ca33..668696d1 100644 --- a/src/kernel/lib/context.cpp +++ b/src/kernel/lib/context.cpp @@ -193,14 +193,14 @@ namespace yask { // Indices to loop through. // Init from begin & end tuples. - ScanIndices rank_idxs(*_dims, false); + ScanIndices rank_idxs(*_dims, false, &rank_domain_offsets); rank_idxs.begin = begin; rank_idxs.end = end; // Set offsets in scratch grids. // Requires scratch grids to be allocated for whole // rank instead of smaller grid size. - update_scratch_grids(scratch_grid_idx, rank_idxs); + update_scratch_grids(scratch_grid_idx, rank_idxs.begin); // Initial halo exchange. // (Needed in case there are 0 time-steps). @@ -376,7 +376,7 @@ namespace yask { } // Indices needed for the 'rank' loops. - ScanIndices rank_idxs(*_dims, true); + ScanIndices rank_idxs(*_dims, true, &rank_domain_offsets); rank_idxs.begin = begin; rank_idxs.end = end; rank_idxs.step = step; @@ -507,7 +507,7 @@ namespace yask { " ... (end before) " << rank_idxs.stop.makeValStr(ndims)); // Init region begin & end from rank start & stop indices. - ScanIndices region_idxs(*_dims, true); + ScanIndices region_idxs(*_dims, true, &rank_domain_offsets); region_idxs.initFromOuter(rank_idxs); // Make a copy of the original start and stop indices because @@ -967,12 +967,13 @@ namespace yask { } } - // Adjust offsets of scratch grids based - // on thread and scan indices. - // Each scratch-grid is assigned to a thread, so it must - // "move around" as the thread is assigned to each block. + // Adjust offsets of scratch grids based on thread number 'thread_idx' + // and beginning point of block 'idxs'. Each scratch-grid is assigned + // to a thread, so it must "move around" as the thread is assigned to + // each block. This move is accomplished by changing the grids' global + // and local offsets. void StencilContext::update_scratch_grids(int thread_idx, - const ScanIndices& idxs) { + const Indices& idxs) { auto dims = get_dims(); int nsdims = dims->_stencil_dims.size(); auto step_posn = Indices::step_posn; @@ -981,7 +982,7 @@ namespace yask { for (auto* sv : scratchVecs) { assert(sv); - // Get the one for this thread. + // Get ptr to the scratch grid for this thread. auto gp = sv->at(thread_idx); assert(gp); assert(gp->is_scratch()); @@ -996,16 +997,32 @@ namespace yask { int posn = gp->get_dim_posn(dname); if (posn >= 0) { + // | +------+ | + // | loc | | | + // | ofs | | | + // |<------>| | | + // | +------+ | + // ^ ^ + // | | + // | start of grid/0-idx of block + // first rank-domain index + // Set offset of grid based on starting point of block. - // This is global, so it will include the rank offset. - gp->_set_offset(posn, idxs.begin[i]); + // This is a global index, so it will include the rank offset. + gp->_set_offset(posn, idxs[i]); + // Local offset is the offset of this grid + // relative to the current rank. // Set local offset to diff between global offset - // and rank offset. Must be vec-multiple. + // and rank offset. auto rofs = rank_domain_offsets[j]; - auto lofs = idxs.begin[i] - rofs; + auto lofs = idxs[i] - rofs; gp->_set_local_offset(posn, lofs); - assert(imod_flr(lofs, dims->_fold_pts[j]) == 0); + + // For a vectorized grid, the local offset must + // be a vector multiple. This is necessary for + // vector and cluster operations to work properly. + assert(imod_flr(lofs, gp->_get_vec_lens(posn)) == 0); } j++; } @@ -1148,10 +1165,14 @@ namespace yask { auto sg_list = sg.get_scratch_deps(); sg_list.push_back(&sg); - // Loop through all the needed groups. + // Loop through all the needed bundles. for (auto* csg : sg_list) { - // Loop thru all *input* grids in this group. + TRACE_MSG("exchange_halos: checking " << csg->inputGridPtrs.size() << + " input grid(s) to bundle '" << csg->get_name() << + "' that is needed for bundle '" << sg.get_name() << "'"); + + // Loop thru all *input* grids in this bundle. for (auto gp : csg->inputGridPtrs) { // Don't swap scratch grids. diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp index d156bf3d..418959fb 100644 --- a/src/kernel/lib/context.hpp +++ b/src/kernel/lib/context.hpp @@ -295,9 +295,6 @@ namespace yask { // Dump stats if get_stats() hasn't been called yet. if (steps_done) get_stats(); - - // Free mem, reset threads, etc. - end_solution(); } // Set debug output to cout if my_rank == msg_rank @@ -386,7 +383,7 @@ namespace yask { // Adjust offsets of scratch grids based // on thread and scan indices. virtual void update_scratch_grids(int thread_idx, - const ScanIndices& idxs); + const Indices& idxs); // Get total memory allocation required by grids. // Does not include MPI buffers. diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp index d0e163ba..3b2a148d 100644 --- a/src/kernel/lib/grid_apis.cpp +++ b/src/kernel/lib/grid_apis.cpp @@ -66,6 +66,7 @@ namespace yask { GET_GRID_API(get_last_rank_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, false, true, false, true) GET_GRID_API(_get_left_wf_ext, _left_wf_exts[posn], true, true, true, false) GET_GRID_API(_get_right_wf_ext, _right_wf_exts[posn], true, true, true, false) + GET_GRID_API(_get_vec_lens, _vec_lens[posn], true, true, true, true) GET_GRID_API(_get_offset, _offsets[posn], true, true, true, true) GET_GRID_API(_get_local_offset, _local_offsets[posn], true, true, true, false) GET_GRID_API(_get_first_alloc_index, _offsets[posn] - _left_pads[posn], true, true, true, true) diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp index 51ba3086..43726f97 100644 --- a/src/kernel/lib/realv_grids.cpp +++ b/src/kernel/lib/realv_grids.cpp @@ -109,6 +109,36 @@ namespace yask { return posn; } + // Determine required padding from halos. + // Does not include user-specified min padding or + // final rounding for left pad. + Indices YkGridBase::getReqdPad(const Indices& halos, const Indices& wf_exts) const { + + // Start with halos plus WF exts. + Indices mp = halos.addElements(wf_exts); + + + // For scratch grids, halo area must be written to. Halo is sum + // of dependent's write halo and depender's read halo, but these + // two components are not stored individually. Write halo will + // be expanded to full vec len during computation, requiring + // load from read halo beyond full vec len. Worst case is when + // write halo is one and rest is read halo. So if there is a + // halo and/or wf-ext, padding should be that plus all but one + // element of a vector. In addition, this vec-len should be the + // global one, not the one for this grid to handle the case where + // this grid is not vectorized. + for (int i = 0; i < get_num_dims(); i++) { + if (mp[i] >= 1) { + auto& dname = get_dim_name(i); + auto* p = _dims->_domain_dims.lookup(dname); + if (p) + mp[i] += *p - 1; + } + } + return mp; + } + // Resizes the underlying generic grid. // Modifies _pads and _allocs. // Fails if mem different and already alloc'd. @@ -151,6 +181,9 @@ namespace yask { left_pads2[i] = ROUND_UP(left_pads2[i], _vec_lens[i]); _left_pads[i] = left_pads2[i]; _vec_left_pads[i] = left_pads2[i] / _vec_lens[i]; + + // For the right pad, we will round it up below when + // we calculate alloc. } // New allocation in each dim. diff --git a/src/kernel/lib/realv_grids.hpp b/src/kernel/lib/realv_grids.hpp index 8c185a8b..929376a6 100644 --- a/src/kernel/lib/realv_grids.hpp +++ b/src/kernel/lib/realv_grids.hpp @@ -92,23 +92,7 @@ namespace yask { // Determine required padding from halos. // Does not include user-specified min padding or // final rounding for left pad. - virtual Indices getReqdPad(const Indices& halos, const Indices& wf_exts) const { - Indices mp = halos.addElements(wf_exts); - for (int i = 0; i < get_num_dims(); i++) { - - // For scratch grids, halo area must be written to. Halo is sum - // of dependent's write halo and dependency's read halo, but - // these two components are not stored individually. Write halo - // will be expanded to full vec len during computation, - // requiring load from read halo beyond full vec len. Worst - // case is when write halo is one and rest is read halo. So if - // there is a halo and/or wf-ext, padding should be that plus - // all but one element of a vector. - if (mp[i] >= 1) - mp[i] += _vec_lens[i] - 1; - } - return mp; - } + virtual Indices getReqdPad(const Indices& halos, const Indices& wf_exts) const; // Check whether dim exists and is of allowed type. virtual void checkDimType(const std::string& dim, @@ -382,6 +366,7 @@ namespace yask { GET_GRID_API(_get_last_alloc_index) GET_GRID_API(_get_left_wf_ext) GET_GRID_API(_get_right_wf_ext) + GET_GRID_API(_get_vec_lens) SET_GRID_API(_set_domain_size) SET_GRID_API(_set_left_pad_size) SET_GRID_API(_set_right_pad_size) diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp index f9d305e6..5545de7e 100644 --- a/src/kernel/lib/settings.hpp +++ b/src/kernel/lib/settings.hpp @@ -460,8 +460,12 @@ namespace yask { Indices begin, end; // first and end (beyond last) range of each index. Indices step; // step value within range. Indices align; // alignment of steps after first one. + Indices align_ofs; // adjustment for alignment (see below). Indices group_size; // proximity grouping within range. + // Alignment: when possible, each step will be aligned + // such that ((start - align_ofs) % align) == 0. + // Values that differ for each sub-range. Indices start, stop; // first and last+1 for this sub-range. Indices index; // 0-based unique index for each sub-range. @@ -475,26 +479,30 @@ namespace yask { // start stop (index = 2) // Default init. - ScanIndices(const Dims& dims, bool use_vec_align) : + ScanIndices(const Dims& dims, bool use_vec_align, IdxTuple* ofs) : ndims(dims._stencil_dims.size()), begin(idx_t(0), ndims), end(idx_t(0), ndims), step(idx_t(1), ndims), align(idx_t(1), ndims), + align_ofs(idx_t(0), ndims), group_size(idx_t(1), ndims), start(idx_t(0), ndims), stop(idx_t(0), ndims), index(idx_t(0), ndims) { - // Set alignment to vector lengths. - if (use_vec_align) { - - // i: index for stencil dims, j: index for domain dims. - for (int i = 0, j = 0; i < ndims; i++) { - if (i != Indices::step_posn) { - align[i] = dims._fold_pts[j]; - j++; - } + // i: index for stencil dims, j: index for domain dims. + for (int i = 0, j = 0; i < ndims; i++) { + if (i == Indices::step_posn) continue; + + // Set alignment to vector lengths. + if (use_vec_align) + align[i] = dims._fold_pts[j]; + + // Set alignment offset. + if (ofs) { + assert(ofs->getNumDims() == ndims - 1); + align_ofs[i] = ofs->getVal(j); } } } @@ -508,7 +516,11 @@ namespace yask { begin = outer.start; end = outer.stop; - // Pass output values through by default. + // Pass other values through by default. + step = outer.step; + align = outer.align; + align_ofs = outer.align_ofs; + group_size = outer.group_size; start = outer.start; stop = outer.stop; index = outer.index; diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp index f6d2bf5d..1f26ee92 100644 --- a/src/kernel/lib/stencil_calc.cpp +++ b/src/kernel/lib/stencil_calc.cpp @@ -36,18 +36,19 @@ namespace yask { auto opts = _generic_context->get_settings(); auto dims = _generic_context->get_dims(); - int ndims = dims->_stencil_dims.size(); + int nsdims = dims->_stencil_dims.size(); auto& step_dim = dims->_step_dim; + auto step_posn = Indices::step_posn; int thread_idx = omp_get_thread_num(); // used to index the scratch grids. TRACE_MSG3("calc_block:" << " in non-scratch bundle '" << get_name() << "': " << - region_idxs.start.makeValStr(ndims) << - " ... (end before) " << region_idxs.stop.makeValStr(ndims) << + region_idxs.start.makeValStr(nsdims) << + " ... (end before) " << region_idxs.stop.makeValStr(nsdims) << " by thread " << thread_idx); assert(!is_scratch()); // Init default block begin & end from region start & stop indices. - ScanIndices def_block_idxs(*dims, true); + ScanIndices def_block_idxs(*dims, true, 0); def_block_idxs.initFromOuter(region_idxs); // Steps within a block are based on sub-block sizes. @@ -57,7 +58,7 @@ namespace yask { def_block_idxs.group_size = opts->_sub_block_group_sizes; // Update offsets of scratch grids based on this bundle's location. - _generic_context->update_scratch_grids(thread_idx, def_block_idxs); + _generic_context->update_scratch_grids(thread_idx, def_block_idxs.begin); // Define the bundles that need to be processed in // this block. This will be the prerequisite scratch-grid @@ -79,8 +80,8 @@ namespace yask { TRACE_MSG3("calc_block: " << " in bundle '" << sg->get_name() << "': " << - block_idxs.begin.makeValStr(ndims) << - " ... (end before) " << block_idxs.end.makeValStr(ndims) << + block_idxs.begin.makeValStr(nsdims) << + " ... (end before) " << block_idxs.end.makeValStr(nsdims) << " by thread " << thread_idx); // Include automatically-generated loop code that calls @@ -91,7 +92,8 @@ namespace yask { } // Normalize the indices, i.e., divide by vector len in each dim. - // Ranks offsets must already be subtracted. + // Ranks offsets must already be subtracted because rank offsets + // are not necessarily vec-multiples. // Each dim in 'orig' must be a multiple of corresponding vec len. void StencilBundleBase::normalize_indices(const Indices& orig, Indices& norm) const { auto* cp = _generic_context; @@ -155,7 +157,7 @@ namespace yask { // Init sub-block begin & end from block start & stop indices. // These indices are in element units and global (NOT rank-relative). - ScanIndices sub_block_idxs(*dims, true); + ScanIndices sub_block_idxs(*dims, true, 0); sub_block_idxs.initFromOuter(block_idxs); // Sub block indices in element units and rank-relative. @@ -169,10 +171,16 @@ namespace yask { // These indices are in element units and rank-relative. ScanIndices sub_block_fvidxs(sub_block_idxs); - // Superset of sub-block that is full or partial vectors. + // Superset of sub-block that is full or partial (masked) vectors. // These indices are in element units and rank-relative. ScanIndices sub_block_vidxs(sub_block_idxs); + // These will be set to rank-relative, so set ofs to zero. + sub_block_eidxs.align_ofs.setFromConst(0); + sub_block_fcidxs.align_ofs.setFromConst(0); + sub_block_fvidxs.align_ofs.setFromConst(0); + sub_block_vidxs.align_ofs.setFromConst(0); + // Masks for computing partial vectors in each dim. // Init to all-ones (no masking). Indices peel_masks(nsdims), rem_masks(nsdims); @@ -209,9 +217,9 @@ namespace yask { sub_block_vidxs.end.setFromConst(0); } + // Adjust indices to be rank-relative. // Determine the subset of this sub-block that is - // clusters, vectors, and partial vectors. TODO: pre-calc this info - // for each block. + // clusters, vectors, and partial vectors. else { do_clusters = true; do_vectors = false; @@ -233,6 +241,7 @@ namespace yask { // Find range of full clusters. // Note that fcend <= eend because we round // down to get whole clusters only. + // Similarly, fcbgn >= ebgn. auto cpts = dims->_cluster_pts[j]; auto fcbgn = round_up_flr(ebgn, cpts); auto fcend = round_down_flr(eend, cpts); @@ -255,6 +264,8 @@ namespace yask { // Similar but opposite for begin vars. // We make a vector mask to pick the // right elements. + // TODO: use compile-time consts instead + // of _fold_pts for more efficiency. auto vpts = dims->_fold_pts[j]; auto fvbgn = round_up_flr(ebgn, vpts); auto fvend = round_down_flr(eend, vpts); @@ -282,6 +293,26 @@ namespace yask { // Calculate masks in this dim for partial vectors. // All such masks will be ANDed together to form the // final masks over all domain dims. + // Example: assume folding is x=4*y=4. + // Possible 'x' peel mask to exclude 1st 2 cols: + // 0 0 1 1 + // 0 0 1 1 + // 0 0 1 1 + // 0 0 1 1 + // Possible 'y' peel mask to exclude 1st row: + // 0 0 0 0 + // 1 1 1 1 + // 1 1 1 1 + // 1 1 1 1 + // Along 'x' face, the 'x' peel mask is used. + // Along 'y' face, the 'y' peel mask is used. + // Along an 'x-y' edge, they are ANDed to make this mask: + // 0 0 0 0 + // 0 0 1 1 + // 0 0 1 1 + // 0 0 1 1 + // so that the 6 corner elements are updated. + if (vbgn < fvbgn || vend > fvend) { idx_t pmask = 0, rmask = 0; @@ -327,6 +358,15 @@ namespace yask { scalar_for_peel_rem = true; } } + + // If no peel or rem, just set vec indices to same as + // full cluster. + else { + sub_block_fvidxs.begin[i] = fcbgn; + sub_block_fvidxs.end[i] = fcend; + sub_block_vidxs.begin[i] = fcbgn; + sub_block_vidxs.end[i] = fcend; + } // Next domain index. j++; @@ -348,7 +388,7 @@ namespace yask { norm_sub_block_idxs.stop = norm_sub_block_idxs.end; norm_sub_block_idxs.align.setFromConst(1); // one vector. - // Full rectangular polytope of aligned clusters: use optimized code. + // Full rectilinear polytope of aligned clusters: use optimized code. if (do_clusters) { TRACE_MSG3("calc_sub_block: using cluster code for " << sub_block_fcidxs.begin.makeValStr(nsdims) << @@ -380,7 +420,7 @@ namespace yask { TRACE_MSG3("calc_sub_block: using vector code for " << sub_block_vidxs.begin.makeValStr(nsdims) << " ... (end before) " << sub_block_vidxs.end.makeValStr(nsdims) << - " before and/or after full vector-clusters in " << + " *not* within full vector-clusters at " << sub_block_fcidxs.begin.makeValStr(nsdims) << " ... (end before) " << sub_block_fcidxs.end.makeValStr(nsdims)); @@ -413,7 +453,7 @@ namespace yask { // Also normalize the *full* vector indices to determine if // we need a mask at each vector index. - // We don't need start, stop, or step for this. + // We just need begin and end indices for this. ScanIndices norm_sub_block_fvidxs(sub_block_eidxs); normalize_indices(sub_block_fvidxs.begin, norm_sub_block_fvidxs.begin); normalize_indices(sub_block_fvidxs.end, norm_sub_block_fvidxs.end); @@ -424,6 +464,8 @@ namespace yask { // range (before the cluster) and/or remainder // range (after the clusters). If so, call the // loop-of-vectors function w/appropriate mask. + // See the mask diagrams above that show how the + // masks are ANDed together. // Since step is always 1, we ignore loop_idxs.stop. #define calc_inner_loop(thread_idx, loop_idxs) \ bool ok = false; \ @@ -635,11 +677,11 @@ namespace yask { auto& domain_dims = dims->_domain_dims; auto& step_dim = dims->_step_dim; auto& stencil_dims = dims->_stencil_dims; - auto ndims = stencil_dims.size(); + auto nsdims = stencil_dims.size(); // Init min vars w/max val and vice-versa. - Indices min_pts(idx_max, ndims); - Indices max_pts(idx_min, ndims); + Indices min_pts(idx_max, nsdims); + Indices max_pts(idx_min, nsdims); idx_t npts = 0; // Begin, end tuples. @@ -653,7 +695,7 @@ namespace yask { end[step_dim] = 1; // one time-step only. // Indices needed for the generated 'misc' loops. - ScanIndices misc_idxs(*dims, false); + ScanIndices misc_idxs(*dims, false, 0); misc_idxs.begin = begin; misc_idxs.end = end; From d4f01fc0a7fea1d8ff9c2e38b4134b43e94b1d42 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Fri, 27 Apr 2018 17:40:00 -0700 Subject: [PATCH 14/21] Fix bug in grid compare() due to use of deprecated API. Add deprecation warning. --- src/kernel/lib/grid_apis.cpp | 10 +++++++--- src/kernel/lib/realv_grids.cpp | 17 ++++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp index 3b2a148d..3d8e8d90 100644 --- a/src/kernel/lib/grid_apis.cpp +++ b/src/kernel/lib/grid_apis.cpp @@ -30,6 +30,9 @@ using namespace std; namespace yask { +#define DEPRECATED(api_name) cerr << "\n*** WARNING: deprecated YASK API '" \ + #api_name "' will be removed in a future release ***\n" + // APIs to get info from vars. #define GET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \ idx_t YkGridBase::api_name(const string& dim) const { \ @@ -46,16 +49,13 @@ namespace yask { GET_GRID_API(get_rank_domain_size, _domains[posn], false, true, false, false) GET_GRID_API(get_left_pad_size, _left_pads[posn], false, true, false, false) // _left_pads is actual size. GET_GRID_API(get_right_pad_size, _allocs[posn] - _left_pads[posn], false, true, false, false) // _right_pads is request only. - GET_GRID_API(get_pad_size, _left_pads[posn], false, true, false, false) GET_GRID_API(get_left_halo_size, _left_halos[posn], false, true, false, false) GET_GRID_API(get_right_halo_size, _right_halos[posn], false, true, false, false) - GET_GRID_API(get_halo_size, _left_halos[posn], false, true, false, false) GET_GRID_API(get_first_misc_index, _offsets[posn], false, false, true, false) GET_GRID_API(get_last_misc_index, _offsets[posn] + _domains[posn] - 1, false, false, true, false) GET_GRID_API(get_left_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false) GET_GRID_API(get_right_extra_pad_size, (_allocs[posn] - _left_pads[posn] - _domains[posn]) - _right_halos[posn], false, true, false, false) - GET_GRID_API(get_extra_pad_size, _left_pads[posn] - _left_halos[posn], false, true, false, false) GET_GRID_API(get_alloc_size, _allocs[posn], true, true, true, false) GET_GRID_API(get_first_rank_domain_index, _offsets[posn] - _local_offsets[posn], false, true, false, true) GET_GRID_API(get_last_rank_domain_index, _offsets[posn] - _local_offsets[posn] + _domains[posn] - 1; @@ -71,6 +71,10 @@ namespace yask { GET_GRID_API(_get_local_offset, _local_offsets[posn], true, true, true, false) GET_GRID_API(_get_first_alloc_index, _offsets[posn] - _left_pads[posn], true, true, true, true) GET_GRID_API(_get_last_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, true, true, true, true) + + GET_GRID_API(get_pad_size, (DEPRECATED(get_pad_size), _left_pads[posn]), false, true, false, false) + GET_GRID_API(get_halo_size, (DEPRECATED(get_halo_size), _left_halos[posn]), false, true, false, false) + GET_GRID_API(get_extra_pad_size, (DEPRECATED(get_extra_pad_size), _left_pads[posn] - _left_halos[posn]), false, true, false, false) #undef GET_GRID_API // APIs to set vars. diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp index 43726f97..d94b659f 100644 --- a/src/kernel/lib/realv_grids.cpp +++ b/src/kernel/lib/realv_grids.cpp @@ -273,23 +273,26 @@ namespace yask { auto allocs = get_allocs(); // This will loop over the entire allocation. - // Indices of 'pt' will be relative to allocation. + // We use this as a handy way to get offsets, + // but not all will be used. allocs.visitAllPoints ([&](const IdxTuple& pt, size_t idx) { // Adjust alloc indices to overall indices. IdxTuple opt(pt); bool ok = true; - for (int i = 0; i < pt.getNumDims(); i++) { + for (int i = 0; ok && i < pt.getNumDims(); i++) { auto val = pt.getVal(i); - opt[i] = _offsets[i] - _left_pads[i] + val; - // Don't compare points in the extra padding area. + // Convert to global index. + opt[i] = _offsets[i] + val; + + // Don't compare points outside the domain. + // TODO: check points in halo. auto& dname = pt.getDimName(i); if (_dims->_domain_dims.lookup(dname)) { - auto halo_sz = get_halo_size(dname); - auto first_ok = get_first_rank_domain_index(dname) - halo_sz; - auto last_ok = get_last_rank_domain_index(dname) + halo_sz; + auto first_ok = get_first_rank_domain_index(dname); + auto last_ok = get_last_rank_domain_index(dname); if (opt[i] < first_ok || opt[i] > last_ok) ok = false; } From db4b0ddc51c676879a0b8998b631408bfe92ac84 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Fri, 27 Apr 2018 18:31:39 -0700 Subject: [PATCH 15/21] Fix scalar peel/remainder loop. --- src/kernel/lib/stencil_calc.cpp | 72 +++++++++++++++++---------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp index 1f26ee92..14166a09 100644 --- a/src/kernel/lib/stencil_calc.cpp +++ b/src/kernel/lib/stencil_calc.cpp @@ -273,11 +273,12 @@ namespace yask { auto vend = round_up_flr(eend, vpts); if (i == _inner_posn) { - // Don't do any vectors in plane of inner dim. - // We'll do these with scalars. - // This is unusual because vector folding is - // normally done in a plane perpendicular to - // the inner dim for >= 2D domains. + // Don't do any full and/or partial vectors in + // plane of inner dim. We'll do these with + // scalars. This is unusual because vector + // folding is normally done in a plane + // perpendicular to the inner dim for >= 2D + // domains. fvbgn = vbgn = fcbgn; fvend = vend = fcend; } @@ -397,10 +398,9 @@ namespace yask { // Step sizes are based on cluster lengths (in vector units). // The step in the inner loop is hard-coded in the generated code. for (int i = 0, j = 0; i < nsdims; i++) { - if (i != step_posn) { - norm_sub_block_idxs.step[i] = dims->_cluster_mults[j]; - j++; - } + if (i == step_posn) continue; + norm_sub_block_idxs.step[i] = dims->_cluster_mults[j]; // N vecs. + j++; } // Define the function called from the generated loops @@ -444,12 +444,7 @@ namespace yask { // Step sizes are one vector. // The step in the inner loop is hard-coded in the generated code. - for (int i = 0, j = 0; i < nsdims; i++) { - if (i != step_posn) { - norm_sub_block_idxs.step[i] = 1; - j++; - } - } + norm_sub_block_idxs.step.setFromConst(1); // Also normalize the *full* vector indices to determine if // we need a mask at each vector index. @@ -494,43 +489,50 @@ namespace yask { // Use scalar code for anything not done above. if (do_scalars) { + // Use the 'misc' loops. Indices for these loops will be scalar and + // global rather than normalized as in the cluster and vector loops. + ScanIndices misc_idxs(sub_block_idxs); + + // Step sizes and alignment are one element. + misc_idxs.step.setFromConst(1); + misc_idxs.align.setFromConst(1); + #ifdef TRACE string msg = "calc_sub_block: using scalar code for "; msg += scalar_for_peel_rem ? "peel/remainder of" : "entire"; msg += " sub-block "; msg += bb_is_full ? "without" : "with"; - msg += " sub-domain checking"; - TRACE_MSG3(msg); + msg += " sub-domain checking for "; + TRACE_MSG3(msg << + misc_idxs.begin.makeValStr(nsdims) << + " ... (end before) " << + misc_idxs.end.makeValStr(nsdims)); #endif - // Use the 'misc' loops. Indices for these loops will be scalar and - // global rather than normalized as in the cluster and vector loops. - ScanIndices misc_idxs(sub_block_idxs); - // Define misc-loop function. // If point is in sub-domain for this // bundle, then evaluate the reference scalar code. // If no holes, don't need to check each point in domain. // Since step is always 1, we ignore misc_idxs.stop. -#define misc_fn(misc_idxs) do { \ - bool ok = true; \ - if (scalar_for_peel_rem) { \ - ok = false; \ - for (int i = 0, j = 0; i < nsdims; i++) { \ - if (i != step_posn) { \ +#define misc_fn(pt_idxs) do { \ + TRACE_MSG3("calc_sub_block: at pt " << pt_idxs.start.makeValStr(nsdims)); \ + bool ok = true; \ + if (scalar_for_peel_rem) { \ + ok = false; \ + for (int i = 0, j = 0; i < nsdims; i++) { \ + if (i == step_posn) continue; \ auto rofs = cp->rank_domain_offsets[j]; \ - if (misc_idxs.start[i] < rofs + sub_block_vidxs.begin[i] || \ - misc_idxs.start[i] >= rofs + sub_block_vidxs.end[i]) { \ + if (pt_idxs.start[i] < rofs + sub_block_vidxs.begin[i] || \ + pt_idxs.start[i] >= rofs + sub_block_vidxs.end[i]) { \ ok = true; break; } \ j++; \ } \ } \ - } \ - if (ok && (bb_is_full || is_in_valid_domain(misc_idxs.start))) { \ - calc_scalar(thread_idx, misc_idxs.start); \ - } \ - } while(0) - + if (ok && (bb_is_full || is_in_valid_domain(pt_idxs.start))) { \ + calc_scalar(thread_idx, pt_idxs.start); \ + } \ + } while(0) + // Scan through n-D space. // The OMP in the misc loops will be ignored if we're already in // the max allowed nested OMP region. From 533a236b20cc96969b6e8bc62735aed607063471 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Sat, 28 Apr 2018 06:53:49 -0700 Subject: [PATCH 16/21] Fix pad-adjustment code. --- src/kernel/lib/realv_grids.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp index d94b659f..3eaae4c3 100644 --- a/src/kernel/lib/realv_grids.cpp +++ b/src/kernel/lib/realv_grids.cpp @@ -117,7 +117,6 @@ namespace yask { // Start with halos plus WF exts. Indices mp = halos.addElements(wf_exts); - // For scratch grids, halo area must be written to. Halo is sum // of dependent's write halo and depender's read halo, but these // two components are not stored individually. Write halo will @@ -131,9 +130,11 @@ namespace yask { for (int i = 0; i < get_num_dims(); i++) { if (mp[i] >= 1) { auto& dname = get_dim_name(i); - auto* p = _dims->_domain_dims.lookup(dname); - if (p) + auto* p = _dims->_fold_pts.lookup(dname); + if (p) { + assert (p >= 1); mp[i] += *p - 1; + } } } return mp; From 6956cda4f876f786d3d659b9c7849aea2c41e223 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Sat, 28 Apr 2018 06:54:20 -0700 Subject: [PATCH 17/21] Make 2D test asymmetical. --- src/stencils/SimpleTestStencils.hpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/stencils/SimpleTestStencils.hpp b/src/stencils/SimpleTestStencils.hpp index 6e910aec..1d2c1c9f 100644 --- a/src/stencils/SimpleTestStencils.hpp +++ b/src/stencils/SimpleTestStencils.hpp @@ -82,11 +82,16 @@ class Test2dStencil : public StencilRadiusBase { // Define equation to apply to all points in 'data' grid. virtual void define() { - // define the value at t+1. + // define the value at t+1 using asymmetric stencil. GridValue v = data(t, x, y) + 1.0; for (int r = 1; r <= _radius; r++) - v += data(t, x + r, y) + data(t, x - r, y) - + data(t, x, y + r) + data(t, x, y - r); + v += data(t, x + r, y); + for (int r = 1; r <= _radius + 1; r++) + v += data(t, x - r, y); + for (int r = 1; r <= _radius + 2; r++) + v += data(t, x, y + r); + for (int r = 1; r <= _radius + 3; r++) + v += data(t, x, y - r); data(t+1, x, y) EQUALS v; } }; From ce9653f7efb3ff4a1e83dd33787c8b6d55116df4 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Sat, 28 Apr 2018 06:54:58 -0700 Subject: [PATCH 18/21] Clean up deprecation warning. Ver 2.06.01. --- src/common/common_utils.cpp | 2 +- src/kernel/lib/grid_apis.cpp | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp index 1d683006..a5c5504b 100644 --- a/src/common/common_utils.cpp +++ b/src/common/common_utils.cpp @@ -41,7 +41,7 @@ namespace yask { // for numbers above 9 (at least up to 99). // Format: "major.minor.patch". - const string version = "2.06.00"; + const string version = "2.06.01"; string yask_get_version_string() { return version; diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp index 3d8e8d90..66544942 100644 --- a/src/kernel/lib/grid_apis.cpp +++ b/src/kernel/lib/grid_apis.cpp @@ -31,7 +31,7 @@ using namespace std; namespace yask { #define DEPRECATED(api_name) cerr << "\n*** WARNING: deprecated YASK API '" \ - #api_name "' will be removed in a future release ***\n" + #api_name "' used that will be removed in a future release ***\n" // APIs to get info from vars. #define GET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \ @@ -41,10 +41,12 @@ namespace yask { if (prep_req && _offsets[posn] < 0) \ THROW_YASK_EXCEPTION("Error: '" #api_name "()' called on grid '" << \ get_name() << "' before calling 'prepare_solution()'"); \ - return expr; \ + auto rtn = expr; \ + return rtn; \ } \ idx_t YkGridBase::api_name(int posn) const { \ - return expr; \ + auto rtn = expr; \ + return rtn; \ } GET_GRID_API(get_rank_domain_size, _domains[posn], false, true, false, false) GET_GRID_API(get_left_pad_size, _left_pads[posn], false, true, false, false) // _left_pads is actual size. @@ -72,9 +74,9 @@ namespace yask { GET_GRID_API(_get_first_alloc_index, _offsets[posn] - _left_pads[posn], true, true, true, true) GET_GRID_API(_get_last_alloc_index, _offsets[posn] - _left_pads[posn] + _allocs[posn] - 1, true, true, true, true) - GET_GRID_API(get_pad_size, (DEPRECATED(get_pad_size), _left_pads[posn]), false, true, false, false) - GET_GRID_API(get_halo_size, (DEPRECATED(get_halo_size), _left_halos[posn]), false, true, false, false) - GET_GRID_API(get_extra_pad_size, (DEPRECATED(get_extra_pad_size), _left_pads[posn] - _left_halos[posn]), false, true, false, false) + GET_GRID_API(get_pad_size, _left_pads[posn]; DEPRECATED(get_pad_size), false, true, false, false) + GET_GRID_API(get_halo_size, _left_halos[posn]; DEPRECATED(get_halo_size), false, true, false, false) + GET_GRID_API(get_extra_pad_size, _left_pads[posn] - _left_halos[posn]; DEPRECATED(get_extra_pad_size), false, true, false, false) #undef GET_GRID_API // APIs to set vars. From d1c8d0c025e4f76499def9097b3e9c77e1a4f329 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Sat, 28 Apr 2018 06:59:48 -0700 Subject: [PATCH 19/21] Fix assertion. --- src/kernel/lib/realv_grids.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp index 3eaae4c3..83fc1d4e 100644 --- a/src/kernel/lib/realv_grids.cpp +++ b/src/kernel/lib/realv_grids.cpp @@ -132,7 +132,7 @@ namespace yask { auto& dname = get_dim_name(i); auto* p = _dims->_fold_pts.lookup(dname); if (p) { - assert (p >= 1); + assert (*p >= 1); mp[i] += *p - 1; } } From eb8fcf2d364edaceb3c13f57087e3f106780d182 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Sat, 28 Apr 2018 11:09:58 -0700 Subject: [PATCH 20/21] Mark grids dirty in a rank if *any* rank could make it dirty. Closes #106. --- src/kernel/lib/context.cpp | 41 ++++++++---- src/kernel/lib/setup.cpp | 127 +++++++++++++++++++++---------------- 2 files changed, 101 insertions(+), 67 deletions(-) diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp index 668696d1..7c521bf0 100644 --- a/src/kernel/lib/context.cpp +++ b/src/kernel/lib/context.cpp @@ -271,9 +271,13 @@ namespace yask { #include "yask_misc_loops.hpp" #undef misc_fn - // Remember grids that have been written to by this bundle, + // Mark grids that [may] have been written to by this bundle, // updated at next step (+/- 1). - mark_grids_dirty(start_t + step_t, stop_t + step_t, *asg); + // Mark grids as dirty even if not actually written by this + // rank. This is needed because neighbors will not know what + // grids are actually dirty, and all ranks must have the same + // information about which grids are possibly dirty. + mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg); } // needed bundles. } // all bundles. @@ -436,9 +440,9 @@ namespace yask { } // If doing wave-fronts, must loop through all bundles in - // calc_region(). - // TODO: make this the only case, allowing all bundles to be done - // between MPI exchanges, even w/o wave-fronts. + // calc_region(). TODO: consider making this the only case, + // allowing all bundles to be done between MPI exchanges, even + // w/o wave-fronts. else { // Exchange all dirty halo(s). @@ -606,13 +610,18 @@ namespace yask { // similar for y and z. This code typically // contains the outer OpenMP loop(s). #include "yask_region_loops.hpp" - - // Remember grids that have been written to by this bundle, - // updated at next step (+/- 1). - mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg); } - // Shift spatial region boundaries for next iteration to + // Mark grids that [may] have been written to by this bundle, + // updated at next step (+/- 1). + // Mark grids as dirty even if not actually written by this + // rank. This is needed because neighbors will not know what + // grids are actually dirty, and all ranks must have the same + // information about which grids are possibly dirty. + // TODO: make this smarter. + mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg); + + // Shift spatial region boundaries for next iteration to // implement temporal wavefront. Between regions, we only shift // backward, so region loops must strictly increment. They may do // so in any order. TODO: shift only what is needed by @@ -1184,7 +1193,7 @@ namespace yask { if (!gp->is_dirty(t)) continue; - // Only need to swap grids that have MPI buffers. + // Only need to swap grids that have any MPI buffers. auto& gname = gp->get_name(); if (mpiData.count(gname) == 0) continue; @@ -1226,7 +1235,7 @@ namespace yask { auto gp = gtsi.second; gi++; MPI_Request* grid_recv_reqs = recv_reqs[gi]; - TRACE_MSG(" for grid '" << gname << "'..."); + TRACE_MSG(" for grid #" << gi << ", '" << gname << "'..."); // Visit all this rank's neighbors. auto& grid_mpi_data = mpiData.at(gname); @@ -1260,6 +1269,8 @@ namespace yask { neighbor_rank, int(gi), _env->comm, &grid_recv_reqs[ni]); num_recv_reqs++; } + else + TRACE_MSG(" 0B to request"); } // Pack data into send buffer, then send to neighbor. @@ -1276,7 +1287,7 @@ namespace yask { IdxTuple first = sendBuf.begin_pt; IdxTuple last = sendBuf.last_pt; - // The code in allocData() pre-calculated the first and + // The code in allocMpiData() pre-calculated the first and // last points of each buffer, except in the step dim. // So, we need to set that value now. // TODO: update this if we expand the buffers to hold @@ -1305,6 +1316,8 @@ namespace yask { neighbor_rank, int(gi), _env->comm, &send_reqs[num_send_reqs++]); } + else + TRACE_MSG(" 0B to send"); } // Wait for data from neighbor, then unpack it. @@ -1343,6 +1356,8 @@ namespace yask { n = gp->set_elements_in_slice(buf, first, last); assert(n == recvBuf.get_size()); } + else + TRACE_MSG(" 0B to wait for"); } }); // visit neighbors. diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp index 58fb63ef..d4aacc8f 100644 --- a/src/kernel/lib/setup.cpp +++ b/src/kernel/lib/setup.cpp @@ -328,7 +328,7 @@ namespace yask { } // grid passes. }; - // Create MPI and allocate buffers. + // Create MPI buffers and allocate them. void StencilContext::allocMpiData(ostream& os) { // Remove any old MPI data. @@ -336,7 +336,8 @@ namespace yask { #ifdef USE_MPI - int num_exchanges = 0; + map num_exchanges; // send/recv => count. + map num_elems; // send/recv => count. auto me = _env->my_rank; // Need to determine the size and shape of all MPI buffers. @@ -369,10 +370,24 @@ namespace yask { return; // from lambda fn. } - // Determine size of MPI buffers between neigh_rank and my rank - // for each grid and create those that are needed. + // Is vectorized exchange allowed based on domain sizes? + // Both my rank and neighbor rank must have all domain sizes + // of vector multiples. + bool vec_ok = allow_vec_exchange && + _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] && + _mpiInfo->has_all_vlen_mults[neigh_idx]; + + // Determine size of MPI buffers between neigh_rank and my + // rank for each grid and create those that are needed. It + // is critical that the number, size, and shape of my + // send/receive buffers match those of the receive/send + // buffers of my neighbors. Important: Current algorithm + // assumes my left neighbor's buffer sizes can be calculated + // by considering my rank's right side data and vice-versa. + // Thus, all ranks must have consistent data that contribute + // to these calculations. for (auto gp : gridPtrs) { - if (!gp) + if (!gp || gp->is_scratch() || gp->is_fixed_size()) continue; auto& gname = gp->get_name(); @@ -384,12 +399,15 @@ namespace yask { IdxTuple first_outer_idx, last_outer_idx; for (auto& dim : _dims->_domain_dims.getDims()) { auto& dname = dim.getName(); + + // Only consider domain dims that are used in this grid. if (gp->is_dim_used(dname)) { - // Get domain indices for this grid. - // If there are no more ranks in the given direction, extend - // the index into the outer halo to make sure all data are sync'd. - // This is critical for WFs. + // Get domain indices for this grid. If there + // are no more ranks in the given direction, + // extend the "outer" index to include the halo + // in that direction to make sure all data are + // sync'd. This is critical for WFs. idx_t fidx = gp->get_first_rank_domain_index(dname); idx_t lidx = gp->get_last_rank_domain_index(dname); first_inner_idx.addDimBack(dname, fidx); @@ -401,55 +419,57 @@ namespace yask { first_outer_idx.addDimBack(dname, fidx); last_outer_idx.addDimBack(dname, lidx); - // Determine size of exchange. This will be the actual halo size - // plus any wave-front extensions. In the current implementation, - // we need the wave-front extensions regardless of whether there - // is a halo on a given grid. This is because each stencil-bundle - // gets shifted by the WF angles at each step in the WF. + // Determine size of exchange in this dim. This + // will be the actual halo size plus any + // wave-front shifts. In the current + // implementation, we need the wave-front shifts + // regardless of whether there is a halo on a + // given grid. This is because each + // stencil-bundle gets shifted by the WF angles + // at each step in the WF. - // Neighbor is to the left. + // Neighbor is to the left in this dim. if (neigh_offsets[dname] == MPIInfo::rank_prev) { - auto ext = left_wf_exts[dname]; + auto ext = wf_shifts[dname]; - // my halo. + // my halo on my left. auto halo_size = gp->get_left_halo_size(dname); halo_size += ext; my_halo_sizes.addDimBack(dname, halo_size); - // neighbor halo. - halo_size = gp->get_right_halo_size(dname); // their right is on my left. + // neighbor halo on their right. + halo_size = gp->get_right_halo_size(dname); // assume their right == my right. halo_size += ext; neigh_halo_sizes.addDimBack(dname, halo_size); + + // Flag that this grid has a neighbor to left or right. + found_delta = true; } - // Neighbor is to the right. + // Neighbor is to the right in this dim. else if (neigh_offsets[dname] == MPIInfo::rank_next) { - auto ext = right_wf_exts[dname]; + auto ext = wf_shifts[dname]; - // my halo. + // my halo on my right. auto halo_size = gp->get_right_halo_size(dname); halo_size += ext; my_halo_sizes.addDimBack(dname, halo_size); - // neighbor halo. - halo_size = gp->get_left_halo_size(dname); // their left is on my right. + // neighbor halo on their left. + halo_size = gp->get_left_halo_size(dname); // assume their left == my left. halo_size += ext; neigh_halo_sizes.addDimBack(dname, halo_size); + + // Flag that this grid has a neighbor to left or right. + found_delta = true; } - // Neighbor in-line. + // Neighbor in-line in this dim. else { my_halo_sizes.addDimBack(dname, 0); neigh_halo_sizes.addDimBack(dname, 0); } - // Vectorized exchange allowed based on domain sizes? - // Both my rank and neighbor rank must have all domain sizes - // of vector multiples. - bool vec_ok = allow_vec_exchange && - _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] && - _mpiInfo->has_all_vlen_mults[neigh_idx]; - // Round up halo sizes if vectorized exchanges allowed. // TODO: add a heuristic to avoid increasing by a large factor. if (vec_ok) { @@ -457,12 +477,8 @@ namespace yask { my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size)); neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size)); } - - // Is this neighbor before or after me in this domain direction? - if (neigh_offsets[dname] != MPIInfo::rank_self) - found_delta = true; - } - } + } // domain dims in this grid. + } // domain dims. // Is buffer needed? // Example: if this grid is 2D in y-z, but only neighbors are in @@ -589,11 +605,19 @@ namespace yask { } // all dims in this grid. + // Unique name for buffer based on grid name, direction, and ranks. + ostringstream oss; + oss << gname; + if (bd == MPIBufs::bufSend) + oss << "_send_halo_from_" << me << "_to_" << neigh_rank; + else if (bd == MPIBufs::bufRecv) + oss << "_recv_halo_from_" << neigh_rank << "_to_" << me; + string bufname = oss.str(); + // Does buffer have non-zero size? if (buf_sizes.size() == 0 || buf_sizes.product() == 0) { - TRACE_MSG("no halo exchange needed for grid '" << gname << - "' with rank " << neigh_rank << - " because there is no data to exchange"); + TRACE_MSG("MPI buffer '" << bufname << + "' not needed because there is no data to exchange"); continue; } @@ -602,15 +626,6 @@ namespace yask { // Convert end to last. IdxTuple copy_last = copy_end.subElements(1); - // Unique name for buffer based on grid name, direction, and ranks. - ostringstream oss; - oss << gname; - if (bd == MPIBufs::bufSend) - oss << "_send_halo_from_" << me << "_to_" << neigh_rank; - else if (bd == MPIBufs::bufRecv) - oss << "_recv_halo_from_" << neigh_rank << "_to_" << me; - string bufname = oss.str(); - // Make MPI data entry for this grid. auto gbp = mpiData.emplace(gname, _mpiInfo); auto& gbi = gbp.first; // iterator from pair returned by emplace(). @@ -625,18 +640,22 @@ namespace yask { buf.name = bufname; buf.has_all_vlen_mults = vlen_mults; - TRACE_MSG("configured MPI buffer object '" << buf.name << - "' for rank at relative offsets " << + TRACE_MSG("MPI buffer '" << buf.name << + "' configured for rank at relative offsets " << neigh_offsets.subElements(1).makeDimValStr() << " with " << buf.num_pts.makeDimValStr(" * ") << " = " << buf.get_size() << " element(s) at " << buf.begin_pt.makeDimValStr() << " ... " << buf.last_pt.makeDimValStr()); - num_exchanges++; + num_exchanges[bd]++; + num_elems[bd] += buf.get_size(); } // send, recv. } // grids. }); // neighbors. - TRACE_MSG("number of halo-exchanges needed on this rank: " << num_exchanges); + TRACE_MSG("number of MPI send buffers on this rank: " << num_exchanges[int(MPIBufs::bufSend)]); + TRACE_MSG("number of elements in send buffers: " << makeNumStr(num_elems[int(MPIBufs::bufSend)])); + TRACE_MSG("number of MPI recv buffers on this rank: " << num_exchanges[int(MPIBufs::bufRecv)]); + TRACE_MSG("number of elements in recv buffers: " << makeNumStr(num_elems[int(MPIBufs::bufRecv)])); // Base ptrs for all alloc'd data. // These pointers will be shared by the ones in the grid From 6d2cd4fc8ffe1dfda3dab08f9d902e943cda7f83 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Sat, 28 Apr 2018 13:16:22 -0700 Subject: [PATCH 21/21] Fix bug in rounding up indices & sizes for vectorized halo exchanges. Ver 2.06.02. --- src/common/common_utils.cpp | 2 +- src/kernel/lib/context.cpp | 4 +- src/kernel/lib/settings.hpp | 4 +- src/kernel/lib/setup.cpp | 95 +++++++++++++++++++++++++------------ 4 files changed, 70 insertions(+), 35 deletions(-) diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp index a5c5504b..fc430a4d 100644 --- a/src/common/common_utils.cpp +++ b/src/common/common_utils.cpp @@ -41,7 +41,7 @@ namespace yask { // for numbers above 9 (at least up to 99). // Format: "major.minor.patch". - const string version = "2.06.01"; + const string version = "2.06.02"; string yask_get_version_string() { return version; diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp index 7c521bf0..b70596e0 100644 --- a/src/kernel/lib/context.cpp +++ b/src/kernel/lib/context.cpp @@ -1281,7 +1281,7 @@ namespace yask { // Vec ok? // Domain sizes must be ok, and buffer size must be ok // as calculated when buffers were created. - bool send_vec_ok = vec_ok && sendBuf.has_all_vlen_mults; + bool send_vec_ok = vec_ok && sendBuf.vec_copy_ok; // Get first and last ranges. IdxTuple first = sendBuf.begin_pt; @@ -1330,7 +1330,7 @@ namespace yask { MPI_Wait(&grid_recv_reqs[ni], MPI_STATUS_IGNORE); // Vec ok? - bool recv_vec_ok = vec_ok && recvBuf.has_all_vlen_mults; + bool recv_vec_ok = vec_ok && recvBuf.vec_copy_ok; // Get first and last ranges. IdxTuple first = recvBuf.begin_pt; diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp index 5545de7e..a2532cfd 100644 --- a/src/kernel/lib/settings.hpp +++ b/src/kernel/lib/settings.hpp @@ -633,8 +633,8 @@ namespace yask { IdxTuple num_pts; // Whether the number of points is a multiple of the - // vector length in all dims. - bool has_all_vlen_mults = false; + // vector length in all dims and buffer is aligned. + bool vec_copy_ok = false; // Number of points overall. idx_t get_size() const { diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp index d4aacc8f..02797647 100644 --- a/src/kernel/lib/setup.cpp +++ b/src/kernel/lib/setup.cpp @@ -371,7 +371,7 @@ namespace yask { } // Is vectorized exchange allowed based on domain sizes? - // Both my rank and neighbor rank must have all domain sizes + // Both my rank and neighbor rank must have *all* domain sizes // of vector multiples. bool vec_ok = allow_vec_exchange && _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] && @@ -390,6 +390,7 @@ namespace yask { if (!gp || gp->is_scratch() || gp->is_fixed_size()) continue; auto& gname = gp->get_name(); + bool grid_vec_ok = vec_ok; // Lookup first & last domain indices and calc exchange sizes // for this grid. @@ -402,6 +403,9 @@ namespace yask { // Only consider domain dims that are used in this grid. if (gp->is_dim_used(dname)) { + auto vlen = _dims->_fold_pts[dname]; + auto lhalo = gp->get_left_halo_size(dname); + auto rhalo = gp->get_right_halo_size(dname); // Get domain indices for this grid. If there // are no more ranks in the given direction, @@ -413,12 +417,25 @@ namespace yask { first_inner_idx.addDimBack(dname, fidx); last_inner_idx.addDimBack(dname, lidx); if (_opts->is_first_rank(dname)) - fidx -= gp->get_left_halo_size(dname); + fidx -= lhalo; if (_opts->is_last_rank(dname)) - lidx += gp->get_right_halo_size(dname); + lidx += rhalo; first_outer_idx.addDimBack(dname, fidx); last_outer_idx.addDimBack(dname, lidx); + // Determine if it is possible to round the + // outer indices to vec-multiples. This will + // be required to allow full vec exchanges for + // this grid. We won't do the actual rounding + // yet, because we need to see if it's safe + // in all dims. + fidx = round_down_flr(fidx, vlen); + lidx = round_up_flr(lidx, vlen); + if (fidx < gp->get_first_rank_alloc_index(dname)) + grid_vec_ok = false; + if (lidx > gp->get_last_rank_alloc_index(dname)) + grid_vec_ok = false; + // Determine size of exchange in this dim. This // will be the actual halo size plus any // wave-front shifts. In the current @@ -432,15 +449,12 @@ namespace yask { if (neigh_offsets[dname] == MPIInfo::rank_prev) { auto ext = wf_shifts[dname]; - // my halo on my left. - auto halo_size = gp->get_left_halo_size(dname); - halo_size += ext; - my_halo_sizes.addDimBack(dname, halo_size); + // My halo on my left. + my_halo_sizes.addDimBack(dname, lhalo + ext); - // neighbor halo on their right. - halo_size = gp->get_right_halo_size(dname); // assume their right == my right. - halo_size += ext; - neigh_halo_sizes.addDimBack(dname, halo_size); + // Neighbor halo on their right. + // Assume my right is same as their right. + neigh_halo_sizes.addDimBack(dname, rhalo + ext); // Flag that this grid has a neighbor to left or right. found_delta = true; @@ -450,15 +464,12 @@ namespace yask { else if (neigh_offsets[dname] == MPIInfo::rank_next) { auto ext = wf_shifts[dname]; - // my halo on my right. - auto halo_size = gp->get_right_halo_size(dname); - halo_size += ext; - my_halo_sizes.addDimBack(dname, halo_size); + // My halo on my right. + my_halo_sizes.addDimBack(dname, rhalo + ext); - // neighbor halo on their left. - halo_size = gp->get_left_halo_size(dname); // assume their left == my left. - halo_size += ext; - neigh_halo_sizes.addDimBack(dname, halo_size); + // Neighbor halo on their left. + // Assume my left is same as their left. + neigh_halo_sizes.addDimBack(dname, lhalo + ext); // Flag that this grid has a neighbor to left or right. found_delta = true; @@ -470,13 +481,6 @@ namespace yask { neigh_halo_sizes.addDimBack(dname, 0); } - // Round up halo sizes if vectorized exchanges allowed. - // TODO: add a heuristic to avoid increasing by a large factor. - if (vec_ok) { - auto vec_size = _dims->_fold_pts[dname]; - my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size)); - neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size)); - } } // domain dims in this grid. } // domain dims. @@ -491,6 +495,31 @@ namespace yask { continue; // to next grid. } + // Round halo sizes if vectorized exchanges allowed. + // Both self and neighbor must be vec-multiples + // and outer indices must be vec-mults or extendable + // to be so. + // TODO: add a heuristic to avoid increasing by a large factor. + if (grid_vec_ok) { + for (auto& dim : _dims->_domain_dims.getDims()) { + auto& dname = dim.getName(); + if (gp->is_dim_used(dname)) { + auto vlen = _dims->_fold_pts[dname]; + + // first index rounded down. + first_outer_idx.setVal(dname, round_down_flr(first_outer_idx[dname], vlen)); + + // last index rounded up. + last_outer_idx.setVal(dname, round_up_flr(last_outer_idx[dname], vlen)); + + // sizes rounded up. + my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vlen)); + neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vlen)); + + } // domain dims in this grid. + } // domain dims. + } + // Make a buffer in both directions (send & receive). for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) { @@ -498,7 +527,7 @@ namespace yask { // of main grid to read from or write to based on // the current neighbor being processed. IdxTuple copy_begin = gp->get_allocs(); - IdxTuple copy_end = gp->get_allocs(); + IdxTuple copy_end = gp->get_allocs(); // one past last! // Adjust along domain dims in this grid. for (auto& dim : _dims->_domain_dims.getDims()) { @@ -516,13 +545,15 @@ namespace yask { // Region to read from, i.e., data from inside // this rank's domain to be put into neighbor's - // halo. + // halo. So, use neighbor's halo sizes when + // calculating buffer size. if (bd == MPIBufs::bufSend) { // Neighbor is to the left. if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { // Only read slice as wide as halo from beginning. + copy_begin[dname] = first_inner_idx[dname]; copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname]; } @@ -531,6 +562,7 @@ namespace yask { // Only read slice as wide as halo before end. copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname]; + copy_end[dname] = last_inner_idx[dname] + 1; } // Else, this neighbor is in same posn as I am in this dim, @@ -538,6 +570,7 @@ namespace yask { } // Region to write to, i.e., into this rank's halo. + // So, use my halo sizes when calculating buffer sizes. else if (bd == MPIBufs::bufRecv) { // Neighbor is to the left. @@ -573,10 +606,12 @@ namespace yask { if (_dims->_domain_dims.lookup(dname)) { dsize = copy_end[dname] - copy_begin[dname]; - // Check whether size is multiple of vlen. + // Check whether alignment and size are multiple of vlen. auto vlen = _dims->_fold_pts[dname]; if (dsize % vlen != 0) vlen_mults = false; + if (imod_flr(copy_begin[dname], vlen) != 0) + vlen_mults = false; } // step dim? @@ -638,7 +673,7 @@ namespace yask { buf.last_pt = copy_last; buf.num_pts = buf_sizes; buf.name = bufname; - buf.has_all_vlen_mults = vlen_mults; + buf.vec_copy_ok = vlen_mults; TRACE_MSG("MPI buffer '" << buf.name << "' configured for rank at relative offsets " <<