Skip to content

Commit

Permalink
Merge pull request #278 from intel/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
chuckyount authored Aug 1, 2023
2 parents 207357d + 36b279d commit 0384156
Show file tree
Hide file tree
Showing 20 changed files with 528 additions and 415 deletions.
10 changes: 7 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -266,18 +266,22 @@ combo-test: $(COMBO_TEST_EXEC)
$(RUN_PREFIX) $<

api-tests: compiler-api
$(MAKE) combo-api-tests
$(YC_MAKE) $@
$(YK_MAKE) $@
$(MAKE) combo-api-tests

unit-tests:
common-unit-tests:
$(MAKE) tuple-test
$(MAKE) combo-test

unit-tests: common-unit-tests
$(YC_MAKE) $@
$(YK_MAKE) $@

example-tests:
$(EX_MAKE) all-tests

all-tests: compiler-api unit-tests
all-tests: compiler-api common-unit-tests
$(YC_MAKE) $@
$(YK_MAKE) $@
$(MAKE) combo-api-tests
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ and Intel(R) graphics processors.
You'll get a warning when running `make` if one of these doesn't exist.
Everything will still work, but the generated code will be difficult to read.
Reading the generated code is only necessary for debug or curiosity.
* SWIG (3.0.12 or later; 4.0.0 or later recommended),
* SWIG (4.0.0 or later):
http://www.swig.org, for creating the Python interface.
* Python 3 (3.6.1 or later, recommended):
* Python 3 (3.6.1 or later):
https://www.python.org/downloads, for creating and using the Python interface.
* Doxygen (1.9.0 or later):
https://www.doxygen.nl, for creating updated API documentation.
Expand Down
14 changes: 9 additions & 5 deletions src/common/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ YC_EXEC := $(BIN_OUT_DIR)/$(YC_BASE).exe
YC_SRC_DIR := $(SRC_DIR)/compiler

# Tools.
CXX := icpx
SWIG := swig
PERL := perl
MKDIR := mkdir -p -v
Expand Down Expand Up @@ -178,6 +177,7 @@ YK_EXT_BASE := $(YK_BASE).$(YK_TAG)
YK_LIB := $(LIB_OUT_DIR)/lib$(YK_EXT_BASE)$(SO_SUFFIX)

# Compiler for building kernel lib and apps.
CXX := icpx
YK_CXX := $(CXX)
MPI_CXX := mpiicpc
ifeq ($(mpi),1)
Expand All @@ -190,8 +190,12 @@ ifeq ($(offload),1)
endif

# Base compiler flags for building kernel lib and apps.
ifeq ($(offload),1)
YK_CXXDBG := -gline-tables-only
else
YK_CXXDBG := -g
endif
YK_CXXOPT := -O3
YK_CXXDBG := -g
YK_CXXWARN := -Wall
YK_CXXFLAGS := -std=c++17 $(YK_CXXDBG) $(YK_CXXOPT) $(YK_CXXWARN) -I$(INC_DIR) $(EXTRA_YK_CXXFLAGS)
ifeq ($(mpi),1)
Expand All @@ -202,9 +206,9 @@ endif
YK_LIBS := -lrt
YK_LFLAGS := -Wl,-rpath=$(LIB_OUT_DIR) -L$(LIB_OUT_DIR) -l$(YK_EXT_BASE)

# Default number of ranks for running tests.
# 4 is good because it tests in-plane diagonal exchanges for 2D and 3D tests.
# 8 would test all exchanges for 3D tests.
# Default number of ranks for running MPI tests.
# 4 tests in-plane diagonal exchanges for 2D and 3D tests.
# 8 tests all exchanges for 3D tests.
ifneq ($(mpi),1)
ranks := 1
else
Expand Down
2 changes: 1 addition & 1 deletion src/common/common_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ namespace yask {
// for numbers above 9 (at least up to 99).

// Format: "major.minor.patch[-alpha|-beta]".
const string version = "4.04.01";
const string version = "4.04.02";

string yask_get_version_string() {
return version;
Expand Down
5 changes: 4 additions & 1 deletion src/compiler/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,10 @@ api-tests:
$(MAKE) cxx-yc-api-test-with-exception
$(MAKE) py-yc-api-test-with-exception

all-tests: api-tests
# no unit tests yet.
unit-tests:

all-tests: api-tests unit-tests

all:
$(MAKE) compiler
Expand Down
5 changes: 2 additions & 3 deletions src/compiler/lib/Cpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1269,7 +1269,6 @@ namespace yask {
}

// Print loop-invariant data values for each VarPoint.
// TODO: fix warning from loading invariant real_vec_t outside of OMP device region.
string CppPreLoopPrintDataVisitor::visit(VarPoint* gp) {
assert(gp);

Expand All @@ -1281,9 +1280,9 @@ namespace yask {

// Not already loaded?
if (!_cvph.lookup_point_var(*gp)) {
string expr = _ph.read_from_point(_os, *gp);
string expr = string("get_copyable(") + _ph.read_from_point(_os, *gp) + ")";
string res;
make_next_temp_var(res, gp, "expr", "") << expr << _ph.get_line_suffix();
make_next_temp_var(res, gp, "expr", "", true) << expr << _ph.get_line_suffix();

// Save for future use.
_cvph.save_point_var(*gp, res);
Expand Down
6 changes: 4 additions & 2 deletions src/compiler/lib/Print.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ namespace yask {
// If 'comment' is set, use it for the comment.
// Return stream to continue w/RHS.
ostream& PrintVisitorBase::make_next_temp_var(string& res, Expr* ex,
string prefix, string comment) {
string prefix, string comment,
bool use_auto_type) {
res = _ph.make_var_name(prefix);
if (ex) {
_temp_vars[ex] = res;
Expand All @@ -48,7 +49,8 @@ namespace yask {
}
if (comment.length())
_os << endl << " // " << res << " = " << comment << "." << endl;
_os << _ph.get_line_prefix() << _ph.get_var_type() << " " << res << " = ";
string vtype = use_auto_type ? "auto" : _ph.get_var_type();
_os << _ph.get_line_prefix() << vtype << " " << res << " = ";
return _os;
}

Expand Down
3 changes: 2 additions & 1 deletion src/compiler/lib/Print.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,8 @@ namespace yask {
// If 'comment' is set, use it for the comment.
// Return stream to continue w/RHS.
virtual ostream& make_next_temp_var(string& res, Expr* ex,
string prefix, string comment);
string prefix, string comment,
bool use_auto_type = false);

public:
// os is used for printing intermediate results as needed.
Expand Down
2 changes: 1 addition & 1 deletion src/examples/swe_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ int main(int argc, char** argv) {
auto ydur = ystats->get_elapsed_secs();
os << fixed << setprecision(2) <<
"Duration (s): " << dur << endl <<
"Rate (ms/step): " << (1e3 * dur / (nt+1)) << endl <<
"Rate (ms/step): " << (1e3 * dur / nt) << endl <<
"Time in YASK kernel (s): " << ydur <<
" (" << (100. * ydur / dur) << "%)" << endl <<
flush;
Expand Down
2 changes: 1 addition & 1 deletion src/examples/wave_eq_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ int main(int argc, char** argv) {
auto ydur = ystats->get_elapsed_secs();
os << fixed << setprecision(2) <<
"Duration (s): " << dur << endl <<
"Rate (ms/step): " << (1e3 * dur / (nt+1)) << endl <<
"Rate (ms/step): " << (1e3 * dur / nt) << endl <<
"Time in YASK kernel (s): " << ydur <<
" (" << (100. * ydur / dur) << "%)" << endl <<
flush;
Expand Down
23 changes: 13 additions & 10 deletions src/kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,12 @@ $(YK_EXT_OBJ_DIR)/%.o: %.cpp $(YK_GEN_HEADERS) $(YK_INC_GLOB)
# C++-preprocessor rule.
%.i: %.cpp $(YK_GEN_HEADERS) $(YK_INC_GLOB)
g++ -E -CC $(YK_CXX_INCFLAGS) $< > $@
@ls -l $@
@ls -l `realpath $@`

# Asm rule.
%.s: %.cpp $(YK_GEN_HEADERS) $(YK_INC_GLOB)
$(CXX_PREFIX) $(YK_CXXCMD) $(YK_CXXFLAGS2) -x c++ -S -o $@ $<
@ls -l `realpath $@`

######## Primary targets.
# NB: must set stencil and arch make vars to generate the desired YASK kernel.
Expand Down Expand Up @@ -712,19 +717,19 @@ py-api-no-yc:
# Make this target before rebuilding YASK with any new parameters.
clean:
rm -fr $(YK_OUT_DIR)
rm -fv *.s
- find . $(SRC_DIR) -name '*.s' -print -delete
- find . $(COMM_DIR) $(COEFF_DIR) -name '*.*.o' -print -delete
- find . $(COMM_DIR) $(COEFF_DIR) -name '*.optrpt' -print -delete

# Remove executables, libs, etc.
# Also remove logs from kernel dir, which are most likely from testing.
# Remove executables, libs, reports, etc.
realclean: clean
rm -fv $(YK_LIB) $(YK_EXEC) $(MAKE_REPORT_FILE)
rm -fv $(YK_PY_MOD)* $(YK_PY_LIB)
rm -fv $(YK_API_TEST_EXEC) $(YK_API_TEST_EXEC_WITH_EXCEPTION)
rm -fv $(BUILD_OUT_DIR)/*report.txt
- find . -name '*.pyc' -print -delete
- find . -name '*~' -print -delete
rm -fv ./*~
- find $(YASK_DIR) -name '*.pyc' -print -delete
- find $(SRC_DIR) $(INC_DIR) $(YASK_DIR) $(UTILS_DIR) -name '*~' -print -delete

echo-settings:
@echo "Build environment, `date`"; \
Expand Down Expand Up @@ -800,11 +805,9 @@ echo-settings:
uname -a

# Print stats on inner SIMD loops from asm file.
code-stats: $(YK_LIB)
$(call MK_DIR,$(YK_OBJ_DIR))
$(CXX_PREFIX) $(YK_CXXCMD) $(YK_CXXFLAGS2) -x c++ -S -o $(YK_OBJ_DIR)/factory.s $(YK_LIB_SRC_DIR)/factory.cpp
code-stats: $(YK_LIB_SRC_DIR)/factory.s
@echo "Code stats for stencil computation:"
$(PERL) $(VIEW_ASM) -p -l -f='calc_' $(YK_OBJ_DIR)/factory.s
$(PERL) $(VIEW_ASM) -p -l -f='calc_vectors' $(YK_LIB_SRC_DIR)/factory.s

# Print some usage info.
help:
Expand Down
18 changes: 12 additions & 6 deletions src/kernel/lib/generic_var.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,13 @@ namespace yask {
// Also update the version on the device.
#ifdef USE_OFFLOAD_NO_USM
auto devn = KernelEnv::_omp_devn;


auto cval = get_copyable(val);
_Pragma("omp target teams distribute parallel for device(devn)")
for (idx_t i = 0; i < ne; i++)
elems[i] = val;
for (idx_t i = 0; i < ne; i++) {
T kval = cval;
elems[i] = kval;
}
#endif
}
}
Expand All @@ -184,10 +187,13 @@ namespace yask {
// Also update the version on the device to the same sequence.
#ifdef USE_OFFLOAD_NO_USM
auto devn = KernelEnv::_omp_devn;


auto cseed = get_copyable(seed);
_Pragma("omp target teams distribute parallel for device(devn)")
for (idx_t i = 0; i < ne; i++)
elems[i] = seed * T(imod_flr(i, wrap) + 1);
for (idx_t i = 0; i < ne; i++) {
T kseed = cseed;
elems[i] = kseed * T(imod_flr(i, wrap) + 1);
}
#endif
}
}
Expand Down
6 changes: 3 additions & 3 deletions src/kernel/lib/halo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ namespace yask {
if (send_vec_ok)
nelems = gb.get_vecs_in_slice(bufp, first, last, use_offload);
else
nelems = gb.get_elements_in_slice(bufp, first, last, use_offload);
nelems = gb.get_elements_in_slice_void(bufp, first, last, use_offload);
auto nb = nelems * get_element_bytes();
bufp += nb;
npbytes += nb;
Expand Down Expand Up @@ -418,9 +418,9 @@ namespace yask {
(use_offload ? " on device" : " on host"));
idx_t nelems = 0;
if (recv_vec_ok)
nelems = gp->set_vecs_in_slice(bufp, first, last, use_offload);
nelems = gb.set_vecs_in_slice(bufp, first, last, use_offload);
else
nelems = gp->set_elements_in_slice(bufp, first, last, use_offload);
nelems = gb.set_elements_in_slice_void(bufp, first, last, use_offload);
auto nb = nelems * get_element_bytes();
bufp += nb;
npbytes += nb;
Expand Down
14 changes: 14 additions & 0 deletions src/kernel/lib/realv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,20 @@ namespace yask {

}; // real_vec_t.

// Function for getting copyable data for OMP offload.
#if 1

// For most types, just return the value.
template<typename T>
inline auto get_copyable(T v) { return v; }

// Specialize for 'real_vec_t' to return data union only.
template<>
inline auto get_copyable(real_vec_t v) { return v.u; }
#else
#define get_copyable(v) v
#endif

// Output using '<<'.
inline std::ostream& operator<<(std::ostream& os, const real_vec_t& rn) {
rn.print_reals(os, false);
Expand Down
7 changes: 3 additions & 4 deletions src/kernel/lib/settings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,14 +397,13 @@ namespace yask {
use_shm));
parser.add_option(make_shared<command_line_parser::bool_option>
("force_scalar_exchange",
"[Debug] Do not allow vectorized halo exchanges.",
"[Debug] Do not allow vectorized halo packing and unpacking.",
force_scalar_exchange));
#endif
#endif
parser.add_option(make_shared<command_line_parser::bool_option>
("force_scalar",
"[Debug] Evaluate every var point with scalar stencil operations "
"and exchange halos using only scalar packing and unpacking.",
"[Debug] Do not allow vector stencil computation.",
force_scalar));
parser.add_option(make_shared<command_line_parser::int_option>
("max_threads",
Expand Down Expand Up @@ -941,7 +940,7 @@ namespace yask {
}
os << " Note: only the nano-block size in the '" <<
_dims->_stencil_dims.get_dim_name(_bind_posn) << "' dimension may be used at run-time\n"
" because block-thread binding is enabled on " << num_inner_threads << " block threads.\n";
" because inner-thread data-binding is enabled on " << num_inner_threads << " inner threads.\n";
}

#ifdef USE_TILING
Expand Down
2 changes: 1 addition & 1 deletion src/kernel/lib/settings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,7 @@ namespace yask {
int get_num_comp_threads(int& outer_threads, int& inner_threads) const;

// Set number of threads to use for a mega-block.
// Enable nested OMP if there are >1 block threads,
// Enable nested OMP if there are >1 inner threads,
// disable otherwise.
// Return number of threads.
// Do nothing and return 0 if not properly initialized.
Expand Down
6 changes: 3 additions & 3 deletions src/kernel/lib/stencil_calc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ namespace yask {
TRACE_MSG("reqd part '" << rp->get_name() << "': " <<
mb_idxs3.make_range_str(true) <<
" via outer thread " << outer_thread_idx <<
" with " << nbt << " block thread(s) bound to data...");
" with " << nbt << " inner thread(s) bound to data...");

// Start threads within a block. Each of these threads
// will eventually work on a separate nano-block. This
Expand All @@ -210,7 +210,7 @@ namespace yask {
assert(omp_get_num_threads() == nbt);
int inner_thread_idx = omp_get_thread_num();

// Run the micro-block loops on all block threads and
// Run the micro-block loops on all inner threads and
// call calc_nano_block() only by the designated
// thread for the given slab index in the binding
// dim. This is an explicit replacement for "normal"
Expand Down Expand Up @@ -249,7 +249,7 @@ namespace yask {
TRACE_MSG("reqd part '" << rp->get_name() << "': " <<
mb_idxs3.make_range_str(true) <<
" via outer thread " << outer_thread_idx <<
" with " << nbt << " block thread(s) NOT bound to data...");
" with " << nbt << " inner thread(s) NOT bound to data...");

// Call calc_nano_block() with a different thread for
// each nano-block using standard OpenMP scheduling.
Expand Down
Loading

0 comments on commit 0384156

Please sign in to comment.