From 31aed98a45c010df5226bedefcc910f1b119c1f5 Mon Sep 17 00:00:00 2001 From: Henry Le Berre Date: Tue, 4 Jun 2024 11:27:59 +0200 Subject: [PATCH] GPU-Aware MPI on OLCF Frontier and Combined weak- & strong-scaling case (#448) --- .gitignore | 2 + docs/documentation/case.md | 33 +++---- docs/documentation/running.md | 2 - examples/2D_whale_bubble_annulus/case.py | 3 +- examples/3D_weak_scaling/README.md | 24 ----- examples/3D_weak_scaling/analyze.sh | 5 -- examples/scaling/README.md | 33 +++++++ examples/scaling/build.sh | 4 + examples/{3D_weak_scaling => scaling}/case.py | 62 ++++++++----- examples/scaling/export.py | 90 +++++++++++++++++++ examples/scaling/submit.sh | 73 +++++++++++++++ src/simulation/m_checker.fpp | 6 +- src/simulation/m_global_parameters.fpp | 5 +- src/simulation/m_mpi_proxy.fpp | 64 ++++++------- src/simulation/m_start_up.fpp | 2 +- toolchain/bootstrap/modules.sh | 5 ++ toolchain/mfc/args.py | 24 ++--- toolchain/mfc/bench.py | 5 +- toolchain/mfc/build.py | 2 +- toolchain/mfc/run/case_dicts.py | 2 +- toolchain/mfc/run/input.py | 45 ++++++++-- toolchain/mfc/run/run.py | 2 +- toolchain/mfc/test/case.py | 2 +- toolchain/modules | 4 +- toolchain/templates/bridges2.mako | 5 +- toolchain/templates/default.mako | 22 ++--- toolchain/templates/delta.mako | 2 +- toolchain/templates/frontier.mako | 21 +++-- toolchain/templates/phoenix.mako | 5 +- toolchain/templates/summit.mako | 13 ++- 30 files changed, 398 insertions(+), 169 deletions(-) delete mode 100644 examples/3D_weak_scaling/README.md delete mode 100755 examples/3D_weak_scaling/analyze.sh create mode 100644 examples/scaling/README.md create mode 100644 examples/scaling/build.sh rename examples/{3D_weak_scaling => scaling}/case.py (82%) create mode 100644 examples/scaling/export.py create mode 100644 examples/scaling/submit.sh diff --git a/.gitignore b/.gitignore index 65e502eda..7881901f6 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,8 @@ examples/*/viz/ examples/*.jpg examples/*.png examples/*/workloads/ +examples/*/run-*/ +examples/*/logs/ workloads/ benchmarks/*batch/*/ diff --git a/docs/documentation/case.md b/docs/documentation/case.md index 6b38595e0..41d57afc2 100644 --- a/docs/documentation/case.md +++ b/docs/documentation/case.md @@ -30,40 +30,39 @@ This is particularly useful when computations are done in Python to generate the ## (Optional) Accepting command line arguments -Input files can accept **positional** command line arguments, forwarded by `mfc.sh run`. -Consider this example from the 3D_weak_scaling case: +Input files can accept command line arguments, forwarded by `mfc.sh run`. +Consider this example from the `scaling` case: ```python import argparse parser = argparse.ArgumentParser( - prog="3D_weak_scaling", - description="This MFC case was created for the purposes of weak scaling.", + prog="scaling", + description="Weak- and strong-scaling benchmark case.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("dict", type=str, metavar="DICT", help=argparse.SUPPRESS) -parser.add_argument("gbpp", type=int, metavar="MEM", default=16, help="Adjusts the problem size per rank to fit into [MEM] GB of GPU memory") +parser.add_argument("dict", type=str, metavar="DICT") +parser.add_argument("-s", "--scaling", type=str, metavar="SCALING", choices=["weak", "strong"], help="Whether weak- or strong-scaling is being exercised.") # Your parsed arguments are here -ARGS = vars(parser.parse_args()) +args = parser.parse_args() ``` The first argument is always a JSON string representing `mfc.sh run`'s internal state. It contains all the runtime information you might want from the build/run system. -We hide it from the help menu with `help=argparse.SUPPRESS` since it is not meant to be passed in by users. -You can add as many additional positional arguments as you may need. +You can add as many additional arguments as you may need. To run such a case, use the following format: ```shell -./mfc.sh run +./mfc.sh run -- ``` -For example, to run the 3D_weak_scaling case with `gbpp=2`: +For example, to run the `scaling` case in "weak-scaling" mode: ```shell -./mfc.sh run examples/3D_weak_scaling/case.py 2 -t pre_process -j 8 +./mfc.sh run examples/scaling/case.py -t pre_process -j 8 -- --scaling weak ``` ## Parameters @@ -87,11 +86,15 @@ Definition of the parameters is described in the following subsections. ### 1. Runtime -| Parameter | Type | Description | -| ---: | :----: | :--- | -| `run_time_info` | Logical | Output run-time information | +| Parameter | Type | Description | +| ---: | :----: | :--- | +| `run_time_info` | Logical | Output run-time information | +| `rdma_mpi` | Logical | (GPUs) Enable RDMA for MPI communication. | - `run_time_info` generates a text file that includes run-time information including the CFL number(s) at each time-step. +- `rdma_mpi` optimizes data transfers between GPUs using Remote Direct Memory Access (RDMA). +The underlying MPI implementation and communication infrastructure must support this +feature, detecting GPU pointers and performing RDMA accordingly. ### 2. Computational Domain diff --git a/docs/documentation/running.md b/docs/documentation/running.md index b0aebf6d2..05eb5015d 100644 --- a/docs/documentation/running.md +++ b/docs/documentation/running.md @@ -24,8 +24,6 @@ several supercomputer clusters, both interactively and through batch submission. > > If `-c ` is left unspecified, it defaults to `-c default`. -Additional flags can be appended to the MPI executable call using the `-f` (i.e `--flags`) option. - Please refer to `./mfc.sh run -h` for a complete list of arguments and options, along with their defaults. ## Interactive Execution diff --git a/examples/2D_whale_bubble_annulus/case.py b/examples/2D_whale_bubble_annulus/case.py index 4f0449afd..6ac44fbfa 100755 --- a/examples/2D_whale_bubble_annulus/case.py +++ b/examples/2D_whale_bubble_annulus/case.py @@ -193,6 +193,5 @@ 'Mono(1)%pulse' : 1, 'Mono(1)%mag' : 1., 'Mono(1)%length' : 0.2, - 'cu_mpi' : 'F', - + 'rdma_mpi' : 'F', })) diff --git a/examples/3D_weak_scaling/README.md b/examples/3D_weak_scaling/README.md deleted file mode 100644 index 137316954..000000000 --- a/examples/3D_weak_scaling/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# 3D Weak Scaling - -The [**3D_weak_scaling**](case.py) case depends on two parameters: - -- **The number of MPI ranks** (_procs_): As _procs_ increases, the problem -size per rank remains constant. _procs_ is determined using information provided -to the case file by `mfc.sh run`. - -- **GPU memory usage per rank** (_gbpp_): As _gbpp_ increases, the problem -size per rank increases and the number of timesteps decreases so that wall times -consistent. _gbpp_ is a user-defined optional argument to the [case.py](case.py) -file. It can be specified right after the case filepath when invoking `mfc.sh run`. - -Weak scaling benchmarks can be produced by keeping _gbpp_ constant and varying _procs_. - -For example, to run a weak scaling test that uses ~4GB of GPU memory per rank -on 8 2-rank nodes with case optimization, one could: - -```shell -./mfc.sh run examples/3D_weak_scaling/case.py 4 -t pre_process simulation \ - -e batch -p mypartition -N 8 -n 2 -w "01:00:00" -# "MFC Weak Scaling" \ - --case-optimization -j 32 -``` - diff --git a/examples/3D_weak_scaling/analyze.sh b/examples/3D_weak_scaling/analyze.sh deleted file mode 100755 index a6cd72428..000000000 --- a/examples/3D_weak_scaling/analyze.sh +++ /dev/null @@ -1,5 +0,0 @@ -# This script is ran from the 3D_weak_scaling case directory after running -# MFC with --omni -n . To analyze, run chmod u+x ./analyze.sh followed -# by ./analyze.sh - -omniperf analyze -p workloads/$1/mi200 --metric 0 7.1.5 7.1.6 7.1.7 7.1.8 7.1.9 16.3.1 16.3.2 16.3.7 17.3.2 17.3.3 17.3.8 diff --git a/examples/scaling/README.md b/examples/scaling/README.md new file mode 100644 index 000000000..88ccbe8d4 --- /dev/null +++ b/examples/scaling/README.md @@ -0,0 +1,33 @@ +# Strong- & Weak-scaling + +The [**Scaling**](case.py) case can exercise both weak- and strong-scaling. It +adjusts itself depending on the number of requested ranks. + +This directory also contains a collection of scripts used to test strong-scaling +on OLCF Frontier. They required modifying MFC to collect some metrics but are +meant to serve as a reference to users wishing to run similar experiments. + +## Weak Scaling + +Pass `--scaling weak`. The `--memory` option controls (approximately) how much +memory each rank should use, in Gigabytes. The number of cells in each dimension +is then adjusted according to the number of requested ranks and an approximation +for the relation between cell count and memory usage. The problem size increases +linearly with the number of ranks. + +## Strong Scaling + +Pass `--scaling strong`. The `--memory` option controls (approximately) how much +memory should be used in total during simulation, across all ranks, in Gigabytes. +The problem size remains constant as the number of ranks increases. + +## Example + +For example, to run a weak-scaling test that uses ~4GB of GPU memory per rank +on 8 2-rank nodes with case optimization, one could: + +```shell +./mfc.sh run examples/scaling/case.py -t pre_process simulation \ + -e batch -p mypartition -N 8 -n 2 -w "01:00:00" -# "MFC Weak Scaling" \ + --case-optimization -j 32 -- --scaling weak --memory 4 +``` diff --git a/examples/scaling/build.sh b/examples/scaling/build.sh new file mode 100644 index 000000000..0d7dde559 --- /dev/null +++ b/examples/scaling/build.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +./mfc.sh build -t pre_process simulation --case-optimization -i examples/scaling/case.py \ + -j 8 --gpu --mpi --no-debug -- -s strong -m 512 \ No newline at end of file diff --git a/examples/3D_weak_scaling/case.py b/examples/scaling/case.py similarity index 82% rename from examples/3D_weak_scaling/case.py rename to examples/scaling/case.py index 71bfbd31a..b9861b7e1 100644 --- a/examples/3D_weak_scaling/case.py +++ b/examples/scaling/case.py @@ -1,28 +1,45 @@ #!/usr/bin/env python3 -# Case file contributed by Anand Radhakrishnan and modified by Henry Le Berre -# for integration as a weak scaling benchmark for MFC. - -import json, math, argparse +import sys, json, math, typing, argparse parser = argparse.ArgumentParser( - prog="3D_weak_scaling", - description="This MFC case was created for the purposes of weak scaling.", + prog="scaling", + description="Weak- and strong-scaling benchmark case.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("dict", type=str, metavar="DICT", help=argparse.SUPPRESS) -parser.add_argument("gbpp", type=int, metavar="MEM", default=16, help="Adjusts the problem size per rank to fit into [MEM] GB of GPU memory per GPU.") +parser.add_argument("dict", type=str, metavar="DICT") +parser.add_argument("-s", "--scaling", type=str, metavar="SCALING", choices=["weak", "strong"], help="Whether weak- or strong-scaling is being exercised.") +parser.add_argument("-m", "--memory", type=int, metavar="MEMORY", help="Weak scaling: memory per rank in GB. Strong scaling: global memory in GB. Used to determine cell count.") +parser.add_argument("-f", "--fidelity", type=str, metavar="FIDELITY", choices=["ideal", "exact"], default="ideal") +parser.add_argument("--rdma_mpi", type=str, metavar="FIDELITY", choices=["T", "F"], default="F") +parser.add_argument("--n-steps", type=int, metavar="N", default=None) + +args = parser.parse_args() + +if args.scaling is None: + parser.print_help() + sys.exit(1) + +DICT = json.loads(args.dict) -ARGS = vars(parser.parse_args()) -DICT = json.loads(ARGS["dict"]) +# \approx The number of cells per GB of memory. The exact value is not important. +cpg = 8000000 / 16.0 +# Number of ranks. +nranks = DICT["nodes"] * DICT["tasks_per_node"] -ppg = 8000000 / 16.0 -procs = DICT["nodes"] * DICT["tasks_per_node"] -ncells = math.floor(ppg * procs * ARGS["gbpp"]) -s = math.floor((ncells / 2.0) ** (1/3)) -Nx, Ny, Nz = 2*s, s, s +def nxyz_from_ncells(ncells: float) -> typing.Tuple[int, int, int]: + s = math.floor((ncells / 2.0) ** (1/3)) + return 2*s, s, s -# athmospheric pressure - Pa (used as reference value) +if args.scaling == "weak": + if args.fidelity == "ideal": + raise RuntimeError("ask ben") + else: + Nx, Ny, Nz = nxyz_from_ncells(cpg * nranks * args.memory) +else: + Nx, Ny, Nz = nxyz_from_ncells(cpg * args.memory) + +# Atmospheric pressure - Pa (used as reference value) patm = 101325 # Initial Droplet Diameter / Reference length - m @@ -162,7 +179,8 @@ AS = int( NtA // SF + 1 ) # Nt = total number of steps. Note that Nt >= NtA (so at least tendA is completely simulated) -Nt = AS * SF +Nt = args.n_steps or (AS * SF) +SF = min( SF, Nt ) # total simulation time - s. Note that tend >= tendA tend = Nt * dt @@ -171,6 +189,7 @@ print(json.dumps({ # Logistics ================================================ 'run_time_info' : 'T', + 'rdma_mpi' : args.rdma_mpi, # ========================================================== # Computational Domain Parameters ========================== @@ -186,8 +205,8 @@ 'cyl_coord' : 'F', 'dt' : dt, 't_step_start' : 0, - 't_step_stop' : int(5000*16.0/ARGS["gbpp"]), - 't_step_save' : int(1000*16.0/ARGS["gbpp"]), + 't_step_stop' : Nt, + 't_step_save' : SF, # ========================================================== # Simulation Algorithm Parameters ========================== @@ -201,7 +220,7 @@ 'time_stepper' : 3, 'weno_order' : 3, 'weno_eps' : 1.0E-16, - 'weno_Re_flux' : 'F', + 'weno_Re_flux' : 'F', 'weno_avg' : 'F', 'mapped_weno' : 'T', 'riemann_solver' : 2, @@ -283,6 +302,3 @@ 'fluid_pp(2)%pi_inf' : gama*pia/(gama-1), # ========================================================== })) - -# ============================================================================== - diff --git a/examples/scaling/export.py b/examples/scaling/export.py new file mode 100644 index 000000000..ba7555887 --- /dev/null +++ b/examples/scaling/export.py @@ -0,0 +1,90 @@ +import re, os, csv, glob, statistics + +from dataclasses import dataclass, fields + +CDIR=os.path.abspath(os.path.join("examples", "scaling")) +LDIR=os.path.join(CDIR, "logs") + +def get_num(s: str) -> float: + try: + return float(re.findall(r"[0-9]+\.[0-9]+(?:E[-+][0-9]+)?", s, re.MULTILINE)[0]) + except: + return None + +def get_nums(arr): + return {get_num(_) for _ in arr if get_num(_)} + +@dataclass(frozen=True, order=True) +class Configuration: + nodes: int + mem: int + rdma_mpi: bool + +@dataclass +class Result: + ts_avg: float + mpi_avg: float + init_t: float + sim_t: float + +runs = {} + +for logpath in glob.glob(os.path.join(LDIR, "run-*-sim*")): + logdata = open(logpath, "r").read() + + tss = get_nums(re.findall(r'^ TS .+', logdata, re.MULTILINE)) + mpis = get_nums(re.findall(r'^ MPI .+', logdata, re.MULTILINE)) + try: + perf = get_num(re.findall(r"^ Performance: .+", logdata, re.MULTILINE)[0]) + except: + perf = 'N/A' + + if len(tss) == 0: tss = [-1.0] + if len(mpis) == 0: mpis = [-1.0] + + pathels = os.path.relpath(logpath, LDIR).split('-') + + runs[Configuration( + nodes=int(pathels[1]), + mem=int(pathels[2]), + rdma_mpi=pathels[3] == 'T' + )] = Result( + ts_avg=statistics.mean(tss), + mpi_avg=statistics.mean(mpis), + init_t=get_num(re.findall(r"Init took .+", logdata, re.MULTILINE)[0]), + sim_t=get_num(re.findall(r"sim_duration .+", logdata, re.MULTILINE)[0]), + ) + +with open(os.path.join(CDIR, "export.csv"), "w") as f: + writer = csv.writer(f, delimiter=',') + writer.writerow([ + _.name for _ in fields(Configuration) + fields(Result) + ]) + + for cfg in sorted(runs.keys()): + writer.writerow( + [ getattr(cfg, _.name) for _ in fields(Configuration) ] + + [ getattr(runs[cfg], _.name) for _ in fields(Result) ] + ) + +for rdma_mpi in (False, True): + with open( + os.path.join(CDIR, f"strong_scaling{'-rdma_mpi' if rdma_mpi else ''}.csv"), + "w" + ) as f: + writer = csv.writer(f, delimiter=',') + + for nodes in sorted({ + _.nodes for _ in runs.keys() if _.rdma_mpi == rdma_mpi + }): + row = (nodes*8,) + for mem in sorted({ + _.mem for _ in runs.keys() if _.nodes == nodes and _.rdma_mpi == rdma_mpi + }, reverse=True): + ref = runs[Configuration(nodes=sorted({ + _.nodes for _ in runs.keys() if _.rdma_mpi == rdma_mpi + })[0], mem=mem, rdma_mpi=rdma_mpi)] + run = runs[Configuration(nodes=nodes, mem=mem, rdma_mpi=rdma_mpi)] + row = (*row,run.sim_t,ref.sim_t/nodes) + + writer.writerow(row) diff --git a/examples/scaling/submit.sh b/examples/scaling/submit.sh new file mode 100644 index 000000000..1d3ed373f --- /dev/null +++ b/examples/scaling/submit.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +mkdir -p examples/scaling/logs + +for N in 1 2 4 8 16 32; do + echo -n "N=$N: " + sbatch < examples/scaling/logs/run-\$slug-pre.out 2>&1 + fi + + for cu_mpi in F T; do + + slug="$N-\$M-\$cu_mpi" + echo "Running \$slug" + + # Note: `time` is not used for performance measurement, only for monitoring + # the job's progress. + time ./mfc.sh run \$case_dir/case.py -c frontier -n 8 -N $N \ + -t simulation --case-optimization --no-build -# run-\$slug-sim \ + -- --scaling strong --memory \$M --cu_mpi \$cu_mpi --n-steps 20 \ + > examples/scaling/logs/run-\$slug-sim.out 2>&1 + + done + + rm -rf \$case_dir + +done + +echo "End @ $(date)" + +EOT +done \ No newline at end of file diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp index 4848a97ad..bb1f9ba41 100644 --- a/src/simulation/m_checker.fpp +++ b/src/simulation/m_checker.fpp @@ -29,9 +29,9 @@ contains bub_fac = 0 if (bubbles .and. (num_fluids == 1)) bub_fac = 1 -#if !(defined(MFC_OpenACC) && defined(__PGI)) - if (cu_mpi) then - call s_mpi_abort('Unsupported value of cu_mpi. Exiting ...') +#if !defined(MFC_OpenACC) && !(defined(__PGI) || defined(_CRAYFTN)) + if (rdma_mpi) then + call s_mpi_abort('Unsupported value of rdma_mpi. Exiting ...') end if #endif diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 255c4beee..0f5d3df5a 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -438,7 +438,8 @@ module m_global_parameters real(kind(0d0)) :: mytime !< Current simulation time real(kind(0d0)) :: finaltime !< Final simulation time - logical :: weno_flat, riemann_flat, cu_mpi + logical :: weno_flat, riemann_flat, rdma_mpi + #ifdef CRAY_ACC_WAR @:CRAY_DECLARE_GLOBAL(type(pres_field), dimension(:), pb_ts) @@ -506,7 +507,7 @@ contains hypoelasticity = .false. weno_flat = .true. riemann_flat = .true. - cu_mpi = .false. + rdma_mpi = .false. bc_x%beg = dflt_int; bc_x%end = dflt_int bc_y%beg = dflt_int; bc_y%end = dflt_int diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp index d38a2ce2d..6c44e2542 100644 --- a/src/simulation/m_mpi_proxy.fpp +++ b/src/simulation/m_mpi_proxy.fpp @@ -194,12 +194,12 @@ contains call MPI_BCAST(${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) #:endfor - #:for VAR in [ 'run_time_info','cyl_coord', 'adv_alphan', 'mpp_lim', & - & 'mapped_weno', 'mp_weno', 'cu_mpi', 'weno_flat', 'riemann_flat', & - & 'weno_Re_flux', 'alt_soundspeed', 'null_weights', 'mixture_err', & - & 'parallel_io', 'hypoelasticity', 'bubbles', 'polytropic', & - & 'polydisperse', 'qbmm', 'monopole', 'probe_wrt', 'integral_wrt', & - & 'prim_vars_wrt', 'weno_avg', 'file_per_process', 'relax', & + #:for VAR in [ 'run_time_info','cyl_coord', 'adv_alphan', 'mpp_lim', & + & 'mapped_weno', 'mp_weno', 'rdma_mpi', 'weno_flat', 'riemann_flat', & + & 'weno_Re_flux', 'alt_soundspeed', 'null_weights', 'mixture_err', & + & 'parallel_io', 'hypoelasticity', 'bubbles', 'polytropic', & + & 'polydisperse', 'qbmm', 'monopole', 'probe_wrt', 'integral_wrt', & + & 'prim_vars_wrt', 'weno_avg', 'file_per_process', 'relax', & & 'adv_n', 'adap_dt', 'ib', 'bodyForces', 'bf_x', 'bf_y', 'bf_z' ] call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) #:endfor @@ -1045,9 +1045,9 @@ contains #:endfor ! Send/Recv - #:for cu_mpi in [False, True] - if (cu_mpi .eqv. ${'.true.' if cu_mpi else '.false.'}$) then - #:if cu_mpi + #:for rdma_mpi in [False, True] + if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then + #:if rdma_mpi !$acc host_data use_device(q_cons_buff_recv, q_cons_buff_send, ib_buff_recv, ib_buff_send) #:else !$acc update host(q_cons_buff_send, ib_buff_send) @@ -1058,7 +1058,7 @@ contains q_cons_buff_recv(0), buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - #:if cu_mpi + #:if rdma_mpi !$acc end host_data !$acc wait #:else @@ -1294,7 +1294,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send, ib_buff_recv, ib_buff_send) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -1344,7 +1344,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -1380,7 +1380,7 @@ contains end if #if defined(MFC_OpenACC) - if (cu_mpi .eqv. .false.) then + if (rdma_mpi .eqv. .false.) then !$acc update device(ib_buff_recv) end if #endif @@ -1416,7 +1416,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -1464,7 +1464,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -1499,7 +1499,7 @@ contains end if - if (cu_mpi .eqv. .false.) then + if (rdma_mpi .eqv. .false.) then !$acc update device(ib_buff_recv) end if @@ -1538,7 +1538,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -1589,7 +1589,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -1626,7 +1626,7 @@ contains end if #if defined(MFC_OpenACC) - if (cu_mpi .eqv. .false.) then + if (rdma_mpi .eqv. .false.) then !$acc update device(ib_buff_recv) end if #endif @@ -1664,7 +1664,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -1715,7 +1715,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -1752,7 +1752,7 @@ contains end if #if defined(MFC_OpenACC) - if (cu_mpi .eqv. .false.) then + if (rdma_mpi .eqv. .false.) then !$acc update device(ib_buff_recv) end if #endif @@ -1793,7 +1793,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -1844,7 +1844,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -1881,7 +1881,7 @@ contains end if #if defined(MFC_OpenACC) - if (cu_mpi .eqv. .false.) then + if (rdma_mpi .eqv. .false.) then !$acc update device(ib_buff_recv) end if #endif @@ -1920,7 +1920,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -1971,7 +1971,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) #if defined(MFC_OpenACC) - if (cu_mpi) then + if (rdma_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) ! Send/receive buffer to/from bc_x%end/bc_x%beg @@ -2007,7 +2007,7 @@ contains end if #if defined(MFC_OpenACC) - if (cu_mpi .eqv. .false.) then + if (rdma_mpi .eqv. .false.) then !$acc update device(ib_buff_recv) end if #endif @@ -2143,9 +2143,9 @@ contains #:endfor ! Send/Recv - #:for cu_mpi in [False, True] - if (cu_mpi .eqv. ${'.true.' if cu_mpi else '.false.'}$) then - #:if cu_mpi + #:for rdma_mpi in [False, True] + if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then + #:if rdma_mpi !$acc host_data use_device(c_divs_buff_recv, c_divs_buff_send) #:else !$acc update host(c_divs_buff_send) @@ -2156,7 +2156,7 @@ contains c_divs_buff_recv(0), buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - #:if cu_mpi + #:if rdma_mpi !$acc end host_data !$acc wait #:else diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp index a1a568556..9df38ce3b 100644 --- a/src/simulation/m_start_up.fpp +++ b/src/simulation/m_start_up.fpp @@ -132,7 +132,7 @@ contains t_step_start, t_step_stop, t_step_save, t_step_print, & model_eqns, adv_alphan, & mpp_lim, time_stepper, weno_eps, weno_flat, & - riemann_flat, cu_mpi, cu_tensor, & + riemann_flat, rdma_mpi, cu_tensor, & mapped_weno, mp_weno, weno_avg, & riemann_solver, wave_speeds, avg_state, & bc_x, bc_y, bc_z, & diff --git a/toolchain/bootstrap/modules.sh b/toolchain/bootstrap/modules.sh index f9aa29693..056df70b7 100644 --- a/toolchain/bootstrap/modules.sh +++ b/toolchain/bootstrap/modules.sh @@ -101,6 +101,11 @@ for element in ${ELEMENTS[@]}; do fi done +if [ ! -z ${CRAY_LD_LIBRARY_PATH+x} ]; then + ok "Found $M\$CRAY_LD_LIBRARY_PATH$CR. Prepending to $M\$LD_LIBRARY_PATH$CR." + export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH" +fi + ok 'All modules and environment variables have been loaded.' return diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py index cd64a1819..e2c162dbc 100644 --- a/toolchain/mfc/args.py +++ b/toolchain/mfc/args.py @@ -1,4 +1,4 @@ -import re, os.path, argparse, dataclasses +import re, sys, os.path, argparse, dataclasses from .run.run import get_baked_templates from .build import TARGETS, DEFAULT_TARGETS, DEPENDENCY_TARGETS @@ -96,12 +96,9 @@ def add_common_arguments(p, mask = None): test_meg.add_argument("--add-new-variables", action="store_true", default=False, help="(Test Generation) If new variables are found in D/ when running tests, add them to the golden files.") test_meg.add_argument("--remove-old-tests", action="store_true", default=False, help="(Test Generation) Delete tests directories that are no longer.") - test.add_argument(metavar="FORWARDED", default=[], dest="--", nargs="*", help="Arguments to forward to the ./mfc.sh run invocations.") - # === RUN === add_common_arguments(run) run.add_argument("input", metavar="INPUT", type=str, help="Input file to run.") - run.add_argument("arguments", metavar="ARGUMENTS", nargs="*", type=str, default=[], help="Additional positional arguments to pass to the case file.") run.add_argument("-e", "--engine", choices=["interactive", "batch"], type=str, default="interactive", help="Job execution/submission engine choice.") run.add_argument("-p", "--partition", metavar="PARTITION", type=str, default="", help="(Batch) Partition for job submission.") run.add_argument("-q", "--quality_of_service", metavar="QOS", type=str, default="", help="(Batch) Quality of Service for job submission.") @@ -113,24 +110,22 @@ def add_common_arguments(p, mask = None): run.add_argument("-#", "--name", metavar="NAME", type=str, default="MFC", help="(Batch) Job name.") run.add_argument("-s", "--scratch", action="store_true", default=False, help="Build from scratch.") run.add_argument("-b", "--binary", choices=["mpirun", "jsrun", "srun", "mpiexec"], type=str, default=None, help="(Interactive) Override MPI execution binary") - run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.") - run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.") - run.add_argument("--omni", nargs=argparse.REMAINDER, type=str, help="Profile with ROCM omniperf.") - run.add_argument("--roc", nargs=argparse.REMAINDER, type=str, help="Profile with ROCM rocprof.") run.add_argument( "--dry-run", action="store_true", default=False, help="(Batch) Run without submitting batch file.") run.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.") run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.") - run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="Arguments to forward to the MPI invocation.") run.add_argument("-c", "--computer", metavar="COMPUTER", type=str, default="default", help=f"(Batch) Path to a custom submission file template or one of {format_list_to_string(list(get_baked_templates().keys()))}.") run.add_argument("-o", "--output-summary", metavar="OUTPUT", type=str, default=None, help="Output file (YAML) for summary.") run.add_argument("--clean", action="store_true", default=False, help="Clean the case before running.") + run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.") + run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.") + run.add_argument("--omni", nargs=argparse.REMAINDER, type=str, help="Profile with ROCM omniperf.") + run.add_argument("--roc", nargs=argparse.REMAINDER, type=str, help="Profile with ROCM rocprof.") # === BENCH === add_common_arguments(bench) bench.add_argument("-o", "--output", metavar="OUTPUT", default=None, type=str, required="True", help="Path to the YAML output file to write the results to.") bench.add_argument("-m", "--mem", metavar="MEM", default=1, type=int, help="Memory per task for benchmarking cases") - bench.add_argument(metavar="FORWARDED", default=[], dest='--', nargs="*", help="Arguments to forward to the ./mfc.sh run invocations.") # === BENCH_DIFF === add_common_arguments(bench_diff, "t") @@ -143,8 +138,13 @@ def add_common_arguments(p, mask = None): # === COUNT === add_common_arguments(count_diff, "g") - args: dict = vars(parser.parse_args()) - args["--"] = args.get("--", []) + try: + extra_index = sys.argv.index('--') + except ValueError: + extra_index = len(sys.argv) + + args: dict = vars(parser.parse_args(sys.argv[1:extra_index])) + args["--"] = sys.argv[extra_index + 1:] # Add default arguments of other subparsers for name, parser in [("run", run), ("test", test), ("build", build), diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py index 148abf13c..52792f269 100644 --- a/toolchain/mfc/bench.py +++ b/toolchain/mfc/bench.py @@ -57,10 +57,11 @@ def bench(targets = None): with open(log_filepath, "w") as log_file: system( - ["./mfc.sh", "run", case.path, ARG('mem'), "--case-optimization"] + + ["./mfc.sh", "run", case.path, "--case-optimization"] + ["--targets"] + [t.name for t in targets] + ["--output-summary", summary_filepath] + - case.args, + case.args + + ["--", ARG('mem')], stdout=log_file, stderr=subprocess.STDOUT) diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index ec5851b1f..700e1c90d 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -286,7 +286,7 @@ def build(targets = None, case: input.MFCInputFile = None, history: typing.Set[s targets = ARG("targets") targets = get_targets(list(REQUIRED_TARGETS) + targets) - case = case or input.load(ARG("input"), ARG("arguments"), {}) + case = case or input.load(ARG("input"), ARG("--"), {}) case.validate_params() if len(history) == 0: diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py index 8fd9e0555..80c77f3e0 100644 --- a/toolchain/mfc/run/case_dicts.py +++ b/toolchain/mfc/run/case_dicts.py @@ -183,7 +183,7 @@ class ParamType(Enum): 'R0_type': ParamType.INT, 'integral_wrt': ParamType.LOG, 'num_integrals': ParamType.INT, - 'cu_mpi': ParamType.LOG, + 'rdma_mpi': ParamType.LOG, 'palpha_eps': ParamType.REAL, 'ptgalpha_eps': ParamType.REAL, 'pi_fac': ParamType.REAL, diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py index 3db2a1d18..27110e1a2 100644 --- a/toolchain/mfc/run/input.py +++ b/toolchain/mfc/run/input.py @@ -1,4 +1,4 @@ -import os, json, typing, dataclasses +import os, json, glob, typing, dataclasses from ..printer import cons from .. import common, build @@ -49,15 +49,44 @@ def generate(self, target) -> None: self.generate_fpp(target) - def clean(self, targets) -> None: - for relfile in [ + def clean(self, _targets) -> None: + targets = [build.get_target(target) for target in _targets] + + files = set() + dirs = set() + + files = set([ "equations.dat", "run_time.inf", "time_data.dat", - "io_time_data.dat", "fort.1" - ] + [f"{build.get_target(target).name}.inp" for target in targets]: - common.delete_file(os.path.join(self.dirpath, relfile)) + "io_time_data.dat", "fort.1", "pre_time_data.dat" + ] + [f"{target.name}.inp" for target in targets]) + + if build.PRE_PROCESS in targets: + files = files | set(glob.glob(os.path.join(self.dirpath, "D", "*.000000.dat"))) + dirs = dirs | set(glob.glob(os.path.join(self.dirpath, "p_all", "p*", "0"))) + + if build.SIMULATION in targets: + restarts = set(glob.glob(os.path.join(self.dirpath, "restart_data", "*.dat"))) + restarts = restarts - set(glob.glob(os.path.join(self.dirpath, "restart_data", "lustre_0.dat"))) + restarts = restarts - set(glob.glob(os.path.join(self.dirpath, "restart_data", "lustre_*_cb.dat"))) + + Ds = set(glob.glob(os.path.join(self.dirpath, "D", "*.dat"))) + Ds = Ds - set(glob.glob(os.path.join(self.dirpath, "D", "*.000000.dat"))) + + files = files | restarts + files = files | Ds + + if build.POST_PROCESS in targets: + dirs.add("silo_hdf5") + + for relfile in files: + if not os.path.isfile(relfile): + relfile = os.path.join(self.dirpath, relfile) + common.delete_file(relfile) - for reldir in ["D", "p_all", "silo_hdf5", "viz"]: - common.delete_directory(os.path.join(self.dirpath, reldir)) + for reldir in dirs: + if not os.path.isdir(reldir): + reldir = os.path.join(self.dirpath, reldir) + common.delete_directory(reldir) # Load the input file diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py index ac825f7e0..72d1adb99 100644 --- a/toolchain/mfc/run/run.py +++ b/toolchain/mfc/run/run.py @@ -130,7 +130,7 @@ def __execute_job_script(qsystem: queues.QueueSystem): def run(targets = None, case = None): targets = get_targets(list(REQUIRED_TARGETS) + (targets or ARG("targets"))) - case = case or input.load(ARG("input"), ARG("arguments")) + case = case or input.load(ARG("input"), ARG("--")) build(targets) diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py index 06a8fd4ba..e772cf13b 100644 --- a/toolchain/mfc/test/case.py +++ b/toolchain/mfc/test/case.py @@ -90,7 +90,7 @@ 'Mono(1)%dir' : 1.0, 'Mono(1)%npulse' : 1, 'Mono(1)%pulse' : 1, - 'cu_mpi' :'F', + 'rdma_mpi' : 'F', } def trace_to_uuid(trace: str) -> str: diff --git a/toolchain/modules b/toolchain/modules index e1e51bfc9..e852818aa 100644 --- a/toolchain/modules +++ b/toolchain/modules @@ -50,8 +50,8 @@ p-gpu MFC_CUDA_CC=70,80 CC=nvc CXX=nvc++ FC=nvfortran f OLCF Frontier f-gpu rocm/5.5.1 craype-accel-amd-gfx90a f-all cpe/23.09 -f-all cray-fftw cray-hdf5 cray-mpich cce/16.0.1 -f-all rocm/5.5.1 cray-python omniperf +f-all cray-fftw cray-hdf5 cray-mpich/8.1.26 cce/16.0.1 +f-all rocm/5.5.1 cray-python omniperf f-cpu diff --git a/toolchain/templates/bridges2.mako b/toolchain/templates/bridges2.mako index 773bc779f..a6471bd8b 100644 --- a/toolchain/templates/bridges2.mako +++ b/toolchain/templates/bridges2.mako @@ -41,9 +41,8 @@ echo % if not mpi: (set -x; ${profiler} "${target.get_install_binpath(case)}") % else: - (set -x; ${profiler} \ - mpirun -np ${nodes*tasks_per_node} \ - ${' '.join([f"'{x}'" for x in ARG('--') ])} \ + (set -x; ${profiler} \ + mpirun -np ${nodes*tasks_per_node} \ "${target.get_install_binpath(case)}") % endif diff --git a/toolchain/templates/default.mako b/toolchain/templates/default.mako index c456d9ef2..4983f5373 100644 --- a/toolchain/templates/default.mako +++ b/toolchain/templates/default.mako @@ -37,26 +37,22 @@ warn "Consider using a different template via the $MAGENTA--computer$COLOR_RESET % else: if [ "$binary" == "jsrun" ]; then (set -x; ${profiler} \ - jsrun --nrs ${tasks_per_node*nodes} \ - --cpu_per_rs 1 \ - --gpu_per_rs ${1 if gpu else 0} \ - --tasks_per_rs 1 \ - ${' '.join([f"'{x}'" for x in ARG('--') ])} \ + jsrun --nrs ${tasks_per_node*nodes} \ + --cpu_per_rs 1 \ + --gpu_per_rs ${1 if gpu else 0} \ + --tasks_per_rs 1 \ "${target.get_install_binpath(case)}") elif [ "$binary" == "srun" ]; then - (set -x; ${profiler} \ - srun --ntasks-per-node ${tasks_per_node} \ - ${' '.join([f"'{x}'" for x in ARG('--') ])} \ + (set -x; ${profiler} \ + srun --ntasks ${nodes*tasks_per_node} \ "${target.get_install_binpath(case)}") elif [ "$binary" == "mpirun" ]; then (set -x; ${profiler} \ - $binary -np ${nodes*tasks_per_node} \ - ${' '.join([f"'{x}'" for x in ARG('--') ])} \ + $binary -np ${nodes*tasks_per_node} \ "${target.get_install_binpath(case)}") elif [ "$binary" == "mpiexec" ]; then - (set -x; ${profiler} \ - $binary --ntasks ${nodes*tasks_per_node} \ - ${' '.join([f"'{x}'" for x in ARG('--') ])} \ + (set -x; ${profiler} \ + $binary --ntasks ${nodes*tasks_per_node} \ "${target.get_install_binpath(case)}") fi % endif diff --git a/toolchain/templates/delta.mako b/toolchain/templates/delta.mako index 079badca3..9f7cdd19e 100644 --- a/toolchain/templates/delta.mako +++ b/toolchain/templates/delta.mako @@ -45,7 +45,7 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/spack/deltas11-2023-03/apps/linux-rh % if not mpi: (set -x; ${profiler} "${target.get_install_binpath(case)}") % else: - (set -x; ${profiler} \ + (set -x; ${profiler} \ mpirun -np ${nodes*tasks_per_node} \ ${' '.join([f"'{x}'" for x in ARG('--') ])} \ "${target.get_install_binpath(case)}") diff --git a/toolchain/templates/frontier.mako b/toolchain/templates/frontier.mako index b375eb260..98b54429e 100644 --- a/toolchain/templates/frontier.mako +++ b/toolchain/templates/frontier.mako @@ -8,6 +8,11 @@ #SBATCH --job-name="${name}" #SBATCH --output="${name}.out" #SBATCH --time=${walltime} +#SBATCH --cpus-per-task=7 +% if gpu: +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=closest +% endif % if account: #SBATCH --account=${account} % endif @@ -33,17 +38,23 @@ cd "${MFC_ROOTDIR}" cd - > /dev/null echo +export MPICH_GPU_SUPPORT_ENABLED=1 + % for target in targets: ${helpers.run_prologue(target)} % if not mpi: (set -x; ${profiler} "${target.get_install_binpath(case)}") % else: - (set -x; srun -N ${nodes} \ - -n ${tasks_per_node} \ - ${profiler} \ - ${' '.join([f"'{x}'" for x in ARG('--') ])} \ - "${target.get_install_binpath(case)}") + (set -x; srun \ + % if engine == 'interactive': + --nodes ${nodes} --ntasks-per-node ${tasks_per_node} \ + --cpus-per-task 7 \ + % if gpu: + --gpus-per-task 1 --gpu-bind closest \ + % endif + % endif + ${profiler} "${target.get_install_binpath(case)}") % endif ${helpers.run_epilogue(target)} diff --git a/toolchain/templates/phoenix.mako b/toolchain/templates/phoenix.mako index 874845ab1..b0bc3c055 100644 --- a/toolchain/templates/phoenix.mako +++ b/toolchain/templates/phoenix.mako @@ -42,9 +42,8 @@ echo (set -x; ${profiler} "${target.get_install_binpath(case)}") % else: (set -x; ${profiler} \ - mpirun -np ${nodes*tasks_per_node} \ - --bind-to none \ - ${' '.join([f"'{x}'" for x in ARG('--') ])} \ + mpirun -np ${nodes*tasks_per_node} \ + --bind-to none \ "${target.get_install_binpath(case)}") % endif diff --git a/toolchain/templates/summit.mako b/toolchain/templates/summit.mako index cdc3b3ddc..88e1fcde5 100644 --- a/toolchain/templates/summit.mako +++ b/toolchain/templates/summit.mako @@ -27,13 +27,12 @@ echo (set -x; ${rofiler} "${target.get_install_binpath(case)}") % else: (set -x; ${profiler} \ - jsrun \ - ${'--smpiargs="-gpu"' if gpu else ''} \ - --nrs ${tasks_per_node*nodes} \ - --cpu_per_rs 1 \ - --gpu_per_rs ${1 if gpu else 0} \ - --tasks_per_rs 1 \ - ${' '.join([f"'{x}'" for x in ARG('--') ])} \ + jsrun \ + ${'--smpiargs="-gpu"' if gpu else ''} \ + --nrs ${tasks_per_node*nodes} \ + --cpu_per_rs 1 \ + --gpu_per_rs ${1 if gpu else 0} \ + --tasks_per_rs 1 \ "${target.get_install_binpath(case)}") % endif