From b1b285c34a163e2e936a10deb41afb895abd78fb Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Thu, 9 Nov 2023 14:15:07 -0800 Subject: [PATCH] switch to MPItrampoline --- .buildkite/JuliaProject.toml | 11 +---------- .buildkite/pipeline.yml | 16 +++++++--------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/.buildkite/JuliaProject.toml b/.buildkite/JuliaProject.toml index d0c28450ca..8ba915e1e3 100644 --- a/.buildkite/JuliaProject.toml +++ b/.buildkite/JuliaProject.toml @@ -1,20 +1,11 @@ [extras] CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" -HDF5_jll = "0234f1f7-429e-5d53-9886-15a909be8d59" MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" [preferences.CUDA_Runtime_jll] version = "12.2" local = "true" -[preferences.HDF5_jll] -libhdf5_path = "libhdf5" -libhdf5_hl_path = "libhdf5_hl" - [preferences.MPIPreferences] _format = "1.0" -abi = "OpenMPI" -binary = "system" -libmpi = "libmpi" -mpiexec = "mpiexec" - +binary = "MPItrampoline_jll" diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index af7e84b4cb..6c07b88299 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,16 +1,20 @@ agents: queue: central slurm_mem: 8G - modules: julia/1.9.4 cuda/12.2 ucx/1.14.1_cuda-12.2 openmpi/4.1.5_cuda-12.2 hdf5/1.12.2-ompi415 nsight-systems/2023.2.1 + modules: julia/1.9.4 cuda/12.2 ucx/1.14.1_cuda-12.2 openmpi/4.1.5_cuda-12.2 nsight-systems/2023.2.1 env: JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite" JULIA_DEPOT_PATH: "${BUILDKITE_BUILD_PATH}/${BUILDKITE_PIPELINE_SLUG}/depot/default" JULIA_MAX_NUM_PRECOMPILE_FILES: 100 - OPENBLAS_NUM_THREADS: 1 - JULIA_NVTX_CALLBACKS: gc JULIA_CPU_TARGET: 'broadwell;skylake' + JULIA_NVTX_CALLBACKS: gc + JULIA_CUDA_MEMORY_POOL: none + JULIA_MPI_HAS_CUDA: "true" + OPENBLAS_NUM_THREADS: 1 OMPI_MCA_opal_warn_on_missing_libcuda: 0 + MPITRAMPOLINE_LIB: /groups/esm/software/MPIwrapper/ompi4.1.5_cuda-12.2/lib64/libmpiwrapper.so + MPITRAMPOLINE_MPIEXEC: /groups/esm/software/MPIwrapper/ompi4.1.5_cuda-12.2/bin/mpiwrapperexec steps: - label: "initialize" @@ -282,7 +286,6 @@ steps: timeout_in_minutes: 15 env: CLIMACOMMS_CONTEXT: "MPI" - JULIA_CUDA_MEMORY_POOL: "none" agents: slurm_ntasks: 2 slurm_gpus: 2 @@ -295,7 +298,6 @@ steps: timeout_in_minutes: 15 env: CLIMACOMMS_CONTEXT: "MPI" - JULIA_CUDA_MEMORY_POOL: "none" agents: slurm_ntasks: 3 slurm_gpus: 3 @@ -308,7 +310,6 @@ steps: timeout_in_minutes: 15 env: CLIMACOMMS_CONTEXT: "MPI" - JULIA_CUDA_MEMORY_POOL: "none" agents: slurm_ntasks: 4 slurm_gpus: 4 @@ -321,7 +322,6 @@ steps: timeout_in_minutes: 15 env: CLIMACOMMS_CONTEXT: "MPI" - JULIA_CUDA_MEMORY_POOL: "none" agents: slurm_ntasks: 2 slurm_gpus: 2 @@ -334,7 +334,6 @@ steps: timeout_in_minutes: 15 env: CLIMACOMMS_CONTEXT: "MPI" - JULIA_CUDA_MEMORY_POOL: "none" agents: slurm_ntasks: 3 slurm_gpus: 3 @@ -347,7 +346,6 @@ steps: timeout_in_minutes: 15 env: CLIMACOMMS_CONTEXT: "MPI" - JULIA_CUDA_MEMORY_POOL: "none" agents: slurm_ntasks: 4 slurm_gpus: 4