From 98282b438bac12997708ea084f79a03aa819a409 Mon Sep 17 00:00:00 2001 From: Ryan Hankins Date: Thu, 6 Jun 2024 15:14:28 -0500 Subject: [PATCH] Add ROCm support based on configure option. ROCm provides an interface similar to CUDA, to work with AMD GPUs. Provide a compile time option to build with ROCm instead of CUDA. 1. Add --with-rocm= flag to ./configure. 2. Make all CUDA calls "gpu" calls, which are independent of the underlying framework. 3. Switch between _rocm and _cuda files at compile time to make the appropriate calls. 4. When building for RCCL (AMD's NCCL), generate a rccl-net.so-named plugin for binary compatibility. Signed-off-by: Ryan Hankins --- configure.ac | 26 +++++++++++++-- include/nccl-headers/error.h | 2 +- include/nccl-headers/net.h | 2 +- include/nccl_ofi_rocm.h | 48 +++++++++++++++++++++++++++ m4/check_pkg_cuda.m4 | 3 -- m4/check_pkg_rocm.m4 | 52 +++++++++++++++++++++++++++++ src/Makefile.am | 54 +++++++++++++++++-------------- src/nccl_ofi_api.c | 2 +- src/nccl_ofi_net.c | 2 ++ src/nccl_ofi_ofiutils.c | 5 +-- src/nccl_ofi_rdma.c | 4 ++- src/nccl_ofi_rocm.c | 63 ++++++++++++++++++++++++++++++++++++ src/nccl_ofi_sendrecv.c | 6 ++-- src/nccl_ofi_topo.c | 3 ++ src/platform-aws.c | 2 +- 15 files changed, 235 insertions(+), 39 deletions(-) create mode 100644 include/nccl_ofi_rocm.h create mode 100644 m4/check_pkg_rocm.m4 create mode 100644 src/nccl_ofi_rocm.c diff --git a/configure.ac b/configure.ac index c2611cf2e..69b86039b 100644 --- a/configure.ac +++ b/configure.ac @@ -89,10 +89,30 @@ CHECK_PKG_NEURON([AS_IF([test -n "${want_cuda}"], [AC_MSG_ERROR([Cannot enable both CUDA and neuron.])], [want_cuda=no]) have_device_interface=neuron]) -CHECK_PKG_CUDA([have_device_interface=cuda]) - +# Select CUDA if Neuron wasn't specified and --with-rocm was not used. +CHECK_PKG_CUDA(AS_IF([test "${have_device_interface}" = "no"], + AS_IF([test -z "$with_rocm"], [have_device_interface=cuda]))) +# If neither CUDA nor Neuron is being used, select ROCm +CHECK_PKG_ROCM(AS_IF([test "${have_device_interface}" = "no"], [have_device_interface=rocm])) AS_IF([test "${have_device_interface}" = "no"], - [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA or Neuron runtime.])]) + [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA, ROCm or Neuron runtime.])]) + +do_cuda=0 +do_rocm=0 +AS_IF([test -n "$with_rocm"], + [AS_IF([test "$have_device_interface" = "rocm"], + [enable_tests="no" + do_rocm=1 + ])], + [AS_IF([test "$have_device_interface" = "cuda"], [do_cuda=1])]) + +AC_DEFINE_UNQUOTED([HAVE_CUDA], [${do_cuda}], [Defined to 1 if CUDA is available]) +AM_CONDITIONAL([HAVE_CUDA], [test ${do_cuda} = 1]) + +AC_DEFINE_UNQUOTED([HAVE_ROCM], [${do_rocm}], [Defined to 1 if ROCm is available]) +AM_CONDITIONAL([HAVE_ROCM], [test ${do_rocm} = 1]) +AS_IF([test ${do_rocm} = 1], + AC_DEFINE_UNQUOTED( [__HIP_PLATFORM_AMD__], [ 1 ], [Select AMD/ROCm HIP APIs] )) CHECK_PKG_HWLOC([], [AC_MSG_ERROR([Could not find the hwloc library. Use --with-hwloc to provide the path to non-standard hwloc installation.])]) diff --git a/include/nccl-headers/error.h b/include/nccl-headers/error.h index 1afc7dce4..1f4f6128a 100644 --- a/include/nccl-headers/error.h +++ b/include/nccl-headers/error.h @@ -5,7 +5,7 @@ #ifndef NCCL_HEADERS_ERROR_H #define NCCL_HEADERS_ERROR_H -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM #include "nccl-headers/nvidia/err.h" #elif HAVE_NEURON #include "nccl-headers/neuron/error.h" diff --git a/include/nccl-headers/net.h b/include/nccl-headers/net.h index d632abe7c..db12933d0 100644 --- a/include/nccl-headers/net.h +++ b/include/nccl-headers/net.h @@ -5,7 +5,7 @@ #ifndef NCCL_HEADERS_NET_H #define NCCL_HEADERS_NET_H -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM #include "nccl-headers/nvidia/net.h" #elif HAVE_NEURON #include "nccl-headers/neuron/net.h" diff --git a/include/nccl_ofi_rocm.h b/include/nccl_ofi_rocm.h new file mode 100644 index 000000000..fdccfb323 --- /dev/null +++ b/include/nccl_ofi_rocm.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 Hewlett Packard Enterprise Development LP + * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_OFI_CUDA_H_ +#define NCCL_OFI_CUDA_H_ + +#ifdef _cplusplus +extern "C" { +#endif + +#include + +/* + * Error checking is currently just success or failure. + */ +enum { + GPU_SUCCESS = 0, + GPU_ERROR = 999 /* Match hipErrorUnknown */ +}; + +int nccl_net_ofi_gpu_init(void); + +/* + * @brief Gets the GPU device associated with the buffer + * + * @param data + * Pointer to GPU buffer. + * + * @return Valid GPU device ID on success + * -1 on error + * @return 0 on success + * non-zero on error + */ +int nccl_net_ofi_get_cuda_device(void *data, int *dev_id); +int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion); +int nccl_net_ofi_gpuCtxGetDevice(int *device); +int nccl_net_ofi_gpuDeviceGetCount(int* count); + +extern void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites; + +#ifdef _cplusplus +} // End extern "C" +#endif + +#endif // End NCCL_OFI_H_ diff --git a/m4/check_pkg_cuda.m4 b/m4/check_pkg_cuda.m4 index 63bac32c7..97452dc67 100644 --- a/m4/check_pkg_cuda.m4 +++ b/m4/check_pkg_cuda.m4 @@ -49,9 +49,6 @@ AC_DEFUN([CHECK_PKG_CUDA], [ CPPFLAGS="${check_pkg_CPPFLAGS_save}" $2]) - AC_DEFINE_UNQUOTED([HAVE_CUDA], [${check_pkg_define}], [Defined to 1 if CUDA is available]) - AM_CONDITIONAL([HAVE_CUDA], [test "${check_pkg_found}" = "yes"]) - AC_SUBST([CUDA_LDFLAGS]) AC_SUBST([CUDA_LIBS]) diff --git a/m4/check_pkg_rocm.m4 b/m4/check_pkg_rocm.m4 new file mode 100644 index 000000000..47191af35 --- /dev/null +++ b/m4/check_pkg_rocm.m4 @@ -0,0 +1,52 @@ +# -*- autoconf -*- +# +# Copyright (c) 2024 Hewlett Packard Enterprise Development LP +# Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE.txt for license information +# + +AC_DEFUN([CHECK_PKG_ROCM], [ + check_pkg_found="yes" + check_pkg_CPPFLAGS_save="${CPPFLAGS}" + check_pkg_LDFLAGS_save="${LDFLAGS}" + check_pkg_LIBS_save="${LIBS}" + + AC_ARG_WITH([rocm], + [AS_HELP_STRING([--with-rocm=PATH], [Path to non-standard ROCm installation])]) + + AS_IF([test -z "${with-rocm}" -o "{with_rocm}" = "yes"], + [], + [test "${with_rocm}" = "no"], + [check_pkg_found=no], + [AS_IF([test -d ${with_rocm}/lib64], [check_pkg_libdir="lib64"], [check_pkg_libdir="lib"]) + CPPFLAGS="-I${with_rocm}/include ${CPPFLAGS}" + LDFLAGS="-L${with_rocm}/${check_pkg_libdir} ${LDFLAGS}"]) + + AS_IF([test "${check_pkg_found}" = "yes"], + [AC_CHECK_LIB([amdhip64], [hipMemAllocHost], [], [check_pkg_found=no])]) + + AS_IF([test "${check_pkg_found}" = "yes"], + [AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], [check_pkg_found=no], [#define __HIP_PLATFORM_AMD__])]) + + + AS_IF([test "${check_pkg_found}" = "yes"], + [check_pkg_define="yes"], + [check_pkg_define="no" + CPPFLAGS="${check_pkg_CPPFLAGS_save}" + LDFLAGS="${check_pkg_LDFLAGS_save}" + LIBS="${check_pkg_LIBS_save}" + ]) + + AS_IF([test -n "${with_rocm}"], + [AS_IF([test "${check_pkg_define}" = "yes"], + [$1], [$2] ) + ], [$2] + ) + + AS_UNSET([check_pkg_found]) + AS_UNSET([check_pkg_define]) + AS_UNSET([check_pkg_CPPFLAGS_save]) + AS_UNSET([check_pkg_LDFLAGS_save]) + AS_UNSET([check_pkg_LIBS_save]) +]) diff --git a/src/Makefile.am b/src/Makefile.am index e9a0de563..fd7254d0e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -24,52 +24,58 @@ sources = \ nccl_ofi_pthread.c \ tracepoint.c +tuner_sources = \ + tuner/nccl_ofi_model.c \ + tuner/nccl_ofi_tuner.c + if WANT_PLATFORM_AWS sources += platform-aws.c endif if ENABLE_NEURON sources += nccl_ofi_interface_neuron.c -else - sources += nccl_ofi_cuda.c \ - nccl_ofi_interface_nvidia.c -endif - -# Build an internal-only library that can be used by unit tests as -# well as the actual nccl_net.so / nccom_net.so libraries. This saves -# us writing dlopen() handlers for simple unit tests. -noinst_LTLIBRARIES = libinternal_net_plugin.la -libinternal_net_plugin_la_SOURCES = $(sources) -libinternal_net_plugin_la_LDFLAGS = -avoid-version -if ENABLE_NEURON lib_LTLIBRARIES = libnccom-net.la libnccom_net_la_SOURCES = libnccom_net_la_LIBADD = libinternal_net_plugin.la libnccom_net_la_LDFLAGS = -module -avoid-version +endif + +if HAVE_CUDA + sources += nccl_ofi_cuda.c nccl_ofi_interface_nvidia.c +if WANT_PLATFORM_AWS + # NCCL tuner plugin + lib_LTLIBRARIES = libnccl-net.la libnccl-ofi-tuner.la + libnccl_ofi_tuner_la_SOURCES = $(tuner_sources) + libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version else lib_LTLIBRARIES = libnccl-net.la +endif + libnccl_net_la_SOURCES = libnccl_net_la_LIBADD = libinternal_net_plugin.la libnccl_net_la_LDFLAGS = -module -avoid-version endif +if HAVE_ROCM + sources += nccl_ofi_rocm.c nccl_ofi_interface_nvidia.c + + lib_LTLIBRARIES = librccl-net.la + librccl_net_la_SOURCES = + librccl_net_la_LIBADD = libinternal_net_plugin.la + librccl_net_la_LDFLAGS = -module -avoid-version +endif + +# Build an internal-only library that can be used by unit tests as +# well as the actual nccl_net.so / nccom_net.so libraries. This saves +# us writing dlopen() handlers for simple unit tests. +noinst_LTLIBRARIES = libinternal_net_plugin.la +libinternal_net_plugin_la_SOURCES = $(sources) +libinternal_net_plugin_la_LDFLAGS = -avoid-version # # Tuner # noinst_LTLIBRARIES += libinternal_tuner_plugin.la -tuner_sources = \ - tuner/nccl_ofi_model.c \ - tuner/nccl_ofi_tuner.c libinternal_tuner_plugin_la_SOURCES = $(tuner_sources) libinternal_tuner_plugin_la_LDFLAGS = -avoid-version - -if HAVE_CUDA -if WANT_PLATFORM_AWS - # NCCL tuner plugin - lib_LTLIBRARIES += libnccl-ofi-tuner.la - libnccl_ofi_tuner_la_SOURCES = $(tuner_sources) - libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version -endif -endif diff --git a/src/nccl_ofi_api.c b/src/nccl_ofi_api.c index c2b12b40c..496677ded 100644 --- a/src/nccl_ofi_api.c +++ b/src/nccl_ofi_api.c @@ -297,7 +297,7 @@ ncclResult_t nccl_net_ofi_regMr(void *comm, void *data, size_t size, int type, /* Validate type of buffer */ bool valid_buffer_type = false; if (type == NCCL_PTR_HOST) valid_buffer_type = true; -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM if (type == NCCL_PTR_CUDA) valid_buffer_type = true; #endif #if HAVE_NEURON diff --git a/src/nccl_ofi_net.c b/src/nccl_ofi_net.c index c294bbaed..58dce4514 100644 --- a/src/nccl_ofi_net.c +++ b/src/nccl_ofi_net.c @@ -20,6 +20,8 @@ #include "nccl_ofi_tracepoint.h" #if HAVE_CUDA #include "nccl_ofi_cuda.h" +#elif HAVE_ROCM +#include "nccl_ofi_rocm.h" #endif #include "nccl_ofi_sendrecv.h" #include "nccl_ofi_rdma.h" diff --git a/src/nccl_ofi_ofiutils.c b/src/nccl_ofi_ofiutils.c index 87e7abac0..5e3f34e1e 100644 --- a/src/nccl_ofi_ofiutils.c +++ b/src/nccl_ofi_ofiutils.c @@ -20,6 +20,8 @@ #include "nccl_ofi_tracepoint.h" #if HAVE_CUDA #include "nccl_ofi_cuda.h" +#elif HAVE_ROCM +#include "nccl_ofi_rocm.h" #endif #include "nccl_ofi_math.h" #include "nccl_ofi_ofiutils.h" @@ -299,12 +301,11 @@ int nccl_ofi_ofiutils_init_connection(int api_version, struct fi_info *info, str goto error; } - /* Set Libfabric endpoint option FI_OPT_CUDA_API_PERMITTED to false if * using the Libfabric 1.18 API with HMEM support. */ if (api_version == FI_VERSION(1,18) && support_gdr != GDR_UNSUPPORTED) { -#if (HAVE_CUDA && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED) +#if ((HAVE_CUDA || HAVE_ROCM) && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED) bool optval = false; ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT, FI_OPT_CUDA_API_PERMITTED, &optval, diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 1ca150ea3..cef5411ed 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -14,6 +14,8 @@ #include "nccl_ofi.h" #if HAVE_CUDA #include "nccl_ofi_cuda.h" +#elif HAVE_ROCM +#include "nccl_ofi_rocm.h" #endif #include "nccl_ofi_param.h" #include "nccl_ofi_rdma.h" @@ -444,7 +446,7 @@ static int set_mr_req_attr(nccl_ofi_idpool_t *key_pool, int dev_id, mr_attr->access |= FI_READ; mr_attr->iface = FI_HMEM_SYSTEM; break; -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM case NCCL_PTR_CUDA: mr_attr->access |= FI_REMOTE_READ; mr_attr->iface = FI_HMEM_CUDA; diff --git a/src/nccl_ofi_rocm.c b/src/nccl_ofi_rocm.c new file mode 100644 index 000000000..08be545f9 --- /dev/null +++ b/src/nccl_ofi_rocm.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 Hewlett Packard Enterprise Development LP + * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + */ + +#include "config.h" + +#include + +#include "nccl_ofi.h" +#include "nccl_ofi_rocm.h" + +int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion) { + return hipDriverGetVersion(driverVersion) == hipSuccess ? GPU_SUCCESS : GPU_ERROR; +} + +int nccl_net_ofi_gpuCtxGetDevice(int *device) { + return hipGetDevice(device) == hipSuccess ? GPU_SUCCESS : GPU_ERROR; +} + +int nccl_net_ofi_gpuDeviceGetCount(int *count) { + return hipGetDeviceCount(count) == hipSuccess ? GPU_SUCCESS : GPU_ERROR; +} + +void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites = NULL; + +int +nccl_net_ofi_gpu_init(void) +{ + return 0; +} + +int nccl_net_ofi_get_cuda_device(void *data, int *dev_id) +{ + int ret = 0; + int cuda_device = -1; + unsigned int mem_type; + unsigned int device_ordinal; + hipError_t cuda_ret_mem = hipPointerGetAttribute(&device_ordinal, + HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + (hipDeviceptr_t) data); + hipError_t cuda_ret_dev = hipPointerGetAttribute(&mem_type, + HIP_POINTER_ATTRIBUTE_MEMORY_TYPE, + (hipDeviceptr_t) data); + + if (cuda_ret_mem != hipSuccess || cuda_ret_dev != hipSuccess) { + ret = -ENOTSUP; + NCCL_OFI_WARN("Invalid buffer pointer provided"); + goto exit; + } + + if (mem_type == hipMemoryTypeDevice) { + cuda_device = device_ordinal; + } else { + ret = -EINVAL; + NCCL_OFI_WARN("Invalid type of buffer provided. Only device memory is expected for NCCL_PTR_CUDA type"); + } + + exit: + *dev_id = cuda_device; + return ret; +} diff --git a/src/nccl_ofi_sendrecv.c b/src/nccl_ofi_sendrecv.c index 9646f52d3..1071dd2a8 100644 --- a/src/nccl_ofi_sendrecv.c +++ b/src/nccl_ofi_sendrecv.c @@ -15,6 +15,8 @@ #include "nccl_ofi.h" #if HAVE_CUDA #include "nccl_ofi_cuda.h" +#elif HAVE_ROCM +#include "nccl_ofi_rocm.h" #endif #include "nccl_ofi_param.h" #include "nccl_ofi_sendrecv.h" @@ -530,7 +532,7 @@ static int register_mr_buffers(struct fid_domain *domain, struct fid_ep *ep, mr_attr.access |= FI_READ; mr_attr.iface = FI_HMEM_SYSTEM; break; -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM case NCCL_PTR_CUDA: mr_attr.access |= FI_REMOTE_READ; mr_attr.iface = FI_HMEM_CUDA; @@ -669,7 +671,7 @@ static int reg_mr_base(struct fid_domain *domain, struct fid_ep *ep, /* Validate type of buffer */ bool valid_buffer_type = false; if (type == NCCL_PTR_HOST) valid_buffer_type = true; -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM if (type == NCCL_PTR_CUDA) valid_buffer_type = true; #endif #if HAVE_NEURON diff --git a/src/nccl_ofi_topo.c b/src/nccl_ofi_topo.c index 5311d9a11..c70a6ada8 100644 --- a/src/nccl_ofi_topo.c +++ b/src/nccl_ofi_topo.c @@ -20,6 +20,9 @@ #if HAVE_CUDA static const uint8_t target_class_id = 0x03; /* Display controller class */ static const unsigned short target_vendor_id = 0x10de; /* NVIDIA */ +#elif HAVE_ROCM +static const uint8_t target_class_id = 0x03; /* Display controller class */ +static const unsigned short target_vendor_id = 0x1002; /* AMD */ #else static const uint8_t target_class_id = 0x08; /* System peripheral */ static const unsigned short target_vendor_id = 0x1d0f; /* Amazon */ diff --git a/src/platform-aws.c b/src/platform-aws.c index 018e3b670..cf8e7b2f3 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -253,7 +253,7 @@ static int validate_rdma_write(struct fid_ep *ep) } -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM static int configure_nccl_proto(void) { int ret;