diff --git a/configure.ac b/configure.ac index 6e68a27e1..8c2f84055 100644 --- a/configure.ac +++ b/configure.ac @@ -93,10 +93,30 @@ CHECK_PKG_NEURON([AS_IF([test -n "${want_cuda}"], [AC_MSG_ERROR([Cannot enable both CUDA and neuron.])], [want_cuda=no]) have_device_interface=neuron]) -CHECK_PKG_CUDA([have_device_interface=cuda]) - +# Select CUDA if Neuron wasn't specified and --with-rocm was not used. +CHECK_PKG_CUDA(AS_IF([test "${have_device_interface}" = "no"], + AS_IF([test -z "$with_rocm"], [have_device_interface=cuda]))) +# If neither CUDA nor Neuron is being used, select ROCm +CHECK_PKG_ROCM(AS_IF([test "${have_device_interface}" = "no"], [have_device_interface=rocm])) AS_IF([test "${have_device_interface}" = "no"], - [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA or Neuron runtime.])]) + [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA, ROCm or Neuron runtime.])]) + +do_cuda=0 +do_rocm=0 +AS_IF([test -n "$with_rocm"], + [AS_IF([test "$have_device_interface" = "rocm"], + [enable_tests="no" + do_rocm=1 + ])], + [AS_IF([test "$have_device_interface" = "cuda"], [do_cuda=1])]) + +AC_DEFINE_UNQUOTED([HAVE_CUDA], [${do_cuda}], [Defined to 1 if CUDA is available]) +AM_CONDITIONAL([HAVE_CUDA], [test ${do_cuda} = 1]) + +AC_DEFINE_UNQUOTED([HAVE_ROCM], [${do_rocm}], [Defined to 1 if ROCm is available]) +AM_CONDITIONAL([HAVE_ROCM], [test ${do_rocm} = 1]) +AS_IF([test ${do_rocm} = 1], + AC_DEFINE_UNQUOTED( [__HIP_PLATFORM_AMD__], [ 1 ], [Select AMD/ROCm HIP APIs] )) CHECK_PKG_HWLOC([], [AC_MSG_ERROR([Could not find the hwloc library. Use --with-hwloc to provide the path to non-standard hwloc installation.])]) diff --git a/include/nccl-headers/error.h b/include/nccl-headers/error.h index 1afc7dce4..1f4f6128a 100644 --- a/include/nccl-headers/error.h +++ b/include/nccl-headers/error.h @@ -5,7 +5,7 @@ #ifndef NCCL_HEADERS_ERROR_H #define NCCL_HEADERS_ERROR_H -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM #include "nccl-headers/nvidia/err.h" #elif HAVE_NEURON #include "nccl-headers/neuron/error.h" diff --git a/include/nccl-headers/net.h b/include/nccl-headers/net.h index d632abe7c..db12933d0 100644 --- a/include/nccl-headers/net.h +++ b/include/nccl-headers/net.h @@ -5,7 +5,7 @@ #ifndef NCCL_HEADERS_NET_H #define NCCL_HEADERS_NET_H -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM #include "nccl-headers/nvidia/net.h" #elif HAVE_NEURON #include "nccl-headers/neuron/net.h" diff --git a/include/nccl_ofi_cuda.h b/include/nccl_ofi_cuda.h index 145ae1dc2..46cd425ed 100644 --- a/include/nccl_ofi_cuda.h +++ b/include/nccl_ofi_cuda.h @@ -41,8 +41,10 @@ extern int nccl_net_ofi_gpuDeviceGetCount(int* count); #if CUDA_VERSION >= 11030 extern int nccl_net_ofi_gpuFlushGPUDirectRDMAWrites(); +#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 1 #else extern void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites; +#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 0 #endif #ifdef _cplusplus diff --git a/include/nccl_ofi_rocm.h b/include/nccl_ofi_rocm.h new file mode 100644 index 000000000..fdccfb323 --- /dev/null +++ b/include/nccl_ofi_rocm.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 Hewlett Packard Enterprise Development LP + * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_OFI_CUDA_H_ +#define NCCL_OFI_CUDA_H_ + +#ifdef _cplusplus +extern "C" { +#endif + +#include + +/* + * Error checking is currently just success or failure. + */ +enum { + GPU_SUCCESS = 0, + GPU_ERROR = 999 /* Match hipErrorUnknown */ +}; + +int nccl_net_ofi_gpu_init(void); + +/* + * @brief Gets the GPU device associated with the buffer + * + * @param data + * Pointer to GPU buffer. + * + * @return Valid GPU device ID on success + * -1 on error + * @return 0 on success + * non-zero on error + */ +int nccl_net_ofi_get_cuda_device(void *data, int *dev_id); +int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion); +int nccl_net_ofi_gpuCtxGetDevice(int *device); +int nccl_net_ofi_gpuDeviceGetCount(int* count); + +extern void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites; + +#ifdef _cplusplus +} // End extern "C" +#endif + +#endif // End NCCL_OFI_H_ diff --git a/m4/check_pkg_cuda.m4 b/m4/check_pkg_cuda.m4 index 63bac32c7..97452dc67 100644 --- a/m4/check_pkg_cuda.m4 +++ b/m4/check_pkg_cuda.m4 @@ -49,9 +49,6 @@ AC_DEFUN([CHECK_PKG_CUDA], [ CPPFLAGS="${check_pkg_CPPFLAGS_save}" $2]) - AC_DEFINE_UNQUOTED([HAVE_CUDA], [${check_pkg_define}], [Defined to 1 if CUDA is available]) - AM_CONDITIONAL([HAVE_CUDA], [test "${check_pkg_found}" = "yes"]) - AC_SUBST([CUDA_LDFLAGS]) AC_SUBST([CUDA_LIBS]) diff --git a/m4/check_pkg_rocm.m4 b/m4/check_pkg_rocm.m4 new file mode 100644 index 000000000..47191af35 --- /dev/null +++ b/m4/check_pkg_rocm.m4 @@ -0,0 +1,52 @@ +# -*- autoconf -*- +# +# Copyright (c) 2024 Hewlett Packard Enterprise Development LP +# Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE.txt for license information +# + +AC_DEFUN([CHECK_PKG_ROCM], [ + check_pkg_found="yes" + check_pkg_CPPFLAGS_save="${CPPFLAGS}" + check_pkg_LDFLAGS_save="${LDFLAGS}" + check_pkg_LIBS_save="${LIBS}" + + AC_ARG_WITH([rocm], + [AS_HELP_STRING([--with-rocm=PATH], [Path to non-standard ROCm installation])]) + + AS_IF([test -z "${with-rocm}" -o "{with_rocm}" = "yes"], + [], + [test "${with_rocm}" = "no"], + [check_pkg_found=no], + [AS_IF([test -d ${with_rocm}/lib64], [check_pkg_libdir="lib64"], [check_pkg_libdir="lib"]) + CPPFLAGS="-I${with_rocm}/include ${CPPFLAGS}" + LDFLAGS="-L${with_rocm}/${check_pkg_libdir} ${LDFLAGS}"]) + + AS_IF([test "${check_pkg_found}" = "yes"], + [AC_CHECK_LIB([amdhip64], [hipMemAllocHost], [], [check_pkg_found=no])]) + + AS_IF([test "${check_pkg_found}" = "yes"], + [AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], [check_pkg_found=no], [#define __HIP_PLATFORM_AMD__])]) + + + AS_IF([test "${check_pkg_found}" = "yes"], + [check_pkg_define="yes"], + [check_pkg_define="no" + CPPFLAGS="${check_pkg_CPPFLAGS_save}" + LDFLAGS="${check_pkg_LDFLAGS_save}" + LIBS="${check_pkg_LIBS_save}" + ]) + + AS_IF([test -n "${with_rocm}"], + [AS_IF([test "${check_pkg_define}" = "yes"], + [$1], [$2] ) + ], [$2] + ) + + AS_UNSET([check_pkg_found]) + AS_UNSET([check_pkg_define]) + AS_UNSET([check_pkg_CPPFLAGS_save]) + AS_UNSET([check_pkg_LDFLAGS_save]) + AS_UNSET([check_pkg_LIBS_save]) +]) diff --git a/src/Makefile.am b/src/Makefile.am index 3ec2f2fe7..616b74dfa 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -26,52 +26,58 @@ sources = \ nccl_ofi_ep_addr_list.c \ tracepoint.c +tuner_sources = \ + tuner/nccl_ofi_regions.c \ + tuner/nccl_ofi_tuner.c + if WANT_PLATFORM_AWS sources += platform-aws.c endif if ENABLE_NEURON sources += nccl_ofi_interface_neuron.c -else - sources += nccl_ofi_cuda.c \ - nccl_ofi_interface_nvidia.c -endif - -# Build an internal-only library that can be used by unit tests as -# well as the actual nccl_net.so / nccom_net.so libraries. This saves -# us writing dlopen() handlers for simple unit tests. -noinst_LTLIBRARIES = libinternal_net_plugin.la -libinternal_net_plugin_la_SOURCES = $(sources) -libinternal_net_plugin_la_LDFLAGS = -avoid-version -if ENABLE_NEURON lib_LTLIBRARIES = libnccom-net.la libnccom_net_la_SOURCES = libnccom_net_la_LIBADD = libinternal_net_plugin.la libnccom_net_la_LDFLAGS = -module -avoid-version +endif + +if HAVE_CUDA + sources += nccl_ofi_cuda.c nccl_ofi_interface_nvidia.c +if WANT_PLATFORM_AWS + # NCCL tuner plugin + lib_LTLIBRARIES = libnccl-net.la libnccl-ofi-tuner.la + libnccl_ofi_tuner_la_SOURCES = $(tuner_sources) + libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version else lib_LTLIBRARIES = libnccl-net.la +endif + libnccl_net_la_SOURCES = libnccl_net_la_LIBADD = libinternal_net_plugin.la libnccl_net_la_LDFLAGS = -module -avoid-version endif +if HAVE_ROCM + sources += nccl_ofi_rocm.c nccl_ofi_interface_nvidia.c + + lib_LTLIBRARIES = librccl-net.la + librccl_net_la_SOURCES = + librccl_net_la_LIBADD = libinternal_net_plugin.la + librccl_net_la_LDFLAGS = -module -avoid-version +endif + +# Build an internal-only library that can be used by unit tests as +# well as the actual nccl_net.so / nccom_net.so libraries. This saves +# us writing dlopen() handlers for simple unit tests. +noinst_LTLIBRARIES = libinternal_net_plugin.la +libinternal_net_plugin_la_SOURCES = $(sources) +libinternal_net_plugin_la_LDFLAGS = -avoid-version # # Tuner # noinst_LTLIBRARIES += libinternal_tuner_plugin.la -tuner_sources = \ - tuner/nccl_ofi_regions.c \ - tuner/nccl_ofi_tuner.c libinternal_tuner_plugin_la_SOURCES = $(tuner_sources) libinternal_tuner_plugin_la_LDFLAGS = -avoid-version - -if HAVE_CUDA -if WANT_PLATFORM_AWS - # NCCL tuner plugin - lib_LTLIBRARIES += libnccl-ofi-tuner.la - libnccl_ofi_tuner_la_SOURCES = $(tuner_sources) - libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version -endif -endif diff --git a/src/nccl_ofi_api.c b/src/nccl_ofi_api.c index 763327885..1b0617626 100644 --- a/src/nccl_ofi_api.c +++ b/src/nccl_ofi_api.c @@ -297,7 +297,7 @@ ncclResult_t nccl_net_ofi_regMr(void *comm, void *data, size_t size, int type, /* Validate type of buffer */ bool valid_buffer_type = false; if (type == NCCL_PTR_HOST) valid_buffer_type = true; -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM if (type == NCCL_PTR_CUDA) valid_buffer_type = true; #endif #if HAVE_NEURON diff --git a/src/nccl_ofi_net.c b/src/nccl_ofi_net.c index 0cfb04442..578f102d2 100644 --- a/src/nccl_ofi_net.c +++ b/src/nccl_ofi_net.c @@ -19,6 +19,8 @@ #include "nccl_ofi_tracepoint.h" #if HAVE_CUDA #include "nccl_ofi_cuda.h" +#elif HAVE_ROCM +#include "nccl_ofi_rocm.h" #endif #include "nccl_ofi_sendrecv.h" #include "nccl_ofi_rdma.h" @@ -151,7 +153,7 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p) assert(NCCL_OFI_IS_POWER_OF_TWO(system_page_size)); assert(system_page_size > 0); -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM ret = nccl_net_ofi_gpu_init(); if (ret != 0) { NCCL_OFI_WARN("CUDA initialization failed."); @@ -167,7 +169,7 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p) NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using CUDA driver version %d", cuda_version); if (ofi_nccl_cuda_flush_enable()) { - if (nccl_net_ofi_gpuFlushGPUDirectRDMAWrites == NULL) { + if (HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE) { NCCL_OFI_WARN("CUDA flush requested, but cuFlushGPUDirectRDMAWrites not found."); cuda_flush = false; } else { diff --git a/src/nccl_ofi_ofiutils.c b/src/nccl_ofi_ofiutils.c index f3ecfca7c..8cde9d11d 100644 --- a/src/nccl_ofi_ofiutils.c +++ b/src/nccl_ofi_ofiutils.c @@ -19,6 +19,8 @@ #include "nccl_ofi_tracepoint.h" #if HAVE_CUDA #include "nccl_ofi_cuda.h" +#elif HAVE_ROCM +#include "nccl_ofi_rocm.h" #endif #include "nccl_ofi_math.h" #include "nccl_ofi_ofiutils.h" @@ -342,7 +344,7 @@ int nccl_ofi_ofiutils_init_connection(int api_version, struct fi_info *info, str * using the Libfabric 1.18 API with HMEM support. */ if (api_version == FI_VERSION(1,18) && support_gdr != GDR_UNSUPPORTED) { -#if (HAVE_CUDA && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED) +#if ((HAVE_CUDA || HAVE_ROCM) && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED) bool optval = false; ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT, FI_OPT_CUDA_API_PERMITTED, &optval, diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 31c0f3030..e12bcf95d 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -14,6 +14,8 @@ #include "nccl_ofi.h" #if HAVE_CUDA #include "nccl_ofi_cuda.h" +#elif HAVE_ROCM +#include "nccl_ofi_rocm.h" #endif #include "nccl_ofi_ep_addr_list.h" #include "nccl_ofi_param.h" @@ -385,7 +387,7 @@ static int set_mr_req_attr(nccl_ofi_idpool_t *key_pool, int dev_id, mr_attr->access |= FI_READ; mr_attr->iface = FI_HMEM_SYSTEM; break; -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM case NCCL_PTR_CUDA: mr_attr->access |= FI_REMOTE_READ; mr_attr->iface = FI_HMEM_CUDA; diff --git a/src/nccl_ofi_rocm.c b/src/nccl_ofi_rocm.c new file mode 100644 index 000000000..6d97d0665 --- /dev/null +++ b/src/nccl_ofi_rocm.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2024 Hewlett Packard Enterprise Development LP + * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + */ + +#include "config.h" + +#include + +#include "nccl_ofi.h" +#include "nccl_ofi_rocm.h" + +int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion) { + return hipDriverGetVersion(driverVersion) == hipSuccess ? GPU_SUCCESS : GPU_ERROR; +} + +int nccl_net_ofi_gpuCtxGetDevice(int *device) { + return hipGetDevice(device) == hipSuccess ? GPU_SUCCESS : GPU_ERROR; +} + +int nccl_net_ofi_gpuDeviceGetCount(int *count) { + return hipGetDeviceCount(count) == hipSuccess ? GPU_SUCCESS : GPU_ERROR; +} + +void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites = NULL; +#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 0 + +int +nccl_net_ofi_gpu_init(void) +{ + return 0; +} + +int nccl_net_ofi_get_cuda_device(void *data, int *dev_id) +{ + int ret = 0; + int cuda_device = -1; + unsigned int mem_type; + unsigned int device_ordinal; + hipError_t cuda_ret_mem = hipPointerGetAttribute(&device_ordinal, + HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + (hipDeviceptr_t) data); + hipError_t cuda_ret_dev = hipPointerGetAttribute(&mem_type, + HIP_POINTER_ATTRIBUTE_MEMORY_TYPE, + (hipDeviceptr_t) data); + + if (cuda_ret_mem != hipSuccess || cuda_ret_dev != hipSuccess) { + ret = -ENOTSUP; + NCCL_OFI_WARN("Invalid buffer pointer provided"); + goto exit; + } + + if (mem_type == hipMemoryTypeDevice) { + cuda_device = device_ordinal; + } else { + ret = -EINVAL; + NCCL_OFI_WARN("Invalid type of buffer provided. Only device memory is expected for NCCL_PTR_CUDA type"); + } + + exit: + *dev_id = cuda_device; + return ret; +} diff --git a/src/nccl_ofi_sendrecv.c b/src/nccl_ofi_sendrecv.c index 3adf351e9..7bbf455f2 100644 --- a/src/nccl_ofi_sendrecv.c +++ b/src/nccl_ofi_sendrecv.c @@ -15,6 +15,8 @@ #include "nccl_ofi.h" #if HAVE_CUDA #include "nccl_ofi_cuda.h" +#elif HAVE_ROCM +#include "nccl_ofi_rocm.h" #endif #include "nccl_ofi_param.h" #include "nccl_ofi_sendrecv.h" @@ -550,7 +552,7 @@ static int register_mr_buffers(struct fid_domain *domain, struct fid_ep *ep, mr_attr.access |= FI_READ; mr_attr.iface = FI_HMEM_SYSTEM; break; -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM case NCCL_PTR_CUDA: mr_attr.access |= FI_REMOTE_READ; mr_attr.iface = FI_HMEM_CUDA; @@ -689,7 +691,7 @@ static int reg_mr_base(struct fid_domain *domain, struct fid_ep *ep, /* Validate type of buffer */ bool valid_buffer_type = false; if (type == NCCL_PTR_HOST) valid_buffer_type = true; -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM if (type == NCCL_PTR_CUDA) valid_buffer_type = true; #endif #if HAVE_NEURON diff --git a/src/nccl_ofi_topo.c b/src/nccl_ofi_topo.c index b8e0d9e35..dbe4969d5 100644 --- a/src/nccl_ofi_topo.c +++ b/src/nccl_ofi_topo.c @@ -20,6 +20,9 @@ #if HAVE_CUDA static const uint8_t target_class_id = 0x03; /* Display controller class */ static const unsigned short target_vendor_id = 0x10de; /* NVIDIA */ +#elif HAVE_ROCM +static const uint8_t target_class_id = 0x03; /* Display controller class */ +static const unsigned short target_vendor_id = 0x1002; /* AMD */ #else static const uint8_t target_class_id = 0x08; /* System peripheral */ static const unsigned short target_vendor_id = 0x1d0f; /* Amazon */ diff --git a/src/platform-aws.c b/src/platform-aws.c index f980efd0a..edd43dea2 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -264,7 +264,7 @@ static int validate_rdma_write(struct fid_ep *ep) } -#if HAVE_CUDA +#if HAVE_CUDA || HAVE_ROCM static int configure_nccl_proto(void) { int ret;