Add ROCm support based on configure option.

ROCm provides an interface similar to CUDA, to work with AMD GPUs. Provide a compile time option to build with ROCm instead of CUDA. 1. Add --with-rocm= flag to ./configure. 2. Make all CUDA calls "gpu" calls, which are independent of the underlying framework. 3. Switch between _rocm and _cuda files at compile time to make the appropriate calls. 4. When building for RCCL (AMD's NCCL), generate a rccl-net.so-named plugin for binary compatibility. Tested on: 1. HPE Cray EX with EX235A BardPeak GPUs + 200Gb Slingshot adapters. 2. HPE Cray EX with NVIDIA A100 SXM4 80GB GPUs + 200 Gb Slingshot adapters. Signed-off-by: Ryan Hankins <ryan.hankins@hpe.com>
aws · Aug 15, 2024 · b1a22d5 · b1a22d5
1 parent 8e836e5
commit b1a22d5
Show file tree

Hide file tree

Showing 16 changed files with 240 additions and 40 deletions.
diff --git a/configure.ac b/configure.ac
@@ -93,10 +93,30 @@ CHECK_PKG_NEURON([AS_IF([test -n "${want_cuda}"],
                         [AC_MSG_ERROR([Cannot enable both CUDA and neuron.])],
                         [want_cuda=no])
                   have_device_interface=neuron])
-CHECK_PKG_CUDA([have_device_interface=cuda])
-
+# Select CUDA if Neuron wasn't specified and --with-rocm was not used.
+CHECK_PKG_CUDA(AS_IF([test "${have_device_interface}" = "no"],
+	       AS_IF([test -z "$with_rocm"], [have_device_interface=cuda])))
+# If neither CUDA nor Neuron is being used, select ROCm
+CHECK_PKG_ROCM(AS_IF([test "${have_device_interface}" = "no"], [have_device_interface=rocm]))
 AS_IF([test "${have_device_interface}" = "no"],
-      [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA or Neuron runtime.])])
+      [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA, ROCm or Neuron runtime.])])
+
+do_cuda=0
+do_rocm=0
+AS_IF([test -n "$with_rocm"],
+	[AS_IF([test "$have_device_interface" = "rocm"],
+		[enable_tests="no"
+		 do_rocm=1
+		 ])],
+	[AS_IF([test "$have_device_interface" = "cuda"], [do_cuda=1])])
+
+AC_DEFINE_UNQUOTED([HAVE_CUDA], [${do_cuda}], [Defined to 1 if CUDA is available])
+AM_CONDITIONAL([HAVE_CUDA], [test ${do_cuda} = 1])
+
+AC_DEFINE_UNQUOTED([HAVE_ROCM], [${do_rocm}], [Defined to 1 if ROCm is available])
+AM_CONDITIONAL([HAVE_ROCM], [test ${do_rocm} = 1])
+AS_IF([test ${do_rocm} = 1],
+	AC_DEFINE_UNQUOTED( [__HIP_PLATFORM_AMD__], [ 1 ], [Select AMD/ROCm HIP APIs] ))
 
 CHECK_PKG_HWLOC([],
 		[AC_MSG_ERROR([Could not find the hwloc library. Use --with-hwloc to provide the path to non-standard hwloc installation.])])

diff --git a/include/nccl-headers/error.h b/include/nccl-headers/error.h
@@ -5,7 +5,7 @@
 #ifndef NCCL_HEADERS_ERROR_H
 #define NCCL_HEADERS_ERROR_H
 
-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_ROCM
 #include "nccl-headers/nvidia/err.h"
 #elif HAVE_NEURON
 #include "nccl-headers/neuron/error.h"

diff --git a/include/nccl-headers/net.h b/include/nccl-headers/net.h
@@ -5,7 +5,7 @@
 #ifndef NCCL_HEADERS_NET_H
 #define NCCL_HEADERS_NET_H
 
-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_ROCM
 #include "nccl-headers/nvidia/net.h"
 #elif HAVE_NEURON
 #include "nccl-headers/neuron/net.h"

diff --git a/include/nccl_ofi_cuda.h b/include/nccl_ofi_cuda.h
@@ -41,8 +41,10 @@ extern int nccl_net_ofi_gpuDeviceGetCount(int* count);
 
 #if CUDA_VERSION >= 11030
 extern int nccl_net_ofi_gpuFlushGPUDirectRDMAWrites();
+#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 1
 #else
 extern void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites;
+#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 0
 #endif
 
 #ifdef _cplusplus

diff --git a/include/nccl_ofi_rocm.h b/include/nccl_ofi_rocm.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024 Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_OFI_CUDA_H_
+#define NCCL_OFI_CUDA_H_
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
+#include <hip/hip_runtime_api.h>
+
+/*
+ * Error checking is currently just success or failure.
+ */
+enum {
+        GPU_SUCCESS = 0,
+        GPU_ERROR = 999 /* Match hipErrorUnknown */
+};
+
+int nccl_net_ofi_gpu_init(void);
+
+/*
+ * @brief      Gets the GPU device associated with the buffer
+ *
+ * @param      data
+ *             Pointer to GPU buffer.
+ *
+ * @return     Valid GPU device ID on success
+ *             -1 on error
+ * @return     0 on success
+ *             non-zero on error
+ */
+int nccl_net_ofi_get_cuda_device(void *data, int *dev_id);
+int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion);
+int nccl_net_ofi_gpuCtxGetDevice(int *device);
+int nccl_net_ofi_gpuDeviceGetCount(int* count);
+
+extern void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites;
+#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 0
+
+#ifdef _cplusplus
+} // End extern "C"
+#endif
+
+#endif // End NCCL_OFI_H_
diff --git a/m4/check_pkg_cuda.m4 b/m4/check_pkg_cuda.m4
@@ -49,9 +49,6 @@ AC_DEFUN([CHECK_PKG_CUDA], [
          CPPFLAGS="${check_pkg_CPPFLAGS_save}"
          $2])
 
-  AC_DEFINE_UNQUOTED([HAVE_CUDA], [${check_pkg_define}], [Defined to 1 if CUDA is available])
-  AM_CONDITIONAL([HAVE_CUDA], [test "${check_pkg_found}" = "yes"])
-
   AC_SUBST([CUDA_LDFLAGS])
   AC_SUBST([CUDA_LIBS])
 

diff --git a/m4/check_pkg_rocm.m4 b/m4/check_pkg_rocm.m4
@@ -0,0 +1,52 @@
+# -*- autoconf -*-
+#
+# Copyright (c) 2024      Hewlett Packard Enterprise Development LP
+# Copyright (c) 2023      Amazon.com, Inc. or its affiliates. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+AC_DEFUN([CHECK_PKG_ROCM], [
+  check_pkg_found="yes"
+  check_pkg_CPPFLAGS_save="${CPPFLAGS}"
+  check_pkg_LDFLAGS_save="${LDFLAGS}"
+  check_pkg_LIBS_save="${LIBS}"
+
+  AC_ARG_WITH([rocm],
+             [AS_HELP_STRING([--with-rocm=PATH], [Path to non-standard ROCm installation])])
+
+  AS_IF([test -z "${with-rocm}" -o "{with_rocm}" = "yes"],
+        [],
+        [test "${with_rocm}" = "no"],
+        [check_pkg_found=no],
+        [AS_IF([test -d ${with_rocm}/lib64], [check_pkg_libdir="lib64"], [check_pkg_libdir="lib"])
+        CPPFLAGS="-I${with_rocm}/include ${CPPFLAGS}"
+        LDFLAGS="-L${with_rocm}/${check_pkg_libdir} ${LDFLAGS}"])
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [AC_CHECK_LIB([amdhip64], [hipMemAllocHost], [], [check_pkg_found=no])])
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], [check_pkg_found=no], [#define __HIP_PLATFORM_AMD__])])
+
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [check_pkg_define="yes"],
+        [check_pkg_define="no"
+         CPPFLAGS="${check_pkg_CPPFLAGS_save}"
+         LDFLAGS="${check_pkg_LDFLAGS_save}"
+         LIBS="${check_pkg_LIBS_save}"
+        ])
+
+  AS_IF([test -n "${with_rocm}"],
+       [AS_IF([test "${check_pkg_define}" = "yes"],
+              [$1], [$2] )
+       ], [$2]
+   )
+
+  AS_UNSET([check_pkg_found])
+  AS_UNSET([check_pkg_define])
+  AS_UNSET([check_pkg_CPPFLAGS_save])
+  AS_UNSET([check_pkg_LDFLAGS_save])
+  AS_UNSET([check_pkg_LIBS_save])
+])
diff --git a/src/Makefile.am b/src/Makefile.am
@@ -26,52 +26,58 @@ sources = \
 	nccl_ofi_ep_addr_list.c \
 	tracepoint.c
 
+tuner_sources = \
+	tuner/nccl_ofi_regions.c \
+	tuner/nccl_ofi_tuner.c
+
 if WANT_PLATFORM_AWS
 sources += platform-aws.c
 endif
 
 if ENABLE_NEURON
   sources += nccl_ofi_interface_neuron.c
-else
-  sources += nccl_ofi_cuda.c \
-	nccl_ofi_interface_nvidia.c
-endif
-
-# Build an internal-only library that can be used by unit tests as
-# well as the actual nccl_net.so / nccom_net.so libraries.  This saves
-# us writing dlopen() handlers for simple unit tests.
-noinst_LTLIBRARIES = libinternal_net_plugin.la
-libinternal_net_plugin_la_SOURCES = $(sources)
-libinternal_net_plugin_la_LDFLAGS = -avoid-version
 
-if ENABLE_NEURON
   lib_LTLIBRARIES = libnccom-net.la
   libnccom_net_la_SOURCES =
   libnccom_net_la_LIBADD = libinternal_net_plugin.la
   libnccom_net_la_LDFLAGS = -module -avoid-version
+endif
+
+if HAVE_CUDA
+  sources += nccl_ofi_cuda.c nccl_ofi_interface_nvidia.c
+if WANT_PLATFORM_AWS
+  # NCCL tuner plugin
+  lib_LTLIBRARIES = libnccl-net.la libnccl-ofi-tuner.la
+  libnccl_ofi_tuner_la_SOURCES = $(tuner_sources)
+  libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version
 else
   lib_LTLIBRARIES = libnccl-net.la
+endif
+
   libnccl_net_la_SOURCES =
   libnccl_net_la_LIBADD = libinternal_net_plugin.la
   libnccl_net_la_LDFLAGS = -module -avoid-version
 endif
 
+if HAVE_ROCM
+  sources += nccl_ofi_rocm.c nccl_ofi_interface_nvidia.c
+
+  lib_LTLIBRARIES = librccl-net.la
+  librccl_net_la_SOURCES =
+  librccl_net_la_LIBADD = libinternal_net_plugin.la
+  librccl_net_la_LDFLAGS = -module -avoid-version
+endif
+
+# Build an internal-only library that can be used by unit tests as
+# well as the actual nccl_net.so / nccom_net.so libraries.  This saves
+# us writing dlopen() handlers for simple unit tests.
+noinst_LTLIBRARIES = libinternal_net_plugin.la
+libinternal_net_plugin_la_SOURCES = $(sources)
+libinternal_net_plugin_la_LDFLAGS = -avoid-version
 
 #
 # Tuner
 #
 noinst_LTLIBRARIES += libinternal_tuner_plugin.la
-tuner_sources = \
-	tuner/nccl_ofi_regions.c \
-	tuner/nccl_ofi_tuner.c
 libinternal_tuner_plugin_la_SOURCES = $(tuner_sources)
 libinternal_tuner_plugin_la_LDFLAGS = -avoid-version
-
-if HAVE_CUDA
-if WANT_PLATFORM_AWS
-  # NCCL tuner plugin
-  lib_LTLIBRARIES += libnccl-ofi-tuner.la
-  libnccl_ofi_tuner_la_SOURCES = $(tuner_sources)
-  libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version
-endif
-endif
diff --git a/src/nccl_ofi_api.c b/src/nccl_ofi_api.c
@@ -297,7 +297,7 @@ ncclResult_t nccl_net_ofi_regMr(void *comm, void *data, size_t size, int type,
 	/* Validate type of buffer */
 	bool valid_buffer_type = false;
 	if (type == NCCL_PTR_HOST) valid_buffer_type = true;
-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_ROCM
 	if (type == NCCL_PTR_CUDA) valid_buffer_type = true;
 #endif
 #if HAVE_NEURON

diff --git a/src/nccl_ofi_net.c b/src/nccl_ofi_net.c
@@ -19,6 +19,8 @@
 #include "nccl_ofi_tracepoint.h"
 #if HAVE_CUDA
 #include "nccl_ofi_cuda.h"
+#elif HAVE_ROCM
+#include "nccl_ofi_rocm.h"
 #endif
 #include "nccl_ofi_sendrecv.h"
 #include "nccl_ofi_rdma.h"
@@ -151,7 +153,7 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 	assert(NCCL_OFI_IS_POWER_OF_TWO(system_page_size));
 	assert(system_page_size > 0);
 
-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_ROCM
 	ret = nccl_net_ofi_gpu_init();
 	if (ret != 0) {
 		NCCL_OFI_WARN("CUDA initialization failed.");
@@ -167,7 +169,7 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 
 	NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using CUDA driver version %d", cuda_version);
 	if (ofi_nccl_cuda_flush_enable()) {
-		if (nccl_net_ofi_gpuFlushGPUDirectRDMAWrites == NULL) {
+		if (HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE) {
 			NCCL_OFI_WARN("CUDA flush requested, but cuFlushGPUDirectRDMAWrites not found.");
 			cuda_flush = false;
 		} else {

diff --git a/src/nccl_ofi_ofiutils.c b/src/nccl_ofi_ofiutils.c
@@ -19,6 +19,8 @@
 #include "nccl_ofi_tracepoint.h"
 #if HAVE_CUDA
 #include "nccl_ofi_cuda.h"
+#elif HAVE_ROCM
+#include "nccl_ofi_rocm.h"
 #endif
 #include "nccl_ofi_math.h"
 #include "nccl_ofi_ofiutils.h"
@@ -342,7 +344,7 @@ int nccl_ofi_ofiutils_init_connection(int api_version, struct fi_info *info, str
 	 * using the Libfabric 1.18 API with HMEM support.
 	 */
 	if (api_version == FI_VERSION(1,18) && support_gdr != GDR_UNSUPPORTED) {
-#if (HAVE_CUDA && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED)
+#if ((HAVE_CUDA || HAVE_ROCM) && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED)
 		bool optval = false;
 		ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT,
 				FI_OPT_CUDA_API_PERMITTED, &optval,

diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c
@@ -14,6 +14,8 @@
 #include "nccl_ofi.h"
 #if HAVE_CUDA
 #include "nccl_ofi_cuda.h"
+#elif HAVE_ROCM
+#include "nccl_ofi_rocm.h"
 #endif
 #include "nccl_ofi_ep_addr_list.h"
 #include "nccl_ofi_param.h"
@@ -385,7 +387,7 @@ static int set_mr_req_attr(nccl_ofi_idpool_t *key_pool, int dev_id,
 		mr_attr->access |= FI_READ;
 		mr_attr->iface = FI_HMEM_SYSTEM;
 		break;
-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_ROCM
 	case NCCL_PTR_CUDA:
 		mr_attr->access |= FI_REMOTE_READ;
 		mr_attr->iface = FI_HMEM_CUDA;

diff --git a/src/nccl_ofi_rocm.c b/src/nccl_ofi_rocm.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2024 Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#include "config.h"
+
+#include <dlfcn.h>
+
+#include "nccl_ofi.h"
+#include "nccl_ofi_rocm.h"
+
+int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion) {
+	return hipDriverGetVersion(driverVersion) == hipSuccess ? GPU_SUCCESS : GPU_ERROR;
+}
+
+int nccl_net_ofi_gpuCtxGetDevice(int *device) {
+	return hipGetDevice(device) == hipSuccess ? GPU_SUCCESS : GPU_ERROR;
+}
+
+int nccl_net_ofi_gpuDeviceGetCount(int *count) {
+	return hipGetDeviceCount(count) == hipSuccess ? GPU_SUCCESS : GPU_ERROR;
+}
+
+void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites = NULL;
+
+int
+nccl_net_ofi_gpu_init(void)
+{
+       return 0;
+}
+
+int nccl_net_ofi_get_cuda_device(void *data, int *dev_id)
+{
+       int ret = 0;
+       int cuda_device = -1;
+       unsigned int mem_type;
+       unsigned int device_ordinal;
+       hipError_t cuda_ret_mem = hipPointerGetAttribute(&device_ordinal,
+                                                        HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+                                                        (hipDeviceptr_t) data);
+       hipError_t cuda_ret_dev = hipPointerGetAttribute(&mem_type,
+                                                        HIP_POINTER_ATTRIBUTE_MEMORY_TYPE,
+                                                        (hipDeviceptr_t) data);
+
+       if (cuda_ret_mem != hipSuccess || cuda_ret_dev != hipSuccess) {
+               ret = -ENOTSUP;
+               NCCL_OFI_WARN("Invalid buffer pointer provided");
+               goto exit;
+       }
+
+       if (mem_type == hipMemoryTypeDevice) {
+               cuda_device = device_ordinal;
+       } else {
+               ret = -EINVAL;
+               NCCL_OFI_WARN("Invalid type of buffer provided. Only device memory is expected for NCCL_PTR_CUDA type");
+       }
+
+ exit:
+       *dev_id = cuda_device;
+       return ret;
+}