-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add ROCm support based on configure option.
ROCm provides an interface similar to CUDA, to work with AMD GPUs. Provide a compile time option to build with ROCm instead of CUDA. 1. Add --with-rocm= flag to ./configure. 2. Make all CUDA calls "gpu" calls, which are independent of the underlying framework. 3. Switch between _rocm and _cuda files at compile time to make the appropriate calls. 4. When building for RCCL (AMD's NCCL), generate a rccl-net.so-named plugin for binary compatibility. Tested on: 1. HPE Cray EX with EX235A BardPeak GPUs + 200Gb Slingshot adapters. 2. HPE Cray EX with NVIDIA A100 SXM4 80GB GPUs + 200 Gb Slingshot adapters. Signed-off-by: Ryan Hankins <ryan.hankins@hpe.com>
- Loading branch information
1 parent
8e836e5
commit b1a22d5
Showing
16 changed files
with
240 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
/* | ||
* Copyright (c) 2024 Hewlett Packard Enterprise Development LP | ||
* Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. | ||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. | ||
*/ | ||
|
||
#ifndef NCCL_OFI_CUDA_H_ | ||
#define NCCL_OFI_CUDA_H_ | ||
|
||
#ifdef _cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
#include <hip/hip_runtime_api.h> | ||
|
||
/* | ||
* Error checking is currently just success or failure. | ||
*/ | ||
enum { | ||
GPU_SUCCESS = 0, | ||
GPU_ERROR = 999 /* Match hipErrorUnknown */ | ||
}; | ||
|
||
int nccl_net_ofi_gpu_init(void); | ||
|
||
/* | ||
* @brief Gets the GPU device associated with the buffer | ||
* | ||
* @param data | ||
* Pointer to GPU buffer. | ||
* | ||
* @return Valid GPU device ID on success | ||
* -1 on error | ||
* @return 0 on success | ||
* non-zero on error | ||
*/ | ||
int nccl_net_ofi_get_cuda_device(void *data, int *dev_id); | ||
int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion); | ||
int nccl_net_ofi_gpuCtxGetDevice(int *device); | ||
int nccl_net_ofi_gpuDeviceGetCount(int* count); | ||
|
||
extern void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites; | ||
#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 0 | ||
|
||
#ifdef _cplusplus | ||
} // End extern "C" | ||
#endif | ||
|
||
#endif // End NCCL_OFI_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# -*- autoconf -*- | ||
# | ||
# Copyright (c) 2024 Hewlett Packard Enterprise Development LP | ||
# Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. | ||
# | ||
# See LICENSE.txt for license information | ||
# | ||
|
||
AC_DEFUN([CHECK_PKG_ROCM], [ | ||
check_pkg_found="yes" | ||
check_pkg_CPPFLAGS_save="${CPPFLAGS}" | ||
check_pkg_LDFLAGS_save="${LDFLAGS}" | ||
check_pkg_LIBS_save="${LIBS}" | ||
AC_ARG_WITH([rocm], | ||
[AS_HELP_STRING([--with-rocm=PATH], [Path to non-standard ROCm installation])]) | ||
AS_IF([test -z "${with-rocm}" -o "{with_rocm}" = "yes"], | ||
[], | ||
[test "${with_rocm}" = "no"], | ||
[check_pkg_found=no], | ||
[AS_IF([test -d ${with_rocm}/lib64], [check_pkg_libdir="lib64"], [check_pkg_libdir="lib"]) | ||
CPPFLAGS="-I${with_rocm}/include ${CPPFLAGS}" | ||
LDFLAGS="-L${with_rocm}/${check_pkg_libdir} ${LDFLAGS}"]) | ||
AS_IF([test "${check_pkg_found}" = "yes"], | ||
[AC_CHECK_LIB([amdhip64], [hipMemAllocHost], [], [check_pkg_found=no])]) | ||
AS_IF([test "${check_pkg_found}" = "yes"], | ||
[AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], [check_pkg_found=no], [#define __HIP_PLATFORM_AMD__])]) | ||
AS_IF([test "${check_pkg_found}" = "yes"], | ||
[check_pkg_define="yes"], | ||
[check_pkg_define="no" | ||
CPPFLAGS="${check_pkg_CPPFLAGS_save}" | ||
LDFLAGS="${check_pkg_LDFLAGS_save}" | ||
LIBS="${check_pkg_LIBS_save}" | ||
]) | ||
AS_IF([test -n "${with_rocm}"], | ||
[AS_IF([test "${check_pkg_define}" = "yes"], | ||
[$1], [$2] ) | ||
], [$2] | ||
) | ||
AS_UNSET([check_pkg_found]) | ||
AS_UNSET([check_pkg_define]) | ||
AS_UNSET([check_pkg_CPPFLAGS_save]) | ||
AS_UNSET([check_pkg_LDFLAGS_save]) | ||
AS_UNSET([check_pkg_LIBS_save]) | ||
]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
/* | ||
* Copyright (c) 2024 Hewlett Packard Enterprise Development LP | ||
* Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. | ||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. | ||
*/ | ||
|
||
#include "config.h" | ||
|
||
#include <dlfcn.h> | ||
|
||
#include "nccl_ofi.h" | ||
#include "nccl_ofi_rocm.h" | ||
|
||
int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion) { | ||
return hipDriverGetVersion(driverVersion) == hipSuccess ? GPU_SUCCESS : GPU_ERROR; | ||
} | ||
|
||
int nccl_net_ofi_gpuCtxGetDevice(int *device) { | ||
return hipGetDevice(device) == hipSuccess ? GPU_SUCCESS : GPU_ERROR; | ||
} | ||
|
||
int nccl_net_ofi_gpuDeviceGetCount(int *count) { | ||
return hipGetDeviceCount(count) == hipSuccess ? GPU_SUCCESS : GPU_ERROR; | ||
} | ||
|
||
void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites = NULL; | ||
|
||
int | ||
nccl_net_ofi_gpu_init(void) | ||
{ | ||
return 0; | ||
} | ||
|
||
int nccl_net_ofi_get_cuda_device(void *data, int *dev_id) | ||
{ | ||
int ret = 0; | ||
int cuda_device = -1; | ||
unsigned int mem_type; | ||
unsigned int device_ordinal; | ||
hipError_t cuda_ret_mem = hipPointerGetAttribute(&device_ordinal, | ||
HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL, | ||
(hipDeviceptr_t) data); | ||
hipError_t cuda_ret_dev = hipPointerGetAttribute(&mem_type, | ||
HIP_POINTER_ATTRIBUTE_MEMORY_TYPE, | ||
(hipDeviceptr_t) data); | ||
|
||
if (cuda_ret_mem != hipSuccess || cuda_ret_dev != hipSuccess) { | ||
ret = -ENOTSUP; | ||
NCCL_OFI_WARN("Invalid buffer pointer provided"); | ||
goto exit; | ||
} | ||
|
||
if (mem_type == hipMemoryTypeDevice) { | ||
cuda_device = device_ordinal; | ||
} else { | ||
ret = -EINVAL; | ||
NCCL_OFI_WARN("Invalid type of buffer provided. Only device memory is expected for NCCL_PTR_CUDA type"); | ||
} | ||
|
||
exit: | ||
*dev_id = cuda_device; | ||
return ret; | ||
} |
Oops, something went wrong.