From cd45df6425713032642dc7ed7a9cae036142b5b1 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 25 Nov 2024 16:02:01 -0800 Subject: [PATCH] tree: cleanup "gdr_support" variable In the default case, we lazily create all fabric resources at the time of communicator creation, such that they end up owned by the correct thread and/or are resident on the correct cpu socket and memory domain. previously, there existed an ugly dependency chain in our init: while the large majority of the provider properties that we care about can be extracted from fi_getinfo responses, some can only be effectively queried by attempting mutations against an existing endpoint/domain/etc and seeing if it failed or not. A further subset of these properties need to be exposed back by nccl-net-ofi to nccl, at the time of getProperties, and prior to communicator instantiation. to work around this, late in init we pick a device, instantiate it, query the attributes we need for getProperties, and then tear it all down. This is expensive and delays our init, as well as exposing us to bugs from incomplete teardown. The sole case in the codebase today where this is necessary today is around detecting gdr support for FI_HMEM_CUDA. With dmabuf now as the default, it's relatively safe to just avoid the call and optimistically assume support when both cuda properties are true and when FI_HMEM is available in the provider. Signed-off-by: Nicholas Sielicki --- include/nccl_ofi.h | 6 +- include/nccl_ofi_cuda.h | 17 ++++ include/nccl_ofi_param.h | 3 + m4/check_pkg_cuda.m4 | 10 +- src/nccl_ofi_cuda.c | 66 +++++++++++--- src/nccl_ofi_net.c | 94 +++++-------------- src/nccl_ofi_ofiutils.c | 54 +++-------- src/nccl_ofi_rdma.c | 22 ++--- src/nccl_ofi_sendrecv.c | 111 ++++++++++------------- src/platform-aws.c | 24 +++-- tests/functional/nccl_connection.c | 23 ----- tests/functional/nccl_message_transfer.c | 45 ++------- tests/functional/ring.c | 67 +++++--------- 13 files changed, 218 insertions(+), 324 deletions(-) diff --git a/include/nccl_ofi.h b/include/nccl_ofi.h index 98ea8b1a2..456add197 100644 --- a/include/nccl_ofi.h +++ b/include/nccl_ofi.h @@ -81,15 +81,11 @@ extern "C" { /* Initial number of entries in the MR cache of a device */ #define NCCL_OFI_MR_CACHE_INIT_SIZE 128 -/* Indicates if GPUDirect is supported by libfabric provider */ -enum gdr_support_level_t {GDR_UNKNOWN, GDR_SUPPORTED, GDR_UNSUPPORTED}; -extern enum gdr_support_level_t support_gdr; - - /* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used * to flush data to the GPU. Note, CUDA flush support is not supported on all * platforms and should be disabled by default */ extern bool cuda_flush; +extern bool gdr_flush_disabled; /* number of duplicate providers to create for each discovered * provider, including renaming to cause NCCL to create additional diff --git a/include/nccl_ofi_cuda.h b/include/nccl_ofi_cuda.h index 4523f0614..0ecc23c6e 100644 --- a/include/nccl_ofi_cuda.h +++ b/include/nccl_ofi_cuda.h @@ -66,6 +66,23 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void); */ bool nccl_net_ofi_cuda_have_gdr_support_attr(void); +/* + * @brief query CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS + + * @return true if attr is fetched successfully and true. + * false otherwise + */ +bool nccl_net_ofi_cuda_have_gdr_flush_support_attr(void); + +/* + * @brief test whether gdrcopy can possibly be supported, depending on the + * linked libfabric version and the properties exposed by cuda. + * + * @return true if attr is fetched successfully and true. + * false otherwise + */ +bool nccl_net_ofi_cuda_gdr_viable(void); + #ifdef __cplusplus } // End extern "C" #endif diff --git a/include/nccl_ofi_param.h b/include/nccl_ofi_param.h index fe02afdf8..cea2695f0 100644 --- a/include/nccl_ofi_param.h +++ b/include/nccl_ofi_param.h @@ -281,6 +281,9 @@ OFI_NCCL_PARAM_INT(disable_gdr_required_check, "DISABLE_GDR_REQUIRED_CHECK", 0); */ OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0); +/* Largely exists for parity with DISABLE_DMABUF, but usage of this is discouraged. */ +OFI_NCCL_PARAM_INT(disable_gdrcopy, "DISABLE_GDRCOPY", 0); + /* * Messages sized larger than this threshold will be striped across multiple rails */ diff --git a/m4/check_pkg_cuda.m4 b/m4/check_pkg_cuda.m4 index 8aade58fc..42cf4b5f2 100644 --- a/m4/check_pkg_cuda.m4 +++ b/m4/check_pkg_cuda.m4 @@ -53,15 +53,15 @@ AC_DEFUN([CHECK_PKG_CUDA], [ [check_pkg_found=no], [-ldl -lrt])]) - check_cuda_gdr_flush_define=0 + check_cuda_gdr_define=0 AS_IF([test "${check_pkg_found}" = "yes"], [ - AC_MSG_CHECKING([if CUDA 11.3+ is available for GDR Write Flush support]) + AC_MSG_CHECKING([if CUDA 11.3+ is available for GDR + GDR Write Flush support]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([ #include _Static_assert(CUDA_VERSION >= 11030, "cudart>=11030 required for cuFlushGPUDirectRDMAWrites"); - ])],[ check_cuda_gdr_flush_define=1 chk_result=yes ], - [ check_cuda_gdr_flush_define=0 chk_result=no ]) + ])],[ check_cuda_gdr_define=1 chk_result=yes ], + [ check_cuda_gdr_define=0 chk_result=no ]) AC_MSG_RESULT(${chk_result}) ]) @@ -85,7 +85,7 @@ AC_DEFUN([CHECK_PKG_CUDA], [ AC_DEFINE_UNQUOTED([HAVE_CUDA], [${check_pkg_define}], [Defined to 1 if CUDA is available]) AC_DEFINE_UNQUOTED([HAVE_CUDA_DMABUF_SUPPORT], [${check_cuda_dmabuf_define}], [Defined to 1 if CUDA DMA-BUF support is available]) - AC_DEFINE_UNQUOTED([HAVE_CUDA_GDRFLUSH_SUPPORT], [${check_cuda_gdr_flush_define}], [Defined to 1 if CUDA cuFlushGPUDirectRDMAWrites support is available]) + AC_DEFINE_UNQUOTED([HAVE_CUDA_GDR_SUPPORT], [${check_cuda_gdr_define}], [Defined to 1 if CUDA cuFlushGPUDirectRDMAWrites support is available]) AM_CONDITIONAL([HAVE_CUDA], [test "${check_pkg_found}" = "yes"]) AC_SUBST([CUDA_LDFLAGS]) diff --git a/src/nccl_ofi_cuda.c b/src/nccl_ofi_cuda.c index 17c0f9009..6d914ce7d 100644 --- a/src/nccl_ofi_cuda.c +++ b/src/nccl_ofi_cuda.c @@ -69,19 +69,23 @@ int nccl_net_ofi_cuda_init(void) RESOLVE_CUDA_FUNCTION(cuCtxGetDevice); RESOLVE_CUDA_FUNCTION(cuDeviceGetAttribute); - if (HAVE_CUDA_GDRFLUSH_SUPPORT && nccl_net_ofi_cuda_have_gdr_support_attr() && ofi_nccl_cuda_flush_enable()) { - NCCL_OFI_WARN("CUDA flush enabled"); - cuda_flush = true; - } else { + cuda_flush = ofi_nccl_cuda_flush_enable(); + gdr_flush_disabled = ofi_nccl_gdr_flush_disable(); + +#if HAVE_CUDA_GDR_SUPPORT + if (!(nccl_net_ofi_cuda_gdr_viable() && + nccl_net_ofi_cuda_have_gdr_flush_support_attr())) { + gdr_flush_disabled = true; cuda_flush = false; } +#endif return 0; } int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void) { -#if HAVE_CUDA_GDRFLUSH_SUPPORT +#if HAVE_CUDA_GDR_SUPPORT static_assert(CUDA_VERSION >= 11030, "Requires cudart>=11.3"); cudaError_t ret = cudaDeviceFlushGPUDirectRDMAWrites(cudaFlushGPUDirectRDMAWritesTargetCurrentDevice, cudaFlushGPUDirectRDMAWritesToOwner); @@ -129,9 +133,9 @@ int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id) }; } -bool nccl_net_ofi_cuda_have_gdr_support_attr(void) +bool nccl_net_ofi_cuda_have_gdr_flush_support_attr(void) { -#if HAVE_CUDA_GDRFLUSH_SUPPORT +#if HAVE_CUDA_GDR_SUPPORT if (pfn_cuCtxGetDevice == NULL || pfn_cuDeviceGetAttribute == NULL) { return false; } @@ -143,13 +147,29 @@ bool nccl_net_ofi_cuda_have_gdr_support_attr(void) } int supported; - result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev); - if (result != CUDA_SUCCESS || !((bool)supported)) { + result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, dev); + return result == CUDA_SUCCESS && ((supported & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0); +#else + return false; +#endif +} + +bool nccl_net_ofi_cuda_have_gdr_support_attr(void) +{ +#if HAVE_CUDA_GDR_SUPPORT + if (pfn_cuCtxGetDevice == NULL || pfn_cuDeviceGetAttribute == NULL) { return false; } - result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, dev); - return result == CUDA_SUCCESS && ((supported & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0); + CUdevice dev; + CUresult result = pfn_cuCtxGetDevice(&dev); + if (result != CUDA_SUCCESS) { + return false; + } + + int supported; + result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev); + return result == CUDA_SUCCESS && (bool)supported; #else return false; #endif @@ -179,3 +199,27 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void) return false; #endif } + +bool nccl_net_ofi_cuda_gdr_viable(void) +{ + /* Disable GDR if building against too-old libfabric. */ + if (FI_VERSION_LT(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), FI_VERSION(1, 18))) { + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Will not use GDR, requires Libfabric 1.18 or greater."); + return false; + } + + /* Disable GDR if explicitly disabled by user. */ + if (ofi_nccl_disable_gdrcopy()) { + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Will not attempt to use GDRCopy, explicitly disabled by user."); + return false; + } + + /* Disable GDR if CUDA does not report GDR support in device attributes. */ + if (!nccl_net_ofi_cuda_have_gdr_support_attr()) { + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, + "Will not attempt to use GDRCopy, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED was false."); + return false; + } + + return true; +} diff --git a/src/nccl_ofi_net.c b/src/nccl_ofi_net.c index ac285d47c..f0b199080 100644 --- a/src/nccl_ofi_net.c +++ b/src/nccl_ofi_net.c @@ -30,13 +30,11 @@ #include "nccl_ofi_ofiutils.h" #include "nccl_ofi_system.h" -/* Indicates if GPUDirect is supported by libfabric provider */ -enum gdr_support_level_t support_gdr = GDR_UNKNOWN; - /* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used * to flush data to the GPU. Note, CUDA flush support is not supported on all * platforms and should be disabled by default */ bool cuda_flush = false; +bool gdr_flush_disabled = true; /* number of duplicate providers to create for each discovered * provider, including renaming to cause NCCL to create additional @@ -136,9 +134,6 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p) int ret = 0; const char *provider_filter = NULL; nccl_net_ofi_plugin_t *plugin; - nccl_net_ofi_ep_t *base_ep = NULL; - nccl_net_ofi_device_t *device = NULL; - nccl_ofi_properties_t properties; NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Initializing " PACKAGE_STRING); @@ -162,6 +157,11 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p) */ mr_cache_alignment = NCCL_OFI_MIN(system_page_size, NCCL_OFI_CACHE_PAGE_SIZE); + /* configuration parameters */ + nic_dup_conns = ofi_nccl_nic_dup_conns(); + net_latency = (float)ofi_nccl_net_latency(); + cq_read_count = ofi_nccl_cq_read_count(); + #if HAVE_CUDA ret = nccl_net_ofi_cuda_init(); if (ret != 0) { @@ -170,17 +170,22 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p) } #endif - /* configuration parameters */ - nic_dup_conns = ofi_nccl_nic_dup_conns(); - net_latency = (float)ofi_nccl_net_latency(); - cq_read_count = ofi_nccl_cq_read_count(); - if (platform_init) { ret = platform_init(&provider_filter); if (ret != 0) goto exit; } +#if HAVE_CUDA + if (nic_dup_conns > 0 && nccl_net_ofi_cuda_have_gdr_support_attr()) { + NCCL_OFI_WARN( + "NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA. This configuration is not " + "supported."); + ret = -ENOTSUP; + goto exit; + } +#endif + /* This is ugly, but here's the basic protocol selection * logic: * 1. if the user set NCCL_OFI_PROTOCOL, use that. @@ -285,55 +290,6 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p) goto exit; } - /* In order to set endpoint options and potentially NCCL configuration - * options (such as NCCL_PROTO) during the plugin initialization - * process, we need to create an endpoint and call the platform hook - * "platform_config_endpoint" using "get_ep". This code makes the - * assumption that the thread calling "nccl_net_ofi_init" will make - * communication calls. As well, since without this code the endpoint - * would be created the first time "get_ep" in called during a listen or - * connect call, creating the endpoint earlier would not be a waste of - * resources. This initialization happens once per process, and thus it - * does not matter which device is used to create the endpoint. - */ - device = plugin->get_device(plugin, 0); - - ret = device->get_ep(device, &base_ep); - if (ret != 0) { - goto exit; - } - ret = device->get_properties(device, &properties); - if (ret != 0) { - goto exit; - } - NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for global registrations: %s", - (properties.regIsGlobal == 0) ? "false" : "true"); - NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for DMA-BUF registrations: %s", - (properties.dmabuf_support == 0) ? "false" : "true"); - /* Cause release to not actually free the resources, to speed - * up initialization, since the very same resources will be - * recreated by NCCL soon after initialization to do real - * communication. - */ - base_ep->ref_cnt++; - ret = base_ep->release_ep(base_ep); - base_ep->ref_cnt--; - if (ret != 0) { - goto exit; - } - - assert(support_gdr != GDR_UNKNOWN); - - /* we don't actually know if GDR is supported until we've - * created the first endpoint, so this check needs to be way - * down here - */ - if (nic_dup_conns > 0 && support_gdr != GDR_UNSUPPORTED) { - NCCL_OFI_WARN("NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA. This configuration is not supported."); - ret = -ENOTSUP; - goto exit; - } - *plugin_p = plugin; exit: @@ -416,12 +372,7 @@ static int set_nic_props_default(int dev_id, struct fi_info *nic_prov, */ props->max_group_receives = NCCL_OFI_MAX_RECVS; - if (support_gdr == GDR_SUPPORTED) { - props->hmem_support = true; - } else { - props->hmem_support = false; - } - + props->hmem_support = false; props->dmabuf_support = false; /* Should be successful for ptrSupport invocation */ @@ -580,14 +531,19 @@ int nccl_net_ofi_info_properties(nccl_net_ofi_plugin_t *plugin, struct fi_info * props->max_mr_key_size = nic_prov->domain_attr->mr_key_size; + props->hmem_support = ((nic_prov->caps & FI_HMEM) != 0) && + FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 18)) && + (HAVE_NEURON || nccl_net_ofi_cuda_have_gdr_support_attr()); props->dmabuf_support = ((nic_prov->caps & FI_HMEM) != 0) && FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 20)) && nccl_ofi_dmabuf_viable() ; - if (props->dmabuf_support) { - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "DMA-BUF support is advertised in properties."); - } + + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, + "NCCL properties: dmabuf=%s hmem=%s", + props->dmabuf_support ? "yes" : "no", + props->hmem_support ? "yes" : "no"); goto exit; error: diff --git a/src/nccl_ofi_ofiutils.c b/src/nccl_ofi_ofiutils.c index a7dafab8b..a555d6966 100644 --- a/src/nccl_ofi_ofiutils.c +++ b/src/nccl_ofi_ofiutils.c @@ -4,22 +4,23 @@ */ #include "config.h" - +#include +#include #include #include #include #include -#include -#include #include -#include +#include #include "nccl_ofi.h" -#include "nccl_ofi_param.h" -#include "nccl_ofi_tracepoint.h" + +#include "nccl_ofi_cuda.h" #include "nccl_ofi_math.h" +#include "nccl_ofi_param.h" #include "nccl_ofi_ofiutils.h" #include "nccl_ofi_platform.h" +#include "nccl_ofi_tracepoint.h" #define EFA_PROVIDER_NAME "efa" #define IS_EFA_PROVIDER(NAME) (strcmp((NAME), EFA_PROVIDER_NAME)==0) @@ -364,53 +365,20 @@ int nccl_ofi_ofiutils_init_connection(struct fi_info *info, struct fid_domain *d * disabling CUDA in old Libfabric, just require newer * Libfabric. */ - if (FI_VERSION_GE(info->fabric_attr->api_version, - FI_VERSION(1, 18)) && support_gdr != GDR_UNSUPPORTED) { #if (HAVE_CUDA && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED) + if (FI_VERSION_GE(info->fabric_attr->api_version, FI_VERSION(1, 18)) && nccl_net_ofi_cuda_have_gdr_support_attr()) { bool optval = false; ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT, FI_OPT_CUDA_API_PERMITTED, &optval, sizeof(optval)); - if (ret == -FI_EOPNOTSUPP || ret == -FI_ENOPROTOOPT) { - if (support_gdr == GDR_SUPPORTED) { - /* If we got here, that means we previously said - * we definitely had GDR support, but now don't. - * Since we may have already told NCCL that we - * support GDR, we should just abort. - */ - NCCL_OFI_WARN("GDR support reported to NCCL but then couldn't be configured on an endpoint. Cannot continue."); - goto error; - } else { - NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Could not disable CUDA API usage for HMEM, disabling GDR"); - /* If we can't disable CUDA, then we don't really - * have GDR, so disable GDR support from the NCCL - * point of view. - */ - support_gdr = GDR_UNSUPPORTED; - } - } else if (ret == 0) { - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Set endpoint option FI_OPT_CUDA_API_PERMITTED. GDR Supported"); - /* we were able to disable CUDA, so we can do GDR */ - support_gdr = GDR_SUPPORTED; - } else { + if (ret != 0) { NCCL_OFI_WARN("Failed to set FI_OPT_CUDA_API_PERMITTED. RC: %d, ERROR: %s", ret, fi_strerror(-ret)); goto error; } -#elif HAVE_NEURON - /* - * Provider discovery for Neuron will have been successful only - * if HMEM capabilities were guaranteed by the libfabric - * provider. Unlike CUDA, we do not need to handle the - * runtime/endpoint deadlock with fi_setopt(), so move the flag - * to supported. - */ - support_gdr = GDR_SUPPORTED; -#else - NCCL_OFI_WARN("Using Libfabric 1.18 API with GPUDirect RDMA support, and FI_OPT_CUDA_API_PERMITTED is not declared."); - goto error; -#endif } +#endif + /* Run platform-specific endpoint configuration hook if declared */ if (platform_config_endpoint) { ret = platform_config_endpoint(info, *ep); diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 65466806a..53dd1f14c 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -3620,7 +3620,8 @@ static int recv(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers, static inline bool is_flush_buff_enabled(void) { - return !ofi_nccl_gdr_flush_disable() && support_gdr == GDR_SUPPORTED && !cuda_flush; + static __thread const bool gdr_flush_disabled = (bool)ofi_nccl_gdr_flush_disable(); + return !cuda_flush && !gdr_flush_disabled; } /* @@ -4161,6 +4162,7 @@ static int flush(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers, int *sizes, nccl_net_ofi_mr_handle_t **mhandles, nccl_net_ofi_req_t **base_req) { + static __thread const bool gdr_flush_disabled = (bool)ofi_nccl_gdr_flush_disable(); int ret = 0; int flush_n = 0; bool network_busy = false; @@ -4193,8 +4195,9 @@ static int flush(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers, goto error; } - if (ofi_nccl_gdr_flush_disable() || support_gdr == GDR_UNSUPPORTED) + if (gdr_flush_disabled) { goto exit; + } #if HAVE_CUDA if (cuda_flush) { @@ -7719,20 +7722,7 @@ int nccl_net_ofi_rdma_init(const char *provider_filter, ret = nccl_ofi_ofiutils_get_providers(provider_filter, api_version, hints, &provider_list, &num_providers); if (ret == 0) { - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Using Libfabric %u.%u API, with %s support", - FI_MAJOR(api_version), - FI_MINOR(api_version), - FI_VERSION_GE(api_version, FI_VERSION(1, 20)) ? "DMA-BUF" : "GPUDirect RDMA"); - /* The 1.18 API allows providers to use CUDA to - * support HMEM pointers, so just having HMEM doesn't - * tell us anything about the usability of CUDA - * pointers with NCCL. So leave the state unknown - * until we create an endpoint and try to disable - * CUDA - */ - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, - "Using Libfabric 1.18 API, with GPUDirect RDMA support"); - support_gdr = GDR_UNKNOWN; + NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using Libfabric %u.%u API", FI_MAJOR(api_version), FI_MINOR(api_version)); } else { NCCL_OFI_WARN("OFI fi_getinfo() call failed: %s", fi_strerror(ret)); goto error; diff --git a/src/nccl_ofi_sendrecv.c b/src/nccl_ofi_sendrecv.c index b2b5e3357..483944de6 100644 --- a/src/nccl_ofi_sendrecv.c +++ b/src/nccl_ofi_sendrecv.c @@ -1032,7 +1032,7 @@ static int sendrecv_recv_comm_close(nccl_net_ofi_recv_comm_t *recv_comm) goto exit; } - if (!ofi_nccl_gdr_flush_disable() && support_gdr == GDR_SUPPORTED && !cuda_flush) { + if (!ofi_nccl_gdr_flush_disable() && !cuda_flush) { NCCL_OFI_TRACE(NCCL_NET, "De-registering buffer for flush operations"); /* Deregister Flush buffer memory region */ mr_handle = (struct fid_mr *)r_comm->flush_buff.mr_handle; @@ -1078,8 +1078,9 @@ static int sendrecv_recv_comm_flush(nccl_net_ofi_recv_comm_t *recv_comm, int n, int flush_n = -1; struct fid_mr **mr_handles = (struct fid_mr **)mhandles; - if (ofi_nccl_gdr_flush_disable() || support_gdr == GDR_UNSUPPORTED) + if (gdr_flush_disabled) { goto exit; + } #if HAVE_CUDA if (cuda_flush) { @@ -1347,7 +1348,7 @@ static nccl_net_ofi_sendrecv_recv_comm_t *sendrecv_recv_comm_prepare(nccl_net_of * Setup flush resources if using GPUDirect RDMA unless user disables * flush operations */ - if (!ofi_nccl_gdr_flush_disable() && support_gdr == GDR_SUPPORTED && !cuda_flush) { + if (!ofi_nccl_gdr_flush_disable() && !cuda_flush) { r_comm->flush_buff.size = NCCL_OFI_FLUSH_SIZE; ret = sendrecv_recv_comm_alloc_and_reg_flush_buff(domain, ep->ofi_ep, key_pool, &r_comm->flush_buff, dev_id); @@ -2519,6 +2520,32 @@ static int nccl_net_ofi_sendrecv_plugin_create(size_t num_devices, } +static uint32_t sendrecv_get_required_api(void) +{ + const uint32_t lib_api = fi_version(); + if (nccl_ofi_dmabuf_viable()) { + return NCCL_OFI_MIN(lib_api, FI_VERSION(1, 20)); + } +#if HAVE_NEURON + else { + /* XXX: neuron will not request libfabric<1.18, not because I know of any + * specific 1.18 behavior neuron relies on relative to 1.6, but because it's + * what the previous code did. + */ + return NCCL_OFI_MIN(lib_api, FI_VERSION(1, 18)); + } +#elif HAVE_CUDA + else if (nccl_net_ofi_cuda_gdr_viable()) { + /* need at least 1.18 for gdrcopy */ + return NCCL_OFI_MIN(lib_api, FI_VERSION(1, 18)); + } else { + /* 1.6 otherwise. */ + return NCCL_OFI_MIN(lib_api, FI_VERSION(1, 6)); + } +#endif +} + + int nccl_net_ofi_sendrecv_init(const char *provider_filter, nccl_net_ofi_plugin_t **plugin_p) { @@ -2526,75 +2553,37 @@ int nccl_net_ofi_sendrecv_init(const char *provider_filter, struct fi_info *provider_list = NULL; unsigned int num_providers; nccl_net_ofi_sendrecv_plugin_t *plugin = NULL; - struct fi_info *hints; - hints = fi_allocinfo(); + uint32_t required_api = sendrecv_get_required_api(); + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Selected libfabric %u.%u API", FI_MAJOR(required_api), FI_MINOR(required_api)); + + struct fi_info *hints = fi_allocinfo(); if (hints == NULL) { NCCL_OFI_WARN("Allocation of fi_info failed"); ret = -FI_ENOMEM; goto error; } - - if (nccl_ofi_dmabuf_viable()) { - sendrecv_get_hints(hints, true); - ret = nccl_ofi_ofiutils_get_providers(provider_filter, - FI_VERSION(1, 20), - hints, - &provider_list, - &num_providers); - if (ret == 0) { - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Using Libfabric 1.20 API, with DMA-BUF support"); - support_gdr = GDR_UNKNOWN; - goto found; - } - } - sendrecv_get_hints(hints, true); - ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 18), hints, - &provider_list, &num_providers); - if (ret == 0) { - /* The 1.18 API allows providers to use CUDA to - * support HMEM pointers, so just having HMEM doesn't - * tell us anything about the usability of CUDA - * pointers with NCCL. So leave the state unknown - * until we create an endpoint and try to disable - * CUDA - */ - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, - "Using Libfabric 1.18 API, with GPUDirect RDMA support"); - support_gdr = GDR_UNKNOWN; - goto found; - } - - sendrecv_get_hints(hints, true); - ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 6), hints, - &provider_list, &num_providers); - if (ret == 0) { - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, - "Using Libfabric 1.6 API, with GPUDirect RDMA support"); - support_gdr = GDR_SUPPORTED; - goto found; - } - - sendrecv_get_hints(hints, false); - ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 6), hints, - &provider_list, &num_providers); + ret = nccl_ofi_ofiutils_get_providers(provider_filter, required_api, hints, &provider_list, &num_providers); + fi_freeinfo(hints); if (ret == 0) { NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, - "Using Libfabric 1.6 API, without GPUDirect RDMA support"); - support_gdr = GDR_UNSUPPORTED; - goto found; - } - - ret = -FI_ENODATA; -found: - fi_freeinfo(hints); - if (ret != 0 && ret != -FI_ENODATA) { - NCCL_OFI_WARN("OFI fi_getinfo() call failed: %s", fi_strerror(ret)); + "Successfully found providers at libfabric %u.%u API", + FI_MAJOR(required_api), + FI_MINOR(required_api)); + } else if (ret == -FI_ENODATA) { + NCCL_OFI_WARN("OFI fi_getinfo() @ libfabric %u.%u api failed to resolve any providers", + FI_MAJOR(required_api), + FI_MINOR(required_api)); + assert(provider_list == NULL); goto error; - } - if (provider_list == NULL) { + } else { + NCCL_OFI_WARN("OFI fi_getinfo() @ libfabric %u.%u api failed unexpectedly: %s", + FI_MAJOR(required_api), + FI_MINOR(required_api), + fi_strerror(ret)); ret = -FI_ENODATA; + assert(provider_list == NULL); goto error; } diff --git a/src/platform-aws.c b/src/platform-aws.c index 787305fdf..c35c2a0a7 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -26,6 +26,10 @@ #include "nccl_ofi_pthread.h" #include "nccl_ofi_system.h" +#if HAVE_CUDA +#include "nccl_ofi_cuda.h" +#endif + struct ec2_platform_data { const char* name; const char* topology; @@ -582,16 +586,6 @@ int platform_config_endpoint(struct fi_info *info, struct fid_ep* endpoint) { goto exit; } - if (ofi_nccl_disable_gdr_required_check() == 0) { - /* Ensure GDR is enabled on GDR-supported instances */ - struct ec2_platform_data *platform_data = get_platform_data(); - if (platform_data && platform_data->gdr_required && support_gdr != GDR_SUPPORTED) { - NCCL_OFI_WARN("GDR disabled on GDR-supported instance type %s", platform_data->name); - ret = -EINVAL; - goto exit; - } - } - /* If the selected communication protocol is RDMA write and the user did * not disable the native RDMA support check, validate that the * FI_OPT_EFA_EMULATED_WRITE endpoint option can be accessed, and that @@ -611,6 +605,16 @@ int platform_config_endpoint(struct fi_info *info, struct fid_ep* endpoint) { static bool need_ordering = false; static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + /* Ensure GDR support via cuda attr on GDR-supported instances */ + if (ofi_nccl_disable_gdr_required_check() == 0) { + struct ec2_platform_data *platform_data = get_platform_data(); + if (platform_data && platform_data->gdr_required && !nccl_net_ofi_cuda_have_gdr_support_attr()) { + NCCL_OFI_WARN("GDR disabled on GDR-supported instance type %s", platform_data->name); + ret = -EINVAL; + goto exit; + } + } + /* During initialization, try to set * FI_OPT_EFA_{SENDRECV,WRTIE}_IN_ORDER_ALIGNED_128_BYTES to * true to see if the LL/LL128 protocol is supported. After diff --git a/tests/functional/nccl_connection.c b/tests/functional/nccl_connection.c index 3874ec975..8b2762b51 100644 --- a/tests/functional/nccl_connection.c +++ b/tests/functional/nccl_connection.c @@ -28,9 +28,6 @@ int main(int argc, char* argv[]) ofi_log_function = logger; - /* Indicates if NICs support GPUDirect */ - int *test_support_gdr = NULL; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); @@ -67,21 +64,11 @@ int main(int argc, char* argv[]) OFINCCLCHECKGOTO(extNet->devices(&ndev), res, exit); NCCL_OFI_INFO(NCCL_INIT, "Received %d network devices", ndev); - test_support_gdr = (int *)malloc(sizeof(int) * ndev); - if (test_support_gdr == NULL) { - NCCL_OFI_WARN("Failed to allocate memory"); - res = ncclInternalError; - goto exit; - } - /* Get Properties for the device */ for (int dev = 0; dev < ndev; dev++) { test_nccl_properties_t props = {}; OFINCCLCHECKGOTO(extNet->getProperties(dev, &props), res, exit); print_dev_props(dev, &props); - - /* Set CUDA support */ - test_support_gdr[dev] = is_gdr_supported_nic(props.ptrSupport); } /* Test all devices */ @@ -95,11 +82,6 @@ int main(int argc, char* argv[]) NCCL_OFI_TRACE(NCCL_INIT, "Rank %d uses %d device for communication", rank, dev); - if (test_support_gdr[dev] == 1) { - NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, - "Network supports communication using CUDA buffers. Dev: %d", dev); - } - /* Listen API */ NCCL_OFI_INFO(NCCL_INIT, "Server: Listening on dev %d", dev); OFINCCLCHECKGOTO(extNet->listen(dev, (void *)&handle, (void **)&lComm), res, exit); @@ -173,10 +155,5 @@ int main(int argc, char* argv[]) NCCL_OFI_INFO(NCCL_NET, "Test completed successfully for rank %d", rank); exit: - if (test_support_gdr) { - free(test_support_gdr); - test_support_gdr = NULL; - } - return res; } diff --git a/tests/functional/nccl_message_transfer.c b/tests/functional/nccl_message_transfer.c index 3b2421ba0..521467e66 100644 --- a/tests/functional/nccl_message_transfer.c +++ b/tests/functional/nccl_message_transfer.c @@ -17,7 +17,7 @@ int main(int argc, char* argv[]) { ncclResult_t res = ncclSuccess; int rank, proc_name_len, num_ranks = 0, local_rank = 0, peer_rank = 0; - int buffer_type = NCCL_PTR_HOST; + int buffer_type = NCCL_PTR_CUDA; test_nccl_properties_t props = {}; /* Plugin defines */ @@ -42,9 +42,6 @@ int main(int argc, char* argv[]) char *expected_buf = NULL; int done, received_size; - /* Indicates if NICs support GPUDirect */ - int *test_support_gdr = NULL; - /* All processors IDs, used to find out the local rank */ char *all_proc_name = NULL; @@ -102,7 +99,7 @@ int main(int argc, char* argv[]) } } - /* Set CUDA device for subsequent device memory allocation, in case GDR is used */ + /* Set CUDA device for subsequent device memory allocation */ NCCL_OFI_TRACE(NCCL_NET, "Using CUDA device %d for memory allocation", local_rank); /* Get external Network from NCCL-OFI library */ @@ -121,20 +118,10 @@ int main(int argc, char* argv[]) OFINCCLCHECKGOTO(extNet->devices(&ndev), res, exit); NCCL_OFI_INFO(NCCL_NET, "Received %d network devices", ndev); - test_support_gdr = (int *)malloc(sizeof(int) * ndev); - if (test_support_gdr == NULL) { - NCCL_OFI_WARN("Failed to allocate memory"); - res = ncclInternalError; - goto exit; - } - /* Get Properties for the device */ for (int dev = 0; dev < ndev; dev++) { OFINCCLCHECKGOTO(extNet->getProperties(dev, &props), res, exit); print_dev_props(dev, &props); - - /* Set CUDA support */ - test_support_gdr[dev] = is_gdr_supported_nic(props.ptrSupport); } /* Test all devices */ @@ -147,12 +134,7 @@ int main(int argc, char* argv[]) } NCCL_OFI_TRACE(NCCL_INIT, "Rank %d uses %d device for communication", rank, dev); - - if (test_support_gdr[dev] == 1) { - NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, - "Network supports communication using CUDA buffers. Dev: %d", dev); - buffer_type = NCCL_PTR_CUDA; - } + buffer_type = NCCL_PTR_CUDA; /* Listen API */ NCCL_OFI_INFO(NCCL_NET, "Server: Listening on dev %d", dev); @@ -310,7 +292,7 @@ int main(int argc, char* argv[]) goto exit; } - if ((rank == 1) && (buffer_type == NCCL_PTR_CUDA)) { + if (rank == 1) { NCCL_OFI_TRACE(NCCL_NET, "Issue flush for data consistency. Request idx: %d", idx); @@ -337,14 +319,12 @@ int main(int argc, char* argv[]) extNet->deregMr((void *)sComm, mhandle[idx]), res, exit); } else { - if ((buffer_type == NCCL_PTR_CUDA) && !ofi_nccl_gdr_flush_disable()) { - /* Data validation may fail if flush operations are disabled */ - } else { - OFINCCLCHECKGOTO( - validate_data(recv_buf[idx], expected_buf, - send_sizes[szidx], buffer_type), - res, exit); - } + OFINCCLCHECKGOTO(validate_data(recv_buf[idx], + expected_buf, + send_sizes[szidx], + buffer_type), + res, + exit); OFINCCLCHECKGOTO( extNet->deregMr((void *)rComm, mhandle[idx]), res, exit); @@ -423,11 +403,6 @@ exit:; expected_buf = NULL; } - if (test_support_gdr) { - free(test_support_gdr); - test_support_gdr = NULL; - } - if (all_proc_name) { free(all_proc_name); all_proc_name = NULL; diff --git a/tests/functional/ring.c b/tests/functional/ring.c index 77aac75a6..56eb3fdbc 100644 --- a/tests/functional/ring.c +++ b/tests/functional/ring.c @@ -12,7 +12,7 @@ int main(int argc, char *argv[]) { ncclResult_t res = ncclSuccess; int rank, size, next, prev, proc_name_len, local_rank = 0; - int buffer_type = NCCL_PTR_HOST; + int buffer_type = NCCL_PTR_CUDA; /* Plugin defines */ int ndev; @@ -41,9 +41,6 @@ int main(int argc, char *argv[]) char *expected_buf = NULL; int done, received_size; - /* Indicates if NICs support GPUDirect */ - int *test_support_gdr = NULL; - /* All processors IDs, used to find out the local rank */ char *all_proc_name = NULL; @@ -96,7 +93,7 @@ int main(int argc, char *argv[]) } } - /* Set CUDA device for subsequent device memory allocation, in case GDR is used */ + /* Set CUDA device for subsequent device memory allocation */ NCCL_OFI_TRACE(NCCL_NET, "Using CUDA device %d for memory allocation", local_rank); /* Allocate and populate expected buffer */ @@ -127,22 +124,11 @@ int main(int argc, char *argv[]) OFINCCLCHECKGOTO(extNet->devices(&ndev), res, exit); NCCL_OFI_INFO(NCCL_NET, "Received %d network devices", ndev); - /* Indicates if NICs support GPUDirect */ - test_support_gdr = (int *)malloc(sizeof(int) * ndev); - if (test_support_gdr == NULL) { - NCCL_OFI_WARN("Failed to allocate memory"); - res = ncclInternalError; - goto exit; - } - /* Get Properties for the device */ for (int dev = 0; dev < ndev; dev++) { test_nccl_properties_t props = {}; OFINCCLCHECKGOTO(extNet->getProperties(dev, &props), res, exit); print_dev_props(dev, &props); - - /* Set CUDA support */ - test_support_gdr[dev] = is_gdr_supported_nic(props.ptrSupport); } /* Test all devices */ @@ -152,12 +138,6 @@ int main(int argc, char *argv[]) NCCL_OFI_TRACE(NCCL_INIT, "Rank %d uses %d device for communication", rank, dev); - if (test_support_gdr[dev] == 1) { - NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, - "Network supports communication using CUDA buffers. Dev: %d", dev); - buffer_type = NCCL_PTR_CUDA; - } - /* Listen API */ NCCL_OFI_INFO(NCCL_NET, "Server: Listening on device %d", dev); OFINCCLCHECKGOTO(extNet->listen(dev, (void *)&handle, (void **)&lComm), res, exit); @@ -251,28 +231,28 @@ int main(int argc, char *argv[]) if (done) { inflight_reqs--; req_completed_recv[idx] = 1; - - /* Invoke flush operations unless user has explicitly disabled it */ - if (buffer_type == NCCL_PTR_CUDA) { - NCCL_OFI_TRACE(NCCL_NET, - "Issue flush for data consistency. Request idx: %d", - idx); - nccl_net_ofi_req_t *iflush_req = NULL; - OFINCCLCHECKGOTO(extNet->iflush((void *)rComm, nrecv, - (void **)&recv_buf[idx], sizes, - &recv_mhandle[idx], (void **)&iflush_req), res, exit); - done = 0; - if (iflush_req) { - while (!done) { - OFINCCLCHECKGOTO(extNet->test((void *)iflush_req, &done, NULL), res, exit); - } + NCCL_OFI_TRACE(NCCL_NET, "Issue flush for data consistency. Request idx: %d", idx); + nccl_net_ofi_req_t *iflush_req = NULL; + OFINCCLCHECKGOTO(extNet->iflush((void *)rComm, + nrecv, + (void **)&recv_buf[idx], + sizes, + &recv_mhandle[idx], + (void **)&iflush_req), + res, + exit); + done = 0; + if (iflush_req) { + while (!done) { + OFINCCLCHECKGOTO(extNet->test((void *)iflush_req, &done, NULL), + res, + exit); } } - if ((buffer_type == NCCL_PTR_CUDA) && !ofi_nccl_gdr_flush_disable()) { - /* Data validation may fail if flush operations are disabled */ - } else - OFINCCLCHECKGOTO(validate_data(recv_buf[idx], expected_buf, SEND_SIZE, buffer_type), res, exit); + OFINCCLCHECKGOTO(validate_data(recv_buf[idx], expected_buf, SEND_SIZE, buffer_type), + res, + exit); /* Deregister memory handle */ OFINCCLCHECKGOTO(extNet->deregMr((void *)rComm, recv_mhandle[idx]), res, exit); @@ -340,11 +320,6 @@ exit:; expected_buf = NULL; } - if (test_support_gdr) { - free(test_support_gdr); - test_support_gdr = NULL; - } - if (all_proc_name) { free(all_proc_name); all_proc_name = NULL;