diff --git a/include/nccl_ofi.h b/include/nccl_ofi.h index 98ea8b1a2..456add197 100644 --- a/include/nccl_ofi.h +++ b/include/nccl_ofi.h @@ -81,15 +81,11 @@ extern "C" { /* Initial number of entries in the MR cache of a device */ #define NCCL_OFI_MR_CACHE_INIT_SIZE 128 -/* Indicates if GPUDirect is supported by libfabric provider */ -enum gdr_support_level_t {GDR_UNKNOWN, GDR_SUPPORTED, GDR_UNSUPPORTED}; -extern enum gdr_support_level_t support_gdr; - - /* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used * to flush data to the GPU. Note, CUDA flush support is not supported on all * platforms and should be disabled by default */ extern bool cuda_flush; +extern bool gdr_flush_disabled; /* number of duplicate providers to create for each discovered * provider, including renaming to cause NCCL to create additional diff --git a/include/nccl_ofi_cuda.h b/include/nccl_ofi_cuda.h index 4523f0614..0ecc23c6e 100644 --- a/include/nccl_ofi_cuda.h +++ b/include/nccl_ofi_cuda.h @@ -66,6 +66,23 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void); */ bool nccl_net_ofi_cuda_have_gdr_support_attr(void); +/* + * @brief query CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS + + * @return true if attr is fetched successfully and true. + * false otherwise + */ +bool nccl_net_ofi_cuda_have_gdr_flush_support_attr(void); + +/* + * @brief test whether gdrcopy can possibly be supported, depending on the + * linked libfabric version and the properties exposed by cuda. + * + * @return true if attr is fetched successfully and true. + * false otherwise + */ +bool nccl_net_ofi_cuda_gdr_viable(void); + #ifdef __cplusplus } // End extern "C" #endif diff --git a/include/nccl_ofi_param.h b/include/nccl_ofi_param.h index fe02afdf8..cea2695f0 100644 --- a/include/nccl_ofi_param.h +++ b/include/nccl_ofi_param.h @@ -281,6 +281,9 @@ OFI_NCCL_PARAM_INT(disable_gdr_required_check, "DISABLE_GDR_REQUIRED_CHECK", 0); */ OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0); +/* Largely exists for parity with DISABLE_DMABUF, but usage of this is discouraged. */ +OFI_NCCL_PARAM_INT(disable_gdrcopy, "DISABLE_GDRCOPY", 0); + /* * Messages sized larger than this threshold will be striped across multiple rails */ diff --git a/m4/check_pkg_cuda.m4 b/m4/check_pkg_cuda.m4 index 8aade58fc..42cf4b5f2 100644 --- a/m4/check_pkg_cuda.m4 +++ b/m4/check_pkg_cuda.m4 @@ -53,15 +53,15 @@ AC_DEFUN([CHECK_PKG_CUDA], [ [check_pkg_found=no], [-ldl -lrt])]) - check_cuda_gdr_flush_define=0 + check_cuda_gdr_define=0 AS_IF([test "${check_pkg_found}" = "yes"], [ - AC_MSG_CHECKING([if CUDA 11.3+ is available for GDR Write Flush support]) + AC_MSG_CHECKING([if CUDA 11.3+ is available for GDR + GDR Write Flush support]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([ #include _Static_assert(CUDA_VERSION >= 11030, "cudart>=11030 required for cuFlushGPUDirectRDMAWrites"); - ])],[ check_cuda_gdr_flush_define=1 chk_result=yes ], - [ check_cuda_gdr_flush_define=0 chk_result=no ]) + ])],[ check_cuda_gdr_define=1 chk_result=yes ], + [ check_cuda_gdr_define=0 chk_result=no ]) AC_MSG_RESULT(${chk_result}) ]) @@ -85,7 +85,7 @@ AC_DEFUN([CHECK_PKG_CUDA], [ AC_DEFINE_UNQUOTED([HAVE_CUDA], [${check_pkg_define}], [Defined to 1 if CUDA is available]) AC_DEFINE_UNQUOTED([HAVE_CUDA_DMABUF_SUPPORT], [${check_cuda_dmabuf_define}], [Defined to 1 if CUDA DMA-BUF support is available]) - AC_DEFINE_UNQUOTED([HAVE_CUDA_GDRFLUSH_SUPPORT], [${check_cuda_gdr_flush_define}], [Defined to 1 if CUDA cuFlushGPUDirectRDMAWrites support is available]) + AC_DEFINE_UNQUOTED([HAVE_CUDA_GDR_SUPPORT], [${check_cuda_gdr_define}], [Defined to 1 if CUDA cuFlushGPUDirectRDMAWrites support is available]) AM_CONDITIONAL([HAVE_CUDA], [test "${check_pkg_found}" = "yes"]) AC_SUBST([CUDA_LDFLAGS]) diff --git a/src/nccl_ofi_cuda.c b/src/nccl_ofi_cuda.c index 17c0f9009..6d914ce7d 100644 --- a/src/nccl_ofi_cuda.c +++ b/src/nccl_ofi_cuda.c @@ -69,19 +69,23 @@ int nccl_net_ofi_cuda_init(void) RESOLVE_CUDA_FUNCTION(cuCtxGetDevice); RESOLVE_CUDA_FUNCTION(cuDeviceGetAttribute); - if (HAVE_CUDA_GDRFLUSH_SUPPORT && nccl_net_ofi_cuda_have_gdr_support_attr() && ofi_nccl_cuda_flush_enable()) { - NCCL_OFI_WARN("CUDA flush enabled"); - cuda_flush = true; - } else { + cuda_flush = ofi_nccl_cuda_flush_enable(); + gdr_flush_disabled = ofi_nccl_gdr_flush_disable(); + +#if HAVE_CUDA_GDR_SUPPORT + if (!(nccl_net_ofi_cuda_gdr_viable() && + nccl_net_ofi_cuda_have_gdr_flush_support_attr())) { + gdr_flush_disabled = true; cuda_flush = false; } +#endif return 0; } int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void) { -#if HAVE_CUDA_GDRFLUSH_SUPPORT +#if HAVE_CUDA_GDR_SUPPORT static_assert(CUDA_VERSION >= 11030, "Requires cudart>=11.3"); cudaError_t ret = cudaDeviceFlushGPUDirectRDMAWrites(cudaFlushGPUDirectRDMAWritesTargetCurrentDevice, cudaFlushGPUDirectRDMAWritesToOwner); @@ -129,9 +133,9 @@ int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id) }; } -bool nccl_net_ofi_cuda_have_gdr_support_attr(void) +bool nccl_net_ofi_cuda_have_gdr_flush_support_attr(void) { -#if HAVE_CUDA_GDRFLUSH_SUPPORT +#if HAVE_CUDA_GDR_SUPPORT if (pfn_cuCtxGetDevice == NULL || pfn_cuDeviceGetAttribute == NULL) { return false; } @@ -143,13 +147,29 @@ bool nccl_net_ofi_cuda_have_gdr_support_attr(void) } int supported; - result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev); - if (result != CUDA_SUCCESS || !((bool)supported)) { + result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, dev); + return result == CUDA_SUCCESS && ((supported & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0); +#else + return false; +#endif +} + +bool nccl_net_ofi_cuda_have_gdr_support_attr(void) +{ +#if HAVE_CUDA_GDR_SUPPORT + if (pfn_cuCtxGetDevice == NULL || pfn_cuDeviceGetAttribute == NULL) { return false; } - result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, dev); - return result == CUDA_SUCCESS && ((supported & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0); + CUdevice dev; + CUresult result = pfn_cuCtxGetDevice(&dev); + if (result != CUDA_SUCCESS) { + return false; + } + + int supported; + result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev); + return result == CUDA_SUCCESS && (bool)supported; #else return false; #endif @@ -179,3 +199,27 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void) return false; #endif } + +bool nccl_net_ofi_cuda_gdr_viable(void) +{ + /* Disable GDR if building against too-old libfabric. */ + if (FI_VERSION_LT(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), FI_VERSION(1, 18))) { + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Will not use GDR, requires Libfabric 1.18 or greater."); + return false; + } + + /* Disable GDR if explicitly disabled by user. */ + if (ofi_nccl_disable_gdrcopy()) { + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Will not attempt to use GDRCopy, explicitly disabled by user."); + return false; + } + + /* Disable GDR if CUDA does not report GDR support in device attributes. */ + if (!nccl_net_ofi_cuda_have_gdr_support_attr()) { + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, + "Will not attempt to use GDRCopy, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED was false."); + return false; + } + + return true; +} diff --git a/src/nccl_ofi_net.c b/src/nccl_ofi_net.c index ac285d47c..f0b199080 100644 --- a/src/nccl_ofi_net.c +++ b/src/nccl_ofi_net.c @@ -30,13 +30,11 @@ #include "nccl_ofi_ofiutils.h" #include "nccl_ofi_system.h" -/* Indicates if GPUDirect is supported by libfabric provider */ -enum gdr_support_level_t support_gdr = GDR_UNKNOWN; - /* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used * to flush data to the GPU. Note, CUDA flush support is not supported on all * platforms and should be disabled by default */ bool cuda_flush = false; +bool gdr_flush_disabled = true; /* number of duplicate providers to create for each discovered * provider, including renaming to cause NCCL to create additional @@ -136,9 +134,6 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p) int ret = 0; const char *provider_filter = NULL; nccl_net_ofi_plugin_t *plugin; - nccl_net_ofi_ep_t *base_ep = NULL; - nccl_net_ofi_device_t *device = NULL; - nccl_ofi_properties_t properties; NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Initializing " PACKAGE_STRING); @@ -162,6 +157,11 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p) */ mr_cache_alignment = NCCL_OFI_MIN(system_page_size, NCCL_OFI_CACHE_PAGE_SIZE); + /* configuration parameters */ + nic_dup_conns = ofi_nccl_nic_dup_conns(); + net_latency = (float)ofi_nccl_net_latency(); + cq_read_count = ofi_nccl_cq_read_count(); + #if HAVE_CUDA ret = nccl_net_ofi_cuda_init(); if (ret != 0) { @@ -170,17 +170,22 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p) } #endif - /* configuration parameters */ - nic_dup_conns = ofi_nccl_nic_dup_conns(); - net_latency = (float)ofi_nccl_net_latency(); - cq_read_count = ofi_nccl_cq_read_count(); - if (platform_init) { ret = platform_init(&provider_filter); if (ret != 0) goto exit; } +#if HAVE_CUDA + if (nic_dup_conns > 0 && nccl_net_ofi_cuda_have_gdr_support_attr()) { + NCCL_OFI_WARN( + "NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA. This configuration is not " + "supported."); + ret = -ENOTSUP; + goto exit; + } +#endif + /* This is ugly, but here's the basic protocol selection * logic: * 1. if the user set NCCL_OFI_PROTOCOL, use that. @@ -285,55 +290,6 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p) goto exit; } - /* In order to set endpoint options and potentially NCCL configuration - * options (such as NCCL_PROTO) during the plugin initialization - * process, we need to create an endpoint and call the platform hook - * "platform_config_endpoint" using "get_ep". This code makes the - * assumption that the thread calling "nccl_net_ofi_init" will make - * communication calls. As well, since without this code the endpoint - * would be created the first time "get_ep" in called during a listen or - * connect call, creating the endpoint earlier would not be a waste of - * resources. This initialization happens once per process, and thus it - * does not matter which device is used to create the endpoint. - */ - device = plugin->get_device(plugin, 0); - - ret = device->get_ep(device, &base_ep); - if (ret != 0) { - goto exit; - } - ret = device->get_properties(device, &properties); - if (ret != 0) { - goto exit; - } - NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for global registrations: %s", - (properties.regIsGlobal == 0) ? "false" : "true"); - NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for DMA-BUF registrations: %s", - (properties.dmabuf_support == 0) ? "false" : "true"); - /* Cause release to not actually free the resources, to speed - * up initialization, since the very same resources will be - * recreated by NCCL soon after initialization to do real - * communication. - */ - base_ep->ref_cnt++; - ret = base_ep->release_ep(base_ep); - base_ep->ref_cnt--; - if (ret != 0) { - goto exit; - } - - assert(support_gdr != GDR_UNKNOWN); - - /* we don't actually know if GDR is supported until we've - * created the first endpoint, so this check needs to be way - * down here - */ - if (nic_dup_conns > 0 && support_gdr != GDR_UNSUPPORTED) { - NCCL_OFI_WARN("NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA. This configuration is not supported."); - ret = -ENOTSUP; - goto exit; - } - *plugin_p = plugin; exit: @@ -416,12 +372,7 @@ static int set_nic_props_default(int dev_id, struct fi_info *nic_prov, */ props->max_group_receives = NCCL_OFI_MAX_RECVS; - if (support_gdr == GDR_SUPPORTED) { - props->hmem_support = true; - } else { - props->hmem_support = false; - } - + props->hmem_support = false; props->dmabuf_support = false; /* Should be successful for ptrSupport invocation */ @@ -580,14 +531,19 @@ int nccl_net_ofi_info_properties(nccl_net_ofi_plugin_t *plugin, struct fi_info * props->max_mr_key_size = nic_prov->domain_attr->mr_key_size; + props->hmem_support = ((nic_prov->caps & FI_HMEM) != 0) && + FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 18)) && + (HAVE_NEURON || nccl_net_ofi_cuda_have_gdr_support_attr()); props->dmabuf_support = ((nic_prov->caps & FI_HMEM) != 0) && FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 20)) && nccl_ofi_dmabuf_viable() ; - if (props->dmabuf_support) { - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "DMA-BUF support is advertised in properties."); - } + + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, + "NCCL properties: dmabuf=%s hmem=%s", + props->dmabuf_support ? "yes" : "no", + props->hmem_support ? "yes" : "no"); goto exit; error: diff --git a/src/nccl_ofi_ofiutils.c b/src/nccl_ofi_ofiutils.c index a7dafab8b..a555d6966 100644 --- a/src/nccl_ofi_ofiutils.c +++ b/src/nccl_ofi_ofiutils.c @@ -4,22 +4,23 @@ */ #include "config.h" - +#include +#include #include #include #include #include -#include -#include #include -#include +#include #include "nccl_ofi.h" -#include "nccl_ofi_param.h" -#include "nccl_ofi_tracepoint.h" + +#include "nccl_ofi_cuda.h" #include "nccl_ofi_math.h" +#include "nccl_ofi_param.h" #include "nccl_ofi_ofiutils.h" #include "nccl_ofi_platform.h" +#include "nccl_ofi_tracepoint.h" #define EFA_PROVIDER_NAME "efa" #define IS_EFA_PROVIDER(NAME) (strcmp((NAME), EFA_PROVIDER_NAME)==0) @@ -364,53 +365,20 @@ int nccl_ofi_ofiutils_init_connection(struct fi_info *info, struct fid_domain *d * disabling CUDA in old Libfabric, just require newer * Libfabric. */ - if (FI_VERSION_GE(info->fabric_attr->api_version, - FI_VERSION(1, 18)) && support_gdr != GDR_UNSUPPORTED) { #if (HAVE_CUDA && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED) + if (FI_VERSION_GE(info->fabric_attr->api_version, FI_VERSION(1, 18)) && nccl_net_ofi_cuda_have_gdr_support_attr()) { bool optval = false; ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT, FI_OPT_CUDA_API_PERMITTED, &optval, sizeof(optval)); - if (ret == -FI_EOPNOTSUPP || ret == -FI_ENOPROTOOPT) { - if (support_gdr == GDR_SUPPORTED) { - /* If we got here, that means we previously said - * we definitely had GDR support, but now don't. - * Since we may have already told NCCL that we - * support GDR, we should just abort. - */ - NCCL_OFI_WARN("GDR support reported to NCCL but then couldn't be configured on an endpoint. Cannot continue."); - goto error; - } else { - NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Could not disable CUDA API usage for HMEM, disabling GDR"); - /* If we can't disable CUDA, then we don't really - * have GDR, so disable GDR support from the NCCL - * point of view. - */ - support_gdr = GDR_UNSUPPORTED; - } - } else if (ret == 0) { - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Set endpoint option FI_OPT_CUDA_API_PERMITTED. GDR Supported"); - /* we were able to disable CUDA, so we can do GDR */ - support_gdr = GDR_SUPPORTED; - } else { + if (ret != 0) { NCCL_OFI_WARN("Failed to set FI_OPT_CUDA_API_PERMITTED. RC: %d, ERROR: %s", ret, fi_strerror(-ret)); goto error; } -#elif HAVE_NEURON - /* - * Provider discovery for Neuron will have been successful only - * if HMEM capabilities were guaranteed by the libfabric - * provider. Unlike CUDA, we do not need to handle the - * runtime/endpoint deadlock with fi_setopt(), so move the flag - * to supported. - */ - support_gdr = GDR_SUPPORTED; -#else - NCCL_OFI_WARN("Using Libfabric 1.18 API with GPUDirect RDMA support, and FI_OPT_CUDA_API_PERMITTED is not declared."); - goto error; -#endif } +#endif + /* Run platform-specific endpoint configuration hook if declared */ if (platform_config_endpoint) { ret = platform_config_endpoint(info, *ep); diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 65466806a..53dd1f14c 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -3620,7 +3620,8 @@ static int recv(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers, static inline bool is_flush_buff_enabled(void) { - return !ofi_nccl_gdr_flush_disable() && support_gdr == GDR_SUPPORTED && !cuda_flush; + static __thread const bool gdr_flush_disabled = (bool)ofi_nccl_gdr_flush_disable(); + return !cuda_flush && !gdr_flush_disabled; } /* @@ -4161,6 +4162,7 @@ static int flush(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers, int *sizes, nccl_net_ofi_mr_handle_t **mhandles, nccl_net_ofi_req_t **base_req) { + static __thread const bool gdr_flush_disabled = (bool)ofi_nccl_gdr_flush_disable(); int ret = 0; int flush_n = 0; bool network_busy = false; @@ -4193,8 +4195,9 @@ static int flush(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers, goto error; } - if (ofi_nccl_gdr_flush_disable() || support_gdr == GDR_UNSUPPORTED) + if (gdr_flush_disabled) { goto exit; + } #if HAVE_CUDA if (cuda_flush) { @@ -7719,20 +7722,7 @@ int nccl_net_ofi_rdma_init(const char *provider_filter, ret = nccl_ofi_ofiutils_get_providers(provider_filter, api_version, hints, &provider_list, &num_providers); if (ret == 0) { - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Using Libfabric %u.%u API, with %s support", - FI_MAJOR(api_version), - FI_MINOR(api_version), - FI_VERSION_GE(api_version, FI_VERSION(1, 20)) ? "DMA-BUF" : "GPUDirect RDMA"); - /* The 1.18 API allows providers to use CUDA to - * support HMEM pointers, so just having HMEM doesn't - * tell us anything about the usability of CUDA - * pointers with NCCL. So leave the state unknown - * until we create an endpoint and try to disable - * CUDA - */ - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, - "Using Libfabric 1.18 API, with GPUDirect RDMA support"); - support_gdr = GDR_UNKNOWN; + NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using Libfabric %u.%u API", FI_MAJOR(api_version), FI_MINOR(api_version)); } else { NCCL_OFI_WARN("OFI fi_getinfo() call failed: %s", fi_strerror(ret)); goto error; diff --git a/src/nccl_ofi_sendrecv.c b/src/nccl_ofi_sendrecv.c index b2b5e3357..483944de6 100644 --- a/src/nccl_ofi_sendrecv.c +++ b/src/nccl_ofi_sendrecv.c @@ -1032,7 +1032,7 @@ static int sendrecv_recv_comm_close(nccl_net_ofi_recv_comm_t *recv_comm) goto exit; } - if (!ofi_nccl_gdr_flush_disable() && support_gdr == GDR_SUPPORTED && !cuda_flush) { + if (!ofi_nccl_gdr_flush_disable() && !cuda_flush) { NCCL_OFI_TRACE(NCCL_NET, "De-registering buffer for flush operations"); /* Deregister Flush buffer memory region */ mr_handle = (struct fid_mr *)r_comm->flush_buff.mr_handle; @@ -1078,8 +1078,9 @@ static int sendrecv_recv_comm_flush(nccl_net_ofi_recv_comm_t *recv_comm, int n, int flush_n = -1; struct fid_mr **mr_handles = (struct fid_mr **)mhandles; - if (ofi_nccl_gdr_flush_disable() || support_gdr == GDR_UNSUPPORTED) + if (gdr_flush_disabled) { goto exit; + } #if HAVE_CUDA if (cuda_flush) { @@ -1347,7 +1348,7 @@ static nccl_net_ofi_sendrecv_recv_comm_t *sendrecv_recv_comm_prepare(nccl_net_of * Setup flush resources if using GPUDirect RDMA unless user disables * flush operations */ - if (!ofi_nccl_gdr_flush_disable() && support_gdr == GDR_SUPPORTED && !cuda_flush) { + if (!ofi_nccl_gdr_flush_disable() && !cuda_flush) { r_comm->flush_buff.size = NCCL_OFI_FLUSH_SIZE; ret = sendrecv_recv_comm_alloc_and_reg_flush_buff(domain, ep->ofi_ep, key_pool, &r_comm->flush_buff, dev_id); @@ -2519,6 +2520,32 @@ static int nccl_net_ofi_sendrecv_plugin_create(size_t num_devices, } +static uint32_t sendrecv_get_required_api(void) +{ + const uint32_t lib_api = fi_version(); + if (nccl_ofi_dmabuf_viable()) { + return NCCL_OFI_MIN(lib_api, FI_VERSION(1, 20)); + } +#if HAVE_NEURON + else { + /* XXX: neuron will not request libfabric<1.18, not because I know of any + * specific 1.18 behavior neuron relies on relative to 1.6, but because it's + * what the previous code did. + */ + return NCCL_OFI_MIN(lib_api, FI_VERSION(1, 18)); + } +#elif HAVE_CUDA + else if (nccl_net_ofi_cuda_gdr_viable()) { + /* need at least 1.18 for gdrcopy */ + return NCCL_OFI_MIN(lib_api, FI_VERSION(1, 18)); + } else { + /* 1.6 otherwise. */ + return NCCL_OFI_MIN(lib_api, FI_VERSION(1, 6)); + } +#endif +} + + int nccl_net_ofi_sendrecv_init(const char *provider_filter, nccl_net_ofi_plugin_t **plugin_p) { @@ -2526,75 +2553,37 @@ int nccl_net_ofi_sendrecv_init(const char *provider_filter, struct fi_info *provider_list = NULL; unsigned int num_providers; nccl_net_ofi_sendrecv_plugin_t *plugin = NULL; - struct fi_info *hints; - hints = fi_allocinfo(); + uint32_t required_api = sendrecv_get_required_api(); + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Selected libfabric %u.%u API", FI_MAJOR(required_api), FI_MINOR(required_api)); + + struct fi_info *hints = fi_allocinfo(); if (hints == NULL) { NCCL_OFI_WARN("Allocation of fi_info failed"); ret = -FI_ENOMEM; goto error; } - - if (nccl_ofi_dmabuf_viable()) { - sendrecv_get_hints(hints, true); - ret = nccl_ofi_ofiutils_get_providers(provider_filter, - FI_VERSION(1, 20), - hints, - &provider_list, - &num_providers); - if (ret == 0) { - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Using Libfabric 1.20 API, with DMA-BUF support"); - support_gdr = GDR_UNKNOWN; - goto found; - } - } - sendrecv_get_hints(hints, true); - ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 18), hints, - &provider_list, &num_providers); - if (ret == 0) { - /* The 1.18 API allows providers to use CUDA to - * support HMEM pointers, so just having HMEM doesn't - * tell us anything about the usability of CUDA - * pointers with NCCL. So leave the state unknown - * until we create an endpoint and try to disable - * CUDA - */ - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, - "Using Libfabric 1.18 API, with GPUDirect RDMA support"); - support_gdr = GDR_UNKNOWN; - goto found; - } - - sendrecv_get_hints(hints, true); - ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 6), hints, - &provider_list, &num_providers); - if (ret == 0) { - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, - "Using Libfabric 1.6 API, with GPUDirect RDMA support"); - support_gdr = GDR_SUPPORTED; - goto found; - } - - sendrecv_get_hints(hints, false); - ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 6), hints, - &provider_list, &num_providers); + ret = nccl_ofi_ofiutils_get_providers(provider_filter, required_api, hints, &provider_list, &num_providers); + fi_freeinfo(hints); if (ret == 0) { NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, - "Using Libfabric 1.6 API, without GPUDirect RDMA support"); - support_gdr = GDR_UNSUPPORTED; - goto found; - } - - ret = -FI_ENODATA; -found: - fi_freeinfo(hints); - if (ret != 0 && ret != -FI_ENODATA) { - NCCL_OFI_WARN("OFI fi_getinfo() call failed: %s", fi_strerror(ret)); + "Successfully found providers at libfabric %u.%u API", + FI_MAJOR(required_api), + FI_MINOR(required_api)); + } else if (ret == -FI_ENODATA) { + NCCL_OFI_WARN("OFI fi_getinfo() @ libfabric %u.%u api failed to resolve any providers", + FI_MAJOR(required_api), + FI_MINOR(required_api)); + assert(provider_list == NULL); goto error; - } - if (provider_list == NULL) { + } else { + NCCL_OFI_WARN("OFI fi_getinfo() @ libfabric %u.%u api failed unexpectedly: %s", + FI_MAJOR(required_api), + FI_MINOR(required_api), + fi_strerror(ret)); ret = -FI_ENODATA; + assert(provider_list == NULL); goto error; } diff --git a/src/platform-aws.c b/src/platform-aws.c index 787305fdf..c35c2a0a7 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -26,6 +26,10 @@ #include "nccl_ofi_pthread.h" #include "nccl_ofi_system.h" +#if HAVE_CUDA +#include "nccl_ofi_cuda.h" +#endif + struct ec2_platform_data { const char* name; const char* topology; @@ -582,16 +586,6 @@ int platform_config_endpoint(struct fi_info *info, struct fid_ep* endpoint) { goto exit; } - if (ofi_nccl_disable_gdr_required_check() == 0) { - /* Ensure GDR is enabled on GDR-supported instances */ - struct ec2_platform_data *platform_data = get_platform_data(); - if (platform_data && platform_data->gdr_required && support_gdr != GDR_SUPPORTED) { - NCCL_OFI_WARN("GDR disabled on GDR-supported instance type %s", platform_data->name); - ret = -EINVAL; - goto exit; - } - } - /* If the selected communication protocol is RDMA write and the user did * not disable the native RDMA support check, validate that the * FI_OPT_EFA_EMULATED_WRITE endpoint option can be accessed, and that @@ -611,6 +605,16 @@ int platform_config_endpoint(struct fi_info *info, struct fid_ep* endpoint) { static bool need_ordering = false; static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + /* Ensure GDR support via cuda attr on GDR-supported instances */ + if (ofi_nccl_disable_gdr_required_check() == 0) { + struct ec2_platform_data *platform_data = get_platform_data(); + if (platform_data && platform_data->gdr_required && !nccl_net_ofi_cuda_have_gdr_support_attr()) { + NCCL_OFI_WARN("GDR disabled on GDR-supported instance type %s", platform_data->name); + ret = -EINVAL; + goto exit; + } + } + /* During initialization, try to set * FI_OPT_EFA_{SENDRECV,WRTIE}_IN_ORDER_ALIGNED_128_BYTES to * true to see if the LL/LL128 protocol is supported. After diff --git a/tests/functional/nccl_connection.c b/tests/functional/nccl_connection.c index 3874ec975..8b2762b51 100644 --- a/tests/functional/nccl_connection.c +++ b/tests/functional/nccl_connection.c @@ -28,9 +28,6 @@ int main(int argc, char* argv[]) ofi_log_function = logger; - /* Indicates if NICs support GPUDirect */ - int *test_support_gdr = NULL; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); @@ -67,21 +64,11 @@ int main(int argc, char* argv[]) OFINCCLCHECKGOTO(extNet->devices(&ndev), res, exit); NCCL_OFI_INFO(NCCL_INIT, "Received %d network devices", ndev); - test_support_gdr = (int *)malloc(sizeof(int) * ndev); - if (test_support_gdr == NULL) { - NCCL_OFI_WARN("Failed to allocate memory"); - res = ncclInternalError; - goto exit; - } - /* Get Properties for the device */ for (int dev = 0; dev < ndev; dev++) { test_nccl_properties_t props = {}; OFINCCLCHECKGOTO(extNet->getProperties(dev, &props), res, exit); print_dev_props(dev, &props); - - /* Set CUDA support */ - test_support_gdr[dev] = is_gdr_supported_nic(props.ptrSupport); } /* Test all devices */ @@ -95,11 +82,6 @@ int main(int argc, char* argv[]) NCCL_OFI_TRACE(NCCL_INIT, "Rank %d uses %d device for communication", rank, dev); - if (test_support_gdr[dev] == 1) { - NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, - "Network supports communication using CUDA buffers. Dev: %d", dev); - } - /* Listen API */ NCCL_OFI_INFO(NCCL_INIT, "Server: Listening on dev %d", dev); OFINCCLCHECKGOTO(extNet->listen(dev, (void *)&handle, (void **)&lComm), res, exit); @@ -173,10 +155,5 @@ int main(int argc, char* argv[]) NCCL_OFI_INFO(NCCL_NET, "Test completed successfully for rank %d", rank); exit: - if (test_support_gdr) { - free(test_support_gdr); - test_support_gdr = NULL; - } - return res; } diff --git a/tests/functional/nccl_message_transfer.c b/tests/functional/nccl_message_transfer.c index 3b2421ba0..521467e66 100644 --- a/tests/functional/nccl_message_transfer.c +++ b/tests/functional/nccl_message_transfer.c @@ -17,7 +17,7 @@ int main(int argc, char* argv[]) { ncclResult_t res = ncclSuccess; int rank, proc_name_len, num_ranks = 0, local_rank = 0, peer_rank = 0; - int buffer_type = NCCL_PTR_HOST; + int buffer_type = NCCL_PTR_CUDA; test_nccl_properties_t props = {}; /* Plugin defines */ @@ -42,9 +42,6 @@ int main(int argc, char* argv[]) char *expected_buf = NULL; int done, received_size; - /* Indicates if NICs support GPUDirect */ - int *test_support_gdr = NULL; - /* All processors IDs, used to find out the local rank */ char *all_proc_name = NULL; @@ -102,7 +99,7 @@ int main(int argc, char* argv[]) } } - /* Set CUDA device for subsequent device memory allocation, in case GDR is used */ + /* Set CUDA device for subsequent device memory allocation */ NCCL_OFI_TRACE(NCCL_NET, "Using CUDA device %d for memory allocation", local_rank); /* Get external Network from NCCL-OFI library */ @@ -121,20 +118,10 @@ int main(int argc, char* argv[]) OFINCCLCHECKGOTO(extNet->devices(&ndev), res, exit); NCCL_OFI_INFO(NCCL_NET, "Received %d network devices", ndev); - test_support_gdr = (int *)malloc(sizeof(int) * ndev); - if (test_support_gdr == NULL) { - NCCL_OFI_WARN("Failed to allocate memory"); - res = ncclInternalError; - goto exit; - } - /* Get Properties for the device */ for (int dev = 0; dev < ndev; dev++) { OFINCCLCHECKGOTO(extNet->getProperties(dev, &props), res, exit); print_dev_props(dev, &props); - - /* Set CUDA support */ - test_support_gdr[dev] = is_gdr_supported_nic(props.ptrSupport); } /* Test all devices */ @@ -147,12 +134,7 @@ int main(int argc, char* argv[]) } NCCL_OFI_TRACE(NCCL_INIT, "Rank %d uses %d device for communication", rank, dev); - - if (test_support_gdr[dev] == 1) { - NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, - "Network supports communication using CUDA buffers. Dev: %d", dev); - buffer_type = NCCL_PTR_CUDA; - } + buffer_type = NCCL_PTR_CUDA; /* Listen API */ NCCL_OFI_INFO(NCCL_NET, "Server: Listening on dev %d", dev); @@ -310,7 +292,7 @@ int main(int argc, char* argv[]) goto exit; } - if ((rank == 1) && (buffer_type == NCCL_PTR_CUDA)) { + if (rank == 1) { NCCL_OFI_TRACE(NCCL_NET, "Issue flush for data consistency. Request idx: %d", idx); @@ -337,14 +319,12 @@ int main(int argc, char* argv[]) extNet->deregMr((void *)sComm, mhandle[idx]), res, exit); } else { - if ((buffer_type == NCCL_PTR_CUDA) && !ofi_nccl_gdr_flush_disable()) { - /* Data validation may fail if flush operations are disabled */ - } else { - OFINCCLCHECKGOTO( - validate_data(recv_buf[idx], expected_buf, - send_sizes[szidx], buffer_type), - res, exit); - } + OFINCCLCHECKGOTO(validate_data(recv_buf[idx], + expected_buf, + send_sizes[szidx], + buffer_type), + res, + exit); OFINCCLCHECKGOTO( extNet->deregMr((void *)rComm, mhandle[idx]), res, exit); @@ -423,11 +403,6 @@ exit:; expected_buf = NULL; } - if (test_support_gdr) { - free(test_support_gdr); - test_support_gdr = NULL; - } - if (all_proc_name) { free(all_proc_name); all_proc_name = NULL; diff --git a/tests/functional/ring.c b/tests/functional/ring.c index 77aac75a6..56eb3fdbc 100644 --- a/tests/functional/ring.c +++ b/tests/functional/ring.c @@ -12,7 +12,7 @@ int main(int argc, char *argv[]) { ncclResult_t res = ncclSuccess; int rank, size, next, prev, proc_name_len, local_rank = 0; - int buffer_type = NCCL_PTR_HOST; + int buffer_type = NCCL_PTR_CUDA; /* Plugin defines */ int ndev; @@ -41,9 +41,6 @@ int main(int argc, char *argv[]) char *expected_buf = NULL; int done, received_size; - /* Indicates if NICs support GPUDirect */ - int *test_support_gdr = NULL; - /* All processors IDs, used to find out the local rank */ char *all_proc_name = NULL; @@ -96,7 +93,7 @@ int main(int argc, char *argv[]) } } - /* Set CUDA device for subsequent device memory allocation, in case GDR is used */ + /* Set CUDA device for subsequent device memory allocation */ NCCL_OFI_TRACE(NCCL_NET, "Using CUDA device %d for memory allocation", local_rank); /* Allocate and populate expected buffer */ @@ -127,22 +124,11 @@ int main(int argc, char *argv[]) OFINCCLCHECKGOTO(extNet->devices(&ndev), res, exit); NCCL_OFI_INFO(NCCL_NET, "Received %d network devices", ndev); - /* Indicates if NICs support GPUDirect */ - test_support_gdr = (int *)malloc(sizeof(int) * ndev); - if (test_support_gdr == NULL) { - NCCL_OFI_WARN("Failed to allocate memory"); - res = ncclInternalError; - goto exit; - } - /* Get Properties for the device */ for (int dev = 0; dev < ndev; dev++) { test_nccl_properties_t props = {}; OFINCCLCHECKGOTO(extNet->getProperties(dev, &props), res, exit); print_dev_props(dev, &props); - - /* Set CUDA support */ - test_support_gdr[dev] = is_gdr_supported_nic(props.ptrSupport); } /* Test all devices */ @@ -152,12 +138,6 @@ int main(int argc, char *argv[]) NCCL_OFI_TRACE(NCCL_INIT, "Rank %d uses %d device for communication", rank, dev); - if (test_support_gdr[dev] == 1) { - NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, - "Network supports communication using CUDA buffers. Dev: %d", dev); - buffer_type = NCCL_PTR_CUDA; - } - /* Listen API */ NCCL_OFI_INFO(NCCL_NET, "Server: Listening on device %d", dev); OFINCCLCHECKGOTO(extNet->listen(dev, (void *)&handle, (void **)&lComm), res, exit); @@ -251,28 +231,28 @@ int main(int argc, char *argv[]) if (done) { inflight_reqs--; req_completed_recv[idx] = 1; - - /* Invoke flush operations unless user has explicitly disabled it */ - if (buffer_type == NCCL_PTR_CUDA) { - NCCL_OFI_TRACE(NCCL_NET, - "Issue flush for data consistency. Request idx: %d", - idx); - nccl_net_ofi_req_t *iflush_req = NULL; - OFINCCLCHECKGOTO(extNet->iflush((void *)rComm, nrecv, - (void **)&recv_buf[idx], sizes, - &recv_mhandle[idx], (void **)&iflush_req), res, exit); - done = 0; - if (iflush_req) { - while (!done) { - OFINCCLCHECKGOTO(extNet->test((void *)iflush_req, &done, NULL), res, exit); - } + NCCL_OFI_TRACE(NCCL_NET, "Issue flush for data consistency. Request idx: %d", idx); + nccl_net_ofi_req_t *iflush_req = NULL; + OFINCCLCHECKGOTO(extNet->iflush((void *)rComm, + nrecv, + (void **)&recv_buf[idx], + sizes, + &recv_mhandle[idx], + (void **)&iflush_req), + res, + exit); + done = 0; + if (iflush_req) { + while (!done) { + OFINCCLCHECKGOTO(extNet->test((void *)iflush_req, &done, NULL), + res, + exit); } } - if ((buffer_type == NCCL_PTR_CUDA) && !ofi_nccl_gdr_flush_disable()) { - /* Data validation may fail if flush operations are disabled */ - } else - OFINCCLCHECKGOTO(validate_data(recv_buf[idx], expected_buf, SEND_SIZE, buffer_type), res, exit); + OFINCCLCHECKGOTO(validate_data(recv_buf[idx], expected_buf, SEND_SIZE, buffer_type), + res, + exit); /* Deregister memory handle */ OFINCCLCHECKGOTO(extNet->deregMr((void *)rComm, recv_mhandle[idx]), res, exit); @@ -340,11 +320,6 @@ exit:; expected_buf = NULL; } - if (test_support_gdr) { - free(test_support_gdr); - test_support_gdr = NULL; - } - if (all_proc_name) { free(all_proc_name); all_proc_name = NULL;