Skip to content

Commit

Permalink
tree: cleanup "gdr_support" variable
Browse files Browse the repository at this point in the history
In the default case, we lazily create all fabric resources at the time
of communicator creation, such that they end up owned by the correct
thread and/or are resident on the correct cpu socket and memory domain.

previously, there existed an ugly dependency chain in our init: while
the large majority of the provider properties that we care about can be
extracted from fi_getinfo responses, some can only be effectively
queried by attempting mutations against an existing endpoint/domain/etc
and seeing if it failed or not. A further subset of these properties
need to be exposed back by nccl-net-ofi to nccl, at the time of
getProperties, and prior to communicator instantiation.

to work around this, late in init we pick a device, instantiate it,
query the attributes we need for getProperties, and then tear it all
down. This is expensive and delays our init, as well as exposing us to
bugs from incomplete teardown.

The sole case in the codebase today where this is necessary today is
around detecting gdr support for FI_HMEM_CUDA. With dmabuf now as the
default, it's relatively safe to just avoid the call and optimistically
assume support when both cuda properties are true and when FI_HMEM is
available in the provider.

Signed-off-by: Nicholas Sielicki <nslick@amazon.com>
  • Loading branch information
aws-nslick committed Nov 26, 2024
1 parent bcb2e96 commit cd45df6
Show file tree
Hide file tree
Showing 13 changed files with 218 additions and 324 deletions.
6 changes: 1 addition & 5 deletions include/nccl_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,11 @@ extern "C" {
/* Initial number of entries in the MR cache of a device */
#define NCCL_OFI_MR_CACHE_INIT_SIZE 128

/* Indicates if GPUDirect is supported by libfabric provider */
enum gdr_support_level_t {GDR_UNKNOWN, GDR_SUPPORTED, GDR_UNSUPPORTED};
extern enum gdr_support_level_t support_gdr;


/* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used
* to flush data to the GPU. Note, CUDA flush support is not supported on all
* platforms and should be disabled by default */
extern bool cuda_flush;
extern bool gdr_flush_disabled;

/* number of duplicate providers to create for each discovered
* provider, including renaming to cause NCCL to create additional
Expand Down
17 changes: 17 additions & 0 deletions include/nccl_ofi_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,23 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
*/
bool nccl_net_ofi_cuda_have_gdr_support_attr(void);

/*
* @brief query CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS
* @return true if attr is fetched successfully and true.
* false otherwise
*/
bool nccl_net_ofi_cuda_have_gdr_flush_support_attr(void);

/*
* @brief test whether gdrcopy can possibly be supported, depending on the
* linked libfabric version and the properties exposed by cuda.
*
* @return true if attr is fetched successfully and true.
* false otherwise
*/
bool nccl_net_ofi_cuda_gdr_viable(void);

#ifdef __cplusplus
} // End extern "C"
#endif
Expand Down
3 changes: 3 additions & 0 deletions include/nccl_ofi_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,9 @@ OFI_NCCL_PARAM_INT(disable_gdr_required_check, "DISABLE_GDR_REQUIRED_CHECK", 0);
*/
OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0);

/* Largely exists for parity with DISABLE_DMABUF, but usage of this is discouraged. */
OFI_NCCL_PARAM_INT(disable_gdrcopy, "DISABLE_GDRCOPY", 0);

/*
* Messages sized larger than this threshold will be striped across multiple rails
*/
Expand Down
10 changes: 5 additions & 5 deletions m4/check_pkg_cuda.m4
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@ AC_DEFUN([CHECK_PKG_CUDA], [
[check_pkg_found=no],
[-ldl -lrt])])
check_cuda_gdr_flush_define=0
check_cuda_gdr_define=0
AS_IF([test "${check_pkg_found}" = "yes"],
[
AC_MSG_CHECKING([if CUDA 11.3+ is available for GDR Write Flush support])
AC_MSG_CHECKING([if CUDA 11.3+ is available for GDR + GDR Write Flush support])
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([
#include <cuda.h>
_Static_assert(CUDA_VERSION >= 11030, "cudart>=11030 required for cuFlushGPUDirectRDMAWrites");
])],[ check_cuda_gdr_flush_define=1 chk_result=yes ],
[ check_cuda_gdr_flush_define=0 chk_result=no ])
])],[ check_cuda_gdr_define=1 chk_result=yes ],
[ check_cuda_gdr_define=0 chk_result=no ])
AC_MSG_RESULT(${chk_result})
])
Expand All @@ -85,7 +85,7 @@ AC_DEFUN([CHECK_PKG_CUDA], [
AC_DEFINE_UNQUOTED([HAVE_CUDA], [${check_pkg_define}], [Defined to 1 if CUDA is available])
AC_DEFINE_UNQUOTED([HAVE_CUDA_DMABUF_SUPPORT], [${check_cuda_dmabuf_define}], [Defined to 1 if CUDA DMA-BUF support is available])
AC_DEFINE_UNQUOTED([HAVE_CUDA_GDRFLUSH_SUPPORT], [${check_cuda_gdr_flush_define}], [Defined to 1 if CUDA cuFlushGPUDirectRDMAWrites support is available])
AC_DEFINE_UNQUOTED([HAVE_CUDA_GDR_SUPPORT], [${check_cuda_gdr_define}], [Defined to 1 if CUDA cuFlushGPUDirectRDMAWrites support is available])
AM_CONDITIONAL([HAVE_CUDA], [test "${check_pkg_found}" = "yes"])
AC_SUBST([CUDA_LDFLAGS])
Expand Down
66 changes: 55 additions & 11 deletions src/nccl_ofi_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,19 +69,23 @@ int nccl_net_ofi_cuda_init(void)
RESOLVE_CUDA_FUNCTION(cuCtxGetDevice);
RESOLVE_CUDA_FUNCTION(cuDeviceGetAttribute);

if (HAVE_CUDA_GDRFLUSH_SUPPORT && nccl_net_ofi_cuda_have_gdr_support_attr() && ofi_nccl_cuda_flush_enable()) {
NCCL_OFI_WARN("CUDA flush enabled");
cuda_flush = true;
} else {
cuda_flush = ofi_nccl_cuda_flush_enable();
gdr_flush_disabled = ofi_nccl_gdr_flush_disable();

#if HAVE_CUDA_GDR_SUPPORT
if (!(nccl_net_ofi_cuda_gdr_viable() &&
nccl_net_ofi_cuda_have_gdr_flush_support_attr())) {
gdr_flush_disabled = true;
cuda_flush = false;
}
#endif

return 0;
}

int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void)
{
#if HAVE_CUDA_GDRFLUSH_SUPPORT
#if HAVE_CUDA_GDR_SUPPORT
static_assert(CUDA_VERSION >= 11030, "Requires cudart>=11.3");
cudaError_t ret = cudaDeviceFlushGPUDirectRDMAWrites(cudaFlushGPUDirectRDMAWritesTargetCurrentDevice,
cudaFlushGPUDirectRDMAWritesToOwner);
Expand Down Expand Up @@ -129,9 +133,9 @@ int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id)
};
}

bool nccl_net_ofi_cuda_have_gdr_support_attr(void)
bool nccl_net_ofi_cuda_have_gdr_flush_support_attr(void)
{
#if HAVE_CUDA_GDRFLUSH_SUPPORT
#if HAVE_CUDA_GDR_SUPPORT
if (pfn_cuCtxGetDevice == NULL || pfn_cuDeviceGetAttribute == NULL) {
return false;
}
Expand All @@ -143,13 +147,29 @@ bool nccl_net_ofi_cuda_have_gdr_support_attr(void)
}

int supported;
result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev);
if (result != CUDA_SUCCESS || !((bool)supported)) {
result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, dev);
return result == CUDA_SUCCESS && ((supported & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0);
#else
return false;
#endif
}

bool nccl_net_ofi_cuda_have_gdr_support_attr(void)
{
#if HAVE_CUDA_GDR_SUPPORT
if (pfn_cuCtxGetDevice == NULL || pfn_cuDeviceGetAttribute == NULL) {
return false;
}

result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, dev);
return result == CUDA_SUCCESS && ((supported & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0);
CUdevice dev;
CUresult result = pfn_cuCtxGetDevice(&dev);
if (result != CUDA_SUCCESS) {
return false;
}

int supported;
result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev);
return result == CUDA_SUCCESS && (bool)supported;
#else
return false;
#endif
Expand Down Expand Up @@ -179,3 +199,27 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void)
return false;
#endif
}

bool nccl_net_ofi_cuda_gdr_viable(void)
{
/* Disable GDR if building against too-old libfabric. */
if (FI_VERSION_LT(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), FI_VERSION(1, 18))) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Will not use GDR, requires Libfabric 1.18 or greater.");
return false;
}

/* Disable GDR if explicitly disabled by user. */
if (ofi_nccl_disable_gdrcopy()) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Will not attempt to use GDRCopy, explicitly disabled by user.");
return false;
}

/* Disable GDR if CUDA does not report GDR support in device attributes. */
if (!nccl_net_ofi_cuda_have_gdr_support_attr()) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"Will not attempt to use GDRCopy, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED was false.");
return false;
}

return true;
}
94 changes: 25 additions & 69 deletions src/nccl_ofi_net.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,11 @@
#include "nccl_ofi_ofiutils.h"
#include "nccl_ofi_system.h"

/* Indicates if GPUDirect is supported by libfabric provider */
enum gdr_support_level_t support_gdr = GDR_UNKNOWN;

/* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used
* to flush data to the GPU. Note, CUDA flush support is not supported on all
* platforms and should be disabled by default */
bool cuda_flush = false;
bool gdr_flush_disabled = true;

/* number of duplicate providers to create for each discovered
* provider, including renaming to cause NCCL to create additional
Expand Down Expand Up @@ -136,9 +134,6 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
int ret = 0;
const char *provider_filter = NULL;
nccl_net_ofi_plugin_t *plugin;
nccl_net_ofi_ep_t *base_ep = NULL;
nccl_net_ofi_device_t *device = NULL;
nccl_ofi_properties_t properties;

NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Initializing " PACKAGE_STRING);

Expand All @@ -162,6 +157,11 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
*/
mr_cache_alignment = NCCL_OFI_MIN(system_page_size, NCCL_OFI_CACHE_PAGE_SIZE);

/* configuration parameters */
nic_dup_conns = ofi_nccl_nic_dup_conns();
net_latency = (float)ofi_nccl_net_latency();
cq_read_count = ofi_nccl_cq_read_count();

#if HAVE_CUDA
ret = nccl_net_ofi_cuda_init();
if (ret != 0) {
Expand All @@ -170,17 +170,22 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
}
#endif

/* configuration parameters */
nic_dup_conns = ofi_nccl_nic_dup_conns();
net_latency = (float)ofi_nccl_net_latency();
cq_read_count = ofi_nccl_cq_read_count();

if (platform_init) {
ret = platform_init(&provider_filter);
if (ret != 0)
goto exit;
}

#if HAVE_CUDA
if (nic_dup_conns > 0 && nccl_net_ofi_cuda_have_gdr_support_attr()) {
NCCL_OFI_WARN(
"NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA. This configuration is not "
"supported.");
ret = -ENOTSUP;
goto exit;
}
#endif

/* This is ugly, but here's the basic protocol selection
* logic:
* 1. if the user set NCCL_OFI_PROTOCOL, use that.
Expand Down Expand Up @@ -285,55 +290,6 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
goto exit;
}

/* In order to set endpoint options and potentially NCCL configuration
* options (such as NCCL_PROTO) during the plugin initialization
* process, we need to create an endpoint and call the platform hook
* "platform_config_endpoint" using "get_ep". This code makes the
* assumption that the thread calling "nccl_net_ofi_init" will make
* communication calls. As well, since without this code the endpoint
* would be created the first time "get_ep" in called during a listen or
* connect call, creating the endpoint earlier would not be a waste of
* resources. This initialization happens once per process, and thus it
* does not matter which device is used to create the endpoint.
*/
device = plugin->get_device(plugin, 0);

ret = device->get_ep(device, &base_ep);
if (ret != 0) {
goto exit;
}
ret = device->get_properties(device, &properties);
if (ret != 0) {
goto exit;
}
NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for global registrations: %s",
(properties.regIsGlobal == 0) ? "false" : "true");
NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for DMA-BUF registrations: %s",
(properties.dmabuf_support == 0) ? "false" : "true");
/* Cause release to not actually free the resources, to speed
* up initialization, since the very same resources will be
* recreated by NCCL soon after initialization to do real
* communication.
*/
base_ep->ref_cnt++;
ret = base_ep->release_ep(base_ep);
base_ep->ref_cnt--;
if (ret != 0) {
goto exit;
}

assert(support_gdr != GDR_UNKNOWN);

/* we don't actually know if GDR is supported until we've
* created the first endpoint, so this check needs to be way
* down here
*/
if (nic_dup_conns > 0 && support_gdr != GDR_UNSUPPORTED) {
NCCL_OFI_WARN("NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA. This configuration is not supported.");
ret = -ENOTSUP;
goto exit;
}

*plugin_p = plugin;

exit:
Expand Down Expand Up @@ -416,12 +372,7 @@ static int set_nic_props_default(int dev_id, struct fi_info *nic_prov,
*/
props->max_group_receives = NCCL_OFI_MAX_RECVS;

if (support_gdr == GDR_SUPPORTED) {
props->hmem_support = true;
} else {
props->hmem_support = false;
}

props->hmem_support = false;
props->dmabuf_support = false;

/* Should be successful for ptrSupport invocation */
Expand Down Expand Up @@ -580,14 +531,19 @@ int nccl_net_ofi_info_properties(nccl_net_ofi_plugin_t *plugin, struct fi_info *

props->max_mr_key_size = nic_prov->domain_attr->mr_key_size;

props->hmem_support = ((nic_prov->caps & FI_HMEM) != 0) &&
FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 18)) &&
(HAVE_NEURON || nccl_net_ofi_cuda_have_gdr_support_attr());

props->dmabuf_support = ((nic_prov->caps & FI_HMEM) != 0) &&
FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 20)) &&
nccl_ofi_dmabuf_viable()
;
if (props->dmabuf_support) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "DMA-BUF support is advertised in properties.");
}

NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"NCCL properties: dmabuf=%s hmem=%s",
props->dmabuf_support ? "yes" : "no",
props->hmem_support ? "yes" : "no");

goto exit;
error:
Expand Down
Loading

0 comments on commit cd45df6

Please sign in to comment.