tree: cleanup "gdr_support" variable

In the default case, we lazily create all fabric resources at the time of communicator creation, such that they end up owned by the correct thread and/or are resident on the correct cpu socket and memory domain. previously, there existed an ugly dependency chain in our init: while the large majority of the provider properties that we care about can be extracted from fi_getinfo responses, some can only be effectively queried by attempting mutations against an existing endpoint/domain/etc and seeing if it failed or not. A further subset of these properties need to be exposed back by nccl-net-ofi to nccl, at the time of getProperties, and prior to communicator instantiation. to work around this, late in init we pick a device, instantiate it, query the attributes we need for getProperties, and then tear it all down. This is expensive and delays our init, as well as exposing us to bugs from incomplete teardown. The sole case in the codebase today where this is necessary today is around detecting gdr support for FI_HMEM_CUDA. With dmabuf now as the default, it's relatively safe to just avoid the call and optimistically assume support when both cuda properties are true and when FI_HMEM is available in the provider. Signed-off-by: Nicholas Sielicki <nslick@amazon.com>
aws · Nov 26, 2024 · cd45df6 · cd45df6
1 parent bcb2e96
commit cd45df6
Show file tree

Hide file tree

Showing 13 changed files with 218 additions and 324 deletions.
diff --git a/include/nccl_ofi.h b/include/nccl_ofi.h
@@ -81,15 +81,11 @@ extern "C" {
 /* Initial number of entries in the MR cache of a device */
 #define NCCL_OFI_MR_CACHE_INIT_SIZE     128
 
-/* Indicates if GPUDirect is supported by libfabric provider */
-enum gdr_support_level_t {GDR_UNKNOWN, GDR_SUPPORTED, GDR_UNSUPPORTED};
-extern enum gdr_support_level_t support_gdr;
-
-
 /* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used
  * to flush data to the GPU. Note, CUDA flush support is not supported on all
  * platforms and should be disabled by default */
 extern bool cuda_flush;
+extern bool gdr_flush_disabled;
 
 /* number of duplicate providers to create for each discovered
  * provider, including renaming to cause NCCL to create additional

diff --git a/include/nccl_ofi_cuda.h b/include/nccl_ofi_cuda.h
@@ -66,6 +66,23 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
  */
 bool nccl_net_ofi_cuda_have_gdr_support_attr(void);
 
+/*
+ * @brief	query CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS
+
+ * @return	true if attr is fetched successfully and true.
+ *		    false otherwise
+ */
+bool nccl_net_ofi_cuda_have_gdr_flush_support_attr(void);
+
+/*
+ * @brief test whether gdrcopy can possibly be supported, depending on the
+ * linked libfabric version and the properties exposed by cuda.
+ *
+ * @return	true if attr is fetched successfully and true.
+ *		    false otherwise
+ */
+bool nccl_net_ofi_cuda_gdr_viable(void);
+
 #ifdef __cplusplus
 }  // End extern "C"
 #endif

diff --git a/include/nccl_ofi_param.h b/include/nccl_ofi_param.h
@@ -281,6 +281,9 @@ OFI_NCCL_PARAM_INT(disable_gdr_required_check, "DISABLE_GDR_REQUIRED_CHECK", 0);
  */
 OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0);
 
+/* Largely exists for parity with DISABLE_DMABUF, but usage of this is discouraged. */
+OFI_NCCL_PARAM_INT(disable_gdrcopy, "DISABLE_GDRCOPY", 0);
+
 /*
  * Messages sized larger than this threshold will be striped across multiple rails
  */

diff --git a/m4/check_pkg_cuda.m4 b/m4/check_pkg_cuda.m4
@@ -53,15 +53,15 @@ AC_DEFUN([CHECK_PKG_CUDA], [
          [check_pkg_found=no],
          [-ldl -lrt])])
 
-  check_cuda_gdr_flush_define=0
+  check_cuda_gdr_define=0
   AS_IF([test "${check_pkg_found}" = "yes"],
         [
-        AC_MSG_CHECKING([if CUDA 11.3+ is available for GDR Write Flush support])
+        AC_MSG_CHECKING([if CUDA 11.3+ is available for GDR + GDR Write Flush support])
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([
         #include <cuda.h>
         _Static_assert(CUDA_VERSION >= 11030, "cudart>=11030 required for cuFlushGPUDirectRDMAWrites");
-        ])],[ check_cuda_gdr_flush_define=1 chk_result=yes ],
-            [ check_cuda_gdr_flush_define=0 chk_result=no ])
+        ])],[ check_cuda_gdr_define=1 chk_result=yes ],
+            [ check_cuda_gdr_define=0 chk_result=no ])
         AC_MSG_RESULT(${chk_result})
         ])
 
@@ -85,7 +85,7 @@ AC_DEFUN([CHECK_PKG_CUDA], [
 
   AC_DEFINE_UNQUOTED([HAVE_CUDA], [${check_pkg_define}], [Defined to 1 if CUDA is available])
   AC_DEFINE_UNQUOTED([HAVE_CUDA_DMABUF_SUPPORT], [${check_cuda_dmabuf_define}], [Defined to 1 if CUDA DMA-BUF support is available])
-  AC_DEFINE_UNQUOTED([HAVE_CUDA_GDRFLUSH_SUPPORT], [${check_cuda_gdr_flush_define}], [Defined to 1 if CUDA cuFlushGPUDirectRDMAWrites support is available])
+  AC_DEFINE_UNQUOTED([HAVE_CUDA_GDR_SUPPORT], [${check_cuda_gdr_define}], [Defined to 1 if CUDA cuFlushGPUDirectRDMAWrites support is available])
   AM_CONDITIONAL([HAVE_CUDA], [test "${check_pkg_found}" = "yes"])
 
   AC_SUBST([CUDA_LDFLAGS])

diff --git a/src/nccl_ofi_cuda.c b/src/nccl_ofi_cuda.c
@@ -69,19 +69,23 @@ int nccl_net_ofi_cuda_init(void)
 	RESOLVE_CUDA_FUNCTION(cuCtxGetDevice);
 	RESOLVE_CUDA_FUNCTION(cuDeviceGetAttribute);
 
-	if (HAVE_CUDA_GDRFLUSH_SUPPORT && nccl_net_ofi_cuda_have_gdr_support_attr() && ofi_nccl_cuda_flush_enable()) {
-		NCCL_OFI_WARN("CUDA flush enabled");
-		cuda_flush = true;
-	} else {
+	cuda_flush = ofi_nccl_cuda_flush_enable();
+	gdr_flush_disabled = ofi_nccl_gdr_flush_disable();
+
+#if HAVE_CUDA_GDR_SUPPORT
+	if (!(nccl_net_ofi_cuda_gdr_viable() &&
+		  nccl_net_ofi_cuda_have_gdr_flush_support_attr())) {
+		gdr_flush_disabled = true;
 		cuda_flush = false;
 	}
+#endif
 
 	return 0;
 }
 
 int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void)
 {
-#if HAVE_CUDA_GDRFLUSH_SUPPORT
+#if HAVE_CUDA_GDR_SUPPORT
 	static_assert(CUDA_VERSION >= 11030, "Requires cudart>=11.3");
 	cudaError_t ret = cudaDeviceFlushGPUDirectRDMAWrites(cudaFlushGPUDirectRDMAWritesTargetCurrentDevice,
 	                                                     cudaFlushGPUDirectRDMAWritesToOwner);
@@ -129,9 +133,9 @@ int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id)
 	};
 }
 
-bool nccl_net_ofi_cuda_have_gdr_support_attr(void)
+bool nccl_net_ofi_cuda_have_gdr_flush_support_attr(void)
 {
-#if HAVE_CUDA_GDRFLUSH_SUPPORT
+#if HAVE_CUDA_GDR_SUPPORT
 	if (pfn_cuCtxGetDevice == NULL || pfn_cuDeviceGetAttribute == NULL) {
 		return false;
 	}
@@ -143,13 +147,29 @@ bool nccl_net_ofi_cuda_have_gdr_support_attr(void)
 	}
 
 	int supported;
-	result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev);
-	if (result != CUDA_SUCCESS || !((bool)supported)) {
+	result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, dev);
+	return result == CUDA_SUCCESS && ((supported & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0);
+#else
+	return false;
+#endif
+}
+
+bool nccl_net_ofi_cuda_have_gdr_support_attr(void)
+{
+#if HAVE_CUDA_GDR_SUPPORT
+	if (pfn_cuCtxGetDevice == NULL || pfn_cuDeviceGetAttribute == NULL) {
 		return false;
 	}
 
-	result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, dev);
-	return result == CUDA_SUCCESS && ((supported & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0);
+	CUdevice dev;
+	CUresult result = pfn_cuCtxGetDevice(&dev);
+	if (result != CUDA_SUCCESS) {
+		return false;
+	}
+
+	int supported;
+	result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev);
+	return result == CUDA_SUCCESS && (bool)supported;
 #else
 	return false;
 #endif
@@ -179,3 +199,27 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void)
 	return false;
 #endif
 }
+
+bool nccl_net_ofi_cuda_gdr_viable(void)
+{
+	/* Disable GDR if building against too-old libfabric. */
+	if (FI_VERSION_LT(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), FI_VERSION(1, 18))) {
+		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Will not use GDR, requires Libfabric 1.18 or greater.");
+		return false;
+	}
+
+	/* Disable GDR if explicitly disabled by user. */
+	if (ofi_nccl_disable_gdrcopy()) {
+		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Will not attempt to use GDRCopy, explicitly disabled by user.");
+		return false;
+	}
+
+	/* Disable GDR if CUDA does not report GDR support in device attributes. */
+	if (!nccl_net_ofi_cuda_have_gdr_support_attr()) {
+		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
+		               "Will not attempt to use GDRCopy, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED was false.");
+		return false;
+	}
+
+	return true;
+}
diff --git a/src/nccl_ofi_net.c b/src/nccl_ofi_net.c
@@ -30,13 +30,11 @@
 #include "nccl_ofi_ofiutils.h"
 #include "nccl_ofi_system.h"
 
-/* Indicates if GPUDirect is supported by libfabric provider */
-enum gdr_support_level_t support_gdr = GDR_UNKNOWN;
-
 /* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used
  * to flush data to the GPU. Note, CUDA flush support is not supported on all
  * platforms and should be disabled by default */
 bool cuda_flush = false;
+bool gdr_flush_disabled = true;
 
 /* number of duplicate providers to create for each discovered
  * provider, including renaming to cause NCCL to create additional
@@ -136,9 +134,6 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 	int ret = 0;
 	const char *provider_filter = NULL;
 	nccl_net_ofi_plugin_t *plugin;
-	nccl_net_ofi_ep_t *base_ep = NULL;
-	nccl_net_ofi_device_t *device = NULL;
-	nccl_ofi_properties_t properties;
 
 	NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Initializing " PACKAGE_STRING);
 
@@ -162,6 +157,11 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 	 */
 	mr_cache_alignment = NCCL_OFI_MIN(system_page_size, NCCL_OFI_CACHE_PAGE_SIZE);
 
+	/* configuration parameters */
+	nic_dup_conns = ofi_nccl_nic_dup_conns();
+	net_latency = (float)ofi_nccl_net_latency();
+	cq_read_count = ofi_nccl_cq_read_count();
+
 #if HAVE_CUDA
 	ret = nccl_net_ofi_cuda_init();
 	if (ret != 0) {
@@ -170,17 +170,22 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 	}
 #endif
 
-	/* configuration parameters */
-	nic_dup_conns = ofi_nccl_nic_dup_conns();
-	net_latency = (float)ofi_nccl_net_latency();
-	cq_read_count = ofi_nccl_cq_read_count();
-
 	if (platform_init) {
 		ret = platform_init(&provider_filter);
 		if (ret != 0)
 			goto exit;
 	}
 
+#if HAVE_CUDA
+	if (nic_dup_conns > 0 && nccl_net_ofi_cuda_have_gdr_support_attr()) {
+		NCCL_OFI_WARN(
+			"NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA.  This configuration is not "
+		        "supported.");
+		ret = -ENOTSUP;
+		goto exit;
+	}
+#endif
+
 	/* This is ugly, but here's the basic protocol selection
 	 * logic:
 	 *   1. if the user set NCCL_OFI_PROTOCOL, use that.
@@ -285,55 +290,6 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 		goto exit;
 	}
 
-	/* In order to set endpoint options and potentially NCCL configuration
-	 * options (such as NCCL_PROTO) during the plugin initialization
-	 * process, we need to create an endpoint and call the platform hook
-	 * "platform_config_endpoint" using "get_ep". This code makes the
-	 * assumption that the thread calling "nccl_net_ofi_init" will make
-	 * communication calls. As well, since without this code the endpoint
-	 * would be created the first time "get_ep" in called during a listen or
-	 * connect call, creating the endpoint earlier would not be a waste of
-	 * resources. This initialization happens once per process, and thus it
-	 * does not matter which device is used to create the endpoint.
-	 */
-	device = plugin->get_device(plugin, 0);
-
-	ret = device->get_ep(device, &base_ep);
-	if (ret != 0) {
-		goto exit;
-	}
-	ret = device->get_properties(device, &properties);
-	if (ret != 0) {
-		goto exit;
-	}
-	NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for global registrations: %s",
-		      (properties.regIsGlobal == 0) ? "false" : "true");
-	NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for DMA-BUF registrations: %s",
-		      (properties.dmabuf_support == 0) ? "false" : "true");
-	/* Cause release to not actually free the resources, to speed
-	 * up initialization, since the very same resources will be
-	 * recreated by NCCL soon after initialization to do real
-	 * communication.
-	 */
-	base_ep->ref_cnt++;
-	ret = base_ep->release_ep(base_ep);
-	base_ep->ref_cnt--;
-	if (ret != 0) {
-		goto exit;
-	}
-
-	assert(support_gdr != GDR_UNKNOWN);
-
-	/* we don't actually know if GDR is supported until we've
-	 * created the first endpoint, so this check needs to be way
-	 * down here
-	 */
-	if (nic_dup_conns > 0 && support_gdr != GDR_UNSUPPORTED) {
-		NCCL_OFI_WARN("NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA.  This configuration is not supported.");
-		ret = -ENOTSUP;
-		goto exit;
-	}
-
 	*plugin_p = plugin;
 
  exit:
@@ -416,12 +372,7 @@ static int set_nic_props_default(int dev_id, struct fi_info *nic_prov,
 	 */
 	props->max_group_receives = NCCL_OFI_MAX_RECVS;
 
-	if (support_gdr == GDR_SUPPORTED) {
-		props->hmem_support = true;
-	} else {
-		props->hmem_support = false;
-	}
-
+	props->hmem_support = false;
 	props->dmabuf_support = false;
 
 	/* Should be successful for ptrSupport invocation */
@@ -580,14 +531,19 @@ int nccl_net_ofi_info_properties(nccl_net_ofi_plugin_t *plugin, struct fi_info *
 
 	props->max_mr_key_size = nic_prov->domain_attr->mr_key_size;
 
+	props->hmem_support = ((nic_prov->caps & FI_HMEM) != 0) &&
+	                      FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 18)) &&
+	                      (HAVE_NEURON || nccl_net_ofi_cuda_have_gdr_support_attr());
 
 	props->dmabuf_support = ((nic_prov->caps & FI_HMEM) != 0) &&
 		FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 20)) &&
 		nccl_ofi_dmabuf_viable()
 		;
-	if (props->dmabuf_support) {
-		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "DMA-BUF support is advertised in properties.");
-	}
+
+	NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
+	               "NCCL properties: dmabuf=%s hmem=%s",
+	               props->dmabuf_support ? "yes" : "no",
+	               props->hmem_support ? "yes" : "no");
 
 	goto exit;
 error: