aws · aws-nslick · Nov 26, 2024
@@ -81,15 +81,11 @@ extern "C" {
 /* Initial number of entries in the MR cache of a device */
 #define NCCL_OFI_MR_CACHE_INIT_SIZE     128
 
-/* Indicates if GPUDirect is supported by libfabric provider */
-enum gdr_support_level_t {GDR_UNKNOWN, GDR_SUPPORTED, GDR_UNSUPPORTED};
-extern enum gdr_support_level_t support_gdr;
-
-
 /* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used
  * to flush data to the GPU. Note, CUDA flush support is not supported on all
  * platforms and should be disabled by default */
 extern bool cuda_flush;
+extern bool gdr_flush_disabled;
 
 /* number of duplicate providers to create for each discovered
  * provider, including renaming to cause NCCL to create additional

@@ -66,6 +66,23 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
  */
 bool nccl_net_ofi_cuda_have_gdr_support_attr(void);
 
+/*
+ * @brief	query CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS
+
+ * @return	true if attr is fetched successfully and true.
+ *		    false otherwise
+ */
+bool nccl_net_ofi_cuda_have_gdr_flush_support_attr(void);
+
+/*
+ * @brief test whether gdrcopy can possibly be supported, depending on the
+ * linked libfabric version and the properties exposed by cuda.
+ *
+ * @return	true if attr is fetched successfully and true.
+ *		    false otherwise
+ */
+bool nccl_net_ofi_cuda_gdr_viable(void);
+
 #ifdef __cplusplus
 }  // End extern "C"
 #endif

@@ -281,6 +281,9 @@ OFI_NCCL_PARAM_INT(disable_gdr_required_check, "DISABLE_GDR_REQUIRED_CHECK", 0);
  */
 OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0);
 
+/* Largely exists for parity with DISABLE_DMABUF, but usage of this is discouraged. */
+OFI_NCCL_PARAM_INT(disable_gdrcopy, "DISABLE_GDRCOPY", 0);
+
 /*
  * Messages sized larger than this threshold will be striped across multiple rails
  */

@@ -53,15 +53,15 @@ AC_DEFUN([CHECK_PKG_CUDA], [
          [check_pkg_found=no],
          [-ldl -lrt])])
 
-  check_cuda_gdr_flush_define=0
+  check_cuda_gdr_define=0
   AS_IF([test "${check_pkg_found}" = "yes"],
         [
-        AC_MSG_CHECKING([if CUDA 11.3+ is available for GDR Write Flush support])
+        AC_MSG_CHECKING([if CUDA 11.3+ is available for GDR + GDR Write Flush support])
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([
         #include <cuda.h>
         _Static_assert(CUDA_VERSION >= 11030, "cudart>=11030 required for cuFlushGPUDirectRDMAWrites");
-        ])],[ check_cuda_gdr_flush_define=1 chk_result=yes ],
-            [ check_cuda_gdr_flush_define=0 chk_result=no ])
+        ])],[ check_cuda_gdr_define=1 chk_result=yes ],
+            [ check_cuda_gdr_define=0 chk_result=no ])
         AC_MSG_RESULT(${chk_result})
         ])
 
@@ -85,7 +85,7 @@ AC_DEFUN([CHECK_PKG_CUDA], [
 
   AC_DEFINE_UNQUOTED([HAVE_CUDA], [${check_pkg_define}], [Defined to 1 if CUDA is available])
   AC_DEFINE_UNQUOTED([HAVE_CUDA_DMABUF_SUPPORT], [${check_cuda_dmabuf_define}], [Defined to 1 if CUDA DMA-BUF support is available])
-  AC_DEFINE_UNQUOTED([HAVE_CUDA_GDRFLUSH_SUPPORT], [${check_cuda_gdr_flush_define}], [Defined to 1 if CUDA cuFlushGPUDirectRDMAWrites support is available])
+  AC_DEFINE_UNQUOTED([HAVE_CUDA_GDR_SUPPORT], [${check_cuda_gdr_define}], [Defined to 1 if CUDA cuFlushGPUDirectRDMAWrites support is available])
   AM_CONDITIONAL([HAVE_CUDA], [test "${check_pkg_found}" = "yes"])
 
   AC_SUBST([CUDA_LDFLAGS])

@@ -69,19 +69,23 @@ int nccl_net_ofi_cuda_init(void)
 	RESOLVE_CUDA_FUNCTION(cuCtxGetDevice);
 	RESOLVE_CUDA_FUNCTION(cuDeviceGetAttribute);
 
-	if (HAVE_CUDA_GDRFLUSH_SUPPORT && nccl_net_ofi_cuda_have_gdr_support_attr() && ofi_nccl_cuda_flush_enable()) {
-		NCCL_OFI_WARN("CUDA flush enabled");
-		cuda_flush = true;
-	} else {
+	cuda_flush = ofi_nccl_cuda_flush_enable();
+	gdr_flush_disabled = ofi_nccl_gdr_flush_disable();
+
+#if HAVE_CUDA_GDR_SUPPORT
+	if (!(nccl_net_ofi_cuda_gdr_viable() &&
+		  nccl_net_ofi_cuda_have_gdr_flush_support_attr())) {
+		gdr_flush_disabled = true;
 		cuda_flush = false;
 	}
+#endif
 
 	return 0;
 }
 
 int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void)
 {
-#if HAVE_CUDA_GDRFLUSH_SUPPORT
+#if HAVE_CUDA_GDR_SUPPORT
 	static_assert(CUDA_VERSION >= 11030, "Requires cudart>=11.3");
 	cudaError_t ret = cudaDeviceFlushGPUDirectRDMAWrites(cudaFlushGPUDirectRDMAWritesTargetCurrentDevice,
 	                                                     cudaFlushGPUDirectRDMAWritesToOwner);
@@ -129,9 +133,9 @@ int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id)
 	};
 }
 
-bool nccl_net_ofi_cuda_have_gdr_support_attr(void)
+bool nccl_net_ofi_cuda_have_gdr_flush_support_attr(void)
 {
-#if HAVE_CUDA_GDRFLUSH_SUPPORT
+#if HAVE_CUDA_GDR_SUPPORT
 	if (pfn_cuCtxGetDevice == NULL || pfn_cuDeviceGetAttribute == NULL) {
 		return false;
 	}
@@ -143,13 +147,29 @@ bool nccl_net_ofi_cuda_have_gdr_support_attr(void)
 	}
 
 	int supported;
-	result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev);
-	if (result != CUDA_SUCCESS || !((bool)supported)) {
+	result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, dev);
+	return result == CUDA_SUCCESS && ((supported & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0);
+#else
+	return false;
+#endif
+}
+
+bool nccl_net_ofi_cuda_have_gdr_support_attr(void)
+{
+#if HAVE_CUDA_GDR_SUPPORT
+	if (pfn_cuCtxGetDevice == NULL || pfn_cuDeviceGetAttribute == NULL) {
 		return false;
 	}
 
-	result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, dev);
-	return result == CUDA_SUCCESS && ((supported & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0);
+	CUdevice dev;
+	CUresult result = pfn_cuCtxGetDevice(&dev);
+	if (result != CUDA_SUCCESS) {
+		return false;
+	}
+
+	int supported;
+	result = pfn_cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev);
+	return result == CUDA_SUCCESS && (bool)supported;
 #else
 	return false;
 #endif
@@ -179,3 +199,27 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void)
 	return false;
 #endif
 }
+
+bool nccl_net_ofi_cuda_gdr_viable(void)
+{
+	/* Disable GDR if building against too-old libfabric. */
+	if (FI_VERSION_LT(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), FI_VERSION(1, 18))) {
+		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Will not use GDR, requires Libfabric 1.18 or greater.");
+		return false;
+	}
+
+	/* Disable GDR if explicitly disabled by user. */
+	if (ofi_nccl_disable_gdrcopy()) {
+		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Will not attempt to use GDRCopy, explicitly disabled by user.");
+		return false;
+	}
+
+	/* Disable GDR if CUDA does not report GDR support in device attributes. */
+	if (!nccl_net_ofi_cuda_have_gdr_support_attr()) {
+		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
+		               "Will not attempt to use GDRCopy, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED was false.");
+		return false;
+	}
+
+	return true;
+}
@@ -30,13 +30,11 @@
 #include "nccl_ofi_ofiutils.h"
 #include "nccl_ofi_system.h"
 
-/* Indicates if GPUDirect is supported by libfabric provider */
-enum gdr_support_level_t support_gdr = GDR_UNKNOWN;
-
 /* Indicates if the cudaDeviceFlushGPUDirectRDMAWrites function should be used
  * to flush data to the GPU. Note, CUDA flush support is not supported on all
  * platforms and should be disabled by default */
 bool cuda_flush = false;
+bool gdr_flush_disabled = true;
 
 /* number of duplicate providers to create for each discovered
  * provider, including renaming to cause NCCL to create additional
@@ -136,9 +134,6 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 	int ret = 0;
 	const char *provider_filter = NULL;
 	nccl_net_ofi_plugin_t *plugin;
-	nccl_net_ofi_ep_t *base_ep = NULL;
-	nccl_net_ofi_device_t *device = NULL;
-	nccl_ofi_properties_t properties;
 
 	NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Initializing " PACKAGE_STRING);
 
@@ -162,6 +157,11 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 	 */
 	mr_cache_alignment = NCCL_OFI_MIN(system_page_size, NCCL_OFI_CACHE_PAGE_SIZE);
 
+	/* configuration parameters */
+	nic_dup_conns = ofi_nccl_nic_dup_conns();
+	net_latency = (float)ofi_nccl_net_latency();
+	cq_read_count = ofi_nccl_cq_read_count();
+
 #if HAVE_CUDA
 	ret = nccl_net_ofi_cuda_init();
 	if (ret != 0) {
@@ -170,17 +170,22 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 	}
 #endif
 
-	/* configuration parameters */
-	nic_dup_conns = ofi_nccl_nic_dup_conns();
-	net_latency = (float)ofi_nccl_net_latency();
-	cq_read_count = ofi_nccl_cq_read_count();
-
 	if (platform_init) {
 		ret = platform_init(&provider_filter);
 		if (ret != 0)
 			goto exit;
 	}
 
+#if HAVE_CUDA
+	if (nic_dup_conns > 0 && nccl_net_ofi_cuda_have_gdr_support_attr()) {
+		NCCL_OFI_WARN(
+			"NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA.  This configuration is not "
+		        "supported.");
+		ret = -ENOTSUP;
+		goto exit;
+	}
+#endif
+
 	/* This is ugly, but here's the basic protocol selection
 	 * logic:
 	 *   1. if the user set NCCL_OFI_PROTOCOL, use that.
@@ -285,55 +290,6 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 		goto exit;
 	}
 
-	/* In order to set endpoint options and potentially NCCL configuration
-	 * options (such as NCCL_PROTO) during the plugin initialization
-	 * process, we need to create an endpoint and call the platform hook
-	 * "platform_config_endpoint" using "get_ep". This code makes the
-	 * assumption that the thread calling "nccl_net_ofi_init" will make
-	 * communication calls. As well, since without this code the endpoint
-	 * would be created the first time "get_ep" in called during a listen or
-	 * connect call, creating the endpoint earlier would not be a waste of
-	 * resources. This initialization happens once per process, and thus it
-	 * does not matter which device is used to create the endpoint.
-	 */
-	device = plugin->get_device(plugin, 0);
-
-	ret = device->get_ep(device, &base_ep);
-	if (ret != 0) {
-		goto exit;
-	}
-	ret = device->get_properties(device, &properties);
-	if (ret != 0) {
-		goto exit;
-	}
-	NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for global registrations: %s",
-		      (properties.regIsGlobal == 0) ? "false" : "true");
-	NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for DMA-BUF registrations: %s",
-		      (properties.dmabuf_support == 0) ? "false" : "true");
-	/* Cause release to not actually free the resources, to speed
-	 * up initialization, since the very same resources will be
-	 * recreated by NCCL soon after initialization to do real
-	 * communication.
-	 */
-	base_ep->ref_cnt++;
-	ret = base_ep->release_ep(base_ep);
-	base_ep->ref_cnt--;
-	if (ret != 0) {
-		goto exit;
-	}
-
-	assert(support_gdr != GDR_UNKNOWN);
-
-	/* we don't actually know if GDR is supported until we've
-	 * created the first endpoint, so this check needs to be way
-	 * down here
-	 */
-	if (nic_dup_conns > 0 && support_gdr != GDR_UNSUPPORTED) {
-		NCCL_OFI_WARN("NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA.  This configuration is not supported.");
-		ret = -ENOTSUP;
-		goto exit;
-	}
-
 	*plugin_p = plugin;
 
  exit:
@@ -416,12 +372,7 @@ static int set_nic_props_default(int dev_id, struct fi_info *nic_prov,
 	 */
 	props->max_group_receives = NCCL_OFI_MAX_RECVS;
 
-	if (support_gdr == GDR_SUPPORTED) {
-		props->hmem_support = true;
-	} else {
-		props->hmem_support = false;
-	}
-
+	props->hmem_support = false;
 	props->dmabuf_support = false;
 
 	/* Should be successful for ptrSupport invocation */
@@ -580,14 +531,19 @@ int nccl_net_ofi_info_properties(nccl_net_ofi_plugin_t *plugin, struct fi_info *
 
 	props->max_mr_key_size = nic_prov->domain_attr->mr_key_size;
 
+	props->hmem_support = ((nic_prov->caps & FI_HMEM) != 0) &&
+	                      FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 18)) &&
+	                      (HAVE_NEURON || nccl_net_ofi_cuda_have_gdr_support_attr());
 
 	props->dmabuf_support = ((nic_prov->caps & FI_HMEM) != 0) &&
 		FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 20)) &&
 		nccl_ofi_dmabuf_viable()
 		;
-	if (props->dmabuf_support) {
-		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "DMA-BUF support is advertised in properties.");
-	}
+
+	NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
+	               "NCCL properties: dmabuf=%s hmem=%s",
+	               props->dmabuf_support ? "yes" : "no",
+	               props->hmem_support ? "yes" : "no");
 
 	goto exit;
 error: