Skip to content

Commit

Permalink
rdma: add separate bounce buffer freelist for data (eager) messages
Browse files Browse the repository at this point in the history
Separate out bounce buffer freelists into a smaller-sized freelist for
control messages and a larger size for data (eager) messages

Also refactor bounce buffer freelists to be per-rail instead of shared
across all rails. Parameters `min_posted_bounce_buffers` and
`max_posted_bounce_buffers` are now per-rail values.

Signed-off-by: Eric Raut <eraut@amazon.com>
  • Loading branch information
rauteric committed Oct 3, 2024
1 parent 3fd8ae3 commit bf176e6
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 65 deletions.
8 changes: 4 additions & 4 deletions include/nccl_ofi_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,18 +239,18 @@ OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0);
OFI_NCCL_PARAM_UINT(round_robin_threshold, "ROUND_ROBIN_THRESHOLD", (256 * 1024));

/*
* Minimum bounce buffers posted per endpoint. The plugin will attempt to post
* Minimum bounce buffers posted per rail. The plugin will attempt to post
* more bounce buffers if we dip below this threshold, allocating new bounce
* buffers if needed.
*/
OFI_NCCL_PARAM_INT(rdma_min_posted_bounce_buffers, "RDMA_MIN_POSTED_BOUNCE_BUFFERS", 64);
OFI_NCCL_PARAM_INT(rdma_min_posted_bounce_buffers, "RDMA_MIN_POSTED_BOUNCE_BUFFERS", 16);

/*
* Maximum bounce buffers posted per endpoint. The plugin will not attempt to
* Maximum bounce buffers posted per rail. The plugin will not attempt to
* post more bounce buffers if we reach this threshold, returning available
* buffers to the free list if needed
*/
OFI_NCCL_PARAM_INT(rdma_max_posted_bounce_buffers, "RDMA_MAX_POSTED_BOUNCE_BUFFERS", 128);
OFI_NCCL_PARAM_INT(rdma_max_posted_bounce_buffers, "RDMA_MAX_POSTED_BOUNCE_BUFFERS", 32);

/*
* Internode network latency reported to NCCL. Defaults to 0, unless the configured
Expand Down
13 changes: 6 additions & 7 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,12 @@ struct nccl_net_ofi_ep_rail {
size_t max_bounce_posted;
/* Mutex for bounce buffer operations */
pthread_mutex_t bounce_mutex;
/* Bounce request fl */
nccl_ofi_freelist_t *bounce_buff_reqs_fl;
/* Buffer freelist */
nccl_ofi_freelist_t *bounce_buff_fl;
/* Size of bounce buffers */
size_t buff_size;
};

/*
Expand Down Expand Up @@ -734,13 +740,6 @@ struct nccl_net_ofi_rdma_ep {
/* Pending requests queue */
nccl_ofi_deque_t *pending_reqs_queue;

/* Free list of bounce buffers */
nccl_ofi_freelist_t *bounce_buff_fl;
/* Free list of bounce buffer requests */
nccl_ofi_freelist_t *bounce_buff_reqs_fl;
/* Size of bounce buffers */
size_t bounce_buff_size;

/* True if this ep is stored in the thread-local store */
bool thread_local_ep;

Expand Down
162 changes: 108 additions & 54 deletions src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -2104,18 +2104,17 @@ static inline int free_bounce_req(nccl_net_ofi_rdma_req_t *req,
{
assert(!dec_inflight_reqs);
rdma_req_bounce_data_t *bounce_data = get_bounce_data(req);
nccl_net_ofi_rdma_ep_t *ep = bounce_data->ep;
/* Free buffer */
if (bounce_data->bounce_fl_item) {
nccl_ofi_freelist_entry_free(ep->bounce_buff_fl, bounce_data->bounce_fl_item);
nccl_ofi_freelist_entry_free(bounce_data->rail->bounce_buff_fl, bounce_data->bounce_fl_item);
}
return free_base_req(NULL, ep->bounce_buff_reqs_fl, req, false);
return free_base_req(NULL, bounce_data->rail->bounce_buff_reqs_fl, req, false);
}

static inline nccl_net_ofi_rdma_req_t *alloc_bounce_req(nccl_net_ofi_rdma_ep_t *ep,
nccl_net_ofi_ep_rail_t *rail)
{
nccl_net_ofi_rdma_req_t *req = allocate_req(ep->bounce_buff_reqs_fl);
nccl_net_ofi_rdma_req_t *req = allocate_req(rail->bounce_buff_reqs_fl);
if (!req) return NULL;

req->comm = NULL;
Expand All @@ -2125,9 +2124,11 @@ static inline nccl_net_ofi_rdma_req_t *alloc_bounce_req(nccl_net_ofi_rdma_ep_t *

rdma_req_bounce_data_t *bounce_data = get_bounce_data(req);

nccl_ofi_freelist_t *bounce_buff_fl = rail->bounce_buff_fl;

nccl_net_ofi_rdma_bounce_fl_item_t *bounce_fl_item =
(nccl_net_ofi_rdma_bounce_fl_item_t *)nccl_ofi_freelist_entry_alloc(
ep->bounce_buff_fl);
bounce_buff_fl);
if (!bounce_fl_item) {
NCCL_OFI_WARN("Failed to allocate bounce_fl_item");
req->free(req, false);
Expand All @@ -2136,7 +2137,7 @@ static inline nccl_net_ofi_rdma_req_t *alloc_bounce_req(nccl_net_ofi_rdma_ep_t *
assert(NCCL_OFI_IS_PTR_ALIGNED(&bounce_fl_item->bounce_msg, BOUNCE_BUFFER_ALIGNMENT));

bounce_data->bounce_fl_item = bounce_fl_item;
bounce_data->buff_len = ep->bounce_buff_size;
bounce_data->buff_len = rail->buff_size;
bounce_data->rail = rail;
bounce_data->ep = ep;
return req;
Expand Down Expand Up @@ -5091,8 +5092,7 @@ static int post_bounce_buffer(nccl_net_ofi_rdma_req_t *req,
/* Reset memcheck guards of bounce buffer freelist entry to
* accessible but undefined to cover cases where the buffer
* gets re-posted */
nccl_net_ofi_rdma_ep_t *ep = bounce_data->ep;
nccl_ofi_freelist_entry_set_undefined(ep->bounce_buff_fl,
nccl_ofi_freelist_entry_set_undefined(ep_rail->bounce_buff_fl,
bounce_fl_item);

req->state = NCCL_OFI_RDMA_REQ_CREATED;
Expand Down Expand Up @@ -5711,59 +5711,123 @@ static inline nccl_net_ofi_rdma_send_comm_t *calloc_rdma_send_comm(int num_rails
/*
* @brief Initialize bounce buffer data of endpoint
*
* @param ep
* Endpoint with bounce buffer and bounce requests not being
* @param ep_rail
* Endpoint rail with bounce buffer and bounce requests not being
* initialized yet.
* @param ep
* Corresponding endpoint
* @return 0, on success
* non-zero, on error
*/
static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
static inline int init_bounce_buffers_rail(nccl_net_ofi_ep_rail_t *ep_rail, nccl_net_ofi_rdma_ep_t *ep,
size_t buff_size, size_t entry_alignment)
{
int ret = 0;

ret = nccl_ofi_freelist_init(sizeof(nccl_net_ofi_rdma_req_t),
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
&ep->bounce_buff_reqs_fl);
&ep_rail->bounce_buff_reqs_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init bounce_buff_reqs_fl");
return ret;
}

ret = nccl_ofi_freelist_init_mr(sizeof(nccl_net_ofi_rdma_bounce_fl_item_t) + ep->bounce_buff_size,
ep_rail->buff_size = buff_size;
ret = nccl_ofi_freelist_init_mr(buff_size,
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
freelist_regmr_host_fn, freelist_deregmr_host_fn,
ep, 0, BOUNCE_BUFFER_ALIGNMENT, &ep->bounce_buff_fl);
ep, 0, entry_alignment, &ep_rail->bounce_buff_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init bounce_buff_fl");
if (nccl_ofi_freelist_fini(ep->bounce_buff_reqs_fl))
NCCL_OFI_WARN("Also failed to freelist_fini bounce_buff_reqs_fl");
goto error;
}

ep_rail->min_bounce_posted = ofi_nccl_rdma_min_posted_bounce_buffers();
ep_rail->max_bounce_posted = ofi_nccl_rdma_max_posted_bounce_buffers();
ep_rail->num_bounce_posted = 0;

ret = nccl_net_ofi_mutex_init(&ep_rail->bounce_mutex, NULL);
if (ret != 0) {
goto error;
}

return ret;

error:
if (ep_rail->bounce_buff_reqs_fl != NULL) {
nccl_ofi_freelist_fini(ep_rail->bounce_buff_reqs_fl);
ep_rail->bounce_buff_reqs_fl = NULL;
}
if (ep_rail->bounce_buff_fl != NULL) {
nccl_ofi_freelist_fini(ep_rail->bounce_buff_fl);
ep_rail->bounce_buff_fl = NULL;
}

return ret;
}

static inline int fini_bounce_buffers_rail(nccl_net_ofi_ep_rail_t *ep_rail)
{
int ret = nccl_net_ofi_mutex_destroy(&ep_rail->bounce_mutex);
if (ret != 0) {
return ret;
}

/*
* The *_bounce_posted limits are used in the progress engine to
* determine if the receive queue is hydrated with sufficient buffers.
* The parameters account for all the rails, so scale down bounds to
* what a single rail would need for the control endpoint.
*/
ep->control_rail.min_bounce_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_min_posted_bounce_buffers(), ep->num_rails
);
ep->control_rail.max_bounce_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_max_posted_bounce_buffers(), ep->num_rails
);
ep->control_rail.num_bounce_posted = 0;
ret = nccl_net_ofi_mutex_init(&ep->control_rail.bounce_mutex, NULL);
if (ep_rail->bounce_buff_fl) {
ret = nccl_ofi_freelist_fini(ep_rail->bounce_buff_fl);
ep_rail->bounce_buff_fl = NULL;
if (ret != 0) {
return ret;
}
}

if (ep_rail->bounce_buff_reqs_fl) {
ret = nccl_ofi_freelist_fini(ep_rail->bounce_buff_reqs_fl);
ep_rail->bounce_buff_reqs_fl = NULL;
if (ret != 0) {
return ret;
}
}

return ret;
}

static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
{
int ret = 0;
/* Control rail */
{
size_t buff_size = sizeof(nccl_net_ofi_rdma_bounce_fl_item_t) +
NCCL_OFI_MAX(NCCL_OFI_MAX(
sizeof(nccl_net_ofi_rdma_ctrl_msg_t),
sizeof(nccl_ofi_rdma_connection_info_t)),
sizeof(nccl_net_ofi_rdma_close_msg_t));
ret = init_bounce_buffers_rail(&ep->control_rail, ep,
buff_size, BOUNCE_BUFFER_ALIGNMENT);
if (ret != 0) {
return ret;
}
}

/* Data rails */
for (int rail_id = 0; rail_id < ep->num_rails; ++rail_id) {
size_t buff_size = sizeof(nccl_net_ofi_rdma_bounce_fl_item_t)
+ eager_max_size;

nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id);
rail->min_bounce_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_min_posted_bounce_buffers(), ep->num_rails
);
rail->max_bounce_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_max_posted_bounce_buffers(), ep->num_rails
);
nccl_net_ofi_mutex_init(&rail->bounce_mutex, NULL);

ret = init_bounce_buffers_rail(rail, ep, buff_size,
BOUNCE_BUFFER_ALIGNMENT);
if (ret != 0) {

/* Cleanup previously-established rails */
fini_bounce_buffers_rail(&ep->control_rail);
for (int i = 0; i < rail_id; ++i) {
fini_bounce_buffers_rail(get_rail(ep, i));
}

return ret;
}
}

return ret;
Expand All @@ -5781,24 +5845,17 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
static inline int fini_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
{
int ret = 0;
ret = nccl_ofi_freelist_fini(ep->bounce_buff_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to fini bounce_buff_fl");
return ret;
}

ret = nccl_ofi_freelist_fini(ep->bounce_buff_reqs_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to fini bounce_buff_reqs_fl");
return ret;
}

/* Data rails */
for (int rail_id = 0; rail_id < ep->num_rails; ++rail_id) {
nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id);
nccl_net_ofi_mutex_destroy(&rail->bounce_mutex);
ret = fini_bounce_buffers_rail(get_rail(ep, rail_id));
if (ret != 0) {
return ret;
}
}

nccl_net_ofi_mutex_destroy(&ep->control_rail.bounce_mutex);
/* Control rail */
ret = fini_bounce_buffers_rail(&ep->control_rail);

return ret;
}
Expand Down Expand Up @@ -6606,9 +6663,6 @@ static int create_ep(nccl_net_ofi_rdma_device_t *device,
/* Initialize number of rail */
ep->num_rails = num_rails;

ep->bounce_buff_size = NCCL_OFI_MAX(NCCL_OFI_MAX(sizeof(nccl_net_ofi_rdma_ctrl_msg_t), eager_max_size),
sizeof(nccl_ofi_rdma_connection_info_t));

ep->rails = (nccl_net_ofi_ep_rail_t *)calloc(ep->num_rails,
sizeof(nccl_net_ofi_ep_rail_t));
if (!ep->rails) {
Expand Down

0 comments on commit bf176e6

Please sign in to comment.