Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rdma: add separate bounce buffer freelist for data (eager) messages #614

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions include/nccl_ofi_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,18 +239,31 @@ OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0);
OFI_NCCL_PARAM_UINT(min_stripe_size, "MIN_STRIPE_SIZE", (64 * 1024));

/*
* Minimum bounce buffers posted per endpoint. The plugin will attempt to post
* Minimum ctrl recv buffers posted per rail. The plugin will attempt to post
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

per what rail? device, endpoint, etc?

* more buffers if we dip below this threshold, allocating new buffers if needed.
*/
OFI_NCCL_PARAM_INT(rdma_min_posted_ctrl_recv_buffers, "RDMA_MIN_POSTED_CTRL_RECV_BUFFERS", 64);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be a function out max outstanding requests, which today we have at 128?

Copy link
Contributor Author

@rauteric rauteric Oct 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a bit tricky, because the outstanding requests max (NCCL_OFI_MAX_REQUESTS) is

  1. A per-communicator value, whereas this parameter is a per-endpoint per-rail value, so we'd have to multiply this parameter by the number of communicators, which we don't know in advance
  2. Much higher than NCCL actually keeps in flight typically, so setting this param to NCCL_OFI_MAX_REQUESTS x num_comms will be overkill.

That said, we may need to tune this value, but the current value in this PR is already significantly higher than what is in master today (which is 16-32 per rail, and shared between eager and ctrl recv buffers).


/*
* Maximum ctrl recv buffers posted per rail. The plugin will not attempt to
* post more buffers if we reach this threshold, returning available buffers to
* the free list if needed
*/
OFI_NCCL_PARAM_INT(rdma_max_posted_ctrl_recv_buffers, "RDMA_MAX_POSTED_CTRL_RECV_BUFFERS", 128);

/*
* Minimum (eager) bounce buffers posted per rail. The plugin will attempt to post
* more bounce buffers if we dip below this threshold, allocating new bounce
* buffers if needed.
*/
OFI_NCCL_PARAM_INT(rdma_min_posted_bounce_buffers, "RDMA_MIN_POSTED_BOUNCE_BUFFERS", 64);
OFI_NCCL_PARAM_INT(rdma_min_posted_bounce_buffers, "RDMA_MIN_POSTED_BOUNCE_BUFFERS", 16);
bwbarrett marked this conversation as resolved.
Show resolved Hide resolved

/*
* Maximum bounce buffers posted per endpoint. The plugin will not attempt to
* Maximum (eager) bounce buffers posted per rail. The plugin will not attempt to
* post more bounce buffers if we reach this threshold, returning available
* buffers to the free list if needed
*/
OFI_NCCL_PARAM_INT(rdma_max_posted_bounce_buffers, "RDMA_MAX_POSTED_BOUNCE_BUFFERS", 128);
OFI_NCCL_PARAM_INT(rdma_max_posted_bounce_buffers, "RDMA_MAX_POSTED_BOUNCE_BUFFERS", 32);

/*
* Internode network latency reported to NCCL. Defaults to 0, unless the configured
Expand Down
13 changes: 6 additions & 7 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,12 @@ struct nccl_net_ofi_ep_rail {
size_t max_bounce_posted;
/* Mutex for bounce buffer operations */
pthread_mutex_t bounce_mutex;
/* Bounce request fl */
nccl_ofi_freelist_t *bounce_buff_reqs_fl;
/* Buffer freelist */
nccl_ofi_freelist_t *bounce_buff_fl;
rauteric marked this conversation as resolved.
Show resolved Hide resolved
/* Size of bounce buffers */
size_t buff_size;
};

/*
Expand Down Expand Up @@ -722,13 +728,6 @@ struct nccl_net_ofi_rdma_ep {
/* Pending requests queue */
nccl_ofi_deque_t *pending_reqs_queue;

/* Free list of bounce buffers */
nccl_ofi_freelist_t *bounce_buff_fl;
/* Free list of bounce buffer requests */
nccl_ofi_freelist_t *bounce_buff_reqs_fl;
/* Size of bounce buffers */
size_t bounce_buff_size;

/* true if the current endpoint is a endpoint_per_communicator
receive communicator */
bool is_endpoint_per_communicator_ep;
Expand Down
173 changes: 117 additions & 56 deletions src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -2152,18 +2152,17 @@ static inline int free_bounce_req(nccl_net_ofi_rdma_req_t *req,
{
assert(!dec_inflight_reqs);
rdma_req_bounce_data_t *bounce_data = get_bounce_data(req);
nccl_net_ofi_rdma_ep_t *ep = bounce_data->ep;
/* Free buffer */
if (bounce_data->bounce_fl_item) {
nccl_ofi_freelist_entry_free(ep->bounce_buff_fl, bounce_data->bounce_fl_item);
nccl_ofi_freelist_entry_free(bounce_data->rail->bounce_buff_fl, bounce_data->bounce_fl_item);
}
return free_base_req(NULL, ep->bounce_buff_reqs_fl, req, false);
return free_base_req(NULL, bounce_data->rail->bounce_buff_reqs_fl, req, false);
}

static inline nccl_net_ofi_rdma_req_t *alloc_bounce_req(nccl_net_ofi_rdma_ep_t *ep,
nccl_net_ofi_ep_rail_t *rail)
{
nccl_net_ofi_rdma_req_t *req = allocate_req(ep->bounce_buff_reqs_fl);
nccl_net_ofi_rdma_req_t *req = allocate_req(rail->bounce_buff_reqs_fl);
if (!req) return NULL;

req->comm = NULL;
Expand All @@ -2173,9 +2172,11 @@ static inline nccl_net_ofi_rdma_req_t *alloc_bounce_req(nccl_net_ofi_rdma_ep_t *

rdma_req_bounce_data_t *bounce_data = get_bounce_data(req);

nccl_ofi_freelist_t *bounce_buff_fl = rail->bounce_buff_fl;

nccl_net_ofi_rdma_bounce_fl_item_t *bounce_fl_item =
(nccl_net_ofi_rdma_bounce_fl_item_t *)nccl_ofi_freelist_entry_alloc(
ep->bounce_buff_fl);
bounce_buff_fl);
if (!bounce_fl_item) {
NCCL_OFI_WARN("Failed to allocate bounce_fl_item");
req->free(req, false);
Expand All @@ -2184,7 +2185,7 @@ static inline nccl_net_ofi_rdma_req_t *alloc_bounce_req(nccl_net_ofi_rdma_ep_t *
assert(NCCL_OFI_IS_PTR_ALIGNED(&bounce_fl_item->bounce_msg, BOUNCE_BUFFER_ALIGNMENT));

bounce_data->bounce_fl_item = bounce_fl_item;
bounce_data->buff_len = ep->bounce_buff_size;
bounce_data->buff_len = rail->buff_size;
bounce_data->rail = rail;
bounce_data->ep = ep;
return req;
Expand Down Expand Up @@ -5131,8 +5132,7 @@ static int post_bounce_buffer(nccl_net_ofi_rdma_req_t *req,
/* Reset memcheck guards of bounce buffer freelist entry to
* accessible but undefined to cover cases where the buffer
* gets re-posted */
nccl_net_ofi_rdma_ep_t *ep = bounce_data->ep;
nccl_ofi_freelist_entry_set_undefined(ep->bounce_buff_fl,
nccl_ofi_freelist_entry_set_undefined(ep_rail->bounce_buff_fl,
bounce_fl_item);

req->state = NCCL_OFI_RDMA_REQ_CREATED;
Expand Down Expand Up @@ -5753,59 +5753,130 @@ static inline nccl_net_ofi_rdma_send_comm_t *calloc_rdma_send_comm(int num_rails
/*
* @brief Initialize bounce buffer data of endpoint
*
* @param ep
* Endpoint with bounce buffer and bounce requests not being
* @param ep_rail
* Endpoint rail with bounce buffer and bounce requests not being
* initialized yet.
* @param ep
* Corresponding endpoint
* @return 0, on success
* non-zero, on error
*/
static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
static inline int init_bounce_buffers_rail(nccl_net_ofi_ep_rail_t *ep_rail, nccl_net_ofi_rdma_ep_t *ep,
size_t buff_size, size_t entry_alignment,
size_t min_posted_count, size_t max_posted_count)
{
int ret = 0;

ret = nccl_ofi_freelist_init(sizeof(nccl_net_ofi_rdma_req_t),
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
&ep->bounce_buff_reqs_fl);
max_posted_count, 16, 0,
&ep_rail->bounce_buff_reqs_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init bounce_buff_reqs_fl");
return ret;
}

ret = nccl_ofi_freelist_init_mr(sizeof(nccl_net_ofi_rdma_bounce_fl_item_t) + ep->bounce_buff_size,
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
ep_rail->buff_size = buff_size;
ret = nccl_ofi_freelist_init_mr(buff_size,
max_posted_count, 16, 0,
freelist_regmr_host_fn, freelist_deregmr_host_fn,
ep, 0, BOUNCE_BUFFER_ALIGNMENT, &ep->bounce_buff_fl);
ep, 0, entry_alignment, &ep_rail->bounce_buff_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init bounce_buff_fl");
if (nccl_ofi_freelist_fini(ep->bounce_buff_reqs_fl))
NCCL_OFI_WARN("Also failed to freelist_fini bounce_buff_reqs_fl");
goto error;
}

ep_rail->min_bounce_posted = min_posted_count;
ep_rail->max_bounce_posted = max_posted_count;
ep_rail->num_bounce_posted = 0;

assert(ep_rail->max_bounce_posted >= ep_rail->min_bounce_posted);

ret = nccl_net_ofi_mutex_init(&ep_rail->bounce_mutex, NULL);
if (ret != 0) {
goto error;
}

return ret;

error:
if (ep_rail->bounce_buff_reqs_fl != NULL) {
nccl_ofi_freelist_fini(ep_rail->bounce_buff_reqs_fl);
ep_rail->bounce_buff_reqs_fl = NULL;
}
if (ep_rail->bounce_buff_fl != NULL) {
nccl_ofi_freelist_fini(ep_rail->bounce_buff_fl);
ep_rail->bounce_buff_fl = NULL;
}

return ret;
}

static inline int fini_bounce_buffers_rail(nccl_net_ofi_ep_rail_t *ep_rail)
{
int ret = nccl_net_ofi_mutex_destroy(&ep_rail->bounce_mutex);
if (ret != 0) {
return ret;
}

/*
* The *_bounce_posted limits are used in the progress engine to
* determine if the receive queue is hydrated with sufficient buffers.
* The parameters account for all the rails, so scale down bounds to
* what a single rail would need for the control endpoint.
*/
ep->control_rail.min_bounce_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_min_posted_bounce_buffers(), ep->num_rails
);
ep->control_rail.max_bounce_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_max_posted_bounce_buffers(), ep->num_rails
);
ep->control_rail.num_bounce_posted = 0;
ret = nccl_net_ofi_mutex_init(&ep->control_rail.bounce_mutex, NULL);
if (ep_rail->bounce_buff_fl) {
ret = nccl_ofi_freelist_fini(ep_rail->bounce_buff_fl);
ep_rail->bounce_buff_fl = NULL;
if (ret != 0) {
return ret;
}
}

if (ep_rail->bounce_buff_reqs_fl) {
ret = nccl_ofi_freelist_fini(ep_rail->bounce_buff_reqs_fl);
ep_rail->bounce_buff_reqs_fl = NULL;
if (ret != 0) {
return ret;
}
}

return ret;
}

static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
{
int ret = 0;
/* Control rail */
{
size_t buff_size = sizeof(nccl_net_ofi_rdma_bounce_fl_item_t) +
NCCL_OFI_MAX(NCCL_OFI_MAX(
sizeof(nccl_net_ofi_rdma_ctrl_msg_t),
sizeof(nccl_ofi_rdma_connection_info_t)),
sizeof(nccl_net_ofi_rdma_close_msg_t));
ret = init_bounce_buffers_rail(&ep->control_rail, ep,
buff_size, BOUNCE_BUFFER_ALIGNMENT,
ofi_nccl_rdma_min_posted_ctrl_recv_buffers(),
ofi_nccl_rdma_max_posted_ctrl_recv_buffers());
if (ret != 0) {
return ret;
}
}

/* Data rails */
for (int rail_id = 0; rail_id < ep->num_rails; ++rail_id) {
size_t buff_size = sizeof(nccl_net_ofi_rdma_bounce_fl_item_t)
+ eager_max_size;

nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id);
rail->min_bounce_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_min_posted_bounce_buffers(), ep->num_rails
);
rail->max_bounce_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_max_posted_bounce_buffers(), ep->num_rails
);
nccl_net_ofi_mutex_init(&rail->bounce_mutex, NULL);

ret = init_bounce_buffers_rail(rail, ep, buff_size,
BOUNCE_BUFFER_ALIGNMENT,
ofi_nccl_rdma_min_posted_bounce_buffers(),
ofi_nccl_rdma_max_posted_bounce_buffers());
if (ret != 0) {

/* Cleanup previously-established rails */
fini_bounce_buffers_rail(&ep->control_rail);
for (int i = 0; i < rail_id; ++i) {
fini_bounce_buffers_rail(get_rail(ep, i));
}

return ret;
}
}

return ret;
Expand All @@ -5823,24 +5894,17 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
static inline int fini_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
{
int ret = 0;
ret = nccl_ofi_freelist_fini(ep->bounce_buff_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to fini bounce_buff_fl");
return ret;
}

ret = nccl_ofi_freelist_fini(ep->bounce_buff_reqs_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to fini bounce_buff_reqs_fl");
return ret;
}

/* Data rails */
for (int rail_id = 0; rail_id < ep->num_rails; ++rail_id) {
nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id);
nccl_net_ofi_mutex_destroy(&rail->bounce_mutex);
ret = fini_bounce_buffers_rail(get_rail(ep, rail_id));
if (ret != 0) {
return ret;
}
}

nccl_net_ofi_mutex_destroy(&ep->control_rail.bounce_mutex);
/* Control rail */
ret = fini_bounce_buffers_rail(&ep->control_rail);

return ret;
}
Expand Down Expand Up @@ -6722,9 +6786,6 @@ static int nccl_net_ofi_rdma_device_create_endpoint(nccl_net_ofi_device_t *base_
goto error;
}

ep->bounce_buff_size = NCCL_OFI_MAX(NCCL_OFI_MAX(sizeof(nccl_net_ofi_rdma_ctrl_msg_t), eager_max_size),
sizeof(nccl_ofi_rdma_connection_info_t));

ep->is_endpoint_per_communicator_ep = false;

ret = init_rail_ofi_resources(device, ep);
Expand Down
Loading