Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch from fine grained locking throughout the code base to device and domain level locking #743

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ typedef uint16_t nccl_ofi_rdma_msg_type_t;
* allocate a RDMA memory registration handle with `num_rails`+`num_control_rails` rails.
*/
typedef struct nccl_net_ofi_rdma_mr_handle {
struct nccl_net_ofi_rdma_device *device;

int num_rails;

Expand Down Expand Up @@ -408,12 +409,6 @@ typedef struct nccl_net_ofi_rdma_req {
/* Size of completed request */
size_t size;

/*
* Protect updating critical fields such as size and ncompls when
* network xfer happened over multiple rails
*/
pthread_mutex_t req_lock;

/* State of request */
nccl_net_ofi_rdma_req_state_t state;

Expand Down Expand Up @@ -544,7 +539,6 @@ typedef struct nccl_net_ofi_rdma_send_comm {

nccl_ofi_deque_elem_t cleanup_list_elem;

pthread_mutex_t ctrl_recv_lock;
bool received_close_message;
/* Counters for total sent and received control messages */
uint64_t n_ctrl_received;
Expand Down Expand Up @@ -624,7 +618,6 @@ typedef struct nccl_net_ofi_rdma_recv_comm {
nccl_ofi_deque_elem_t cleanup_list_elem;

/* Counters for total sent and received control messages */
pthread_mutex_t ctrl_counter_lock;
uint64_t n_ctrl_sent;
uint64_t n_ctrl_delivered;

Expand Down Expand Up @@ -701,8 +694,6 @@ struct nccl_net_ofi_ep_rail {
size_t min_bounce_posted;
/* Maximum posted bounce buffers (see RDMA_MAX_POSTED_BOUNCE_BUFFERS) */
size_t max_bounce_posted;
/* Mutex for bounce buffer operations */
pthread_mutex_t bounce_mutex;
};

/*
Expand Down Expand Up @@ -841,6 +832,8 @@ typedef struct nccl_net_ofi_rdma_domain {

/* List of endpoints and set of addresses they have connections to */
nccl_ofi_ep_addr_list_t *ep_addr_list;

pthread_mutex_t rdma_domain_lock;
} nccl_net_ofi_rdma_domain_t;


Expand Down
3 changes: 3 additions & 0 deletions include/nccl_ofi_sendrecv.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ typedef struct nccl_net_ofi_sendrecv_ep {
/* Current available tag ID */
uint64_t tag;

/* copy of device's max_tag to reading device information */
uint64_t max_tag;

/* Endpoint handle to communicate to */
struct fid_ep *ofi_ep;

Expand Down
Loading