Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions src/components/ec/cuda/ec_cuda_executor_persistent.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,12 @@ ucc_cuda_executor_persistent_task_post(ucc_ee_executor_t *executor,
memcpy(ee_task->subtasks[0], task_args,
sizeof(ucc_ee_executor_task_args_t));
}
ucc_memory_cpu_store_fence();
/* tasks[] and pidx live in device-mapped (zero-copy) host memory shared
* with the persistent kernel. Use the bus (outer-shareable) store fence so
* the task args are guaranteed visible to the GPU before the updated pidx
* that publishes them; the inner-shareable CPU fence does not order stores
* against the GPU's shareability domain on weakly-ordered CPUs (aarch64). */
ucc_memory_bus_store_fence();
eee->pidx += ee_task->num_subtasks;
if (ucc_ec_cuda.thread_mode == UCC_THREAD_MULTIPLE) {
ucc_spin_unlock(&eee->tasks_lock);
Expand Down Expand Up @@ -163,7 +168,21 @@ ucc_status_t ucc_cuda_executor_persistent_stop(ucc_ee_executor_t *executor)
(*st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN));
*st = UCC_EC_CUDA_EXECUTOR_SHUTDOWN;
eee->pidx = -1;
while(*st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK) { }
/* state/pidx live in device-mapped (zero-copy) host memory the persistent
* kernel polls. Publish these stores to the GPU before spinning: on
* weakly-ordered CPUs (aarch64/Grace) the store to pidx may sit in the CPU
* store buffer while this loop spins, and the cheaper inner-shareable CPU
* store fence only orders against other CPUs, not the GPU. The bus
* (outer-shareable) store fence drains/orders the store to the domain the
* GPU observes; without it the kernel never sees pidx == -1, never writes
* SHUTDOWN_ACK, and this loop hangs forever. */
ucc_memory_bus_store_fence();
/* No load fence here: st is volatile (re-read every iteration) and points
* to coherent device-mapped memory, so the kernel's SHUTDOWN_ACK write
* becomes visible on its own. A fence would only order accesses, not force
* a cache re-read, and there is no dependent load after the poll. */
while (*st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK) {
}
eee->super.ee_context = NULL;
eee->state = UCC_EC_CUDA_EXECUTOR_INITIALIZED;

Expand Down
9 changes: 9 additions & 0 deletions src/components/tl/cuda/allreduce/allreduce.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ ucc_status_t ucc_tl_cuda_allreduce_init(ucc_base_coll_args_t *coll_args,
{
ucc_status_t status = UCC_ERR_NOT_IMPLEMENTED;
#ifdef HAVE_NVLS
ucc_tl_cuda_team_t *cuda_team = ucc_derived_of(team, ucc_tl_cuda_team_t);

/* NVLS is the only allreduce algorithm in TL/CUDA. If NVLS did not
* initialize for this team (e.g. it fell back because the peer fd import
* was denied), report NOT_SUPPORTED so the collective is served by another
* transport instead of dispatching to uninitialized NVLS resources. */
if (!cuda_team->nvls.enabled) {
return UCC_ERR_NOT_SUPPORTED;
}
/* Use NVLS algorithm as default */
status = ucc_tl_cuda_allreduce_nvls_init(coll_args, team, task_h);
#else
Expand Down
177 changes: 155 additions & 22 deletions src/components/tl/cuda/tl_cuda_nvls.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include <sys/syscall.h> // for pidfd_open and pidfd_getfd
#include <sys/prctl.h> // for prctl()
#include <unistd.h> // for close()
#include <errno.h> // for EPERM/EACCES
#include <string.h> // for strerror()

/* RHEL 8 glibc headers (kernel 4.18) don't define pidfd syscall numbers */
#ifndef SYS_pidfd_open
Expand Down Expand Up @@ -195,6 +197,41 @@ static ucc_status_t ucc_tl_cuda_nvls_share_handles(
return status;
}

/* Log a peer-fd import failure. Always emits a debug line with the details;
* when the failure is a permission denial (the common single-node case: Yama
* ptrace_scope, missing CAP_SYS_PTRACE, or a seccomp filter blocking
* pidfd_getfd) it additionally emits a single, actionable warning (once per
* process) telling the user how to enable NVLS instead of silently losing it. */
static void ucc_tl_cuda_nvls_report_peer_import_denied(
ucc_tl_cuda_team_t *team, const char *what, int err)
{
static volatile int warned = 0;

tl_debug(
UCC_TL_TEAM_LIB(team),
"%s: %s (errno=%d); NVLS not available, falling back",
what,
strerror(err),
err);

if ((err != EPERM && err != EACCES) || warned) {
return;
}
warned = 1;
tl_warn(
UCC_TL_TEAM_LIB(team),
"NVLS disabled: importing a peer process file descriptor was denied "
"(%s). Single-node NVLS needs permission to access peer GPU memory "
"handles across processes. To enable NVLS, relax ptrace "
"restrictions on the host/container, e.g. host: "
"'sysctl -w kernel.yama.ptrace_scope=0'; docker: add "
"'--cap-add=SYS_PTRACE' (and if pidfd_getfd is blocked by seccomp, "
"'--security-opt seccomp=unconfined'); enroot: run with "
"'--container-remap-root' or set 'kernel.yama.ptrace_scope=0' on the "
"host. Collectives fall back to another transport in the meantime.",
strerror(err));
}

Comment thread
ikryukov marked this conversation as resolved.
static ucc_status_t ucc_tl_cuda_nvls_import_handle_posix(
struct ucc_tl_cuda_team *team, ucc_tl_cuda_nvls_handle_t *share_data,
CUmemGenericAllocationHandle *mc_handle)
Expand All @@ -217,20 +254,21 @@ static ucc_status_t ucc_tl_cuda_nvls_import_handle_posix(

pid_fd = syscall(SYS_pidfd_open, target_pid, 0);
if (pid_fd < 0) {
tl_error(
UCC_TL_TEAM_LIB(team),
"failed to open pidfd for pid %d",
target_pid);
/* Expected fallback condition (e.g. restricted ptrace/seccomp in a
* container): report and let the team fall back to another transport
* instead of emitting a scary error. */
ucc_tl_cuda_nvls_report_peer_import_denied(
team, "failed to open peer pidfd", errno);
return UCC_ERR_NO_RESOURCE;
}

peer_fd = syscall(SYS_pidfd_getfd, pid_fd, export_handle, 0);
if (peer_fd < 0) {
tl_error(
UCC_TL_TEAM_LIB(team),
"failed to get peer fd: %s (errno=%d)",
strerror(errno),
errno);
/* EPERM here typically means the container lacks the permissions to
* import a peer's fd (Yama ptrace_scope, missing CAP_SYS_PTRACE, or a
* seccomp filter). Supported fallback condition. */
ucc_tl_cuda_nvls_report_peer_import_denied(
team, "failed to get peer fd", errno);
close(pid_fd);
return UCC_ERR_NO_RESOURCE;
}
Expand All @@ -255,9 +293,10 @@ static ucc_status_t ucc_tl_cuda_nvls_import_handle_posix(
}

if (status != UCC_OK) {
tl_error(
tl_debug(
UCC_TL_TEAM_LIB(team),
"failed to import POSIX file descriptor handle from rank 0");
"failed to import POSIX file descriptor handle from rank 0; "
"NVLS not available, falling back");
return status;
}

Expand All @@ -281,9 +320,10 @@ static ucc_status_t ucc_tl_cuda_nvls_import_handle_fabric(
mc_handle, &share_data->data.fabric, CU_MEM_HANDLE_TYPE_FABRIC));

if (status != UCC_OK) {
tl_error(
tl_debug(
UCC_TL_TEAM_LIB(team),
"failed to import fabric handle from rank 0. status (%d) %s",
"failed to import fabric handle from rank 0. status (%d) %s; "
"NVLS not available, falling back",
status,
ucc_status_string(status));
return status;
Expand Down Expand Up @@ -427,9 +467,10 @@ ucc_status_t ucc_tl_cuda_nvls_init(
&nvls->local_handle.data.posix.handle);
}
if (status != UCC_OK) {
tl_error(
tl_debug(
UCC_TL_TEAM_LIB(team),
"failed to create multicast object. status (%d) %s",
"failed to create multicast object. status (%d) %s; "
"NVLS not available, falling back",
status,
ucc_status_string(status));
/* Keep going to unblock peers waiting in the allgather;
Expand Down Expand Up @@ -480,14 +521,22 @@ ucc_status_t ucc_tl_cuda_nvls_init(
team->state = UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE;
// fall through
case UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE:
/* Optimistically assume this rank is ready; cleared below on any local
* import failure so STATE_SYNC_STATUS can disable NVLS team-wide. */
nvls->init_ready = 1;
/* Non-root ranks check the status field broadcast by rank 0 before
* attempting to import a potentially garbage handle. */
if (UCC_TL_TEAM_RANK(team) != 0) {
if (nvls->share_data[0].status != UCC_OK) {
tl_warn(UCC_TL_TEAM_LIB(team),
"NVLS: rank 0 failed to create multicast object "
"(status=%d); disabling NVLS for this team",
nvls->share_data[0].status);
/* Rank 0 failed to create the multicast object. Every non-root
* rank observes this via the broadcast status, and rank 0 bails
* out below through status_supported, so the failure is already
* symmetric and it is safe to clean up directly. */
tl_debug(
UCC_TL_TEAM_LIB(team),
"NVLS: rank 0 failed to create multicast object "
"(status=%d); disabling NVLS for this team",
nvls->share_data[0].status);
status = nvls->share_data[0].status;
nvls->status_supported = status;
goto cleanup;
Expand All @@ -500,17 +549,90 @@ ucc_status_t ucc_tl_cuda_nvls_init(
team, &nvls->share_data[0], &mc_handle);
}
if (status != UCC_OK) {
goto cleanup;
/* Import failed on this rank only (e.g. pidfd_getfd EPERM in a
* restricted container). Do NOT clean up directly: rank 0 and
* the ranks that imported successfully would block forever in
* the collective cuMulticastBindAddr barrier. Record the local
* failure and let STATE_SYNC_STATUS propagate it so all ranks
* disable NVLS together. */
nvls->init_ready = 0;
} else {
nvls->mc_handle = mc_handle;
}
nvls->mc_handle = mc_handle;
}
if (nvls->status_supported != UCC_OK) {
// Propagate the supported status to the caller
/* Rank 0 local creation failure: non-root ranks already saw this via
* the broadcast status, so cleaning up here keeps the team
* symmetric. */
status = nvls->status_supported;
goto cleanup;
}
team->state = UCC_TL_CUDA_NVLS_STATE_SYNC_STATUS;
// fall through
case UCC_TL_CUDA_NVLS_STATE_SYNC_STATUS:
{
/* Collectively agree on whether every rank imported the multicast
* handle. If any rank failed, all ranks skip the multicast binding and
* fall back together; otherwise the ranks that succeeded would deadlock
* in cuMulticastBindAddr waiting for the failed rank to call
* cuMulticastAddDevice. */
ucc_rank_t r;

if (nvls->init_sync_data == NULL) {
nvls->init_sync_data = (char *)ucc_malloc(
UCC_TL_TEAM_SIZE(team), "nvls_init_sync");
if (!nvls->init_sync_data) {
status = UCC_ERR_NO_MEMORY;
goto cleanup;
}
nvls->init_sync_data[UCC_TL_TEAM_RANK(team)] = (char)
nvls->init_ready;
}

if (team->oob_req == NULL) {
status = team->oob.allgather(
&nvls->init_sync_data[UCC_TL_TEAM_RANK(team)],
nvls->init_sync_data,
1,
team->oob.coll_info,
&team->oob_req);
if (status != UCC_OK) {
tl_error(
UCC_TL_TEAM_LIB(team),
"failed to initiate NVLS init status exchange");
goto cleanup;
}
}

status = team->oob.req_test(team->oob_req);
if (status > 0) {
return UCC_INPROGRESS;
}
if (status < 0) {
tl_error(UCC_TL_TEAM_LIB(team), "NVLS init status exchange failed");
team->oob.req_free(team->oob_req);
team->oob_req = NULL;
goto cleanup;
}
team->oob.req_free(team->oob_req);
team->oob_req = NULL;

for (r = 0; r < UCC_TL_TEAM_SIZE(team); r++) {
if (nvls->init_sync_data[r] == 0) {
tl_debug(
UCC_TL_TEAM_LIB(team),
"NVLS: rank %u could not initialize NVLS; disabling "
"NVLS for the whole team and falling back",
r);
status = UCC_ERR_NOT_SUPPORTED;
goto cleanup;
}
}
ucc_free(nvls->init_sync_data);
nvls->init_sync_data = NULL;
team->state = UCC_TL_CUDA_NVLS_STATE_ADD_DEVICE;
// fall through
}
case UCC_TL_CUDA_NVLS_STATE_ADD_DEVICE:
{
// Allocate physical memory
Expand Down Expand Up @@ -686,6 +808,9 @@ ucc_status_t ucc_tl_cuda_nvls_init(
ucc_free(nvls->barrier_data);
nvls->barrier_data = NULL;

/* NVLS is fully initialized for this team; only now may collectives be
* routed to the NVLS algorithms (see ucc_tl_cuda_get_supported_colls). */
nvls->enabled = 1;
tl_debug(UCC_TL_TEAM_LIB(team),
"NVLS init: rank %d OOB barrier complete — team ready",
UCC_TL_TEAM_RANK(team));
Expand All @@ -707,6 +832,10 @@ ucc_status_t ucc_tl_cuda_nvls_init(
ucc_free(nvls->barrier_data);
nvls->barrier_data = NULL;
}
if (nvls->init_sync_data) {
ucc_free(nvls->init_sync_data);
nvls->init_sync_data = NULL;
}

// Clean up CUDA resources - check local variables for partial allocations
// Unmap and free multicast VA if it was reserved/mapped
Expand Down Expand Up @@ -807,5 +936,9 @@ ucc_status_t ucc_tl_cuda_nvls_destroy(ucc_tl_cuda_team_t *team)
ucc_free(team->nvls.barrier_data);
team->nvls.barrier_data = NULL;
}
if (team->nvls.init_sync_data) {
ucc_free(team->nvls.init_sync_data);
team->nvls.init_sync_data = NULL;
}
return UCC_OK;
}
13 changes: 13 additions & 0 deletions src/components/tl/cuda/tl_cuda_nvls.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ typedef enum {
UCC_TL_CUDA_NVLS_STATE_INIT,
UCC_TL_CUDA_NVLS_STATE_SHARE_HANDLES,
UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE,
UCC_TL_CUDA_NVLS_STATE_SYNC_STATUS,
UCC_TL_CUDA_NVLS_STATE_ADD_DEVICE,
UCC_TL_CUDA_NVLS_STATE_BARRIER,
} ucc_tl_cuda_nvls_state_t;
Expand Down Expand Up @@ -74,6 +75,18 @@ typedef struct ucc_tl_cuda_nvls {
size_t gran;
/* temporary buffer for STATE_BARRIER */
char *barrier_data;
/* Whether this rank locally succeeded in importing the multicast handle.
* Exchanged across ranks in STATE_SYNC_STATUS so a per-rank import failure
* (e.g. pidfd_getfd EPERM) disables NVLS on the whole team instead of
* deadlocking the ranks that succeeded in cuMulticastBindAddr. */
int init_ready;
/* temporary buffer for STATE_SYNC_STATUS allgather */
char *init_sync_data;
/* Set to 1 only when NVLS initialization fully succeeded for this team.
* Gates advertising the NVLS collectives: the hardware may support
* multicast while NVLS init fell back (e.g. peer fd import denied), and in
* that case collectives must not be routed to the NVLS algorithms. */
int enabled;
} ucc_tl_cuda_nvls_t;

typedef struct ucc_tl_cuda_nvls_control {
Expand Down
Loading
Loading