From 10db7720971660d471a324aea2cfd1dae0241944 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ikryukov@nvidia.com>
Date: Wed, 1 Jul 2026 13:34:13 +0000
Subject: [PATCH 1/5] TL/CUDA: fix NVLS init deadlock on import fail

When a rank fails to import the multicast handle during NVLS team
initialization (e.g. pidfd_getfd returns EPERM in a container without
CAP_SYS_PTRACE or with a restrictive seccomp filter), it would bail out
and fall back while the other ranks proceeded into the collective
cuMulticastBindAddr barrier and blocked forever, deadlocking the team
(observed as a hang in the first DDP collective in test_c10d_ucc).

Add a STATE_SYNC_STATUS step that allgathers each rank's import status
after STATE_IMPORT_HANDLE. If any rank failed, all ranks disable NVLS
together and fall back symmetrically, avoiding the deadlock.

Also downgrade the expected NVLS init/fallback failures (peer fd import,
multicast object creation, top-level NVLS init failure) from tl_error to
tl_debug, matching how TL/SHARP reports failed initialization, so a
supported fallback does not emit a spurious error.
---
 src/components/tl/cuda/tl_cuda_nvls.c | 139 ++++++++++++++++++++++----
 src/components/tl/cuda/tl_cuda_nvls.h |   8 ++
 src/components/tl/cuda/tl_cuda_team.c |  10 +-
 3 files changed, 137 insertions(+), 20 deletions(-)

diff --git a/src/components/tl/cuda/tl_cuda_nvls.c b/src/components/tl/cuda/tl_cuda_nvls.c
index 55da6f13ac..4fbda786a1 100644
--- a/src/components/tl/cuda/tl_cuda_nvls.c
+++ b/src/components/tl/cuda/tl_cuda_nvls.c
@@ -217,18 +217,29 @@ static ucc_status_t ucc_tl_cuda_nvls_import_handle_posix(
 
     pid_fd        = syscall(SYS_pidfd_open, target_pid, 0);
     if (pid_fd < 0) {
-        tl_error(
+        /* Expected fallback condition (e.g. restricted ptrace/seccomp in a
+         * container): log at debug level and let the team fall back to
+         * another transport instead of emitting a scary error. */
+        tl_debug(
             UCC_TL_TEAM_LIB(team),
-            "failed to open pidfd for pid %d",
-            target_pid);
+            "failed to open pidfd for pid %d: %s (errno=%d); "
+            "NVLS not available, falling back",
+            target_pid,
+            strerror(errno),
+            errno);
         return UCC_ERR_NO_RESOURCE;
     }
 
     peer_fd = syscall(SYS_pidfd_getfd, pid_fd, export_handle, 0);
     if (peer_fd < 0) {
-        tl_error(
+        /* EPERM here typically means the container lacks the permissions to
+         * import a peer's fd (Yama ptrace_scope, missing CAP_SYS_PTRACE, or a
+         * seccomp filter). This is a supported fallback condition, so log at
+         * debug level rather than error. */
+        tl_debug(
             UCC_TL_TEAM_LIB(team),
-            "failed to get peer fd: %s (errno=%d)",
+            "failed to get peer fd: %s (errno=%d); "
+            "NVLS not available, falling back",
             strerror(errno),
             errno);
         close(pid_fd);
@@ -255,9 +266,10 @@ static ucc_status_t ucc_tl_cuda_nvls_import_handle_posix(
     }
 
     if (status != UCC_OK) {
-        tl_error(
+        tl_debug(
             UCC_TL_TEAM_LIB(team),
-            "failed to import POSIX file descriptor handle from rank 0");
+            "failed to import POSIX file descriptor handle from rank 0; "
+            "NVLS not available, falling back");
         return status;
     }
 
@@ -281,9 +293,10 @@ static ucc_status_t ucc_tl_cuda_nvls_import_handle_fabric(
         mc_handle, &share_data->data.fabric, CU_MEM_HANDLE_TYPE_FABRIC));
 
     if (status != UCC_OK) {
-        tl_error(
+        tl_debug(
             UCC_TL_TEAM_LIB(team),
-            "failed to import fabric handle from rank 0. status (%d) %s",
+            "failed to import fabric handle from rank 0. status (%d) %s; "
+            "NVLS not available, falling back",
             status,
             ucc_status_string(status));
         return status;
@@ -427,9 +440,10 @@ ucc_status_t ucc_tl_cuda_nvls_init(
                     &nvls->local_handle.data.posix.handle);
             }
             if (status != UCC_OK) {
-                tl_error(
+                tl_debug(
                     UCC_TL_TEAM_LIB(team),
-                    "failed to create multicast object. status (%d) %s",
+                    "failed to create multicast object. status (%d) %s; "
+                    "NVLS not available, falling back",
                     status,
                     ucc_status_string(status));
                 /* Keep going to unblock peers waiting in the allgather;
@@ -480,14 +494,22 @@ ucc_status_t ucc_tl_cuda_nvls_init(
         team->state = UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE;
         // fall through
     case UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE:
+        /* Optimistically assume this rank is ready; cleared below on any local
+         * import failure so STATE_SYNC_STATUS can disable NVLS team-wide. */
+        nvls->init_ready = 1;
         /* Non-root ranks check the status field broadcast by rank 0 before
          * attempting to import a potentially garbage handle. */
         if (UCC_TL_TEAM_RANK(team) != 0) {
             if (nvls->share_data[0].status != UCC_OK) {
-                tl_warn(UCC_TL_TEAM_LIB(team),
-                        "NVLS: rank 0 failed to create multicast object "
-                        "(status=%d); disabling NVLS for this team",
-                        nvls->share_data[0].status);
+                /* Rank 0 failed to create the multicast object. Every non-root
+                 * rank observes this via the broadcast status, and rank 0 bails
+                 * out below through status_supported, so the failure is already
+                 * symmetric and it is safe to clean up directly. */
+                tl_debug(
+                    UCC_TL_TEAM_LIB(team),
+                    "NVLS: rank 0 failed to create multicast object "
+                    "(status=%d); disabling NVLS for this team",
+                    nvls->share_data[0].status);
                 status = nvls->share_data[0].status;
                 nvls->status_supported = status;
                 goto cleanup;
@@ -500,17 +522,90 @@ ucc_status_t ucc_tl_cuda_nvls_init(
                     team, &nvls->share_data[0], &mc_handle);
             }
             if (status != UCC_OK) {
-                goto cleanup;
+                /* Import failed on this rank only (e.g. pidfd_getfd EPERM in a
+                 * restricted container). Do NOT clean up directly: rank 0 and
+                 * the ranks that imported successfully would block forever in
+                 * the collective cuMulticastBindAddr barrier. Record the local
+                 * failure and let STATE_SYNC_STATUS propagate it so all ranks
+                 * disable NVLS together. */
+                nvls->init_ready = 0;
+            } else {
+                nvls->mc_handle = mc_handle;
             }
-            nvls->mc_handle = mc_handle;
         }
         if (nvls->status_supported != UCC_OK) {
-            // Propagate the supported status to the caller
+            /* Rank 0 local creation failure: non-root ranks already saw this via
+             * the broadcast status, so cleaning up here keeps the team
+             * symmetric. */
             status = nvls->status_supported;
             goto cleanup;
         }
+        team->state = UCC_TL_CUDA_NVLS_STATE_SYNC_STATUS;
+        // fall through
+    case UCC_TL_CUDA_NVLS_STATE_SYNC_STATUS:
+    {
+        /* Collectively agree on whether every rank imported the multicast
+         * handle. If any rank failed, all ranks skip the multicast binding and
+         * fall back together; otherwise the ranks that succeeded would deadlock
+         * in cuMulticastBindAddr waiting for the failed rank to call
+         * cuMulticastAddDevice. */
+        ucc_rank_t r;
+
+        if (nvls->init_sync_data == NULL) {
+            nvls->init_sync_data = (char *)ucc_malloc(
+                UCC_TL_TEAM_SIZE(team), "nvls_init_sync");
+            if (!nvls->init_sync_data) {
+                status = UCC_ERR_NO_MEMORY;
+                goto cleanup;
+            }
+            nvls->init_sync_data[UCC_TL_TEAM_RANK(team)] = (char)
+                                                               nvls->init_ready;
+        }
+
+        if (team->oob_req == NULL) {
+            status = team->oob.allgather(
+                &nvls->init_sync_data[UCC_TL_TEAM_RANK(team)],
+                nvls->init_sync_data,
+                1,
+                team->oob.coll_info,
+                &team->oob_req);
+            if (status != UCC_OK) {
+                tl_error(
+                    UCC_TL_TEAM_LIB(team),
+                    "failed to initiate NVLS init status exchange");
+                goto cleanup;
+            }
+        }
+
+        status = team->oob.req_test(team->oob_req);
+        if (status > 0) {
+            return UCC_INPROGRESS;
+        }
+        if (status < 0) {
+            tl_error(UCC_TL_TEAM_LIB(team), "NVLS init status exchange failed");
+            team->oob.req_free(team->oob_req);
+            team->oob_req = NULL;
+            goto cleanup;
+        }
+        team->oob.req_free(team->oob_req);
+        team->oob_req = NULL;
+
+        for (r = 0; r < UCC_TL_TEAM_SIZE(team); r++) {
+            if (nvls->init_sync_data[r] == 0) {
+                tl_debug(
+                    UCC_TL_TEAM_LIB(team),
+                    "NVLS: rank %u could not initialize NVLS; disabling "
+                    "NVLS for the whole team and falling back",
+                    r);
+                status = UCC_ERR_NOT_SUPPORTED;
+                goto cleanup;
+            }
+        }
+        ucc_free(nvls->init_sync_data);
+        nvls->init_sync_data = NULL;
         team->state = UCC_TL_CUDA_NVLS_STATE_ADD_DEVICE;
         // fall through
+    }
     case UCC_TL_CUDA_NVLS_STATE_ADD_DEVICE:
     {
         // Allocate physical memory
@@ -707,6 +802,10 @@ ucc_status_t ucc_tl_cuda_nvls_init(
         ucc_free(nvls->barrier_data);
         nvls->barrier_data = NULL;
     }
+    if (nvls->init_sync_data) {
+        ucc_free(nvls->init_sync_data);
+        nvls->init_sync_data = NULL;
+    }
 
     // Clean up CUDA resources - check local variables for partial allocations
     // Unmap and free multicast VA if it was reserved/mapped
@@ -807,5 +906,9 @@ ucc_status_t ucc_tl_cuda_nvls_destroy(ucc_tl_cuda_team_t *team)
         ucc_free(team->nvls.barrier_data);
         team->nvls.barrier_data = NULL;
     }
+    if (team->nvls.init_sync_data) {
+        ucc_free(team->nvls.init_sync_data);
+        team->nvls.init_sync_data = NULL;
+    }
     return UCC_OK;
 }
diff --git a/src/components/tl/cuda/tl_cuda_nvls.h b/src/components/tl/cuda/tl_cuda_nvls.h
index 934a9a7f80..bbf3284ea7 100644
--- a/src/components/tl/cuda/tl_cuda_nvls.h
+++ b/src/components/tl/cuda/tl_cuda_nvls.h
@@ -39,6 +39,7 @@ typedef enum {
     UCC_TL_CUDA_NVLS_STATE_INIT,
     UCC_TL_CUDA_NVLS_STATE_SHARE_HANDLES,
     UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE,
+    UCC_TL_CUDA_NVLS_STATE_SYNC_STATUS,
     UCC_TL_CUDA_NVLS_STATE_ADD_DEVICE,
     UCC_TL_CUDA_NVLS_STATE_BARRIER,
 } ucc_tl_cuda_nvls_state_t;
@@ -74,6 +75,13 @@ typedef struct ucc_tl_cuda_nvls {
     size_t                       gran;
     /* temporary buffer for STATE_BARRIER */
     char                        *barrier_data;
+    /* Whether this rank locally succeeded in importing the multicast handle.
+     * Exchanged across ranks in STATE_SYNC_STATUS so a per-rank import failure
+     * (e.g. pidfd_getfd EPERM) disables NVLS on the whole team instead of
+     * deadlocking the ranks that succeeded in cuMulticastBindAddr. */
+    int                          init_ready;
+    /* temporary buffer for STATE_SYNC_STATUS allgather */
+    char                        *init_sync_data;
 } ucc_tl_cuda_nvls_t;
 
 typedef struct ucc_tl_cuda_nvls_control {
diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c
index 85d875e4b4..fb14928e31 100644
--- a/src/components/tl/cuda/tl_cuda_team.c
+++ b/src/components/tl/cuda/tl_cuda_team.c
@@ -470,8 +470,14 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team)
     case UCC_OK:
         break;
     default:
-        tl_error(lib,
-            "failed to initialize NVLS with status (%d) %s",
+        /* NVLS init failure is a supported fallback condition (the team is
+         * marked NOT_SUPPORTED below so the CL retries another TL). Log at
+         * debug level to avoid emitting an error when a fallback exists,
+         * mirroring how TL/SHARP reports failed SHARP initialization. */
+        tl_debug(
+            lib,
+            "failed to initialize NVLS with status (%d) %s; "
+            "falling back to another transport",
             status,
             ucc_status_string(status));
         // For multi-node teams in NVLS-only mode, no IPC resources were allocated

From aa574d7d267b91893a0d3edad8e881440174179d Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ikryukov@nvidia.com>
Date: Thu, 2 Jul 2026 14:48:09 +0000
Subject: [PATCH 2/5] EC/CUDA: fix persistent executor shutdown hang

ucc_cuda_executor_persistent_stop() signals the persistent GPU kernel to
exit by writing eee->pidx = -1 and the SHUTDOWN state into device-mapped
(cudaHostAllocMapped, zero-copy) memory, then busy-waits for the kernel to
write SHUTDOWN_ACK back.

The shutdown flag was published without any memory barrier that orders the
store against the GPU. On strongly-ordered CPUs (x86) this happens to work,
but on weakly-ordered CPUs (aarch64, e.g. Grace/GB200/VR200) the inner-
shareable CPU fences used elsewhere do not order against the GPU's
shareability domain, so the persistent kernel may never observe pidx == -1.
It then never exits and never acknowledges the shutdown, leaving the CPU
spinning forever in the stop loop. This stalls the UCC progress thread and,
with the PyTorch UCC backend, manifests as a hang in the first/teardown
collective (e.g. test_ddp_checkpointing_dynamic_module hanging in an
all_gather/barrier).

Publish the shutdown flag with ucc_memory_bus_store_fence() (outer-shareable
on aarch64, sfence on x86), which is defined specifically to synchronize
write-back and device-mapped memory. The ack is a volatile flag in coherent
device-mapped memory, so it is observed without a load fence in the wait loop.
---
 .../ec/cuda/ec_cuda_executor_persistent.c        | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/components/ec/cuda/ec_cuda_executor_persistent.c b/src/components/ec/cuda/ec_cuda_executor_persistent.c
index 54e4cafd44..1c75c5a276 100644
--- a/src/components/ec/cuda/ec_cuda_executor_persistent.c
+++ b/src/components/ec/cuda/ec_cuda_executor_persistent.c
@@ -163,7 +163,21 @@ ucc_status_t ucc_cuda_executor_persistent_stop(ucc_ee_executor_t *executor)
                (*st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN));
     *st = UCC_EC_CUDA_EXECUTOR_SHUTDOWN;
     eee->pidx = -1;
-    while(*st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK) { }
+    /* state/pidx live in device-mapped (zero-copy) host memory the persistent
+     * kernel polls. Publish these stores to the GPU before spinning: on
+     * weakly-ordered CPUs (aarch64/Grace) the store to pidx may sit in the CPU
+     * store buffer while this loop spins, and the cheaper inner-shareable CPU
+     * store fence only orders against other CPUs, not the GPU. The bus
+     * (outer-shareable) store fence drains/orders the store to the domain the
+     * GPU observes; without it the kernel never sees pidx == -1, never writes
+     * SHUTDOWN_ACK, and this loop hangs forever. */
+    ucc_memory_bus_store_fence();
+    /* No load fence here: st is volatile (re-read every iteration) and points
+     * to coherent device-mapped memory, so the kernel's SHUTDOWN_ACK write
+     * becomes visible on its own. A fence would only order accesses, not force
+     * a cache re-read, and there is no dependent load after the poll. */
+    while (*st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK) {
+    }
     eee->super.ee_context = NULL;
     eee->state = UCC_EC_CUDA_EXECUTOR_INITIALIZED;
 

From 4c9903acbf30cb7f719bf9ca954163abd089ea4a Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ikryukov@nvidia.com>
Date: Wed, 1 Jul 2026 13:34:13 +0000
Subject: [PATCH 3/5] EC/CUDA: bus fence when posting persistent tasks

task_post() writes task args into device-mapped (zero-copy) memory and then
publishes them by advancing pidx, which the persistent GPU kernel polls. The
ordering "task args visible before pidx" was enforced with the inner-shareable
CPU store fence, which does not order stores against the GPU's shareability
domain on weakly-ordered CPUs (aarch64/Grace). Use ucc_memory_bus_store_fence()
(outer-shareable), consistent with the shutdown path, so the kernel cannot
observe an advanced pidx with stale task args.
---
 src/components/ec/cuda/ec_cuda_executor_persistent.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/components/ec/cuda/ec_cuda_executor_persistent.c b/src/components/ec/cuda/ec_cuda_executor_persistent.c
index 1c75c5a276..ef72d610c1 100644
--- a/src/components/ec/cuda/ec_cuda_executor_persistent.c
+++ b/src/components/ec/cuda/ec_cuda_executor_persistent.c
@@ -70,7 +70,12 @@ ucc_cuda_executor_persistent_task_post(ucc_ee_executor_t *executor,
         memcpy(ee_task->subtasks[0], task_args,
                sizeof(ucc_ee_executor_task_args_t));
     }
-    ucc_memory_cpu_store_fence();
+    /* tasks[] and pidx live in device-mapped (zero-copy) host memory shared
+     * with the persistent kernel. Use the bus (outer-shareable) store fence so
+     * the task args are guaranteed visible to the GPU before the updated pidx
+     * that publishes them; the inner-shareable CPU fence does not order stores
+     * against the GPU's shareability domain on weakly-ordered CPUs (aarch64). */
+    ucc_memory_bus_store_fence();
     eee->pidx += ee_task->num_subtasks;
     if (ucc_ec_cuda.thread_mode == UCC_THREAD_MULTIPLE) {
         ucc_spin_unlock(&eee->tasks_lock);

From 357b42dda7751da437f7cda2fc0c2d34971a1a03 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ikryukov@nvidia.com>
Date: Wed, 1 Jul 2026 13:34:13 +0000
Subject: [PATCH 4/5] TL/CUDA: gate NVLS allreduce on init success

ucc_tl_cuda_get_supported_colls() advertised NVLS ALLREDUCE based on the
static hardware capability check (cuMulticast attributes), not on whether
NVLS actually initialized for the team. When NVLS init falls back for a
single-node team (e.g. peer fd import denied, team size over the NVLS peer
limit, non-uniform ppn), the TL/CUDA team is still created, so allreduce
was routed to ucc_tl_cuda_allreduce_nvls_init with no NVLS resources set up,
producing wrong results / crashes instead of falling back.

Track whether NVLS finished initializing (nvls.enabled, set only after the
final NVLS barrier) and:
- gate advertising ALLREDUCE in get_supported_colls on nvls.enabled, so the
  score map routes allreduce to another TL when NVLS is unavailable;
- defensively return UCC_ERR_NOT_SUPPORTED from ucc_tl_cuda_allreduce_init
  when NVLS is not enabled.
---
 src/components/tl/cuda/allreduce/allreduce.c |  9 ++++++++
 src/components/tl/cuda/tl_cuda_nvls.c        |  3 +++
 src/components/tl/cuda/tl_cuda_nvls.h        |  5 +++++
 src/components/tl/cuda/tl_cuda_team.c        | 23 ++++++++------------
 4 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/components/tl/cuda/allreduce/allreduce.c b/src/components/tl/cuda/allreduce/allreduce.c
index 29f52c42e0..e7fe8a4ce3 100644
--- a/src/components/tl/cuda/allreduce/allreduce.c
+++ b/src/components/tl/cuda/allreduce/allreduce.c
@@ -31,6 +31,15 @@ ucc_status_t ucc_tl_cuda_allreduce_init(ucc_base_coll_args_t *coll_args,
 {
     ucc_status_t        status  = UCC_ERR_NOT_IMPLEMENTED;
 #ifdef HAVE_NVLS
+    ucc_tl_cuda_team_t *cuda_team = ucc_derived_of(team, ucc_tl_cuda_team_t);
+
+    /* NVLS is the only allreduce algorithm in TL/CUDA. If NVLS did not
+     * initialize for this team (e.g. it fell back because the peer fd import
+     * was denied), report NOT_SUPPORTED so the collective is served by another
+     * transport instead of dispatching to uninitialized NVLS resources. */
+    if (!cuda_team->nvls.enabled) {
+        return UCC_ERR_NOT_SUPPORTED;
+    }
     /* Use NVLS algorithm as default */
     status = ucc_tl_cuda_allreduce_nvls_init(coll_args, team, task_h);
 #else
diff --git a/src/components/tl/cuda/tl_cuda_nvls.c b/src/components/tl/cuda/tl_cuda_nvls.c
index 4fbda786a1..6fe707bf48 100644
--- a/src/components/tl/cuda/tl_cuda_nvls.c
+++ b/src/components/tl/cuda/tl_cuda_nvls.c
@@ -781,6 +781,9 @@ ucc_status_t ucc_tl_cuda_nvls_init(
         ucc_free(nvls->barrier_data);
         nvls->barrier_data = NULL;
 
+        /* NVLS is fully initialized for this team; only now may collectives be
+         * routed to the NVLS algorithms (see ucc_tl_cuda_get_supported_colls). */
+        nvls->enabled      = 1;
         tl_debug(UCC_TL_TEAM_LIB(team),
                  "NVLS init: rank %d OOB barrier complete — team ready",
                  UCC_TL_TEAM_RANK(team));
diff --git a/src/components/tl/cuda/tl_cuda_nvls.h b/src/components/tl/cuda/tl_cuda_nvls.h
index bbf3284ea7..2b6ad338f6 100644
--- a/src/components/tl/cuda/tl_cuda_nvls.h
+++ b/src/components/tl/cuda/tl_cuda_nvls.h
@@ -82,6 +82,11 @@ typedef struct ucc_tl_cuda_nvls {
     int                          init_ready;
     /* temporary buffer for STATE_SYNC_STATUS allgather */
     char                        *init_sync_data;
+    /* Set to 1 only when NVLS initialization fully succeeded for this team.
+     * Gates advertising the NVLS collectives: the hardware may support
+     * multicast while NVLS init fell back (e.g. peer fd import denied), and in
+     * that case collectives must not be routed to the NVLS algorithms. */
+    int                          enabled;
 } ucc_tl_cuda_nvls_t;
 
 typedef struct ucc_tl_cuda_nvls_control {
diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c
index fb14928e31..f2752a90f9 100644
--- a/src/components/tl/cuda/tl_cuda_team.c
+++ b/src/components/tl/cuda/tl_cuda_team.c
@@ -25,9 +25,6 @@ static uint64_t ucc_tl_cuda_get_supported_colls(const ucc_tl_cuda_team_t *team)
 {
     const int is_multinode = !ucc_team_map_is_single_node(
         team->super.super.params.team, team->super.super.params.map);
-#ifdef HAVE_NVLS
-    ucc_status_t status;
-#endif
     // Base TL/CUDA collectives that are supported without NVLS
     uint64_t base_tl_cuda_colls =
         (UCC_COLL_TYPE_ALLTOALL | UCC_COLL_TYPE_ALLTOALLV |
@@ -36,19 +33,17 @@ static uint64_t ucc_tl_cuda_get_supported_colls(const ucc_tl_cuda_team_t *team)
          UCC_COLL_TYPE_REDUCE_SCATTERV);
 
 #ifdef HAVE_NVLS
-    // With NVLS compiled in, ALLREDUCE may be supported via NVLS.
-    // For multi-node teams, advertise ONLY NVLS ALLREDUCE if supported;
-    // otherwise advertise nothing for TL/CUDA (prevent non-NVLS colls).
-    // For single-node teams, advertise base TL/CUDA colls and add ALLREDUCE
-    // only if NVLS is supported.
-    status = ucc_tl_cuda_nvls_check_support(
-        ucc_derived_of(team->super.super.context->lib, ucc_tl_cuda_lib_t),
-        UCC_TL_CUDA_TEAM_CTX(team)->device,
-        is_multinode);
+    // ALLREDUCE is served by the NVLS algorithms, so advertise it only when
+    // NVLS actually initialized for this team (team->nvls.enabled). The device
+    // may support multicast while NVLS init fell back (e.g. peer fd import
+    // denied in a restricted container); in that case allreduce must NOT be
+    // routed to the NVLS path.
+    // For multi-node teams TL/CUDA supports ONLY NVLS ALLREDUCE, so without a
+    // working NVLS it advertises nothing.
     if (is_multinode) {
-        return (status == UCC_OK) ? UCC_COLL_TYPE_ALLREDUCE : 0;
+        return team->nvls.enabled ? UCC_COLL_TYPE_ALLREDUCE : 0;
     }
-    return (status == UCC_OK) ? (base_tl_cuda_colls | UCC_COLL_TYPE_ALLREDUCE)
+    return team->nvls.enabled ? (base_tl_cuda_colls | UCC_COLL_TYPE_ALLREDUCE)
                               : base_tl_cuda_colls;
 #else
     if (is_multinode) {

From cef80e20f4e975e7f20aff1c218d0071e69a57f1 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ikryukov@nvidia.com>
Date: Wed, 1 Jul 2026 13:34:13 +0000
Subject: [PATCH 5/5] TL/CUDA: warn when NVLS peer fd import denied

When single-node NVLS falls back because importing a peer process file
descriptor is denied (EPERM/EACCES from pidfd_open/pidfd_getfd due to Yama
ptrace_scope, missing CAP_SYS_PTRACE, or a seccomp filter), emit a single
per-process warning that explains the cause and how to enable NVLS (host
sysctl kernel.yama.ptrace_scope=0, docker --cap-add=SYS_PTRACE /
--security-opt seccomp=unconfined, enroot --container-remap-root). The
per-occurrence detail stays at debug level to avoid spam.
---
 src/components/tl/cuda/tl_cuda_nvls.c | 61 +++++++++++++++++++--------
 1 file changed, 44 insertions(+), 17 deletions(-)

diff --git a/src/components/tl/cuda/tl_cuda_nvls.c b/src/components/tl/cuda/tl_cuda_nvls.c
index 6fe707bf48..e47ddbf545 100644
--- a/src/components/tl/cuda/tl_cuda_nvls.c
+++ b/src/components/tl/cuda/tl_cuda_nvls.c
@@ -15,6 +15,8 @@
 #include <sys/syscall.h> // for pidfd_open and pidfd_getfd
 #include <sys/prctl.h>   // for prctl()
 #include <unistd.h>      // for close()
+#include <errno.h>       // for EPERM/EACCES
+#include <string.h>      // for strerror()
 
 /* RHEL 8 glibc headers (kernel 4.18) don't define pidfd syscall numbers */
 #ifndef SYS_pidfd_open
@@ -195,6 +197,41 @@ static ucc_status_t ucc_tl_cuda_nvls_share_handles(
     return status;
 }
 
+/* Log a peer-fd import failure. Always emits a debug line with the details;
+ * when the failure is a permission denial (the common single-node case: Yama
+ * ptrace_scope, missing CAP_SYS_PTRACE, or a seccomp filter blocking
+ * pidfd_getfd) it additionally emits a single, actionable warning (once per
+ * process) telling the user how to enable NVLS instead of silently losing it. */
+static void ucc_tl_cuda_nvls_report_peer_import_denied(
+    ucc_tl_cuda_team_t *team, const char *what, int err)
+{
+    static volatile int warned = 0;
+
+    tl_debug(
+        UCC_TL_TEAM_LIB(team),
+        "%s: %s (errno=%d); NVLS not available, falling back",
+        what,
+        strerror(err),
+        err);
+
+    if ((err != EPERM && err != EACCES) || warned) {
+        return;
+    }
+    warned = 1;
+    tl_warn(
+        UCC_TL_TEAM_LIB(team),
+        "NVLS disabled: importing a peer process file descriptor was denied "
+        "(%s). Single-node NVLS needs permission to access peer GPU memory "
+        "handles across processes. To enable NVLS, relax ptrace "
+        "restrictions on the host/container, e.g. host: "
+        "'sysctl -w kernel.yama.ptrace_scope=0'; docker: add "
+        "'--cap-add=SYS_PTRACE' (and if pidfd_getfd is blocked by seccomp, "
+        "'--security-opt seccomp=unconfined'); enroot: run with "
+        "'--container-remap-root' or set 'kernel.yama.ptrace_scope=0' on the "
+        "host. Collectives fall back to another transport in the meantime.",
+        strerror(err));
+}
+
 static ucc_status_t ucc_tl_cuda_nvls_import_handle_posix(
     struct ucc_tl_cuda_team *team, ucc_tl_cuda_nvls_handle_t *share_data,
     CUmemGenericAllocationHandle *mc_handle)
@@ -218,15 +255,10 @@ static ucc_status_t ucc_tl_cuda_nvls_import_handle_posix(
     pid_fd        = syscall(SYS_pidfd_open, target_pid, 0);
     if (pid_fd < 0) {
         /* Expected fallback condition (e.g. restricted ptrace/seccomp in a
-         * container): log at debug level and let the team fall back to
-         * another transport instead of emitting a scary error. */
-        tl_debug(
-            UCC_TL_TEAM_LIB(team),
-            "failed to open pidfd for pid %d: %s (errno=%d); "
-            "NVLS not available, falling back",
-            target_pid,
-            strerror(errno),
-            errno);
+         * container): report and let the team fall back to another transport
+         * instead of emitting a scary error. */
+        ucc_tl_cuda_nvls_report_peer_import_denied(
+            team, "failed to open peer pidfd", errno);
         return UCC_ERR_NO_RESOURCE;
     }
 
@@ -234,14 +266,9 @@ static ucc_status_t ucc_tl_cuda_nvls_import_handle_posix(
     if (peer_fd < 0) {
         /* EPERM here typically means the container lacks the permissions to
          * import a peer's fd (Yama ptrace_scope, missing CAP_SYS_PTRACE, or a
-         * seccomp filter). This is a supported fallback condition, so log at
-         * debug level rather than error. */
-        tl_debug(
-            UCC_TL_TEAM_LIB(team),
-            "failed to get peer fd: %s (errno=%d); "
-            "NVLS not available, falling back",
-            strerror(errno),
-            errno);
+         * seccomp filter). Supported fallback condition. */
+        ucc_tl_cuda_nvls_report_peer_import_denied(
+            team, "failed to get peer fd", errno);
         close(pid_fd);
         return UCC_ERR_NO_RESOURCE;
     }