Merge branch 'branch-25.12' into pinned_host_buffer

nirandaperera · web-flow · commit b5999edd0d52 · 2025-10-15T11:31:38.000-07:00
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -34,8 +34,7 @@ ENV HISTFILE="/home/coder/.cache/._bash_history"
 ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-# 2hr (1 minute longer than sccache-dist request timeout)
-ENV SCCACHE_IDLE_TIMEOUT=7200
+ENV SCCACHE_IDLE_TIMEOUT=0
 
 ###
 # sccache-dist configuration
@@ -46,13 +45,8 @@ ENV DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=1
 ENV SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=true
 # Retry transient errors 4 times (for a total of 5 attempts)
 ENV SCCACHE_DIST_MAX_RETRIES=4
-ENV SCCACHE_DIST_CONNECT_TIMEOUT=30
-ENV SCCACHE_DIST_CONNECTION_POOL=false
 # 1hr 59min (to accommodate debug builds)
 ENV SCCACHE_DIST_REQUEST_TIMEOUT=7140
-ENV SCCACHE_DIST_KEEPALIVE_ENABLED=true
-ENV SCCACHE_DIST_KEEPALIVE_INTERVAL=20
-ENV SCCACHE_DIST_KEEPALIVE_TIMEOUT=600
 ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.rapids.nvidia.com"
 
 # Build as much in parallel as possible
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -42,7 +42,7 @@ dependencies:
 - pytest
 - python>=3.10,<3.14
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
-- ray-default==2.42.*,>=0.0.0a0
+- ray-default>=2.49
 - rmm==25.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml
@@ -42,7 +42,7 @@ dependencies:
 - pytest
 - python>=3.10,<3.14
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
-- ray-default==2.42.*,>=0.0.0a0
+- ray-default>=2.49
 - rmm==25.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
diff --git a/cpp/compute-sanitizer-suppressions.xml b/cpp/compute-sanitizer-suppressions.xml
@@ -251,7 +251,7 @@
       </frame>
       <frame>
         <func>std::invoke_result&lt;rapidsmpf::buffer_copy</func>
-        <path>/home/mkristensen/repos/rapidsmpf/cpp/include/rapidsmpf/buffer/buffer.hpp</path>
+        <path>*/buffer/buffer.hpp</path>
         <module>.*/librapidsmpf.so</module>
       </frame>
       <frame>
diff --git a/cpp/src/communicator/ucxx.cpp b/cpp/src/communicator/ucxx.cpp
@@ -1172,21 +1172,19 @@ std::unique_ptr<Communicator::Future> UCXX::recv_sync_host_data(
 std::pair<std::unique_ptr<std::vector<uint8_t>>, Rank> UCXX::recv_any(Tag tag) {
     progress_worker();
     auto probe = shared_resources_->get_worker()->tagProbe(
-        ::ucxx::Tag(static_cast<int>(tag)), UserTagMask
+        ::ucxx::Tag(static_cast<int>(tag)), UserTagMask, true
     );
-    auto msg_available = probe.first;
-    auto info = probe.second;
-    auto sender_rank = static_cast<Rank>(info.senderTag >> 32);
+    auto msg_available = probe->isMatched();
     if (!msg_available) {
         return {nullptr, 0};
     }
+    auto info = probe->getInfo();
+    auto sender_rank = static_cast<Rank>(info.senderTag >> 32);
     auto msg = std::make_unique<std::vector<uint8_t>>(
         info.length
     );  // TODO: choose between host and device
 
-    auto req = shared_resources_->get_worker()->tagRecv(
-        msg->data(), msg->size(), ::ucxx::Tag(static_cast<int>(tag)), UserTagMask
-    );
+    auto req = shared_resources_->get_worker()->tagRecvWithHandle(msg->data(), probe);
 
     while (!req->isCompleted()) {
         progress_worker();
@@ -1199,23 +1197,18 @@ std::pair<std::unique_ptr<std::vector<uint8_t>>, Rank> UCXX::recv_any(Tag tag) {
 std::unique_ptr<std::vector<uint8_t>> UCXX::recv_from(Rank src, Tag tag) {
     progress_worker();
     auto probe = shared_resources_->get_worker()->tagProbe(
-        tag_with_rank(src, static_cast<int>(tag)), ::ucxx::TagMaskFull
+        tag_with_rank(src, static_cast<int>(tag)), ::ucxx::TagMaskFull, true
     );
-    auto msg_available = probe.first;
-    auto info = probe.second;
+    auto msg_available = probe->isMatched();
     if (!msg_available) {
         return nullptr;
     }
+    auto info = probe->getInfo();
     auto msg = std::make_unique<std::vector<uint8_t>>(
         info.length
     );  // TODO: choose between host and device
 
-    auto req = shared_resources_->get_worker()->tagRecv(
-        msg->data(),
-        msg->size(),
-        tag_with_rank(src, static_cast<int>(tag)),
-        ::ucxx::TagMaskFull
-    );
+    auto req = shared_resources_->get_worker()->tagRecvWithHandle(msg->data(), probe);
 
     while (!req->isCompleted()) {
         progress_worker();
diff --git a/cpp/src/cupti.cpp b/cpp/src/cupti.cpp
@@ -71,32 +71,35 @@ CuptiMonitor::~CuptiMonitor() {
 }
 
 void CuptiMonitor::start_monitoring() {
-    std::lock_guard<std::mutex> lock(mutex_);
-
     if (monitoring_active_.load()) {
         return;
     }
 
-    CUptiResult cupti_err = subscribe();
-    if (cupti_err != CUPTI_SUCCESS) {
-        throw std::runtime_error(
-            "Failed to initialize CUPTI: " + std::to_string(cupti_err)
-        );
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+
+        CUptiResult cupti_err = subscribe();
+        if (cupti_err != CUPTI_SUCCESS) {
+            throw std::runtime_error(
+                "Failed to initialize CUPTI: " + std::to_string(cupti_err)
+            );
+        }
     }
 
     monitoring_active_.store(true);
 
-    // Capture initial memory state
-    capture_memory_usage_impl();
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        // Capture initial memory state
+        capture_memory_usage_impl();
 
-    if (enable_periodic_sampling_) {
-        sampling_thread_ = std::thread(&CuptiMonitor::periodic_memory_sampling, this);
+        if (enable_periodic_sampling_) {
+            sampling_thread_ = std::thread(&CuptiMonitor::periodic_memory_sampling, this);
+        }
     }
 }
 
 void CuptiMonitor::stop_monitoring() {
-    std::lock_guard<std::mutex> lock(mutex_);
-
     if (!monitoring_active_.load()) {
         return;
     }
@@ -107,8 +110,11 @@ void CuptiMonitor::stop_monitoring() {
         sampling_thread_.join();
     }
 
-    // Capture final memory state
-    capture_memory_usage_impl();
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        // Capture final memory state
+        capture_memory_usage_impl();
+    }
 
     unsubscribe();
 }
diff --git a/dependencies.yaml b/dependencies.yaml
@@ -295,14 +295,10 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              arch: x86_64
-              py: "3.13"
-            packages:
           - matrix:
               arch: x86_64
             packages:
-              - ray-default==2.42.*,>=0.0.0a0
+              - ray-default>=2.49
           - matrix:
               arch: aarch64
             packages:
@@ -415,18 +411,10 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              arch: x86_64
-              py: "3.13"
-            packages:
-              - pip:
-                # Ray for Python 3.13 not available from conda-forge, for
-                # now install it from PyPI just to build docs
-                - ray==2.45.*,>=0.0.0a0
           - matrix:
               arch: x86_64
             packages:
-              - ray-default==2.42.*,>=0.0.0a0
+              - ray-default>=2.49
           - matrix:
               arch: aarch64
             packages: