Skip to content

Commit b5999ed

Browse files
Merge branch 'branch-25.12' into pinned_host_buffer
2 parents c4e48de + c9e10a0 commit b5999ed

File tree

7 files changed

+36
-55
lines changed

7 files changed

+36
-55
lines changed

.devcontainer/Dockerfile

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@ ENV HISTFILE="/home/coder/.cache/._bash_history"
3434
ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
3535
ENV SCCACHE_REGION="us-east-2"
3636
ENV SCCACHE_BUCKET="rapids-sccache-devs"
37-
# 2hr (1 minute longer than sccache-dist request timeout)
38-
ENV SCCACHE_IDLE_TIMEOUT=7200
37+
ENV SCCACHE_IDLE_TIMEOUT=0
3938

4039
###
4140
# sccache-dist configuration
@@ -46,13 +45,8 @@ ENV DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=1
4645
ENV SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=true
4746
# Retry transient errors 4 times (for a total of 5 attempts)
4847
ENV SCCACHE_DIST_MAX_RETRIES=4
49-
ENV SCCACHE_DIST_CONNECT_TIMEOUT=30
50-
ENV SCCACHE_DIST_CONNECTION_POOL=false
5148
# 1hr 59min (to accommodate debug builds)
5249
ENV SCCACHE_DIST_REQUEST_TIMEOUT=7140
53-
ENV SCCACHE_DIST_KEEPALIVE_ENABLED=true
54-
ENV SCCACHE_DIST_KEEPALIVE_INTERVAL=20
55-
ENV SCCACHE_DIST_KEEPALIVE_TIMEOUT=600
5650
ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.rapids.nvidia.com"
5751

5852
# Build as much in parallel as possible

conda/environments/all_cuda-129_arch-x86_64.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ dependencies:
4242
- pytest
4343
- python>=3.10,<3.14
4444
- rapids-build-backend>=0.4.0,<0.5.0.dev0
45-
- ray-default==2.42.*,>=0.0.0a0
45+
- ray-default>=2.49
4646
- rmm==25.12.*,>=0.0.0a0
4747
- scikit-build-core>=0.10.0
4848
- sphinx

conda/environments/all_cuda-130_arch-x86_64.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ dependencies:
4242
- pytest
4343
- python>=3.10,<3.14
4444
- rapids-build-backend>=0.4.0,<0.5.0.dev0
45-
- ray-default==2.42.*,>=0.0.0a0
45+
- ray-default>=2.49
4646
- rmm==25.12.*,>=0.0.0a0
4747
- scikit-build-core>=0.10.0
4848
- sphinx

cpp/compute-sanitizer-suppressions.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@
251251
</frame>
252252
<frame>
253253
<func>std::invoke_result&lt;rapidsmpf::buffer_copy</func>
254-
<path>/home/mkristensen/repos/rapidsmpf/cpp/include/rapidsmpf/buffer/buffer.hpp</path>
254+
<path>*/buffer/buffer.hpp</path>
255255
<module>.*/librapidsmpf.so</module>
256256
</frame>
257257
<frame>

cpp/src/communicator/ucxx.cpp

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,21 +1172,19 @@ std::unique_ptr<Communicator::Future> UCXX::recv_sync_host_data(
11721172
std::pair<std::unique_ptr<std::vector<uint8_t>>, Rank> UCXX::recv_any(Tag tag) {
11731173
progress_worker();
11741174
auto probe = shared_resources_->get_worker()->tagProbe(
1175-
::ucxx::Tag(static_cast<int>(tag)), UserTagMask
1175+
::ucxx::Tag(static_cast<int>(tag)), UserTagMask, true
11761176
);
1177-
auto msg_available = probe.first;
1178-
auto info = probe.second;
1179-
auto sender_rank = static_cast<Rank>(info.senderTag >> 32);
1177+
auto msg_available = probe->isMatched();
11801178
if (!msg_available) {
11811179
return {nullptr, 0};
11821180
}
1181+
auto info = probe->getInfo();
1182+
auto sender_rank = static_cast<Rank>(info.senderTag >> 32);
11831183
auto msg = std::make_unique<std::vector<uint8_t>>(
11841184
info.length
11851185
); // TODO: choose between host and device
11861186

1187-
auto req = shared_resources_->get_worker()->tagRecv(
1188-
msg->data(), msg->size(), ::ucxx::Tag(static_cast<int>(tag)), UserTagMask
1189-
);
1187+
auto req = shared_resources_->get_worker()->tagRecvWithHandle(msg->data(), probe);
11901188

11911189
while (!req->isCompleted()) {
11921190
progress_worker();
@@ -1199,23 +1197,18 @@ std::pair<std::unique_ptr<std::vector<uint8_t>>, Rank> UCXX::recv_any(Tag tag) {
11991197
std::unique_ptr<std::vector<uint8_t>> UCXX::recv_from(Rank src, Tag tag) {
12001198
progress_worker();
12011199
auto probe = shared_resources_->get_worker()->tagProbe(
1202-
tag_with_rank(src, static_cast<int>(tag)), ::ucxx::TagMaskFull
1200+
tag_with_rank(src, static_cast<int>(tag)), ::ucxx::TagMaskFull, true
12031201
);
1204-
auto msg_available = probe.first;
1205-
auto info = probe.second;
1202+
auto msg_available = probe->isMatched();
12061203
if (!msg_available) {
12071204
return nullptr;
12081205
}
1206+
auto info = probe->getInfo();
12091207
auto msg = std::make_unique<std::vector<uint8_t>>(
12101208
info.length
12111209
); // TODO: choose between host and device
12121210

1213-
auto req = shared_resources_->get_worker()->tagRecv(
1214-
msg->data(),
1215-
msg->size(),
1216-
tag_with_rank(src, static_cast<int>(tag)),
1217-
::ucxx::TagMaskFull
1218-
);
1211+
auto req = shared_resources_->get_worker()->tagRecvWithHandle(msg->data(), probe);
12191212

12201213
while (!req->isCompleted()) {
12211214
progress_worker();

cpp/src/cupti.cpp

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -71,32 +71,35 @@ CuptiMonitor::~CuptiMonitor() {
7171
}
7272

7373
void CuptiMonitor::start_monitoring() {
74-
std::lock_guard<std::mutex> lock(mutex_);
75-
7674
if (monitoring_active_.load()) {
7775
return;
7876
}
7977

80-
CUptiResult cupti_err = subscribe();
81-
if (cupti_err != CUPTI_SUCCESS) {
82-
throw std::runtime_error(
83-
"Failed to initialize CUPTI: " + std::to_string(cupti_err)
84-
);
78+
{
79+
std::lock_guard<std::mutex> lock(mutex_);
80+
81+
CUptiResult cupti_err = subscribe();
82+
if (cupti_err != CUPTI_SUCCESS) {
83+
throw std::runtime_error(
84+
"Failed to initialize CUPTI: " + std::to_string(cupti_err)
85+
);
86+
}
8587
}
8688

8789
monitoring_active_.store(true);
8890

89-
// Capture initial memory state
90-
capture_memory_usage_impl();
91+
{
92+
std::lock_guard<std::mutex> lock(mutex_);
93+
// Capture initial memory state
94+
capture_memory_usage_impl();
9195

92-
if (enable_periodic_sampling_) {
93-
sampling_thread_ = std::thread(&CuptiMonitor::periodic_memory_sampling, this);
96+
if (enable_periodic_sampling_) {
97+
sampling_thread_ = std::thread(&CuptiMonitor::periodic_memory_sampling, this);
98+
}
9499
}
95100
}
96101

97102
void CuptiMonitor::stop_monitoring() {
98-
std::lock_guard<std::mutex> lock(mutex_);
99-
100103
if (!monitoring_active_.load()) {
101104
return;
102105
}
@@ -107,8 +110,11 @@ void CuptiMonitor::stop_monitoring() {
107110
sampling_thread_.join();
108111
}
109112

110-
// Capture final memory state
111-
capture_memory_usage_impl();
113+
{
114+
std::lock_guard<std::mutex> lock(mutex_);
115+
// Capture final memory state
116+
capture_memory_usage_impl();
117+
}
112118

113119
unsubscribe();
114120
}

dependencies.yaml

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -295,14 +295,10 @@ dependencies:
295295
specific:
296296
- output_types: conda
297297
matrices:
298-
- matrix:
299-
arch: x86_64
300-
py: "3.13"
301-
packages:
302298
- matrix:
303299
arch: x86_64
304300
packages:
305-
- ray-default==2.42.*,>=0.0.0a0
301+
- ray-default>=2.49
306302
- matrix:
307303
arch: aarch64
308304
packages:
@@ -415,18 +411,10 @@ dependencies:
415411
specific:
416412
- output_types: conda
417413
matrices:
418-
- matrix:
419-
arch: x86_64
420-
py: "3.13"
421-
packages:
422-
- pip:
423-
# Ray for Python 3.13 not available from conda-forge, for
424-
# now install it from PyPI just to build docs
425-
- ray==2.45.*,>=0.0.0a0
426414
- matrix:
427415
arch: x86_64
428416
packages:
429-
- ray-default==2.42.*,>=0.0.0a0
417+
- ray-default>=2.49
430418
- matrix:
431419
arch: aarch64
432420
packages:

0 commit comments

Comments
 (0)