Skip to content

Temporarily disable cross-device memory access from kernels#3112

Merged
maleadt merged 2 commits intomasterfrom
tb/multi
Apr 21, 2026
Merged

Temporarily disable cross-device memory access from kernels#3112
maleadt merged 2 commits intomasterfrom
tb/multi

Conversation

@maleadt
Copy link
Copy Markdown
Member

@maleadt maleadt commented Apr 21, 2026

Calling cuMemPoolSetAccess somehow seems to break cuMemcpyPeerAsync. MWE:

// Reproducer for a suspected CUDA driver bug on multi-GPU peer copies from
// stream-ordered memory pools. See comments in the main loop for the expected
// behavior.
//
// Build:
//   gcc -O0 -g -o repro repro.c -I/usr/local/cuda-13.2/targets/x86_64-linux/include -lcuda
// Run (with two P2P-capable GPUs):
//   CUDA_VISIBLE_DEVICES=0,1 ./repro
//
// On Turing sm_75 + driver 590.48.01 + CUDA 13.2 we observe that after
// `cuMemPoolSetAccess` has been called on a custom device memory pool,
// `cuMemcpyPeerAsync` into allocations from that pool silently fails to
// write the data.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>

#define CHECK(call) do { \
    CUresult _r = (call); \
    if (_r != CUDA_SUCCESS) { \
        const char *err_name = NULL, *err_str = NULL; \
        cuGetErrorName(_r, &err_name); \
        cuGetErrorString(_r, &err_str); \
        fprintf(stderr, "%s:%d: %s failed: %s (%s)\n", \
                __FILE__, __LINE__, #call, \
                err_name ? err_name : "?", err_str ? err_str : "?"); \
        exit(1); \
    } \
} while (0)

// Create a custom memory pool for `dev` and set it as the default pool.
static CUmemoryPool make_pool(CUdevice dev) {
    CUmemPoolProps props;
    memset(&props, 0, sizeof(props));
    props.allocType = CU_MEM_ALLOCATION_TYPE_PINNED;
    props.handleTypes = CU_MEM_HANDLE_TYPE_NONE;
    props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    props.location.id = dev;

    CUmemoryPool pool;
    CHECK(cuMemPoolCreate(&pool, &props));

    CHECK(cuDeviceSetMemPool(dev, pool));

    cuuint64_t threshold = (cuuint64_t)-1;
    CHECK(cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
                                &threshold));
    return pool;
}

// Grant `peer` read/write access to allocations from `pool`.
static void grant_pool_access(CUmemoryPool pool, CUdevice peer) {
    CUmemAccessDesc desc;
    memset(&desc, 0, sizeof(desc));
    desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    desc.location.id = peer;
    desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    CHECK(cuMemPoolSetAccess(pool, &desc, 1));
}

int main(int argc, char **argv) {
    CHECK(cuInit(0));

    int ndev;
    CHECK(cuDeviceGetCount(&ndev));
    if (ndev < 2) {
        fprintf(stderr, "need at least 2 devices, have %d\n", ndev);
        return 1;
    }

    CUdevice dev0, dev1;
    CHECK(cuDeviceGet(&dev0, 0));
    CHECK(cuDeviceGet(&dev1, 1));

    int p2p_01 = 0, p2p_10 = 0;
    CHECK(cuDeviceCanAccessPeer(&p2p_01, dev0, dev1));
    CHECK(cuDeviceCanAccessPeer(&p2p_10, dev1, dev0));
    if (!p2p_01 || !p2p_10) {
        fprintf(stderr, "need bidirectional P2P; 0->1=%d 1->0=%d\n",
                p2p_01, p2p_10);
        return 1;
    }

    // Retain primary contexts for each device.
    CUcontext ctx0, ctx1;
    CHECK(cuDevicePrimaryCtxRetain(&ctx0, dev0));
    CHECK(cuDevicePrimaryCtxRetain(&ctx1, dev1));

    // Enable context-level peer access in both directions.
    CHECK(cuCtxSetCurrent(ctx0));
    CHECK(cuCtxEnablePeerAccess(ctx1, 0));
    CHECK(cuCtxSetCurrent(ctx1));
    CHECK(cuCtxEnablePeerAccess(ctx0, 0));

    // Create custom pools for both devices.
    CHECK(cuCtxSetCurrent(ctx0));
    CUmemoryPool pool0 = make_pool(dev0);
    CHECK(cuCtxSetCurrent(ctx1));
    CUmemoryPool pool1 = make_pool(dev1);

    // NVIDIA's documented pattern: grant pool-level peer access ONCE, before
    // any allocations come out of the pool. Per the programming guide,
    // "once a pool is made accessible from a given GPU, it should remain
    // accessible from that GPU for the lifetime of the pool."
    int grant_pool_access_flag = 1;
    if (argc >= 2 && strcmp(argv[1], "--no-pool-access") == 0) {
        grant_pool_access_flag = 0;
    }
    if (grant_pool_access_flag) {
        grant_pool_access(pool0, dev1);
        grant_pool_access(pool1, dev0);
    }

    // Create a stream on each device.
    CHECK(cuCtxSetCurrent(ctx0));
    CUstream s0;
    CHECK(cuStreamCreate(&s0, CU_STREAM_NON_BLOCKING));
    CHECK(cuCtxSetCurrent(ctx1));
    CUstream s1;
    CHECK(cuStreamCreate(&s1, CU_STREAM_NON_BLOCKING));

    // Main loop: per iteration, allocate a on dev0 and b on dev1, initialize
    // a with host data, peer-copy a -> b on dev0's stream, and verify b
    // contains the same data by copying back to the host via dev1's stream.
    const size_t N = 25;
    const size_t bytes = N * sizeof(double);
    double host_a[N], host_b[N];
    for (size_t i = 0; i < N; i++) host_a[i] = (double)(i + 1) * 1.5;

    int fails = 0;
    const int iters = 500;
    for (int it = 0; it < iters; it++) {
        // alloc a on dev0 (stream-ordered, from pool0)
        CHECK(cuCtxSetCurrent(ctx0));
        CUdeviceptr a;
        CHECK(cuMemAllocFromPoolAsync(&a, bytes, pool0, s0));

        // alloc b on dev1 (stream-ordered, from pool1)
        CHECK(cuCtxSetCurrent(ctx1));
        CUdeviceptr b;
        CHECK(cuMemAllocFromPoolAsync(&b, bytes, pool1, s1));

        // make sure b's allocation is physically done before the peer write
        // touches it from dev0's stream
        CHECK(cuStreamSynchronize(s1));

        // HtoD fill of a on dev0's stream
        CHECK(cuCtxSetCurrent(ctx0));
        CHECK(cuMemcpyHtoDAsync(a, host_a, bytes, s0));

        // peer copy a -> b on dev0's stream
        CHECK(cuMemcpyPeerAsync(b, ctx1, a, ctx0, bytes, s0));

        // wait for peer copy to complete
        CHECK(cuStreamSynchronize(s0));

        // DtoH read of b on dev1's stream
        CHECK(cuCtxSetCurrent(ctx1));
        memset(host_b, 0, bytes);
        CHECK(cuMemcpyDtoHAsync(host_b, b, bytes, s1));
        CHECK(cuStreamSynchronize(s1));

        // verify
        int ok = (memcmp(host_a, host_b, bytes) == 0);
        if (!ok) {
            fails++;
            if (fails <= 3) {
                fprintf(stderr, "iter %d: mismatch; first 3 values: "
                        "host_a=[%g,%g,%g] host_b=[%g,%g,%g]\n",
                        it, host_a[0], host_a[1], host_a[2],
                        host_b[0], host_b[1], host_b[2]);
            }
        }

        // free (stream-ordered)
        CHECK(cuCtxSetCurrent(ctx0));
        CHECK(cuMemFreeAsync(a, s0));
        CHECK(cuCtxSetCurrent(ctx1));
        CHECK(cuMemFreeAsync(b, s1));
    }

    printf("grant_pool_access=%d: %d/%d peer copies returned wrong data\n",
           grant_pool_access_flag, fails, iters);

    CHECK(cuStreamDestroy(s0));
    CHECK(cuStreamDestroy(s1));
    CHECK(cuMemPoolDestroy(pool0));
    CHECK(cuMemPoolDestroy(pool1));
    CHECK(cuDevicePrimaryCtxRelease(dev0));
    CHECK(cuDevicePrimaryCtxRelease(dev1));
    return fails == 0 ? 0 : 2;
}

Replacing the memcpy by a kernel-based one doesn't help. So temporarily disabling this until I hear back from NVIDIA.

Works around #2930

maleadt and others added 2 commits April 21, 2026 02:07
`PerDevice.get!` cached `(context(), value)`, where `context()` is the
currently-active context, but subsequent lookups compared against
`device_context(id)` — the target device's context. Whenever
`get!(x, dev)` was invoked from a context belonging to a *different*
device (e.g., `pool_create(other_dev)` called from inside
`context!(context(src))`), the comparison mismatched on every later
lookup and the constructor ran again, creating a fresh value per call
and leaking the previous one.

Store `context(dev)` instead, so the cache key matches the lookup key.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
`Base.convert(::Type{CuPtr{T}}, ::Managed)` granted pool-level peer
access via `cuMemPoolSetAccess` on every cross-device pointer
conversion. This worked around an apparent CUDA driver bug by making it
much worse: a minimal C reproducer confirms that a single
`cuMemPoolSetAccess` call on a stream-ordered pool — even the
documented once-at-creation pattern, done before any allocations come
out of the pool — causes subsequent peer-direction data writes into
allocations from that pool (whether via `cuMemcpyPeerAsync` or via a
kernel on the peer device) to silently write zeros on driver 590.48.01
/ CUDA 13.2 / Turing sm_75. The API returns `CUDA_SUCCESS` and
`cuMemPoolGetAccess` reports the access is set, but the data-plane
write is dropped. `compute-sanitizer` additionally flags each call
with a bogus "HOST/HOST_NUMA pools are always read-write accessible on
the HOST" warning even though the access descriptor is
`CU_MEM_LOCATION_TYPE_DEVICE` on a device pool. Reported upstream
as NVIDIA bug #6098762.

`cuMemcpyPeerAsync` is a driver-mediated copy that only requires
context-level peer access (`cuCtxEnablePeerAccess`, already enabled
above) — not pool-level access — so removing the call fixes `copyto!`
between CuArrays on different devices without needing the
driver-bug-triggering API call. Callers that genuinely need
cross-device kernel access (e.g., cuBLASXt) already configure pool
access themselves and are unaffected by this change (though they will
still hit the driver bug in the same way the pre-fix code did).

Fixes the flaky "issue 1136: copies between devices" testset.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

[only special]
@codecov
Copy link
Copy Markdown

codecov Bot commented Apr 21, 2026

Codecov Report

✅ All modified and coverable lines are covered by tests.
✅ Project coverage is 16.56%. Comparing base (e0e295f) to head (08bb31d).
⚠️ Report is 3 commits behind head on master.

Additional details and impacted files
@@            Coverage Diff             @@
##           master    #3112      +/-   ##
==========================================
+ Coverage   10.19%   16.56%   +6.36%     
==========================================
  Files         119      120       +1     
  Lines        9198     9594     +396     
==========================================
+ Hits          938     1589     +651     
+ Misses       8260     8005     -255     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

Copy link
Copy Markdown
Contributor

@github-actions github-actions Bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Details
Benchmark suite Current: 08bb31d Previous: e0e295f Ratio
array/accumulate/Float32/1d 101551 ns 100878 ns 1.01
array/accumulate/Float32/dims=1 77097 ns 75855 ns 1.02
array/accumulate/Float32/dims=1L 1593830 ns 1585504 ns 1.01
array/accumulate/Float32/dims=2 144382 ns 143115.5 ns 1.01
array/accumulate/Float32/dims=2L 660133 ns 657101 ns 1.00
array/accumulate/Int64/1d 118831 ns 118250 ns 1.00
array/accumulate/Int64/dims=1 80265 ns 79820.5 ns 1.01
array/accumulate/Int64/dims=1L 1704679 ns 1694871 ns 1.01
array/accumulate/Int64/dims=2 156762 ns 155746 ns 1.01
array/accumulate/Int64/dims=2L 961755 ns 961802 ns 1.00
array/broadcast 20353 ns 20486 ns 0.99
array/construct 1301.7 ns 1263.9 ns 1.03
array/copy 18210 ns 17962 ns 1.01
array/copyto!/cpu_to_gpu 215057 ns 214197 ns 1.00
array/copyto!/gpu_to_cpu 283657 ns 281343 ns 1.01
array/copyto!/gpu_to_gpu 10929 ns 10794 ns 1.01
array/iteration/findall/bool 134807 ns 134478 ns 1.00
array/iteration/findall/int 149792 ns 149314.5 ns 1.00
array/iteration/findfirst/bool 81542.5 ns 81113 ns 1.01
array/iteration/findfirst/int 84112 ns 83293 ns 1.01
array/iteration/findmin/1d 85798.5 ns 84555 ns 1.01
array/iteration/findmin/2d 116649 ns 116516 ns 1.00
array/iteration/logical 199215.5 ns 197262.5 ns 1.01
array/iteration/scalar 67829 ns 67092 ns 1.01
array/permutedims/2d 52186 ns 52211 ns 1.00
array/permutedims/3d 52766 ns 52764 ns 1.00
array/permutedims/4d 51720 ns 51452 ns 1.01
array/random/rand/Float32 12958 ns 12943 ns 1.00
array/random/rand/Int64 25183 ns 24996 ns 1.01
array/random/rand!/Float32 8472 ns 8402.333333333334 ns 1.01
array/random/rand!/Int64 21843 ns 21937 ns 1.00
array/random/randn/Float32 38056.5 ns 36954 ns 1.03
array/random/randn!/Float32 31051 ns 30982 ns 1.00
array/reductions/mapreduce/Float32/1d 35035 ns 34678 ns 1.01
array/reductions/mapreduce/Float32/dims=1 40090 ns 39206 ns 1.02
array/reductions/mapreduce/Float32/dims=1L 51351 ns 51259.5 ns 1.00
array/reductions/mapreduce/Float32/dims=2 56448.5 ns 56274 ns 1.00
array/reductions/mapreduce/Float32/dims=2L 69248 ns 69346 ns 1.00
array/reductions/mapreduce/Int64/1d 42631 ns 42412 ns 1.01
array/reductions/mapreduce/Int64/dims=1 42885 ns 42188 ns 1.02
array/reductions/mapreduce/Int64/dims=1L 87135 ns 87287 ns 1.00
array/reductions/mapreduce/Int64/dims=2 59453 ns 59630 ns 1.00
array/reductions/mapreduce/Int64/dims=2L 84732 ns 84743 ns 1.00
array/reductions/reduce/Float32/1d 35232 ns 34235 ns 1.03
array/reductions/reduce/Float32/dims=1 48638 ns 39618.5 ns 1.23
array/reductions/reduce/Float32/dims=1L 51287 ns 51305 ns 1.00
array/reductions/reduce/Float32/dims=2 56586 ns 56667 ns 1.00
array/reductions/reduce/Float32/dims=2L 69400 ns 69784 ns 0.99
array/reductions/reduce/Int64/1d 42663 ns 42369 ns 1.01
array/reductions/reduce/Int64/dims=1 47447.5 ns 42478 ns 1.12
array/reductions/reduce/Int64/dims=1L 87063 ns 87248 ns 1.00
array/reductions/reduce/Int64/dims=2 59467 ns 59729 ns 1.00
array/reductions/reduce/Int64/dims=2L 84381 ns 84769 ns 1.00
array/reverse/1d 17779 ns 18015.5 ns 0.99
array/reverse/1dL 68359 ns 68638 ns 1.00
array/reverse/1dL_inplace 65696 ns 65779 ns 1.00
array/reverse/1d_inplace 8475.333333333334 ns 8649.666666666666 ns 0.98
array/reverse/2d 20773 ns 20711 ns 1.00
array/reverse/2dL 72907 ns 72634 ns 1.00
array/reverse/2dL_inplace 65831 ns 65985 ns 1.00
array/reverse/2d_inplace 9983 ns 10088 ns 0.99
array/sorting/1d 2744620 ns 2734295 ns 1.00
array/sorting/2d 1072540 ns 1068343 ns 1.00
array/sorting/by 3314456 ns 3304353 ns 1.00
cuda/synchronization/context/auto 1120.9 ns 1159.9 ns 0.97
cuda/synchronization/context/blocking 921.0555555555555 ns 896.4878048780488 ns 1.03
cuda/synchronization/context/nonblocking 7122 ns 7409.1 ns 0.96
cuda/synchronization/stream/auto 1002.5454545454545 ns 1027.578947368421 ns 0.98
cuda/synchronization/stream/blocking 793.9795918367347 ns 841.2941176470588 ns 0.94
cuda/synchronization/stream/nonblocking 7377.4 ns 7567.799999999999 ns 0.97
integration/byval/reference 143725 ns 143876 ns 1.00
integration/byval/slices=1 145722 ns 145738.5 ns 1.00
integration/byval/slices=2 284373 ns 284423 ns 1.00
integration/byval/slices=3 423145 ns 423173 ns 1.00
integration/cudadevrt 102373 ns 102437 ns 1.00
integration/volumerhs 23469620.5 ns 23470585 ns 1.00
kernel/indexing 13127 ns 13311 ns 0.99
kernel/indexing_checked 13950 ns 14095 ns 0.99
kernel/launch 2079.6666666666665 ns 2235.1111111111113 ns 0.93
kernel/occupancy 671.243670886076 ns 693.6190476190476 ns 0.97
kernel/rand 14274 ns 18172.5 ns 0.79
latency/import 3826822993 ns 3820990542 ns 1.00
latency/precompile 4595981474.5 ns 4593009584 ns 1.00
latency/ttfp 4416850028.5 ns 4397252952 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@maleadt maleadt merged commit 49ce6f6 into master Apr 21, 2026
2 checks passed
@maleadt maleadt deleted the tb/multi branch April 21, 2026 11:56
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant