Temporarily disable cross-device memory access from kernels by maleadt · Pull Request #3112 · JuliaGPU/CUDA.jl

maleadt · 2026-04-21T07:19:47Z

Calling cuMemPoolSetAccess somehow seems to break cuMemcpyPeerAsync. MWE:

// Reproducer for a suspected CUDA driver bug on multi-GPU peer copies from
// stream-ordered memory pools. See comments in the main loop for the expected
// behavior.
//
// Build:
//   gcc -O0 -g -o repro repro.c -I/usr/local/cuda-13.2/targets/x86_64-linux/include -lcuda
// Run (with two P2P-capable GPUs):
//   CUDA_VISIBLE_DEVICES=0,1 ./repro
//
// On Turing sm_75 + driver 590.48.01 + CUDA 13.2 we observe that after
// `cuMemPoolSetAccess` has been called on a custom device memory pool,
// `cuMemcpyPeerAsync` into allocations from that pool silently fails to
// write the data.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>

#define CHECK(call) do { \
    CUresult _r = (call); \
    if (_r != CUDA_SUCCESS) { \
        const char *err_name = NULL, *err_str = NULL; \
        cuGetErrorName(_r, &err_name); \
        cuGetErrorString(_r, &err_str); \
        fprintf(stderr, "%s:%d: %s failed: %s (%s)\n", \
                __FILE__, __LINE__, #call, \
                err_name ? err_name : "?", err_str ? err_str : "?"); \
        exit(1); \
    } \
} while (0)

// Create a custom memory pool for `dev` and set it as the default pool.
static CUmemoryPool make_pool(CUdevice dev) {
    CUmemPoolProps props;
    memset(&props, 0, sizeof(props));
    props.allocType = CU_MEM_ALLOCATION_TYPE_PINNED;
    props.handleTypes = CU_MEM_HANDLE_TYPE_NONE;
    props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    props.location.id = dev;

    CUmemoryPool pool;
    CHECK(cuMemPoolCreate(&pool, &props));

    CHECK(cuDeviceSetMemPool(dev, pool));

    cuuint64_t threshold = (cuuint64_t)-1;
    CHECK(cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
                                &threshold));
    return pool;
}

// Grant `peer` read/write access to allocations from `pool`.
static void grant_pool_access(CUmemoryPool pool, CUdevice peer) {
    CUmemAccessDesc desc;
    memset(&desc, 0, sizeof(desc));
    desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    desc.location.id = peer;
    desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    CHECK(cuMemPoolSetAccess(pool, &desc, 1));
}

int main(int argc, char **argv) {
    CHECK(cuInit(0));

    int ndev;
    CHECK(cuDeviceGetCount(&ndev));
    if (ndev < 2) {
        fprintf(stderr, "need at least 2 devices, have %d\n", ndev);
        return 1;
    }

    CUdevice dev0, dev1;
    CHECK(cuDeviceGet(&dev0, 0));
    CHECK(cuDeviceGet(&dev1, 1));

    int p2p_01 = 0, p2p_10 = 0;
    CHECK(cuDeviceCanAccessPeer(&p2p_01, dev0, dev1));
    CHECK(cuDeviceCanAccessPeer(&p2p_10, dev1, dev0));
    if (!p2p_01 || !p2p_10) {
        fprintf(stderr, "need bidirectional P2P; 0->1=%d 1->0=%d\n",
                p2p_01, p2p_10);
        return 1;
    }

    // Retain primary contexts for each device.
    CUcontext ctx0, ctx1;
    CHECK(cuDevicePrimaryCtxRetain(&ctx0, dev0));
    CHECK(cuDevicePrimaryCtxRetain(&ctx1, dev1));

    // Enable context-level peer access in both directions.
    CHECK(cuCtxSetCurrent(ctx0));
    CHECK(cuCtxEnablePeerAccess(ctx1, 0));
    CHECK(cuCtxSetCurrent(ctx1));
    CHECK(cuCtxEnablePeerAccess(ctx0, 0));

    // Create custom pools for both devices.
    CHECK(cuCtxSetCurrent(ctx0));
    CUmemoryPool pool0 = make_pool(dev0);
    CHECK(cuCtxSetCurrent(ctx1));
    CUmemoryPool pool1 = make_pool(dev1);

    // NVIDIA's documented pattern: grant pool-level peer access ONCE, before
    // any allocations come out of the pool. Per the programming guide,
    // "once a pool is made accessible from a given GPU, it should remain
    // accessible from that GPU for the lifetime of the pool."
    int grant_pool_access_flag = 1;
    if (argc >= 2 && strcmp(argv[1], "--no-pool-access") == 0) {
        grant_pool_access_flag = 0;
    }
    if (grant_pool_access_flag) {
        grant_pool_access(pool0, dev1);
        grant_pool_access(pool1, dev0);
    }

    // Create a stream on each device.
    CHECK(cuCtxSetCurrent(ctx0));
    CUstream s0;
    CHECK(cuStreamCreate(&s0, CU_STREAM_NON_BLOCKING));
    CHECK(cuCtxSetCurrent(ctx1));
    CUstream s1;
    CHECK(cuStreamCreate(&s1, CU_STREAM_NON_BLOCKING));

    // Main loop: per iteration, allocate a on dev0 and b on dev1, initialize
    // a with host data, peer-copy a -> b on dev0's stream, and verify b
    // contains the same data by copying back to the host via dev1's stream.
    const size_t N = 25;
    const size_t bytes = N * sizeof(double);
    double host_a[N], host_b[N];
    for (size_t i = 0; i < N; i++) host_a[i] = (double)(i + 1) * 1.5;

    int fails = 0;
    const int iters = 500;
    for (int it = 0; it < iters; it++) {
        // alloc a on dev0 (stream-ordered, from pool0)
        CHECK(cuCtxSetCurrent(ctx0));
        CUdeviceptr a;
        CHECK(cuMemAllocFromPoolAsync(&a, bytes, pool0, s0));

        // alloc b on dev1 (stream-ordered, from pool1)
        CHECK(cuCtxSetCurrent(ctx1));
        CUdeviceptr b;
        CHECK(cuMemAllocFromPoolAsync(&b, bytes, pool1, s1));

        // make sure b's allocation is physically done before the peer write
        // touches it from dev0's stream
        CHECK(cuStreamSynchronize(s1));

        // HtoD fill of a on dev0's stream
        CHECK(cuCtxSetCurrent(ctx0));
        CHECK(cuMemcpyHtoDAsync(a, host_a, bytes, s0));

        // peer copy a -> b on dev0's stream
        CHECK(cuMemcpyPeerAsync(b, ctx1, a, ctx0, bytes, s0));

        // wait for peer copy to complete
        CHECK(cuStreamSynchronize(s0));

        // DtoH read of b on dev1's stream
        CHECK(cuCtxSetCurrent(ctx1));
        memset(host_b, 0, bytes);
        CHECK(cuMemcpyDtoHAsync(host_b, b, bytes, s1));
        CHECK(cuStreamSynchronize(s1));

        // verify
        int ok = (memcmp(host_a, host_b, bytes) == 0);
        if (!ok) {
            fails++;
            if (fails <= 3) {
                fprintf(stderr, "iter %d: mismatch; first 3 values: "
                        "host_a=[%g,%g,%g] host_b=[%g,%g,%g]\n",
                        it, host_a[0], host_a[1], host_a[2],
                        host_b[0], host_b[1], host_b[2]);
            }
        }

        // free (stream-ordered)
        CHECK(cuCtxSetCurrent(ctx0));
        CHECK(cuMemFreeAsync(a, s0));
        CHECK(cuCtxSetCurrent(ctx1));
        CHECK(cuMemFreeAsync(b, s1));
    }

    printf("grant_pool_access=%d: %d/%d peer copies returned wrong data\n",
           grant_pool_access_flag, fails, iters);

    CHECK(cuStreamDestroy(s0));
    CHECK(cuStreamDestroy(s1));
    CHECK(cuMemPoolDestroy(pool0));
    CHECK(cuMemPoolDestroy(pool1));
    CHECK(cuDevicePrimaryCtxRelease(dev0));
    CHECK(cuDevicePrimaryCtxRelease(dev1));
    return fails == 0 ? 0 : 2;
}

Replacing the memcpy by a kernel-based one doesn't help. So temporarily disabling this until I hear back from NVIDIA.

Works around #2930

`PerDevice.get!` cached `(context(), value)`, where `context()` is the currently-active context, but subsequent lookups compared against `device_context(id)` — the target device's context. Whenever `get!(x, dev)` was invoked from a context belonging to a *different* device (e.g., `pool_create(other_dev)` called from inside `context!(context(src))`), the comparison mismatched on every later lookup and the constructor ran again, creating a fresh value per call and leaking the previous one. Store `context(dev)` instead, so the cache key matches the lookup key. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

`Base.convert(::Type{CuPtr{T}}, ::Managed)` granted pool-level peer access via `cuMemPoolSetAccess` on every cross-device pointer conversion. This worked around an apparent CUDA driver bug by making it much worse: a minimal C reproducer confirms that a single `cuMemPoolSetAccess` call on a stream-ordered pool — even the documented once-at-creation pattern, done before any allocations come out of the pool — causes subsequent peer-direction data writes into allocations from that pool (whether via `cuMemcpyPeerAsync` or via a kernel on the peer device) to silently write zeros on driver 590.48.01 / CUDA 13.2 / Turing sm_75. The API returns `CUDA_SUCCESS` and `cuMemPoolGetAccess` reports the access is set, but the data-plane write is dropped. `compute-sanitizer` additionally flags each call with a bogus "HOST/HOST_NUMA pools are always read-write accessible on the HOST" warning even though the access descriptor is `CU_MEM_LOCATION_TYPE_DEVICE` on a device pool. Reported upstream as NVIDIA bug #6098762. `cuMemcpyPeerAsync` is a driver-mediated copy that only requires context-level peer access (`cuCtxEnablePeerAccess`, already enabled above) — not pool-level access — so removing the call fixes `copyto!` between CuArrays on different devices without needing the driver-bug-triggering API call. Callers that genuinely need cross-device kernel access (e.g., cuBLASXt) already configure pool access themselves and are unaffected by this change (though they will still hit the driver bug in the same way the pre-fix code did). Fixes the flaky "issue 1136: copies between devices" testset. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> [only special]

codecov · 2026-04-21T09:42:55Z

Codecov Report

✅ All modified and coverable lines are covered by tests.
✅ Project coverage is 16.56%. Comparing base (e0e295f) to head (08bb31d).
⚠️ Report is 3 commits behind head on master.

Additional details and impacted files

@@            Coverage Diff             @@
##           master    #3112      +/-   ##
==========================================
+ Coverage   10.19%   16.56%   +6.36%     
==========================================
  Files         119      120       +1     
  Lines        9198     9594     +396     
==========================================
+ Hits          938     1589     +651     
+ Misses       8260     8005     -255

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

github-actions

CUDA.jl Benchmarks

Details

Benchmark suite	Current: `08bb31d`	Previous: `e0e295f`	Ratio
`array/accumulate/Float32/1d`	`101551` ns	`100878` ns	`1.01`
`array/accumulate/Float32/dims=1`	`77097` ns	`75855` ns	`1.02`
`array/accumulate/Float32/dims=1L`	`1593830` ns	`1585504` ns	`1.01`
`array/accumulate/Float32/dims=2`	`144382` ns	`143115.5` ns	`1.01`
`array/accumulate/Float32/dims=2L`	`660133` ns	`657101` ns	`1.00`
`array/accumulate/Int64/1d`	`118831` ns	`118250` ns	`1.00`
`array/accumulate/Int64/dims=1`	`80265` ns	`79820.5` ns	`1.01`
`array/accumulate/Int64/dims=1L`	`1704679` ns	`1694871` ns	`1.01`
`array/accumulate/Int64/dims=2`	`156762` ns	`155746` ns	`1.01`
`array/accumulate/Int64/dims=2L`	`961755` ns	`961802` ns	`1.00`
`array/broadcast`	`20353` ns	`20486` ns	`0.99`
`array/construct`	`1301.7` ns	`1263.9` ns	`1.03`
`array/copy`	`18210` ns	`17962` ns	`1.01`
`array/copyto!/cpu_to_gpu`	`215057` ns	`214197` ns	`1.00`
`array/copyto!/gpu_to_cpu`	`283657` ns	`281343` ns	`1.01`
`array/copyto!/gpu_to_gpu`	`10929` ns	`10794` ns	`1.01`
`array/iteration/findall/bool`	`134807` ns	`134478` ns	`1.00`
`array/iteration/findall/int`	`149792` ns	`149314.5` ns	`1.00`
`array/iteration/findfirst/bool`	`81542.5` ns	`81113` ns	`1.01`
`array/iteration/findfirst/int`	`84112` ns	`83293` ns	`1.01`
`array/iteration/findmin/1d`	`85798.5` ns	`84555` ns	`1.01`
`array/iteration/findmin/2d`	`116649` ns	`116516` ns	`1.00`
`array/iteration/logical`	`199215.5` ns	`197262.5` ns	`1.01`
`array/iteration/scalar`	`67829` ns	`67092` ns	`1.01`
`array/permutedims/2d`	`52186` ns	`52211` ns	`1.00`
`array/permutedims/3d`	`52766` ns	`52764` ns	`1.00`
`array/permutedims/4d`	`51720` ns	`51452` ns	`1.01`
`array/random/rand/Float32`	`12958` ns	`12943` ns	`1.00`
`array/random/rand/Int64`	`25183` ns	`24996` ns	`1.01`
`array/random/rand!/Float32`	`8472` ns	`8402.333333333334` ns	`1.01`
`array/random/rand!/Int64`	`21843` ns	`21937` ns	`1.00`
`array/random/randn/Float32`	`38056.5` ns	`36954` ns	`1.03`
`array/random/randn!/Float32`	`31051` ns	`30982` ns	`1.00`
`array/reductions/mapreduce/Float32/1d`	`35035` ns	`34678` ns	`1.01`
`array/reductions/mapreduce/Float32/dims=1`	`40090` ns	`39206` ns	`1.02`
`array/reductions/mapreduce/Float32/dims=1L`	`51351` ns	`51259.5` ns	`1.00`
`array/reductions/mapreduce/Float32/dims=2`	`56448.5` ns	`56274` ns	`1.00`
`array/reductions/mapreduce/Float32/dims=2L`	`69248` ns	`69346` ns	`1.00`
`array/reductions/mapreduce/Int64/1d`	`42631` ns	`42412` ns	`1.01`
`array/reductions/mapreduce/Int64/dims=1`	`42885` ns	`42188` ns	`1.02`
`array/reductions/mapreduce/Int64/dims=1L`	`87135` ns	`87287` ns	`1.00`
`array/reductions/mapreduce/Int64/dims=2`	`59453` ns	`59630` ns	`1.00`
`array/reductions/mapreduce/Int64/dims=2L`	`84732` ns	`84743` ns	`1.00`
`array/reductions/reduce/Float32/1d`	`35232` ns	`34235` ns	`1.03`
`array/reductions/reduce/Float32/dims=1`	`48638` ns	`39618.5` ns	`1.23`
`array/reductions/reduce/Float32/dims=1L`	`51287` ns	`51305` ns	`1.00`
`array/reductions/reduce/Float32/dims=2`	`56586` ns	`56667` ns	`1.00`
`array/reductions/reduce/Float32/dims=2L`	`69400` ns	`69784` ns	`0.99`
`array/reductions/reduce/Int64/1d`	`42663` ns	`42369` ns	`1.01`
`array/reductions/reduce/Int64/dims=1`	`47447.5` ns	`42478` ns	`1.12`
`array/reductions/reduce/Int64/dims=1L`	`87063` ns	`87248` ns	`1.00`
`array/reductions/reduce/Int64/dims=2`	`59467` ns	`59729` ns	`1.00`
`array/reductions/reduce/Int64/dims=2L`	`84381` ns	`84769` ns	`1.00`
`array/reverse/1d`	`17779` ns	`18015.5` ns	`0.99`
`array/reverse/1dL`	`68359` ns	`68638` ns	`1.00`
`array/reverse/1dL_inplace`	`65696` ns	`65779` ns	`1.00`
`array/reverse/1d_inplace`	`8475.333333333334` ns	`8649.666666666666` ns	`0.98`
`array/reverse/2d`	`20773` ns	`20711` ns	`1.00`
`array/reverse/2dL`	`72907` ns	`72634` ns	`1.00`
`array/reverse/2dL_inplace`	`65831` ns	`65985` ns	`1.00`
`array/reverse/2d_inplace`	`9983` ns	`10088` ns	`0.99`
`array/sorting/1d`	`2744620` ns	`2734295` ns	`1.00`
`array/sorting/2d`	`1072540` ns	`1068343` ns	`1.00`
`array/sorting/by`	`3314456` ns	`3304353` ns	`1.00`
`cuda/synchronization/context/auto`	`1120.9` ns	`1159.9` ns	`0.97`
`cuda/synchronization/context/blocking`	`921.0555555555555` ns	`896.4878048780488` ns	`1.03`
`cuda/synchronization/context/nonblocking`	`7122` ns	`7409.1` ns	`0.96`
`cuda/synchronization/stream/auto`	`1002.5454545454545` ns	`1027.578947368421` ns	`0.98`
`cuda/synchronization/stream/blocking`	`793.9795918367347` ns	`841.2941176470588` ns	`0.94`
`cuda/synchronization/stream/nonblocking`	`7377.4` ns	`7567.799999999999` ns	`0.97`
`integration/byval/reference`	`143725` ns	`143876` ns	`1.00`
`integration/byval/slices=1`	`145722` ns	`145738.5` ns	`1.00`
`integration/byval/slices=2`	`284373` ns	`284423` ns	`1.00`
`integration/byval/slices=3`	`423145` ns	`423173` ns	`1.00`
`integration/cudadevrt`	`102373` ns	`102437` ns	`1.00`
`integration/volumerhs`	`23469620.5` ns	`23470585` ns	`1.00`
`kernel/indexing`	`13127` ns	`13311` ns	`0.99`
`kernel/indexing_checked`	`13950` ns	`14095` ns	`0.99`
`kernel/launch`	`2079.6666666666665` ns	`2235.1111111111113` ns	`0.93`
`kernel/occupancy`	`671.243670886076` ns	`693.6190476190476` ns	`0.97`
`kernel/rand`	`14274` ns	`18172.5` ns	`0.79`
`latency/import`	`3826822993` ns	`3820990542` ns	`1.00`
`latency/precompile`	`4595981474.5` ns	`4593009584` ns	`1.00`
`latency/ttfp`	`4416850028.5` ns	`4397252952` ns	`1.00`

This comment was automatically generated by workflow using github-action-benchmark.

maleadt and others added 2 commits April 21, 2026 02:07

github-actions Bot reviewed Apr 21, 2026

View reviewed changes

maleadt merged commit 49ce6f6 into master Apr 21, 2026
2 checks passed

maleadt deleted the tb/multi branch April 21, 2026 11:56

maleadt mentioned this pull request Apr 21, 2026

Restore cross-device kernel access #2930

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Temporarily disable cross-device memory access from kernels#3112

Temporarily disable cross-device memory access from kernels#3112
maleadt merged 2 commits intomasterfrom
tb/multi

maleadt commented Apr 21, 2026 •

edited

Loading

Uh oh!

codecov Bot commented Apr 21, 2026 •

edited

Loading

Uh oh!

github-actions Bot left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

Conversation

maleadt commented Apr 21, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

codecov Bot commented Apr 21, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

github-actions Bot left a comment

Choose a reason for hiding this comment

CUDA.jl Benchmarks

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

maleadt commented Apr 21, 2026 •

edited

Loading

codecov Bot commented Apr 21, 2026 •

edited

Loading