JuliaGPU
diff --git a/‎.buildkite/pipeline.yml‎
Lines changed: 0 additions & 13 deletions b/‎.buildkite/pipeline.yml‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎CUDACore/Project.toml‎
Lines changed: 2 additions & 2 deletions b/‎CUDACore/Project.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎CUDACore/lib/cudadrv/state.jl‎
Lines changed: 6 additions & 1 deletion b/‎CUDACore/lib/cudadrv/state.jl‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎CUDACore/lib/cudadrv/version.jl‎
Lines changed: 1 addition & 1 deletion b/‎CUDACore/lib/cudadrv/version.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CUDACore/src/CUDACore.jl‎
Lines changed: 9 additions & 7 deletions b/‎CUDACore/src/CUDACore.jl‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎CUDACore/src/compiler/compilation.jl‎
Lines changed: 16 additions & 11 deletions b/‎CUDACore/src/compiler/compilation.jl‎
Lines changed: 16 additions & 11 deletions
diff --git a/‎CUDACore/src/compiler/execution.jl‎
Lines changed: 84 additions & 12 deletions b/‎CUDACore/src/compiler/execution.jl‎
Lines changed: 84 additions & 12 deletions
@@ -113,7 +113,6 @@ steps:
               - "12.1"
               - "12.0"
         commands: |
-          unset LD_LIBRARY_PATH
           echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml
 
   - group: ":telescope: Downstream"
@@ -186,7 +185,6 @@ steps:
                 - lib
                 - examples
         commands: |
-          unset LD_LIBRARY_PATH
           julia -e '
             using Pkg
 
@@ -272,8 +270,6 @@ steps:
                 - src
                 - lib
                 - examples
-        commands: |
-          unset LD_LIBRARY_PATH
         agents:
           queue: "juliagpu"
           cuda: "*"
@@ -296,8 +292,6 @@ steps:
                 - src
                 - lib
                 - examples
-        commands: |
-          unset LD_LIBRARY_PATH
         agents:
           queue: "juliagpu"
           cuda: "*"
@@ -322,8 +316,6 @@ steps:
                 - src
                 - lib
                 - examples
-        commands: |
-          unset LD_LIBRARY_PATH
         agents:
           queue: "juliagpu"
           cuda: "*"
@@ -340,7 +332,6 @@ steps:
               - "unified"
               - "host"
         commands: |
-          unset LD_LIBRARY_PATH
           echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml
 
       - label: "MultiGPU"
@@ -354,8 +345,6 @@ steps:
                 - src
                 - lib
                 - examples
-        commands: |
-          unset LD_LIBRARY_PATH
         agents:
           queue: "juliagpu"
           cuda: "*"
@@ -376,7 +365,6 @@ steps:
       - JuliaCI/julia#v1:
           version: "1.12"
     commands: |
-      unset LD_LIBRARY_PATH
       julia --project -e '
         println("--- :julia: Instantiating project")
         using Pkg
@@ -401,7 +389,6 @@ steps:
       - JuliaCI/julia#v1:
           version: "1.12"
     commands: |
-      unset LD_LIBRARY_PATH
       julia --project=perf -e '
         using Pkg
 
 
@@ -1,6 +1,6 @@
 name = "CUDACore"
 uuid = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
-version = "6.0.0"
+version = "6.1.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -45,7 +45,7 @@ BFloat16s = "0.5, 0.6"
 CEnum = "0.2, 0.3, 0.4, 0.5"
 CUDA_Compiler_jll = "0.3, 0.4"
 CUDA_Driver_jll = "13"
-CUDA_Runtime_Discovery = "1"
+CUDA_Runtime_Discovery = "2"
 CUDA_Runtime_jll = "0.21"
 ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 
@@ -459,7 +459,12 @@ function Base.get!(constructor::F, x::PerDevice{T}, dev::CuDevice) where {F <: B
         if y[id] === nothing || (y[id]::Tuple)[1] !== ctx
             Base.@lock x.lock begin
                 if y[id] === nothing || (y[id]::Tuple)[1] !== ctx
-                    y[id] = (context(), constructor())
+                    # store the device's own context (it may be created during `constructor()`),
+                    # so subsequent lookups — which compare against `device_context(id)`, not
+                    # the currently-active context — hit the cache regardless of which context
+                    # was active when the value was constructed.
+                    value = constructor()
+                    y[id] = (context(dev), value)
                 end
             end
         end
 
@@ -100,7 +100,7 @@ other tools. This is version separately from the CUDA Runtime, in order to ensur
 compatibility with the driver, and make sure we use the latest compatible version regardless
 of the selected runtime.
 """
-compiler_version() = CUDA_Compiler_jll.cuda_version
+compiler_version() = CUDA_Compiler.cuda_version
 
 
 ## helpers
 
@@ -31,15 +31,8 @@ using LLVMLoopInfo
 
 using CUDA_Driver_jll
 
-using CUDA_Compiler_jll
-
 import CUDA_Runtime_jll
 const local_toolkit = CUDA_Runtime_jll.host_platform["cuda_local"] == "true"
-const toolkit_version = if CUDA_Runtime_jll.host_platform["cuda"] == "none"
-    nothing
-else
-    parse(VersionNumber, CUDA_Runtime_jll.host_platform["cuda"])
-end
 if local_toolkit
     using CUDA_Runtime_Discovery
     const CUDA_Runtime = CUDA_Runtime_Discovery
@@ -49,6 +42,15 @@ else
 end
 
 import Preferences
+const local_compiler = Preferences.@load_preference("local_compiler", "false") == "true"
+
+if local_compiler
+    using CUDA_Runtime_Discovery
+    const CUDA_Compiler = CUDA_Runtime_Discovery
+else
+    import CUDA_Compiler_jll
+    const CUDA_Compiler = CUDA_Compiler_jll
+end
 
 using Libdl
 
 
@@ -32,7 +32,7 @@ function GPUCompiler.link_libraries!(@nospecialize(job::CUDACompilerJob), mod::L
         return
     end
 
-    lib = parse(LLVM.Module, read(libdevice))
+    lib = parse(LLVM.Module, read(CUDA_Compiler.libdevice))
 
     # override libdevice's triple and datalayout to avoid warnings
     triple!(lib, triple(mod))
@@ -122,6 +122,14 @@ function GPUCompiler.finish_module!(@nospecialize(job::CUDACompilerJob),
     return entry
 end
 
+# stamp `.version` with the ISA we want `ptxas` to validate against
+# and `.target` with the arch that `--gpu-name` will use
+function rewrite_ptx_header(asm, ptx, cap)
+    return replace(asm,
+        r"(\.version .+)"     => ".version $(ptx.major).$(ptx.minor)",
+        r"\.target sm_\d+\w*" => ".target sm_$(cap.major)$(cap.minor)")
+end
+
 function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format)
     @assert format == LLVM.API.LLVMAssemblyFile
     asm = invoke(GPUCompiler.mcgen,
@@ -142,15 +150,12 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module
         asm = replace(asm, r"(\.target .+), debug" => s"\1")
     end
 
-    # if LLVM couldn't target the requested PTX ISA, bump it in the assembly.
-    if job.config.target.ptx != job.config.params.ptx
-        ptx = job.config.params.ptx
-        asm = replace(asm, r"(\.version .+)" => ".version $(ptx.major).$(ptx.minor)")
+    (; ptx, cap) = job.config.params
+    if job.config.target.ptx != ptx || job.config.target.cap != cap
+        asm = rewrite_ptx_header(asm, ptx, cap)
     end
 
-    # no need to bump the `.target` directive; we can do that by passing `-arch` to `ptxas`
-
-    asm
+    return asm
 end
 
 
@@ -339,7 +344,7 @@ function compile(@nospecialize(job::CompilerJob))
         "--output-file", ptxas_output,
         ptx_input
     ])
-    proc, log = run_and_collect(`$(ptxas()) $ptxas_opts`)
+    proc, log = run_and_collect(`$(CUDA_Compiler.ptxas()) $ptxas_opts`)
     log = strip(log)
     if !success(proc)
         reason = proc.termsignal > 0 ? "ptxas received signal $(proc.termsignal)" :
@@ -370,12 +375,12 @@ function compile(@nospecialize(job::CompilerJob))
         append!(nvlink_opts, [
             "--verbose", "--extra-warnings",
             "--arch", arch,
-            "--library-path", dirname(libcudadevrt),
+            "--library-path", dirname(CUDA_Compiler.libcudadevrt),
             "--library", "cudadevrt",
             "--output-file", nvlink_output,
             ptxas_output
         ])
-        proc, log = run_and_collect(`$(nvlink()) $nvlink_opts`)
+        proc, log = run_and_collect(`$(CUDA_Compiler.nvlink()) $nvlink_opts`)
         log = strip(log)
         if !success(proc)
             reason = proc.termsignal > 0 ? "nvlink received signal $(proc.termsignal)" :
 
@@ -2,11 +2,67 @@
 
 export @cuda, cudaconvert, cufunction, dynamic_cufunction, nextwarp, prevwarp
 @public maxthreads, registers, memory, version, KernelAdaptor
+@public AbstractBackend, LLVMBackend, DefaultBackend, kernel_convert, kernel_compile
+
+
+## backend dispatch
+
+"""
+    AbstractBackend
+
+Abstract supertype for `@cuda` backend dispatch. The default backend is
+[`LLVMBackend`](@ref), which compiles SIMT/PTX kernels via
+[`cufunction`](@ref). Other backends (e.g. Tile IR via cuTile.jl) register
+a subtype and define methods for [`kernel_convert`](@ref) and
+[`kernel_compile`](@ref); `@cuda backend=...` then routes through them.
+
+`@cuda backend=...` accepts either an `AbstractBackend` instance or a
+module that defines `DefaultBackend()` returning one (e.g.
+`@cuda backend=cuTile ...` resolves to `cuTile.DefaultBackend()`).
+"""
+abstract type AbstractBackend end
+
+"""
+    LLVMBackend()
+
+Default `@cuda` backend. Compiles SIMT/PTX kernels via [`cufunction`](@ref)
+and converts arguments via [`cudaconvert`](@ref).
+"""
+struct LLVMBackend <: AbstractBackend end
+
+"""
+    DefaultBackend()
+
+Returns the default `@cuda` backend for this module ([`LLVMBackend`](@ref)).
+This makes `@cuda backend=CUDA ...` (or `backend=CUDACore`) resolve to
+[`LLVMBackend`](@ref), mirroring the convention used by other backend
+packages (e.g. `@cuda backend=cuTile ...` resolves to `cuTile.DefaultBackend()`).
+"""
+DefaultBackend() = LLVMBackend()
+
+"""
+    kernel_convert(backend, x)
+
+Convert a host-side launch argument to its kernel-side form. The default
+implementation for [`LLVMBackend`](@ref) forwards to [`cudaconvert`](@ref);
+other backends override to produce backend-specific argument types.
+"""
+kernel_convert(::LLVMBackend, x) = cudaconvert(x)
+
+"""
+    kernel_compile(backend, f, tt::Type{<:Tuple}; kwargs...) -> AbstractKernel
+
+Compile a function for the given backend. Returns an [`AbstractKernel`](@ref)
+callable as `kernel(args...; launch_kwargs...)` to launch on the GPU. The
+default implementation for [`LLVMBackend`](@ref) is [`cufunction`](@ref).
+"""
+kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} =
+    cufunction(f, tt; kwargs...)
 
 
 ## high-level @cuda interface
 
-const MACRO_KWARGS = [:dynamic, :launch]
+const MACRO_KWARGS = [:dynamic, :launch, :backend]
 const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx]
 const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream]
 
@@ -24,6 +80,10 @@ Several keyword arguments are supported that influence the behavior of `@cuda`.
 - `launch`: whether to launch this kernel, defaults to `true`. If `false` the returned
   kernel object should be launched by calling it and passing arguments again.
 - `dynamic`: use dynamic parallelism to launch device-side kernels, defaults to `false`.
+- `backend`: which compiler backend to use, defaults to [`LLVMBackend`](@ref). Either an
+  [`AbstractBackend`](@ref) instance or a module that defines `DefaultBackend()` (e.g.
+  `backend=CUDA` resolves to `CUDA.DefaultBackend()`). Backend-specific compiler kwargs
+  not recognized by `@cuda` itself are forwarded to [`kernel_compile`](@ref).
 - arguments that influence kernel compilation: see [`cufunction`](@ref) and
   [`dynamic_cufunction`](@ref)
 - arguments that influence kernel launch: see [`CUDACore.HostKernel`](@ref) and
@@ -50,17 +110,16 @@ macro cuda(ex...)
     code = quote end
     vars, var_exprs = assign_args!(code, args)
 
-    # group keyword argument
+    # group keyword argument. Backend-specific compiler kwargs land in
+    # `other_kwargs` and are forwarded to `kernel_compile`; the backend
+    # validates them.
     macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs =
         split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS)
-    if !isempty(other_kwargs)
-        key,val = first(other_kwargs).args
-        throw(ArgumentError("Unsupported keyword argument '$key'"))
-    end
 
     # handle keyword arguments that influence the macro's behavior
     dynamic = false
     launch = true
+    backend_expr = :($LLVMBackend())
     for kwarg in macro_kwargs
         key::Symbol, val = kwarg.args
         if key === :dynamic
@@ -69,6 +128,8 @@ macro cuda(ex...)
         elseif key === :launch
             isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @cuda should be a constant value"))
             launch = val::Bool
+        elseif key === :backend
+            backend_expr = val
         else
             throw(ArgumentError("Unsupported keyword argument '$key'"))
         end
@@ -79,12 +140,14 @@ macro cuda(ex...)
 
     # FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5)
     #        we esc() the whole thing now, necessitating gensyms...
-    @gensym f_var kernel_f kernel_args kernel_tt kernel
+    @gensym f_var kernel_f kernel_args kernel_tt kernel backend backend_raw
     if dynamic
         # FIXME: we could probably somehow support kwargs with constant values by either
         #        saving them in a global Dict here, or trying to pick them up from the Julia
         #        IR when processing the dynamic parallelism marker
         isempty(compiler_kwargs) || error("@cuda dynamic parallelism does not support compiler keyword arguments")
+        isempty(other_kwargs) ||
+            error("@cuda dynamic parallelism does not support backend-specific compiler keyword arguments")
 
         # dynamic, device-side kernel launch
         push!(code.args,
@@ -105,12 +168,19 @@ macro cuda(ex...)
         # while keeping the original arguments alive
         push!(code.args,
             quote
+                # Accept either an `AbstractBackend` instance or a module
+                # providing `DefaultBackend()` (e.g. `backend=cuTile`).
+                # Inference folds the branch away on concretely-typed inputs.
+                $backend = let $backend_raw = $backend_expr
+                    $backend_raw isa $AbstractBackend ? $backend_raw : $backend_raw.DefaultBackend()
+                end
                 $f_var = $f
                 GC.@preserve $(vars...) $f_var begin
-                    $kernel_f = $cudaconvert($f_var)
-                    $kernel_args = map($cudaconvert, ($(var_exprs...),))
+                    $kernel_f = $kernel_convert($backend, $f_var)
+                    $kernel_args = map(x -> $kernel_convert($backend, x), ($(var_exprs...),))
                     $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...}
-                    $kernel = $cufunction($kernel_f, $kernel_tt; $(compiler_kwargs...))
+                    $kernel = $kernel_compile($backend, $kernel_f, $kernel_tt;
+                                              $(compiler_kwargs...), $(other_kwargs...))
                     if $launch
                         $kernel($kernel_args...; $(call_kwargs...), convert=Val(false))
                     end
@@ -239,10 +309,12 @@ The following keyword arguments are supported:
 AbstractKernel
 
 function Base.show(io::IO, k::AbstractKernel{F,TT}) where {F,TT}
-    print(io, "CUDACore.$(nameof(typeof(k)))($(k.f))")
+    T = typeof(k)
+    print(io, "$(parentmodule(T)).$(nameof(T))($(k.f))")
 end
 function Base.show(io::IO, ::MIME"text/plain", k::AbstractKernel{F,TT}) where {F,TT}
-    print(io, "CUDACore.$(nameof(typeof(k))) for $(k.f)($(join(TT.parameters, ", ")))")
+    T = typeof(k)
+    print(io, "$(parentmodule(T)).$(nameof(T)) for $(k.f)($(join(TT.parameters, ", ")))")
 end
 
 @inline @generated function (kernel::AbstractKernel{F,TT})(args::Vararg{Any,N};