Skip to content

Commit e8402c0

Browse files
Merge branch 'master' into fix-range-metadata
2 parents 42481a5 + 5e4118b commit e8402c0

41 files changed

Lines changed: 649 additions & 145 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.buildkite/pipeline.yml

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,6 @@ steps:
113113
- "12.1"
114114
- "12.0"
115115
commands: |
116-
unset LD_LIBRARY_PATH
117116
echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml
118117
119118
- group: ":telescope: Downstream"
@@ -186,7 +185,6 @@ steps:
186185
- lib
187186
- examples
188187
commands: |
189-
unset LD_LIBRARY_PATH
190188
julia -e '
191189
using Pkg
192190
@@ -272,8 +270,6 @@ steps:
272270
- src
273271
- lib
274272
- examples
275-
commands: |
276-
unset LD_LIBRARY_PATH
277273
agents:
278274
queue: "juliagpu"
279275
cuda: "*"
@@ -296,8 +292,6 @@ steps:
296292
- src
297293
- lib
298294
- examples
299-
commands: |
300-
unset LD_LIBRARY_PATH
301295
agents:
302296
queue: "juliagpu"
303297
cuda: "*"
@@ -322,8 +316,6 @@ steps:
322316
- src
323317
- lib
324318
- examples
325-
commands: |
326-
unset LD_LIBRARY_PATH
327319
agents:
328320
queue: "juliagpu"
329321
cuda: "*"
@@ -340,7 +332,6 @@ steps:
340332
- "unified"
341333
- "host"
342334
commands: |
343-
unset LD_LIBRARY_PATH
344335
echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml
345336
346337
- label: "MultiGPU"
@@ -354,8 +345,6 @@ steps:
354345
- src
355346
- lib
356347
- examples
357-
commands: |
358-
unset LD_LIBRARY_PATH
359348
agents:
360349
queue: "juliagpu"
361350
cuda: "*"
@@ -376,7 +365,6 @@ steps:
376365
- JuliaCI/julia#v1:
377366
version: "1.12"
378367
commands: |
379-
unset LD_LIBRARY_PATH
380368
julia --project -e '
381369
println("--- :julia: Instantiating project")
382370
using Pkg
@@ -401,7 +389,6 @@ steps:
401389
- JuliaCI/julia#v1:
402390
version: "1.12"
403391
commands: |
404-
unset LD_LIBRARY_PATH
405392
julia --project=perf -e '
406393
using Pkg
407394

CUDACore/Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "CUDACore"
22
uuid = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
3-
version = "6.0.0"
3+
version = "6.1.0"
44

55
[deps]
66
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -45,7 +45,7 @@ BFloat16s = "0.5, 0.6"
4545
CEnum = "0.2, 0.3, 0.4, 0.5"
4646
CUDA_Compiler_jll = "0.3, 0.4"
4747
CUDA_Driver_jll = "13"
48-
CUDA_Runtime_Discovery = "1"
48+
CUDA_Runtime_Discovery = "2"
4949
CUDA_Runtime_jll = "0.21"
5050
ChainRulesCore = "1"
5151
EnzymeCore = "0.8.2"

CUDACore/lib/cudadrv/state.jl

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,12 @@ function Base.get!(constructor::F, x::PerDevice{T}, dev::CuDevice) where {F <: B
459459
if y[id] === nothing || (y[id]::Tuple)[1] !== ctx
460460
Base.@lock x.lock begin
461461
if y[id] === nothing || (y[id]::Tuple)[1] !== ctx
462-
y[id] = (context(), constructor())
462+
# store the device's own context (it may be created during `constructor()`),
463+
# so subsequent lookups — which compare against `device_context(id)`, not
464+
# the currently-active context — hit the cache regardless of which context
465+
# was active when the value was constructed.
466+
value = constructor()
467+
y[id] = (context(dev), value)
463468
end
464469
end
465470
end

CUDACore/lib/cudadrv/version.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ other tools. This is version separately from the CUDA Runtime, in order to ensur
100100
compatibility with the driver, and make sure we use the latest compatible version regardless
101101
of the selected runtime.
102102
"""
103-
compiler_version() = CUDA_Compiler_jll.cuda_version
103+
compiler_version() = CUDA_Compiler.cuda_version
104104

105105

106106
## helpers

CUDACore/src/CUDACore.jl

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,8 @@ using LLVMLoopInfo
3131

3232
using CUDA_Driver_jll
3333

34-
using CUDA_Compiler_jll
35-
3634
import CUDA_Runtime_jll
3735
const local_toolkit = CUDA_Runtime_jll.host_platform["cuda_local"] == "true"
38-
const toolkit_version = if CUDA_Runtime_jll.host_platform["cuda"] == "none"
39-
nothing
40-
else
41-
parse(VersionNumber, CUDA_Runtime_jll.host_platform["cuda"])
42-
end
4336
if local_toolkit
4437
using CUDA_Runtime_Discovery
4538
const CUDA_Runtime = CUDA_Runtime_Discovery
@@ -49,6 +42,15 @@ else
4942
end
5043

5144
import Preferences
45+
const local_compiler = Preferences.@load_preference("local_compiler", "false") == "true"
46+
47+
if local_compiler
48+
using CUDA_Runtime_Discovery
49+
const CUDA_Compiler = CUDA_Runtime_Discovery
50+
else
51+
import CUDA_Compiler_jll
52+
const CUDA_Compiler = CUDA_Compiler_jll
53+
end
5254

5355
using Libdl
5456

CUDACore/src/compiler/compilation.jl

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ function GPUCompiler.link_libraries!(@nospecialize(job::CUDACompilerJob), mod::L
3232
return
3333
end
3434

35-
lib = parse(LLVM.Module, read(libdevice))
35+
lib = parse(LLVM.Module, read(CUDA_Compiler.libdevice))
3636

3737
# override libdevice's triple and datalayout to avoid warnings
3838
triple!(lib, triple(mod))
@@ -122,6 +122,14 @@ function GPUCompiler.finish_module!(@nospecialize(job::CUDACompilerJob),
122122
return entry
123123
end
124124

125+
# stamp `.version` with the ISA we want `ptxas` to validate against
126+
# and `.target` with the arch that `--gpu-name` will use
127+
function rewrite_ptx_header(asm, ptx, cap)
128+
return replace(asm,
129+
r"(\.version .+)" => ".version $(ptx.major).$(ptx.minor)",
130+
r"\.target sm_\d+\w*" => ".target sm_$(cap.major)$(cap.minor)")
131+
end
132+
125133
function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format)
126134
@assert format == LLVM.API.LLVMAssemblyFile
127135
asm = invoke(GPUCompiler.mcgen,
@@ -142,15 +150,12 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module
142150
asm = replace(asm, r"(\.target .+), debug" => s"\1")
143151
end
144152

145-
# if LLVM couldn't target the requested PTX ISA, bump it in the assembly.
146-
if job.config.target.ptx != job.config.params.ptx
147-
ptx = job.config.params.ptx
148-
asm = replace(asm, r"(\.version .+)" => ".version $(ptx.major).$(ptx.minor)")
153+
(; ptx, cap) = job.config.params
154+
if job.config.target.ptx != ptx || job.config.target.cap != cap
155+
asm = rewrite_ptx_header(asm, ptx, cap)
149156
end
150157

151-
# no need to bump the `.target` directive; we can do that by passing `-arch` to `ptxas`
152-
153-
asm
158+
return asm
154159
end
155160

156161

@@ -339,7 +344,7 @@ function compile(@nospecialize(job::CompilerJob))
339344
"--output-file", ptxas_output,
340345
ptx_input
341346
])
342-
proc, log = run_and_collect(`$(ptxas()) $ptxas_opts`)
347+
proc, log = run_and_collect(`$(CUDA_Compiler.ptxas()) $ptxas_opts`)
343348
log = strip(log)
344349
if !success(proc)
345350
reason = proc.termsignal > 0 ? "ptxas received signal $(proc.termsignal)" :
@@ -370,12 +375,12 @@ function compile(@nospecialize(job::CompilerJob))
370375
append!(nvlink_opts, [
371376
"--verbose", "--extra-warnings",
372377
"--arch", arch,
373-
"--library-path", dirname(libcudadevrt),
378+
"--library-path", dirname(CUDA_Compiler.libcudadevrt),
374379
"--library", "cudadevrt",
375380
"--output-file", nvlink_output,
376381
ptxas_output
377382
])
378-
proc, log = run_and_collect(`$(nvlink()) $nvlink_opts`)
383+
proc, log = run_and_collect(`$(CUDA_Compiler.nvlink()) $nvlink_opts`)
379384
log = strip(log)
380385
if !success(proc)
381386
reason = proc.termsignal > 0 ? "nvlink received signal $(proc.termsignal)" :

CUDACore/src/compiler/execution.jl

Lines changed: 84 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,67 @@
22

33
export @cuda, cudaconvert, cufunction, dynamic_cufunction, nextwarp, prevwarp
44
@public maxthreads, registers, memory, version, KernelAdaptor
5+
@public AbstractBackend, LLVMBackend, DefaultBackend, kernel_convert, kernel_compile
6+
7+
8+
## backend dispatch
9+
10+
"""
11+
AbstractBackend
12+
13+
Abstract supertype for `@cuda` backend dispatch. The default backend is
14+
[`LLVMBackend`](@ref), which compiles SIMT/PTX kernels via
15+
[`cufunction`](@ref). Other backends (e.g. Tile IR via cuTile.jl) register
16+
a subtype and define methods for [`kernel_convert`](@ref) and
17+
[`kernel_compile`](@ref); `@cuda backend=...` then routes through them.
18+
19+
`@cuda backend=...` accepts either an `AbstractBackend` instance or a
20+
module that defines `DefaultBackend()` returning one (e.g.
21+
`@cuda backend=cuTile ...` resolves to `cuTile.DefaultBackend()`).
22+
"""
23+
abstract type AbstractBackend end
24+
25+
"""
26+
LLVMBackend()
27+
28+
Default `@cuda` backend. Compiles SIMT/PTX kernels via [`cufunction`](@ref)
29+
and converts arguments via [`cudaconvert`](@ref).
30+
"""
31+
struct LLVMBackend <: AbstractBackend end
32+
33+
"""
34+
DefaultBackend()
35+
36+
Returns the default `@cuda` backend for this module ([`LLVMBackend`](@ref)).
37+
This makes `@cuda backend=CUDA ...` (or `backend=CUDACore`) resolve to
38+
[`LLVMBackend`](@ref), mirroring the convention used by other backend
39+
packages (e.g. `@cuda backend=cuTile ...` resolves to `cuTile.DefaultBackend()`).
40+
"""
41+
DefaultBackend() = LLVMBackend()
42+
43+
"""
44+
kernel_convert(backend, x)
45+
46+
Convert a host-side launch argument to its kernel-side form. The default
47+
implementation for [`LLVMBackend`](@ref) forwards to [`cudaconvert`](@ref);
48+
other backends override to produce backend-specific argument types.
49+
"""
50+
kernel_convert(::LLVMBackend, x) = cudaconvert(x)
51+
52+
"""
53+
kernel_compile(backend, f, tt::Type{<:Tuple}; kwargs...) -> AbstractKernel
54+
55+
Compile a function for the given backend. Returns an [`AbstractKernel`](@ref)
56+
callable as `kernel(args...; launch_kwargs...)` to launch on the GPU. The
57+
default implementation for [`LLVMBackend`](@ref) is [`cufunction`](@ref).
58+
"""
59+
kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} =
60+
cufunction(f, tt; kwargs...)
561

662

763
## high-level @cuda interface
864

9-
const MACRO_KWARGS = [:dynamic, :launch]
65+
const MACRO_KWARGS = [:dynamic, :launch, :backend]
1066
const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx]
1167
const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream]
1268

@@ -24,6 +80,10 @@ Several keyword arguments are supported that influence the behavior of `@cuda`.
2480
- `launch`: whether to launch this kernel, defaults to `true`. If `false` the returned
2581
kernel object should be launched by calling it and passing arguments again.
2682
- `dynamic`: use dynamic parallelism to launch device-side kernels, defaults to `false`.
83+
- `backend`: which compiler backend to use, defaults to [`LLVMBackend`](@ref). Either an
84+
[`AbstractBackend`](@ref) instance or a module that defines `DefaultBackend()` (e.g.
85+
`backend=CUDA` resolves to `CUDA.DefaultBackend()`). Backend-specific compiler kwargs
86+
not recognized by `@cuda` itself are forwarded to [`kernel_compile`](@ref).
2787
- arguments that influence kernel compilation: see [`cufunction`](@ref) and
2888
[`dynamic_cufunction`](@ref)
2989
- arguments that influence kernel launch: see [`CUDACore.HostKernel`](@ref) and
@@ -50,17 +110,16 @@ macro cuda(ex...)
50110
code = quote end
51111
vars, var_exprs = assign_args!(code, args)
52112

53-
# group keyword argument
113+
# group keyword argument. Backend-specific compiler kwargs land in
114+
# `other_kwargs` and are forwarded to `kernel_compile`; the backend
115+
# validates them.
54116
macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs =
55117
split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS)
56-
if !isempty(other_kwargs)
57-
key,val = first(other_kwargs).args
58-
throw(ArgumentError("Unsupported keyword argument '$key'"))
59-
end
60118

61119
# handle keyword arguments that influence the macro's behavior
62120
dynamic = false
63121
launch = true
122+
backend_expr = :($LLVMBackend())
64123
for kwarg in macro_kwargs
65124
key::Symbol, val = kwarg.args
66125
if key === :dynamic
@@ -69,6 +128,8 @@ macro cuda(ex...)
69128
elseif key === :launch
70129
isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @cuda should be a constant value"))
71130
launch = val::Bool
131+
elseif key === :backend
132+
backend_expr = val
72133
else
73134
throw(ArgumentError("Unsupported keyword argument '$key'"))
74135
end
@@ -79,12 +140,14 @@ macro cuda(ex...)
79140

80141
# FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5)
81142
# we esc() the whole thing now, necessitating gensyms...
82-
@gensym f_var kernel_f kernel_args kernel_tt kernel
143+
@gensym f_var kernel_f kernel_args kernel_tt kernel backend backend_raw
83144
if dynamic
84145
# FIXME: we could probably somehow support kwargs with constant values by either
85146
# saving them in a global Dict here, or trying to pick them up from the Julia
86147
# IR when processing the dynamic parallelism marker
87148
isempty(compiler_kwargs) || error("@cuda dynamic parallelism does not support compiler keyword arguments")
149+
isempty(other_kwargs) ||
150+
error("@cuda dynamic parallelism does not support backend-specific compiler keyword arguments")
88151

89152
# dynamic, device-side kernel launch
90153
push!(code.args,
@@ -105,12 +168,19 @@ macro cuda(ex...)
105168
# while keeping the original arguments alive
106169
push!(code.args,
107170
quote
171+
# Accept either an `AbstractBackend` instance or a module
172+
# providing `DefaultBackend()` (e.g. `backend=cuTile`).
173+
# Inference folds the branch away on concretely-typed inputs.
174+
$backend = let $backend_raw = $backend_expr
175+
$backend_raw isa $AbstractBackend ? $backend_raw : $backend_raw.DefaultBackend()
176+
end
108177
$f_var = $f
109178
GC.@preserve $(vars...) $f_var begin
110-
$kernel_f = $cudaconvert($f_var)
111-
$kernel_args = map($cudaconvert, ($(var_exprs...),))
179+
$kernel_f = $kernel_convert($backend, $f_var)
180+
$kernel_args = map(x -> $kernel_convert($backend, x), ($(var_exprs...),))
112181
$kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...}
113-
$kernel = $cufunction($kernel_f, $kernel_tt; $(compiler_kwargs...))
182+
$kernel = $kernel_compile($backend, $kernel_f, $kernel_tt;
183+
$(compiler_kwargs...), $(other_kwargs...))
114184
if $launch
115185
$kernel($kernel_args...; $(call_kwargs...), convert=Val(false))
116186
end
@@ -239,10 +309,12 @@ The following keyword arguments are supported:
239309
AbstractKernel
240310

241311
function Base.show(io::IO, k::AbstractKernel{F,TT}) where {F,TT}
242-
print(io, "CUDACore.$(nameof(typeof(k)))($(k.f))")
312+
T = typeof(k)
313+
print(io, "$(parentmodule(T)).$(nameof(T))($(k.f))")
243314
end
244315
function Base.show(io::IO, ::MIME"text/plain", k::AbstractKernel{F,TT}) where {F,TT}
245-
print(io, "CUDACore.$(nameof(typeof(k))) for $(k.f)($(join(TT.parameters, ", ")))")
316+
T = typeof(k)
317+
print(io, "$(parentmodule(T)).$(nameof(T)) for $(k.f)($(join(TT.parameters, ", ")))")
246318
end
247319

248320
@inline @generated function (kernel::AbstractKernel{F,TT})(args::Vararg{Any,N};

0 commit comments

Comments
 (0)