From b98c518fe1812939f8511c8325480a404de2e761 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Thu, 30 Apr 2026 09:54:32 +0000 Subject: [PATCH 1/5] Add architecture- and family-specific PTX target support --- CUDACore/src/compatibility.jl | 56 +++++++++++++++++----------- CUDACore/src/compiler/compilation.jl | 38 +++++++++++++++---- test/core/codegen.jl | 32 ++++++++++++++++ 3 files changed, 96 insertions(+), 30 deletions(-) diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index 7fd60d8723..e10899ce20 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -1,14 +1,43 @@ # compatibility of Julia, CUDA and LLVM -# NOTE: Target architectures with suffix “a”, such as sm_90a, include -# architecture-accelerated features that are supported on the specified architecture only, -# hence such targets do not follow the onion layer model. Therefore, PTX code generated for -# such targets cannot be run on later generation devices. Architecture-accelerated features -# can only be used with targets that support these features. +# PTX compilation targets come in three feature-set flavors, selected via the suffix on the +# `.target` directive (and the matching `--gpu-name` to ptxas): +# +# - Baseline (no suffix, e.g. sm_90): the forward-compatible feature set. Code compiled +# for sm_X runs on any sm_Y with Y >= X (onion model). +# - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable; +# code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families. +# Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap. +# - Architectural (`a` suffix, e.g. sm_90a): a strict superset of Family. Locked to one +# exact CC; code compiled for sm_90a runs only on CC 9.0 devices. Introduced with +# CC 9.0; uses the same PTX requirement as the plain target. +# +# baseline ⊆ family ⊆ architectural. Architectural unlocks the full PTX surface (wgmma, +# tcgen05, FP4/MXFP cvt, TMA, setmaxnreg, …); family unlocks the subset shared across +# devices in the same major family; baseline unlocks only the forward-portable set. const lowest = v"0" const highest = v"999" +# PTX compilation target feature set; see top-of-file note for the hierarchy and rules. +@enum PTXTargetKind Baseline Family Architectural + +# Validate that `kind` is reachable at the requested `cap`/`ptx`. The cap floors and the +# kind PTX floors are uniform across caps, so we encode them here rather than in the +# per-cap tables (which would just repeat the same rule for every entry). The `a` syntax +# was introduced in PTX 8.0; the `f` syntax in PTX 8.8. +function validate_target_kind(cap::VersionNumber, ptx::VersionNumber, kind::PTXTargetKind) + if kind === Architectural + cap >= v"9.0" || error("Architectural targets require compute capability >= 9.0; got $cap") + ptx >= v"8.0" || error("Architectural targets require PTX ISA >= 8.0; got $ptx") + end + if kind === Family + cap >= v"10.0" || error("Family targets require compute capability >= 10.0; got $cap") + ptx >= v"8.8" || error("Family targets require PTX ISA >= 8.8; got $ptx") + end + return +end + ## version range @@ -163,22 +192,11 @@ const ptx_cap_db = Dict( v"8.7" => between(v"7.4", highest), v"8.9" => between(v"7.8", highest), v"9.0" => between(v"7.8", highest), - #v"9.0a" => between(v"8.0", highest) v"10.0" => between(v"8.6", highest), - #v"10.0a"=> between(v"8.6", highest), - #v"10.0f"=> between(v"8.8", highest), v"10.1" => between(v"8.6", highest), - #v"10.1a"=> between(v"8.6", highest), - #v"10.1f"=> between(v"8.8", highest), v"10.3" => between(v"8.8", highest), - #v"10.3a"=> between(v"8.8", highest), - #v"10.3f"=> between(v"8.8", highest), v"12.0" => between(v"8.7", highest), - #v"12.0a"=> between(v"8.7", highest), - #v"12.0f"=> between(v"8.8", highest), v"12.1" => between(v"8.8", highest), - #v"12.1a"=> between(v"8.8", highest), - #v"12.1f"=> between(v"8.8", highest), ) function ptx_cap_support(ver::VersionNumber) @@ -216,17 +234,11 @@ const llvm_cap_db = Dict( v"8.7" => between(v"16", highest), v"8.9" => between(v"16", highest), v"9.0" => between(v"16", highest), - #v"9.0a" => between(v"18", highest), v"10.0" => between(v"20", highest), - #v"10.0a"=> between(v"20", highest), v"10.1" => between(v"20", highest), - #v"10.1a"=> between(v"20", highest), v"10.3" => between(v"21", highest), - #v"10.3a"=> between(v"21", highest), v"12.0" => between(v"20", highest), - #v"12.0a"=> between(v"20", highest), v"12.1" => between(v"21", highest), - #v"12.1a"=> between(v"21", highest), ) function llvm_cap_support(ver::VersionNumber) diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index b5118f5765..ff634572ec 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -3,15 +3,27 @@ Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams cap::VersionNumber ptx::VersionNumber + kind::PTXTargetKind = Baseline end function Base.hash(params::CUDACompilerParams, h::UInt) h = hash(params.cap, h) h = hash(params.ptx, h) + h = hash(params.kind, h) return h end +# Format a `(cap, kind)` tuple as the `sm_NNN[a|f]` string used by both the `.target` +# directive and the `--gpu-name` flag. The two must agree on suffix for `kind=Architectural` +# (ptxas requires exact match) and need to be in the same major family for `kind=Family`; +# emitting the same string on both sides handles all three kinds correctly. +function format_target(cap::VersionNumber, kind::PTXTargetKind) + suffix = kind === Architectural ? "a" : + kind === Family ? "f" : "" + return "sm_$(cap.major)$(cap.minor)$suffix" +end + const CUDACompilerConfig = CompilerConfig{PTXCompilerTarget, CUDACompilerParams} const CUDACompilerJob = CompilerJob{PTXCompilerTarget,CUDACompilerParams} @@ -124,10 +136,10 @@ end # stamp `.version` with the ISA we want `ptxas` to validate against # and `.target` with the arch that `--gpu-name` will use -function rewrite_ptx_header(asm, ptx, cap) +function rewrite_ptx_header(asm, ptx, cap, kind=Baseline) return replace(asm, r"(\.version .+)" => ".version $(ptx.major).$(ptx.minor)", - r"\.target sm_\d+\w*" => ".target sm_$(cap.major)$(cap.minor)") + r"\.target sm_\d+\w*" => ".target $(format_target(cap, kind))") end function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format) @@ -150,9 +162,9 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module asm = replace(asm, r"(\.target .+), debug" => s"\1") end - (; ptx, cap) = job.config.params - if job.config.target.ptx != ptx || job.config.target.cap != cap - asm = rewrite_ptx_header(asm, ptx, cap) + (; ptx, cap, kind) = job.config.params + if job.config.target.ptx != ptx || job.config.target.cap != cap || kind !== Baseline + asm = rewrite_ptx_header(asm, ptx, cap, kind) end return asm @@ -184,7 +196,7 @@ function compiler_config(dev; kwargs...) return config end @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, - cap=nothing, ptx=nothing, kwargs...) + cap=nothing, ptx=nothing, kind=nothing, kwargs...) # determine the toolchain llvm_support = llvm_compat() cuda_support = cuda_compat() @@ -241,9 +253,18 @@ end # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7 debuginfo = runtime_version() >= v"11.7" + # default the target feature set based on the device cap. Architectural is the + # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict + # superset of Baseline, and the cubin is per-device anyway so portability isn't on + # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline. + if kind === nothing + kind = cuda_cap >= v"9.0" ? Architectural : Baseline + end + validate_target_kind(cuda_cap, cuda_ptx, kind) + # create GPUCompiler objects target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...) - params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx) + params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, kind) CompilerConfig(target, params; kernel, name, always_inline) end @@ -280,7 +301,8 @@ function compile(@nospecialize(job::CompilerJob)) ptx = job.config.params.ptx cap = job.config.params.cap - arch = "sm_$(cap.major)$(cap.minor)" + kind = job.config.params.kind + arch = format_target(cap, kind) # validate use of parameter memory argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt diff --git a/test/core/codegen.jl b/test/core/codegen.jl index 02f275aa7d..f813197b46 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -259,6 +259,38 @@ end @test occursin(".target sm_90", asm_post) @test success(run_ptxas(asm_post, "sm_90")) + + # Architectural kind appends an `a` suffix to the .target directive (and the same + # string is what `compile()` passes to --gpu-name, since ptxas requires exact match + # for `a`-mode). + asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", CUDACore.Architectural) + @test occursin(".target sm_90a", asm_arch) + @test success(run_ptxas(asm_arch, "sm_90a")) + + # Family kind appends `f`. Requires PTX 8.8+ at the `.target` line. + asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", CUDACore.Family) + @test occursin(".target sm_100f", asm_family) + @test success(run_ptxas(asm_family, "sm_100f")) +end + +@testset "CUDACompilerParams hash discriminates on kind" begin + # Without `kind` in the hash, two params differing only on kind would collide in + # the compiler cache and silently return a cubin compiled for the wrong feature set. + base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Baseline) + arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Architectural) + @test hash(base) != hash(arch) + @test base != arch +end + +@testset "validate_target_kind" begin + # Architectural needs CC >= 9.0 and PTX >= 8.0; Family needs CC >= 10.0 and PTX >= 8.8. + @test_throws ErrorException CUDACore.validate_target_kind(v"8.6", v"8.0", CUDACore.Architectural) + @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"7.8", CUDACore.Architectural) + @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Family) + @test_throws ErrorException CUDACore.validate_target_kind(v"10.0", v"8.7", CUDACore.Family) + @test CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Architectural) === nothing + @test CUDACore.validate_target_kind(v"10.0", v"8.8", CUDACore.Family) === nothing + @test CUDACore.validate_target_kind(v"5.0", v"6.2", CUDACore.Baseline) === nothing end end From 9fd5464d0d519d2674f71719b335b6cce7428503 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Thu, 30 Apr 2026 10:24:39 +0000 Subject: [PATCH 2/5] Remove `kind` compiler config kwarg --- CUDACore/src/compiler/compilation.jl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index ff634572ec..b3f6f8dec0 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -196,7 +196,7 @@ function compiler_config(dev; kwargs...) return config end @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, - cap=nothing, ptx=nothing, kind=nothing, kwargs...) + cap=nothing, ptx=nothing, kwargs...) # determine the toolchain llvm_support = llvm_compat() cuda_support = cuda_compat() @@ -253,14 +253,11 @@ end # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7 debuginfo = runtime_version() >= v"11.7" - # default the target feature set based on the device cap. Architectural is the + # pick the target feature set based on the device cap. Architectural is the # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict # superset of Baseline, and the cubin is per-device anyway so portability isn't on # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline. - if kind === nothing - kind = cuda_cap >= v"9.0" ? Architectural : Baseline - end - validate_target_kind(cuda_cap, cuda_ptx, kind) + kind = cuda_cap >= v"9.0" ? Architectural : Baseline # create GPUCompiler objects target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...) From 494bd937ee5a6b0bd46dac932e2559299fba33c3 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Thu, 30 Apr 2026 19:28:09 +0000 Subject: [PATCH 3/5] Add `feature_set` compiler config kwarg; use symbols instead of enum type --- CUDACore/src/compatibility.jl | 38 +++++++----------- CUDACore/src/compiler/compilation.jl | 58 ++++++++++++++++------------ CUDACore/src/compiler/execution.jl | 4 +- test/core/codegen.jl | 42 ++++++++++---------- 4 files changed, 72 insertions(+), 70 deletions(-) diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index e10899ce20..e8386b49d4 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -1,5 +1,9 @@ # compatibility of Julia, CUDA and LLVM +const lowest = v"0" +const highest = v"999" + + # PTX compilation targets come in three feature-set flavors, selected via the suffix on the # `.target` directive (and the matching `--gpu-name` to ptxas): # @@ -8,32 +12,20 @@ # - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable; # code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families. # Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap. -# - Architectural (`a` suffix, e.g. sm_90a): a strict superset of Family. Locked to one -# exact CC; code compiled for sm_90a runs only on CC 9.0 devices. Introduced with +# - Architecture (`a` suffix, e.g. sm_90a): a superset of Family. Locked to one +# exact CC; code compiled for sm_103a runs only on CC 10.3 devices. Introduced with # CC 9.0; uses the same PTX requirement as the plain target. # -# baseline ⊆ family ⊆ architectural. Architectural unlocks the full PTX surface (wgmma, -# tcgen05, FP4/MXFP cvt, TMA, setmaxnreg, …); family unlocks the subset shared across -# devices in the same major family; baseline unlocks only the forward-portable set. - -const lowest = v"0" -const highest = v"999" - -# PTX compilation target feature set; see top-of-file note for the hierarchy and rules. -@enum PTXTargetKind Baseline Family Architectural - -# Validate that `kind` is reachable at the requested `cap`/`ptx`. The cap floors and the -# kind PTX floors are uniform across caps, so we encode them here rather than in the -# per-cap tables (which would just repeat the same rule for every entry). The `a` syntax -# was introduced in PTX 8.0; the `f` syntax in PTX 8.8. -function validate_target_kind(cap::VersionNumber, ptx::VersionNumber, kind::PTXTargetKind) - if kind === Architectural - cap >= v"9.0" || error("Architectural targets require compute capability >= 9.0; got $cap") - ptx >= v"8.0" || error("Architectural targets require PTX ISA >= 8.0; got $ptx") +function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol) + if !(feature_set in (:baseline, :family, :architecture)) + error("feature_set must be one of :baseline, :family, :architecture; got $(repr(feature_set))") end - if kind === Family - cap >= v"10.0" || error("Family targets require compute capability >= 10.0; got $cap") - ptx >= v"8.8" || error("Family targets require PTX ISA >= 8.8; got $ptx") + if feature_set === :architecture + cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap") + ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx") + elseif feature_set === :family + cap >= v"10.0" || error("Family-specific targets require compute capability >= 10.0; got $cap") + ptx >= v"8.8" || error("Family-specific targets require PTX ISA >= 8.8; got $ptx") end return end diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index b3f6f8dec0..f53dded26c 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -3,24 +3,24 @@ Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams cap::VersionNumber ptx::VersionNumber - kind::PTXTargetKind = Baseline + feature_set::Symbol = :baseline end function Base.hash(params::CUDACompilerParams, h::UInt) h = hash(params.cap, h) h = hash(params.ptx, h) - h = hash(params.kind, h) + h = hash(params.feature_set, h) return h end -# Format a `(cap, kind)` tuple as the `sm_NNN[a|f]` string used by both the `.target` -# directive and the `--gpu-name` flag. The two must agree on suffix for `kind=Architectural` -# (ptxas requires exact match) and need to be in the same major family for `kind=Family`; -# emitting the same string on both sides handles all three kinds correctly. -function format_target(cap::VersionNumber, kind::PTXTargetKind) - suffix = kind === Architectural ? "a" : - kind === Family ? "f" : "" +# Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target` +# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:architecture` +# (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`; +# emitting the same string on both sides handles all three feature sets correctly. +function format_target(cap::VersionNumber, feature_set::Symbol) + suffix = feature_set === :architecture ? "a" : + feature_set === :family ? "f" : "" return "sm_$(cap.major)$(cap.minor)$suffix" end @@ -136,10 +136,10 @@ end # stamp `.version` with the ISA we want `ptxas` to validate against # and `.target` with the arch that `--gpu-name` will use -function rewrite_ptx_header(asm, ptx, cap, kind=Baseline) +function rewrite_ptx_header(asm, ptx, cap, feature_set) return replace(asm, r"(\.version .+)" => ".version $(ptx.major).$(ptx.minor)", - r"\.target sm_\d+\w*" => ".target $(format_target(cap, kind))") + r"\.target sm_\d+\w*" => ".target $(format_target(cap, feature_set))") end function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format) @@ -162,9 +162,12 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module asm = replace(asm, r"(\.target .+), debug" => s"\1") end - (; ptx, cap, kind) = job.config.params - if job.config.target.ptx != ptx || job.config.target.cap != cap || kind !== Baseline - asm = rewrite_ptx_header(asm, ptx, cap, kind) + (; ptx, cap, feature_set) = job.config.params + needs_rewrite = job.config.target.ptx != ptx || + job.config.target.cap != cap || + feature_set !== :baseline + if needs_rewrite + asm = rewrite_ptx_header(asm, ptx, cap, feature_set) end return asm @@ -196,7 +199,7 @@ function compiler_config(dev; kwargs...) return config end @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, - cap=nothing, ptx=nothing, kwargs...) + cap=nothing, ptx=nothing, feature_set=nothing, kwargs...) # determine the toolchain llvm_support = llvm_compat() cuda_support = cuda_compat() @@ -253,15 +256,22 @@ end # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7 debuginfo = runtime_version() >= v"11.7" - # pick the target feature set based on the device cap. Architectural is the - # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict - # superset of Baseline, and the cubin is per-device anyway so portability isn't on - # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline. - kind = cuda_cap >= v"9.0" ? Architectural : Baseline + # Pick the target feature set based on the device cap. + # Architecture-specific is chosen for devices where it's + # available (CC >= 9.0) since it's a strict superset of + # the baseline and family feature sets. + if feature_set === nothing + feature_set = if cuda_cap >= v"9.0" && cuda_ptx >= v"8.0" + :architecture + else + :baseline + end + end + validate_feature_set(cuda_cap, cuda_ptx, feature_set) # create GPUCompiler objects target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...) - params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, kind) + params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, feature_set) CompilerConfig(target, params; kernel, name, always_inline) end @@ -296,10 +306,8 @@ function compile(@nospecialize(job::CompilerJob)) push!(ptxas_opts, "--compile-only") end - ptx = job.config.params.ptx - cap = job.config.params.cap - kind = job.config.params.kind - arch = format_target(cap, kind) + (; ptx, cap, feature_set) = job.config.params + arch = format_target(cap, feature_set) # validate use of parameter memory argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl index 130d049e7c..7cfe96e1a9 100644 --- a/CUDACore/src/compiler/execution.jl +++ b/CUDACore/src/compiler/execution.jl @@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} = ## high-level @cuda interface const MACRO_KWARGS = [:dynamic, :launch, :backend] -const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx, :feature_set] const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream] @@ -434,6 +434,8 @@ The following keyword arguments are supported: - `always_inline`: inline all function calls in the kernel - `fastmath`: use less precise square roots and flush denormals - `cap` and `ptx`: to override the compute capability and PTX version to compile for +- `feature_set`: PTX feature set (`:baseline`, `:family`, or `:architecture`); defaults to the + most specific supported by the device The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when diff --git a/test/core/codegen.jl b/test/core/codegen.jl index f813197b46..48af0d92f5 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -255,42 +255,42 @@ end @test !success(run_ptxas(asm_pre, "sm_75")) - asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0") + asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :baseline) @test occursin(".target sm_90", asm_post) @test success(run_ptxas(asm_post, "sm_90")) - # Architectural kind appends an `a` suffix to the .target directive (and the same - # string is what `compile()` passes to --gpu-name, since ptxas requires exact match - # for `a`-mode). - asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", CUDACore.Architectural) + # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same + # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode). + asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :architecture) @test occursin(".target sm_90a", asm_arch) @test success(run_ptxas(asm_arch, "sm_90a")) - # Family kind appends `f`. Requires PTX 8.8+ at the `.target` line. - asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", CUDACore.Family) + # Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line. + asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", :family) @test occursin(".target sm_100f", asm_family) @test success(run_ptxas(asm_family, "sm_100f")) end -@testset "CUDACompilerParams hash discriminates on kind" begin - # Without `kind` in the hash, two params differing only on kind would collide in - # the compiler cache and silently return a cubin compiled for the wrong feature set. - base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Baseline) - arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Architectural) +@testset "CUDACompilerParams hash discriminates on feature_set" begin + # Without feature_set in the hash, two params differing only on feature_set would collide + # in the compiler cache and silently return a cubin compiled for the wrong feature set. + base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline) + arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:architecture) @test hash(base) != hash(arch) @test base != arch end -@testset "validate_target_kind" begin - # Architectural needs CC >= 9.0 and PTX >= 8.0; Family needs CC >= 10.0 and PTX >= 8.8. - @test_throws ErrorException CUDACore.validate_target_kind(v"8.6", v"8.0", CUDACore.Architectural) - @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"7.8", CUDACore.Architectural) - @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Family) - @test_throws ErrorException CUDACore.validate_target_kind(v"10.0", v"8.7", CUDACore.Family) - @test CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Architectural) === nothing - @test CUDACore.validate_target_kind(v"10.0", v"8.8", CUDACore.Family) === nothing - @test CUDACore.validate_target_kind(v"5.0", v"6.2", CUDACore.Baseline) === nothing +@testset "validate_feature_set" begin + # Architecture-specific needs CC >= 9.0 and PTX >= 8.0 + # Family-specific needs CC >= 10.0 and PTX >= 8.8. + @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :architecture) + @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :architecture) + @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family) + @test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family) + @test CUDACore.validate_feature_set(v"9.0", v"8.0", :architecture) === nothing + @test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing + @test CUDACore.validate_feature_set(v"5.0", v"6.2", :baseline) === nothing end end From e8537f65f87ce822d146d975b33680bb245e8244 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Thu, 30 Apr 2026 19:59:39 +0000 Subject: [PATCH 4/5] Require opt-in through `feature_set` to avoid breaking changes --- CUDACore/src/compiler/compilation.jl | 14 +++----------- CUDACore/src/compiler/execution.jl | 3 +-- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index f53dded26c..27b41dabdf 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -256,17 +256,9 @@ end # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7 debuginfo = runtime_version() >= v"11.7" - # Pick the target feature set based on the device cap. - # Architecture-specific is chosen for devices where it's - # available (CC >= 9.0) since it's a strict superset of - # the baseline and family feature sets. - if feature_set === nothing - feature_set = if cuda_cap >= v"9.0" && cuda_ptx >= v"8.0" - :architecture - else - :baseline - end - end + # Conservatively pick baseline for backward compatibility, + # requiring explicit opt-in for family- and architecture-specific instructions. + feature_set = something(feature_set, :baseline) validate_feature_set(cuda_cap, cuda_ptx, feature_set) # create GPUCompiler objects diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl index 7cfe96e1a9..e5359e50fc 100644 --- a/CUDACore/src/compiler/execution.jl +++ b/CUDACore/src/compiler/execution.jl @@ -434,8 +434,7 @@ The following keyword arguments are supported: - `always_inline`: inline all function calls in the kernel - `fastmath`: use less precise square roots and flush denormals - `cap` and `ptx`: to override the compute capability and PTX version to compile for -- `feature_set`: PTX feature set (`:baseline`, `:family`, or `:architecture`); defaults to the - most specific supported by the device +- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:architecture` The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when From a5bbd52593d92fbdcf0c7b8f451dc63f5e77c30f Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Fri, 1 May 2026 15:41:37 +0000 Subject: [PATCH 5/5] Shorten `:architecture` to `:arch` --- CUDACore/src/compatibility.jl | 6 +++--- CUDACore/src/compiler/compilation.jl | 4 ++-- CUDACore/src/compiler/execution.jl | 2 +- test/core/codegen.jl | 10 +++++----- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl index e8386b49d4..2b30a4f843 100644 --- a/CUDACore/src/compatibility.jl +++ b/CUDACore/src/compatibility.jl @@ -17,10 +17,10 @@ const highest = v"999" # CC 9.0; uses the same PTX requirement as the plain target. # function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol) - if !(feature_set in (:baseline, :family, :architecture)) - error("feature_set must be one of :baseline, :family, :architecture; got $(repr(feature_set))") + if !(feature_set in (:baseline, :family, :arch)) + error("feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))") end - if feature_set === :architecture + if feature_set === :arch cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap") ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx") elseif feature_set === :family diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl index 27b41dabdf..e4bf547dbf 100644 --- a/CUDACore/src/compiler/compilation.jl +++ b/CUDACore/src/compiler/compilation.jl @@ -15,11 +15,11 @@ function Base.hash(params::CUDACompilerParams, h::UInt) end # Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target` -# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:architecture` +# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:arch` # (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`; # emitting the same string on both sides handles all three feature sets correctly. function format_target(cap::VersionNumber, feature_set::Symbol) - suffix = feature_set === :architecture ? "a" : + suffix = feature_set === :arch ? "a" : feature_set === :family ? "f" : "" return "sm_$(cap.major)$(cap.minor)$suffix" end diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl index e5359e50fc..619acaa327 100644 --- a/CUDACore/src/compiler/execution.jl +++ b/CUDACore/src/compiler/execution.jl @@ -434,7 +434,7 @@ The following keyword arguments are supported: - `always_inline`: inline all function calls in the kernel - `fastmath`: use less precise square roots and flush denormals - `cap` and `ptx`: to override the compute capability and PTX version to compile for -- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:architecture` +- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:arch` The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when diff --git a/test/core/codegen.jl b/test/core/codegen.jl index 48af0d92f5..2096439b6b 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -262,7 +262,7 @@ end # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode). - asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :architecture) + asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :arch) @test occursin(".target sm_90a", asm_arch) @test success(run_ptxas(asm_arch, "sm_90a")) @@ -276,7 +276,7 @@ end # Without feature_set in the hash, two params differing only on feature_set would collide # in the compiler cache and silently return a cubin compiled for the wrong feature set. base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline) - arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:architecture) + arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:arch) @test hash(base) != hash(arch) @test base != arch end @@ -284,11 +284,11 @@ end @testset "validate_feature_set" begin # Architecture-specific needs CC >= 9.0 and PTX >= 8.0 # Family-specific needs CC >= 10.0 and PTX >= 8.8. - @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :architecture) - @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :architecture) + @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :arch) + @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :arch) @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family) @test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family) - @test CUDACore.validate_feature_set(v"9.0", v"8.0", :architecture) === nothing + @test CUDACore.validate_feature_set(v"9.0", v"8.0", :arch) === nothing @test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing @test CUDACore.validate_feature_set(v"5.0", v"6.2", :baseline) === nothing end