JuliaGPU · AntonOresten · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
@@ -1,15 +1,36 @@
 # compatibility of Julia, CUDA and LLVM
 
-# NOTE: Target architectures with suffix “a”, such as sm_90a, include
-# architecture-accelerated features that are supported on the specified architecture only,
-# hence such targets do not follow the onion layer model. Therefore, PTX code generated for
-# such targets cannot be run on later generation devices. Architecture-accelerated features
-# can only be used with targets that support these features.
-
 const lowest = v"0"
 const highest = v"999"
 
 
+# PTX compilation targets come in three feature-set flavors, selected via the suffix on the
+# `.target` directive (and the matching `--gpu-name` to ptxas):
+#
+#   - Baseline (no suffix, e.g. sm_90): the forward-compatible feature set. Code compiled
+#     for sm_X runs on any sm_Y with Y >= X (onion model).
+#   - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable;
+#     code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families.
+#     Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap.
+#   - Architecture (`a` suffix, e.g. sm_90a): a superset of Family. Locked to one
+#     exact CC; code compiled for sm_103a runs only on CC 10.3 devices. Introduced with
+#     CC 9.0; uses the same PTX requirement as the plain target.
+#
+function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol)
+    if !(feature_set in (:baseline, :family, :arch))
+        error("feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))")
+    end
+    if feature_set === :arch
+        cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap")
+        ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx")
+    elseif feature_set === :family
+        cap >= v"10.0" || error("Family-specific targets require compute capability >= 10.0; got $cap")
+        ptx >= v"8.8"  || error("Family-specific targets require PTX ISA >= 8.8; got $ptx")
+    end
+    return
+end
+
+
 ## version range
 
 struct VersionRange
@@ -163,22 +184,11 @@ const ptx_cap_db = Dict(
     v"8.7"   => between(v"7.4", highest),
     v"8.9"   => between(v"7.8", highest),
     v"9.0"   => between(v"7.8", highest),
-    #v"9.0a" => between(v"8.0", highest)
     v"10.0"  => between(v"8.6", highest),
-    #v"10.0a"=> between(v"8.6", highest),
-    #v"10.0f"=> between(v"8.8", highest),
     v"10.1"  => between(v"8.6", highest),
-    #v"10.1a"=> between(v"8.6", highest),
-    #v"10.1f"=> between(v"8.8", highest),
     v"10.3"  => between(v"8.8", highest),
-    #v"10.3a"=> between(v"8.8", highest),
-    #v"10.3f"=> between(v"8.8", highest),
     v"12.0"  => between(v"8.7", highest),
-    #v"12.0a"=> between(v"8.7", highest),
-    #v"12.0f"=> between(v"8.8", highest),
     v"12.1"  => between(v"8.8", highest),
-    #v"12.1a"=> between(v"8.8", highest),
-    #v"12.1f"=> between(v"8.8", highest),
 )
 
 function ptx_cap_support(ver::VersionNumber)
@@ -216,17 +226,11 @@ const llvm_cap_db = Dict(
     v"8.7"   => between(v"16", highest),
     v"8.9"   => between(v"16", highest),
     v"9.0"   => between(v"16", highest),
-    #v"9.0a" => between(v"18", highest),
     v"10.0"  => between(v"20", highest),
-    #v"10.0a"=> between(v"20", highest),
     v"10.1"  => between(v"20", highest),
-    #v"10.1a"=> between(v"20", highest),
     v"10.3"  => between(v"21", highest),
-    #v"10.3a"=> between(v"21", highest),
     v"12.0"  => between(v"20", highest),
-    #v"12.0a"=> between(v"20", highest),
     v"12.1"  => between(v"21", highest),
-    #v"12.1a"=> between(v"21", highest),
 )
 
 function llvm_cap_support(ver::VersionNumber)

diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
@@ -3,15 +3,27 @@
 Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams
     cap::VersionNumber
     ptx::VersionNumber
+    feature_set::Symbol = :baseline
 end
 
 function Base.hash(params::CUDACompilerParams, h::UInt)
     h = hash(params.cap, h)
     h = hash(params.ptx, h)
+    h = hash(params.feature_set, h)
 
     return h
 end
 
+# Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target`
+# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:arch`
+# (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`;
+# emitting the same string on both sides handles all three feature sets correctly.
+function format_target(cap::VersionNumber, feature_set::Symbol)
+    suffix = feature_set === :arch ? "a" :
+             feature_set === :family       ? "f" : ""
+    return "sm_$(cap.major)$(cap.minor)$suffix"
+end
+
 const CUDACompilerConfig = CompilerConfig{PTXCompilerTarget, CUDACompilerParams}
 const CUDACompilerJob = CompilerJob{PTXCompilerTarget,CUDACompilerParams}
 
@@ -124,10 +136,10 @@ end
 
 # stamp `.version` with the ISA we want `ptxas` to validate against
 # and `.target` with the arch that `--gpu-name` will use
-function rewrite_ptx_header(asm, ptx, cap)
+function rewrite_ptx_header(asm, ptx, cap, feature_set)
     return replace(asm,
         r"(\.version .+)"     => ".version $(ptx.major).$(ptx.minor)",
-        r"\.target sm_\d+\w*" => ".target sm_$(cap.major)$(cap.minor)")
+        r"\.target sm_\d+\w*" => ".target $(format_target(cap, feature_set))")
 end
 
 function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format)
@@ -150,9 +162,12 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module
         asm = replace(asm, r"(\.target .+), debug" => s"\1")
     end
 
-    (; ptx, cap) = job.config.params
-    if job.config.target.ptx != ptx || job.config.target.cap != cap
-        asm = rewrite_ptx_header(asm, ptx, cap)
+    (; ptx, cap, feature_set) = job.config.params
+    needs_rewrite = job.config.target.ptx != ptx ||
+                    job.config.target.cap != cap ||
+                    feature_set !== :baseline
+    if needs_rewrite
+        asm = rewrite_ptx_header(asm, ptx, cap, feature_set)
     end
 
     return asm
@@ -184,7 +199,7 @@ function compiler_config(dev; kwargs...)
     return config
 end
 @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
-                                         cap=nothing, ptx=nothing, kwargs...)
+                                         cap=nothing, ptx=nothing, feature_set=nothing, kwargs...)
     # determine the toolchain
     llvm_support = llvm_compat()
     cuda_support = cuda_compat()
@@ -241,9 +256,14 @@ end
     # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
     debuginfo = runtime_version() >= v"11.7"
 
+    # Conservatively pick baseline for backward compatibility,
+    # requiring explicit opt-in for family- and architecture-specific instructions. 
+    feature_set = something(feature_set, :baseline)
+    validate_feature_set(cuda_cap, cuda_ptx, feature_set)
+
     # create GPUCompiler objects
     target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...)
-    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx)
+    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, feature_set)
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
@@ -278,9 +298,8 @@ function compile(@nospecialize(job::CompilerJob))
         push!(ptxas_opts, "--compile-only")
     end
 
-    ptx = job.config.params.ptx
-    cap = job.config.params.cap
-    arch = "sm_$(cap.major)$(cap.minor)"
+    (; ptx, cap, feature_set) = job.config.params
+    arch = format_target(cap, feature_set)
 
     # validate use of parameter memory
     argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt

diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl
@@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} =
 ## high-level @cuda interface
 
 const MACRO_KWARGS = [:dynamic, :launch, :backend]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx, :feature_set]
 const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream]
 
 
@@ -434,6 +434,7 @@ The following keyword arguments are supported:
 - `always_inline`: inline all function calls in the kernel
 - `fastmath`: use less precise square roots and flush denormals
 - `cap` and `ptx`: to override the compute capability and PTX version to compile for
+- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:arch`
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when

diff --git a/test/core/codegen.jl b/test/core/codegen.jl
@@ -255,10 +255,42 @@ end
 
     @test !success(run_ptxas(asm_pre, "sm_75"))
 
-    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0")
+    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :baseline)
     @test occursin(".target sm_90", asm_post)
 
     @test success(run_ptxas(asm_post, "sm_90"))
+
+    # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same
+    # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode).
+    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :arch)
+    @test occursin(".target sm_90a", asm_arch)
+    @test success(run_ptxas(asm_arch, "sm_90a"))
+
+    # Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line.
+    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", :family)
+    @test occursin(".target sm_100f", asm_family)
+    @test success(run_ptxas(asm_family, "sm_100f"))
+end
+
+@testset "CUDACompilerParams hash discriminates on feature_set" begin
+    # Without feature_set in the hash, two params differing only on feature_set would collide
+    # in the compiler cache and silently return a cubin compiled for the wrong feature set.
+    base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline)
+    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:arch)
+    @test hash(base) != hash(arch)
+    @test base != arch
+end
+
+@testset "validate_feature_set" begin
+    # Architecture-specific needs CC >= 9.0 and PTX >= 8.0
+    # Family-specific needs CC >= 10.0 and PTX >= 8.8.
+    @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :arch)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :arch)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family)
+    @test CUDACore.validate_feature_set(v"9.0",  v"8.0", :arch) === nothing
+    @test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing
+    @test CUDACore.validate_feature_set(v"5.0",  v"6.2", :baseline) === nothing
 end
 
 end