From b98c518fe1812939f8511c8325480a404de2e761 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Thu, 30 Apr 2026 09:54:32 +0000
Subject: [PATCH 1/5] Add architecture- and family-specific PTX target support

---
 CUDACore/src/compatibility.jl        | 56 +++++++++++++++++-----------
 CUDACore/src/compiler/compilation.jl | 38 +++++++++++++++----
 test/core/codegen.jl                 | 32 ++++++++++++++++
 3 files changed, 96 insertions(+), 30 deletions(-)

diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
index 7fd60d8723..e10899ce20 100644
--- a/CUDACore/src/compatibility.jl
+++ b/CUDACore/src/compatibility.jl
@@ -1,14 +1,43 @@
 # compatibility of Julia, CUDA and LLVM
 
-# NOTE: Target architectures with suffix “a”, such as sm_90a, include
-# architecture-accelerated features that are supported on the specified architecture only,
-# hence such targets do not follow the onion layer model. Therefore, PTX code generated for
-# such targets cannot be run on later generation devices. Architecture-accelerated features
-# can only be used with targets that support these features.
+# PTX compilation targets come in three feature-set flavors, selected via the suffix on the
+# `.target` directive (and the matching `--gpu-name` to ptxas):
+#
+#   - Baseline (no suffix, e.g. sm_90): the forward-compatible feature set. Code compiled
+#     for sm_X runs on any sm_Y with Y >= X (onion model).
+#   - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable;
+#     code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families.
+#     Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap.
+#   - Architectural (`a` suffix, e.g. sm_90a): a strict superset of Family. Locked to one
+#     exact CC; code compiled for sm_90a runs only on CC 9.0 devices. Introduced with
+#     CC 9.0; uses the same PTX requirement as the plain target.
+#
+# baseline ⊆ family ⊆ architectural. Architectural unlocks the full PTX surface (wgmma,
+# tcgen05, FP4/MXFP cvt, TMA, setmaxnreg, …); family unlocks the subset shared across
+# devices in the same major family; baseline unlocks only the forward-portable set.
 
 const lowest = v"0"
 const highest = v"999"
 
+# PTX compilation target feature set; see top-of-file note for the hierarchy and rules.
+@enum PTXTargetKind Baseline Family Architectural
+
+# Validate that `kind` is reachable at the requested `cap`/`ptx`. The cap floors and the
+# kind PTX floors are uniform across caps, so we encode them here rather than in the
+# per-cap tables (which would just repeat the same rule for every entry). The `a` syntax
+# was introduced in PTX 8.0; the `f` syntax in PTX 8.8.
+function validate_target_kind(cap::VersionNumber, ptx::VersionNumber, kind::PTXTargetKind)
+    if kind === Architectural
+        cap >= v"9.0" || error("Architectural targets require compute capability >= 9.0; got $cap")
+        ptx >= v"8.0" || error("Architectural targets require PTX ISA >= 8.0; got $ptx")
+    end
+    if kind === Family
+        cap >= v"10.0" || error("Family targets require compute capability >= 10.0; got $cap")
+        ptx >= v"8.8"  || error("Family targets require PTX ISA >= 8.8; got $ptx")
+    end
+    return
+end
+
 
 ## version range
 
@@ -163,22 +192,11 @@ const ptx_cap_db = Dict(
     v"8.7"   => between(v"7.4", highest),
     v"8.9"   => between(v"7.8", highest),
     v"9.0"   => between(v"7.8", highest),
-    #v"9.0a" => between(v"8.0", highest)
     v"10.0"  => between(v"8.6", highest),
-    #v"10.0a"=> between(v"8.6", highest),
-    #v"10.0f"=> between(v"8.8", highest),
     v"10.1"  => between(v"8.6", highest),
-    #v"10.1a"=> between(v"8.6", highest),
-    #v"10.1f"=> between(v"8.8", highest),
     v"10.3"  => between(v"8.8", highest),
-    #v"10.3a"=> between(v"8.8", highest),
-    #v"10.3f"=> between(v"8.8", highest),
     v"12.0"  => between(v"8.7", highest),
-    #v"12.0a"=> between(v"8.7", highest),
-    #v"12.0f"=> between(v"8.8", highest),
     v"12.1"  => between(v"8.8", highest),
-    #v"12.1a"=> between(v"8.8", highest),
-    #v"12.1f"=> between(v"8.8", highest),
 )
 
 function ptx_cap_support(ver::VersionNumber)
@@ -216,17 +234,11 @@ const llvm_cap_db = Dict(
     v"8.7"   => between(v"16", highest),
     v"8.9"   => between(v"16", highest),
     v"9.0"   => between(v"16", highest),
-    #v"9.0a" => between(v"18", highest),
     v"10.0"  => between(v"20", highest),
-    #v"10.0a"=> between(v"20", highest),
     v"10.1"  => between(v"20", highest),
-    #v"10.1a"=> between(v"20", highest),
     v"10.3"  => between(v"21", highest),
-    #v"10.3a"=> between(v"21", highest),
     v"12.0"  => between(v"20", highest),
-    #v"12.0a"=> between(v"20", highest),
     v"12.1"  => between(v"21", highest),
-    #v"12.1a"=> between(v"21", highest),
 )
 
 function llvm_cap_support(ver::VersionNumber)
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index b5118f5765..ff634572ec 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -3,15 +3,27 @@
 Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams
     cap::VersionNumber
     ptx::VersionNumber
+    kind::PTXTargetKind = Baseline
 end
 
 function Base.hash(params::CUDACompilerParams, h::UInt)
     h = hash(params.cap, h)
     h = hash(params.ptx, h)
+    h = hash(params.kind, h)
 
     return h
 end
 
+# Format a `(cap, kind)` tuple as the `sm_NNN[a|f]` string used by both the `.target`
+# directive and the `--gpu-name` flag. The two must agree on suffix for `kind=Architectural`
+# (ptxas requires exact match) and need to be in the same major family for `kind=Family`;
+# emitting the same string on both sides handles all three kinds correctly.
+function format_target(cap::VersionNumber, kind::PTXTargetKind)
+    suffix = kind === Architectural ? "a" :
+             kind === Family        ? "f" : ""
+    return "sm_$(cap.major)$(cap.minor)$suffix"
+end
+
 const CUDACompilerConfig = CompilerConfig{PTXCompilerTarget, CUDACompilerParams}
 const CUDACompilerJob = CompilerJob{PTXCompilerTarget,CUDACompilerParams}
 
@@ -124,10 +136,10 @@ end
 
 # stamp `.version` with the ISA we want `ptxas` to validate against
 # and `.target` with the arch that `--gpu-name` will use
-function rewrite_ptx_header(asm, ptx, cap)
+function rewrite_ptx_header(asm, ptx, cap, kind=Baseline)
     return replace(asm,
         r"(\.version .+)"     => ".version $(ptx.major).$(ptx.minor)",
-        r"\.target sm_\d+\w*" => ".target sm_$(cap.major)$(cap.minor)")
+        r"\.target sm_\d+\w*" => ".target $(format_target(cap, kind))")
 end
 
 function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format)
@@ -150,9 +162,9 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module
         asm = replace(asm, r"(\.target .+), debug" => s"\1")
     end
 
-    (; ptx, cap) = job.config.params
-    if job.config.target.ptx != ptx || job.config.target.cap != cap
-        asm = rewrite_ptx_header(asm, ptx, cap)
+    (; ptx, cap, kind) = job.config.params
+    if job.config.target.ptx != ptx || job.config.target.cap != cap || kind !== Baseline
+        asm = rewrite_ptx_header(asm, ptx, cap, kind)
     end
 
     return asm
@@ -184,7 +196,7 @@ function compiler_config(dev; kwargs...)
     return config
 end
 @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
-                                         cap=nothing, ptx=nothing, kwargs...)
+                                         cap=nothing, ptx=nothing, kind=nothing, kwargs...)
     # determine the toolchain
     llvm_support = llvm_compat()
     cuda_support = cuda_compat()
@@ -241,9 +253,18 @@ end
     # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
     debuginfo = runtime_version() >= v"11.7"
 
+    # default the target feature set based on the device cap. Architectural is the
+    # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict
+    # superset of Baseline, and the cubin is per-device anyway so portability isn't on
+    # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline.
+    if kind === nothing
+        kind = cuda_cap >= v"9.0" ? Architectural : Baseline
+    end
+    validate_target_kind(cuda_cap, cuda_ptx, kind)
+
     # create GPUCompiler objects
     target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...)
-    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx)
+    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, kind)
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
@@ -280,7 +301,8 @@ function compile(@nospecialize(job::CompilerJob))
 
     ptx = job.config.params.ptx
     cap = job.config.params.cap
-    arch = "sm_$(cap.major)$(cap.minor)"
+    kind = job.config.params.kind
+    arch = format_target(cap, kind)
 
     # validate use of parameter memory
     argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 02f275aa7d..f813197b46 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -259,6 +259,38 @@ end
     @test occursin(".target sm_90", asm_post)
 
     @test success(run_ptxas(asm_post, "sm_90"))
+
+    # Architectural kind appends an `a` suffix to the .target directive (and the same
+    # string is what `compile()` passes to --gpu-name, since ptxas requires exact match
+    # for `a`-mode).
+    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", CUDACore.Architectural)
+    @test occursin(".target sm_90a", asm_arch)
+    @test success(run_ptxas(asm_arch, "sm_90a"))
+
+    # Family kind appends `f`. Requires PTX 8.8+ at the `.target` line.
+    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", CUDACore.Family)
+    @test occursin(".target sm_100f", asm_family)
+    @test success(run_ptxas(asm_family, "sm_100f"))
+end
+
+@testset "CUDACompilerParams hash discriminates on kind" begin
+    # Without `kind` in the hash, two params differing only on kind would collide in
+    # the compiler cache and silently return a cubin compiled for the wrong feature set.
+    base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Baseline)
+    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Architectural)
+    @test hash(base) != hash(arch)
+    @test base != arch
+end
+
+@testset "validate_target_kind" begin
+    # Architectural needs CC >= 9.0 and PTX >= 8.0; Family needs CC >= 10.0 and PTX >= 8.8.
+    @test_throws ErrorException CUDACore.validate_target_kind(v"8.6", v"8.0", CUDACore.Architectural)
+    @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"7.8", CUDACore.Architectural)
+    @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Family)
+    @test_throws ErrorException CUDACore.validate_target_kind(v"10.0", v"8.7", CUDACore.Family)
+    @test CUDACore.validate_target_kind(v"9.0",  v"8.0", CUDACore.Architectural) === nothing
+    @test CUDACore.validate_target_kind(v"10.0", v"8.8", CUDACore.Family) === nothing
+    @test CUDACore.validate_target_kind(v"5.0",  v"6.2", CUDACore.Baseline) === nothing
 end
 
 end

From 9fd5464d0d519d2674f71719b335b6cce7428503 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Thu, 30 Apr 2026 10:24:39 +0000
Subject: [PATCH 2/5] Remove `kind` compiler config kwarg

---
 CUDACore/src/compiler/compilation.jl | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index ff634572ec..b3f6f8dec0 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -196,7 +196,7 @@ function compiler_config(dev; kwargs...)
     return config
 end
 @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
-                                         cap=nothing, ptx=nothing, kind=nothing, kwargs...)
+                                         cap=nothing, ptx=nothing, kwargs...)
     # determine the toolchain
     llvm_support = llvm_compat()
     cuda_support = cuda_compat()
@@ -253,14 +253,11 @@ end
     # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
     debuginfo = runtime_version() >= v"11.7"
 
-    # default the target feature set based on the device cap. Architectural is the
+    # pick the target feature set based on the device cap. Architectural is the
     # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict
     # superset of Baseline, and the cubin is per-device anyway so portability isn't on
     # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline.
-    if kind === nothing
-        kind = cuda_cap >= v"9.0" ? Architectural : Baseline
-    end
-    validate_target_kind(cuda_cap, cuda_ptx, kind)
+    kind = cuda_cap >= v"9.0" ? Architectural : Baseline
 
     # create GPUCompiler objects
     target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...)

From 494bd937ee5a6b0bd46dac932e2559299fba33c3 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Thu, 30 Apr 2026 19:28:09 +0000
Subject: [PATCH 3/5] Add `feature_set` compiler config kwarg; use symbols
 instead of enum type

---
 CUDACore/src/compatibility.jl        | 38 +++++++-----------
 CUDACore/src/compiler/compilation.jl | 58 ++++++++++++++++------------
 CUDACore/src/compiler/execution.jl   |  4 +-
 test/core/codegen.jl                 | 42 ++++++++++----------
 4 files changed, 72 insertions(+), 70 deletions(-)

diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
index e10899ce20..e8386b49d4 100644
--- a/CUDACore/src/compatibility.jl
+++ b/CUDACore/src/compatibility.jl
@@ -1,5 +1,9 @@
 # compatibility of Julia, CUDA and LLVM
 
+const lowest = v"0"
+const highest = v"999"
+
+
 # PTX compilation targets come in three feature-set flavors, selected via the suffix on the
 # `.target` directive (and the matching `--gpu-name` to ptxas):
 #
@@ -8,32 +12,20 @@
 #   - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable;
 #     code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families.
 #     Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap.
-#   - Architectural (`a` suffix, e.g. sm_90a): a strict superset of Family. Locked to one
-#     exact CC; code compiled for sm_90a runs only on CC 9.0 devices. Introduced with
+#   - Architecture (`a` suffix, e.g. sm_90a): a superset of Family. Locked to one
+#     exact CC; code compiled for sm_103a runs only on CC 10.3 devices. Introduced with
 #     CC 9.0; uses the same PTX requirement as the plain target.
 #
-# baseline ⊆ family ⊆ architectural. Architectural unlocks the full PTX surface (wgmma,
-# tcgen05, FP4/MXFP cvt, TMA, setmaxnreg, …); family unlocks the subset shared across
-# devices in the same major family; baseline unlocks only the forward-portable set.
-
-const lowest = v"0"
-const highest = v"999"
-
-# PTX compilation target feature set; see top-of-file note for the hierarchy and rules.
-@enum PTXTargetKind Baseline Family Architectural
-
-# Validate that `kind` is reachable at the requested `cap`/`ptx`. The cap floors and the
-# kind PTX floors are uniform across caps, so we encode them here rather than in the
-# per-cap tables (which would just repeat the same rule for every entry). The `a` syntax
-# was introduced in PTX 8.0; the `f` syntax in PTX 8.8.
-function validate_target_kind(cap::VersionNumber, ptx::VersionNumber, kind::PTXTargetKind)
-    if kind === Architectural
-        cap >= v"9.0" || error("Architectural targets require compute capability >= 9.0; got $cap")
-        ptx >= v"8.0" || error("Architectural targets require PTX ISA >= 8.0; got $ptx")
+function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol)
+    if !(feature_set in (:baseline, :family, :architecture))
+        error("feature_set must be one of :baseline, :family, :architecture; got $(repr(feature_set))")
     end
-    if kind === Family
-        cap >= v"10.0" || error("Family targets require compute capability >= 10.0; got $cap")
-        ptx >= v"8.8"  || error("Family targets require PTX ISA >= 8.8; got $ptx")
+    if feature_set === :architecture
+        cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap")
+        ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx")
+    elseif feature_set === :family
+        cap >= v"10.0" || error("Family-specific targets require compute capability >= 10.0; got $cap")
+        ptx >= v"8.8"  || error("Family-specific targets require PTX ISA >= 8.8; got $ptx")
     end
     return
 end
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index b3f6f8dec0..f53dded26c 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -3,24 +3,24 @@
 Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams
     cap::VersionNumber
     ptx::VersionNumber
-    kind::PTXTargetKind = Baseline
+    feature_set::Symbol = :baseline
 end
 
 function Base.hash(params::CUDACompilerParams, h::UInt)
     h = hash(params.cap, h)
     h = hash(params.ptx, h)
-    h = hash(params.kind, h)
+    h = hash(params.feature_set, h)
 
     return h
 end
 
-# Format a `(cap, kind)` tuple as the `sm_NNN[a|f]` string used by both the `.target`
-# directive and the `--gpu-name` flag. The two must agree on suffix for `kind=Architectural`
-# (ptxas requires exact match) and need to be in the same major family for `kind=Family`;
-# emitting the same string on both sides handles all three kinds correctly.
-function format_target(cap::VersionNumber, kind::PTXTargetKind)
-    suffix = kind === Architectural ? "a" :
-             kind === Family        ? "f" : ""
+# Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target`
+# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:architecture`
+# (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`;
+# emitting the same string on both sides handles all three feature sets correctly.
+function format_target(cap::VersionNumber, feature_set::Symbol)
+    suffix = feature_set === :architecture ? "a" :
+             feature_set === :family       ? "f" : ""
     return "sm_$(cap.major)$(cap.minor)$suffix"
 end
 
@@ -136,10 +136,10 @@ end
 
 # stamp `.version` with the ISA we want `ptxas` to validate against
 # and `.target` with the arch that `--gpu-name` will use
-function rewrite_ptx_header(asm, ptx, cap, kind=Baseline)
+function rewrite_ptx_header(asm, ptx, cap, feature_set)
     return replace(asm,
         r"(\.version .+)"     => ".version $(ptx.major).$(ptx.minor)",
-        r"\.target sm_\d+\w*" => ".target $(format_target(cap, kind))")
+        r"\.target sm_\d+\w*" => ".target $(format_target(cap, feature_set))")
 end
 
 function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format)
@@ -162,9 +162,12 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module
         asm = replace(asm, r"(\.target .+), debug" => s"\1")
     end
 
-    (; ptx, cap, kind) = job.config.params
-    if job.config.target.ptx != ptx || job.config.target.cap != cap || kind !== Baseline
-        asm = rewrite_ptx_header(asm, ptx, cap, kind)
+    (; ptx, cap, feature_set) = job.config.params
+    needs_rewrite = job.config.target.ptx != ptx ||
+                    job.config.target.cap != cap ||
+                    feature_set !== :baseline
+    if needs_rewrite
+        asm = rewrite_ptx_header(asm, ptx, cap, feature_set)
     end
 
     return asm
@@ -196,7 +199,7 @@ function compiler_config(dev; kwargs...)
     return config
 end
 @noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
-                                         cap=nothing, ptx=nothing, kwargs...)
+                                         cap=nothing, ptx=nothing, feature_set=nothing, kwargs...)
     # determine the toolchain
     llvm_support = llvm_compat()
     cuda_support = cuda_compat()
@@ -253,15 +256,22 @@ end
     # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
     debuginfo = runtime_version() >= v"11.7"
 
-    # pick the target feature set based on the device cap. Architectural is the
-    # JIT-correct choice on devices where it's available (CC >= 9.0): it's a strict
-    # superset of Baseline, and the cubin is per-device anyway so portability isn't on
-    # the table. Pre-Hopper devices have no `a` flavor and stay on Baseline.
-    kind = cuda_cap >= v"9.0" ? Architectural : Baseline
+    # Pick the target feature set based on the device cap.
+    # Architecture-specific is chosen for devices where it's
+    # available (CC >= 9.0) since it's a strict superset of
+    # the baseline and family feature sets.
+    if feature_set === nothing
+        feature_set = if cuda_cap >= v"9.0" && cuda_ptx >= v"8.0"
+            :architecture
+        else
+            :baseline
+        end
+    end
+    validate_feature_set(cuda_cap, cuda_ptx, feature_set)
 
     # create GPUCompiler objects
     target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...)
-    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, kind)
+    params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, feature_set)
     CompilerConfig(target, params; kernel, name, always_inline)
 end
 
@@ -296,10 +306,8 @@ function compile(@nospecialize(job::CompilerJob))
         push!(ptxas_opts, "--compile-only")
     end
 
-    ptx = job.config.params.ptx
-    cap = job.config.params.cap
-    kind = job.config.params.kind
-    arch = format_target(cap, kind)
+    (; ptx, cap, feature_set) = job.config.params
+    arch = format_target(cap, feature_set)
 
     # validate use of parameter memory
     argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl
index 130d049e7c..7cfe96e1a9 100644
--- a/CUDACore/src/compiler/execution.jl
+++ b/CUDACore/src/compiler/execution.jl
@@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} =
 ## high-level @cuda interface
 
 const MACRO_KWARGS = [:dynamic, :launch, :backend]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx, :feature_set]
 const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream]
 
 
@@ -434,6 +434,8 @@ The following keyword arguments are supported:
 - `always_inline`: inline all function calls in the kernel
 - `fastmath`: use less precise square roots and flush denormals
 - `cap` and `ptx`: to override the compute capability and PTX version to compile for
+- `feature_set`: PTX feature set (`:baseline`, `:family`, or `:architecture`); defaults to the
+  most specific supported by the device
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index f813197b46..48af0d92f5 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -255,42 +255,42 @@ end
 
     @test !success(run_ptxas(asm_pre, "sm_75"))
 
-    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0")
+    asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :baseline)
     @test occursin(".target sm_90", asm_post)
 
     @test success(run_ptxas(asm_post, "sm_90"))
 
-    # Architectural kind appends an `a` suffix to the .target directive (and the same
-    # string is what `compile()` passes to --gpu-name, since ptxas requires exact match
-    # for `a`-mode).
-    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", CUDACore.Architectural)
+    # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same
+    # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode).
+    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :architecture)
     @test occursin(".target sm_90a", asm_arch)
     @test success(run_ptxas(asm_arch, "sm_90a"))
 
-    # Family kind appends `f`. Requires PTX 8.8+ at the `.target` line.
-    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", CUDACore.Family)
+    # Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line.
+    asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", :family)
     @test occursin(".target sm_100f", asm_family)
     @test success(run_ptxas(asm_family, "sm_100f"))
 end
 
-@testset "CUDACompilerParams hash discriminates on kind" begin
-    # Without `kind` in the hash, two params differing only on kind would collide in
-    # the compiler cache and silently return a cubin compiled for the wrong feature set.
-    base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Baseline)
-    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", kind=CUDACore.Architectural)
+@testset "CUDACompilerParams hash discriminates on feature_set" begin
+    # Without feature_set in the hash, two params differing only on feature_set would collide
+    # in the compiler cache and silently return a cubin compiled for the wrong feature set.
+    base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline)
+    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:architecture)
     @test hash(base) != hash(arch)
     @test base != arch
 end
 
-@testset "validate_target_kind" begin
-    # Architectural needs CC >= 9.0 and PTX >= 8.0; Family needs CC >= 10.0 and PTX >= 8.8.
-    @test_throws ErrorException CUDACore.validate_target_kind(v"8.6", v"8.0", CUDACore.Architectural)
-    @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"7.8", CUDACore.Architectural)
-    @test_throws ErrorException CUDACore.validate_target_kind(v"9.0", v"8.0", CUDACore.Family)
-    @test_throws ErrorException CUDACore.validate_target_kind(v"10.0", v"8.7", CUDACore.Family)
-    @test CUDACore.validate_target_kind(v"9.0",  v"8.0", CUDACore.Architectural) === nothing
-    @test CUDACore.validate_target_kind(v"10.0", v"8.8", CUDACore.Family) === nothing
-    @test CUDACore.validate_target_kind(v"5.0",  v"6.2", CUDACore.Baseline) === nothing
+@testset "validate_feature_set" begin
+    # Architecture-specific needs CC >= 9.0 and PTX >= 8.0
+    # Family-specific needs CC >= 10.0 and PTX >= 8.8.
+    @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :architecture)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :architecture)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family)
+    @test CUDACore.validate_feature_set(v"9.0",  v"8.0", :architecture) === nothing
+    @test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing
+    @test CUDACore.validate_feature_set(v"5.0",  v"6.2", :baseline) === nothing
 end
 
 end

From e8537f65f87ce822d146d975b33680bb245e8244 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Thu, 30 Apr 2026 19:59:39 +0000
Subject: [PATCH 4/5] Require opt-in through `feature_set` to avoid breaking
 changes

---
 CUDACore/src/compiler/compilation.jl | 14 +++-----------
 CUDACore/src/compiler/execution.jl   |  3 +--
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index f53dded26c..27b41dabdf 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -256,17 +256,9 @@ end
     # NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
     debuginfo = runtime_version() >= v"11.7"
 
-    # Pick the target feature set based on the device cap.
-    # Architecture-specific is chosen for devices where it's
-    # available (CC >= 9.0) since it's a strict superset of
-    # the baseline and family feature sets.
-    if feature_set === nothing
-        feature_set = if cuda_cap >= v"9.0" && cuda_ptx >= v"8.0"
-            :architecture
-        else
-            :baseline
-        end
-    end
+    # Conservatively pick baseline for backward compatibility,
+    # requiring explicit opt-in for family- and architecture-specific instructions. 
+    feature_set = something(feature_set, :baseline)
     validate_feature_set(cuda_cap, cuda_ptx, feature_set)
 
     # create GPUCompiler objects
diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl
index 7cfe96e1a9..e5359e50fc 100644
--- a/CUDACore/src/compiler/execution.jl
+++ b/CUDACore/src/compiler/execution.jl
@@ -434,8 +434,7 @@ The following keyword arguments are supported:
 - `always_inline`: inline all function calls in the kernel
 - `fastmath`: use less precise square roots and flush denormals
 - `cap` and `ptx`: to override the compute capability and PTX version to compile for
-- `feature_set`: PTX feature set (`:baseline`, `:family`, or `:architecture`); defaults to the
-  most specific supported by the device
+- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:architecture`
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when

From a5bbd52593d92fbdcf0c7b8f451dc63f5e77c30f Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Fri, 1 May 2026 15:41:37 +0000
Subject: [PATCH 5/5] Shorten `:architecture` to `:arch`

---
 CUDACore/src/compatibility.jl        |  6 +++---
 CUDACore/src/compiler/compilation.jl |  4 ++--
 CUDACore/src/compiler/execution.jl   |  2 +-
 test/core/codegen.jl                 | 10 +++++-----
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/CUDACore/src/compatibility.jl b/CUDACore/src/compatibility.jl
index e8386b49d4..2b30a4f843 100644
--- a/CUDACore/src/compatibility.jl
+++ b/CUDACore/src/compatibility.jl
@@ -17,10 +17,10 @@ const highest = v"999"
 #     CC 9.0; uses the same PTX requirement as the plain target.
 #
 function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol)
-    if !(feature_set in (:baseline, :family, :architecture))
-        error("feature_set must be one of :baseline, :family, :architecture; got $(repr(feature_set))")
+    if !(feature_set in (:baseline, :family, :arch))
+        error("feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))")
     end
-    if feature_set === :architecture
+    if feature_set === :arch
         cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap")
         ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx")
     elseif feature_set === :family
diff --git a/CUDACore/src/compiler/compilation.jl b/CUDACore/src/compiler/compilation.jl
index 27b41dabdf..e4bf547dbf 100644
--- a/CUDACore/src/compiler/compilation.jl
+++ b/CUDACore/src/compiler/compilation.jl
@@ -15,11 +15,11 @@ function Base.hash(params::CUDACompilerParams, h::UInt)
 end
 
 # Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target`
-# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:architecture`
+# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:arch`
 # (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`;
 # emitting the same string on both sides handles all three feature sets correctly.
 function format_target(cap::VersionNumber, feature_set::Symbol)
-    suffix = feature_set === :architecture ? "a" :
+    suffix = feature_set === :arch ? "a" :
              feature_set === :family       ? "f" : ""
     return "sm_$(cap.major)$(cap.minor)$suffix"
 end
diff --git a/CUDACore/src/compiler/execution.jl b/CUDACore/src/compiler/execution.jl
index e5359e50fc..619acaa327 100644
--- a/CUDACore/src/compiler/execution.jl
+++ b/CUDACore/src/compiler/execution.jl
@@ -434,7 +434,7 @@ The following keyword arguments are supported:
 - `always_inline`: inline all function calls in the kernel
 - `fastmath`: use less precise square roots and flush denormals
 - `cap` and `ptx`: to override the compute capability and PTX version to compile for
-- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:architecture`
+- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:arch`
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 48af0d92f5..2096439b6b 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -262,7 +262,7 @@ end
 
     # Architecture-specific feature set appends an `a` suffix to the .target directive (and the same
     # string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode).
-    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :architecture)
+    asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :arch)
     @test occursin(".target sm_90a", asm_arch)
     @test success(run_ptxas(asm_arch, "sm_90a"))
 
@@ -276,7 +276,7 @@ end
     # Without feature_set in the hash, two params differing only on feature_set would collide
     # in the compiler cache and silently return a cubin compiled for the wrong feature set.
     base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline)
-    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:architecture)
+    arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:arch)
     @test hash(base) != hash(arch)
     @test base != arch
 end
@@ -284,11 +284,11 @@ end
 @testset "validate_feature_set" begin
     # Architecture-specific needs CC >= 9.0 and PTX >= 8.0
     # Family-specific needs CC >= 10.0 and PTX >= 8.8.
-    @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :architecture)
-    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :architecture)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :arch)
+    @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :arch)
     @test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family)
     @test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family)
-    @test CUDACore.validate_feature_set(v"9.0",  v"8.0", :architecture) === nothing
+    @test CUDACore.validate_feature_set(v"9.0",  v"8.0", :arch) === nothing
     @test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing
     @test CUDACore.validate_feature_set(v"5.0",  v"6.2", :baseline) === nothing
 end