Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 27 additions & 23 deletions CUDACore/src/compatibility.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,36 @@
# compatibility of Julia, CUDA and LLVM

# NOTE: Target architectures with suffix “a”, such as sm_90a, include
# architecture-accelerated features that are supported on the specified architecture only,
# hence such targets do not follow the onion layer model. Therefore, PTX code generated for
# such targets cannot be run on later generation devices. Architecture-accelerated features
# can only be used with targets that support these features.

const lowest = v"0"
const highest = v"999"


# PTX compilation targets come in three feature-set flavors, selected via the suffix on the
# `.target` directive (and the matching `--gpu-name` to ptxas):
#
# - Baseline (no suffix, e.g. sm_90): the forward-compatible feature set. Code compiled
# for sm_X runs on any sm_Y with Y >= X (onion model).
# - Family (`f` suffix, e.g. sm_100f): a superset of Baseline. Same-major-family-portable;
# code compiled for sm_100f runs on sm_100, sm_103, etc., but not across families.
# Introduced with CC 10.0; requires PTX >= 8.8 regardless of cap.
# - Architecture (`a` suffix, e.g. sm_90a): a superset of Family. Locked to one
# exact CC; code compiled for sm_103a runs only on CC 10.3 devices. Introduced with
# CC 9.0; uses the same PTX requirement as the plain target.
#
function validate_feature_set(cap::VersionNumber, ptx::VersionNumber, feature_set::Symbol)
if !(feature_set in (:baseline, :family, :arch))
error("feature_set must be one of :baseline, :family, :arch; got $(repr(feature_set))")
end
if feature_set === :arch
cap >= v"9.0" || error("Architecture-specific targets require compute capability >= 9.0; got $cap")
ptx >= v"8.0" || error("Architecture-specific targets require PTX ISA >= 8.0; got $ptx")
elseif feature_set === :family
cap >= v"10.0" || error("Family-specific targets require compute capability >= 10.0; got $cap")
ptx >= v"8.8" || error("Family-specific targets require PTX ISA >= 8.8; got $ptx")
end
return
end


## version range

struct VersionRange
Expand Down Expand Up @@ -163,22 +184,11 @@ const ptx_cap_db = Dict(
v"8.7" => between(v"7.4", highest),
v"8.9" => between(v"7.8", highest),
v"9.0" => between(v"7.8", highest),
#v"9.0a" => between(v"8.0", highest)
v"10.0" => between(v"8.6", highest),
#v"10.0a"=> between(v"8.6", highest),
#v"10.0f"=> between(v"8.8", highest),
v"10.1" => between(v"8.6", highest),
#v"10.1a"=> between(v"8.6", highest),
#v"10.1f"=> between(v"8.8", highest),
v"10.3" => between(v"8.8", highest),
#v"10.3a"=> between(v"8.8", highest),
#v"10.3f"=> between(v"8.8", highest),
v"12.0" => between(v"8.7", highest),
#v"12.0a"=> between(v"8.7", highest),
#v"12.0f"=> between(v"8.8", highest),
v"12.1" => between(v"8.8", highest),
#v"12.1a"=> between(v"8.8", highest),
#v"12.1f"=> between(v"8.8", highest),
)

function ptx_cap_support(ver::VersionNumber)
Expand Down Expand Up @@ -216,17 +226,11 @@ const llvm_cap_db = Dict(
v"8.7" => between(v"16", highest),
v"8.9" => between(v"16", highest),
v"9.0" => between(v"16", highest),
#v"9.0a" => between(v"18", highest),
v"10.0" => between(v"20", highest),
#v"10.0a"=> between(v"20", highest),
v"10.1" => between(v"20", highest),
#v"10.1a"=> between(v"20", highest),
v"10.3" => between(v"21", highest),
#v"10.3a"=> between(v"21", highest),
v"12.0" => between(v"20", highest),
#v"12.0a"=> between(v"20", highest),
v"12.1" => between(v"21", highest),
#v"12.1a"=> between(v"21", highest),
)

function llvm_cap_support(ver::VersionNumber)
Expand Down
39 changes: 29 additions & 10 deletions CUDACore/src/compiler/compilation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,27 @@
Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams
cap::VersionNumber
ptx::VersionNumber
feature_set::Symbol = :baseline
end

function Base.hash(params::CUDACompilerParams, h::UInt)
h = hash(params.cap, h)
h = hash(params.ptx, h)
h = hash(params.feature_set, h)

return h
end

# Format a `(cap, feature_set)` tuple as the `sm_NNN[a|f]` string used by both the `.target`
# directive and the `--gpu-name` flag. The two must agree on suffix for `feature_set=:arch`
# (ptxas requires exact match) and need to be in the same major family for `feature_set=:family`;
# emitting the same string on both sides handles all three feature sets correctly.
function format_target(cap::VersionNumber, feature_set::Symbol)
suffix = feature_set === :arch ? "a" :
feature_set === :family ? "f" : ""
return "sm_$(cap.major)$(cap.minor)$suffix"
end

const CUDACompilerConfig = CompilerConfig{PTXCompilerTarget, CUDACompilerParams}
const CUDACompilerJob = CompilerJob{PTXCompilerTarget,CUDACompilerParams}

Expand Down Expand Up @@ -124,10 +136,10 @@ end

# stamp `.version` with the ISA we want `ptxas` to validate against
# and `.target` with the arch that `--gpu-name` will use
function rewrite_ptx_header(asm, ptx, cap)
function rewrite_ptx_header(asm, ptx, cap, feature_set)
return replace(asm,
r"(\.version .+)" => ".version $(ptx.major).$(ptx.minor)",
r"\.target sm_\d+\w*" => ".target sm_$(cap.major)$(cap.minor)")
r"\.target sm_\d+\w*" => ".target $(format_target(cap, feature_set))")
end

function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module, format)
Expand All @@ -150,9 +162,12 @@ function GPUCompiler.mcgen(@nospecialize(job::CUDACompilerJob), mod::LLVM.Module
asm = replace(asm, r"(\.target .+), debug" => s"\1")
end

(; ptx, cap) = job.config.params
if job.config.target.ptx != ptx || job.config.target.cap != cap
asm = rewrite_ptx_header(asm, ptx, cap)
(; ptx, cap, feature_set) = job.config.params
needs_rewrite = job.config.target.ptx != ptx ||
job.config.target.cap != cap ||
feature_set !== :baseline
if needs_rewrite
asm = rewrite_ptx_header(asm, ptx, cap, feature_set)
end

return asm
Expand Down Expand Up @@ -184,7 +199,7 @@ function compiler_config(dev; kwargs...)
return config
end
@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
cap=nothing, ptx=nothing, kwargs...)
cap=nothing, ptx=nothing, feature_set=nothing, kwargs...)
# determine the toolchain
llvm_support = llvm_compat()
cuda_support = cuda_compat()
Expand Down Expand Up @@ -241,9 +256,14 @@ end
# NVIDIA bug #3600554: ptxas segfaults with our debug info, fixed in 11.7
debuginfo = runtime_version() >= v"11.7"

# Conservatively pick baseline for backward compatibility,
# requiring explicit opt-in for family- and architecture-specific instructions.
feature_set = something(feature_set, :baseline)
validate_feature_set(cuda_cap, cuda_ptx, feature_set)

# create GPUCompiler objects
target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo, kwargs...)
params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx)
params = CUDACompilerParams(; cap=cuda_cap, ptx=cuda_ptx, feature_set)
CompilerConfig(target, params; kernel, name, always_inline)
end

Expand Down Expand Up @@ -278,9 +298,8 @@ function compile(@nospecialize(job::CompilerJob))
push!(ptxas_opts, "--compile-only")
end

ptx = job.config.params.ptx
cap = job.config.params.cap
arch = "sm_$(cap.major)$(cap.minor)"
(; ptx, cap, feature_set) = job.config.params
arch = format_target(cap, feature_set)

# validate use of parameter memory
argtypes = filter([KernelState, job.source.specTypes.parameters...]) do dt
Expand Down
3 changes: 2 additions & 1 deletion CUDACore/src/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ kernel_compile(::LLVMBackend, f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} =
## high-level @cuda interface

const MACRO_KWARGS = [:dynamic, :launch, :backend]
const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx]
const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx, :feature_set]
const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clustersize, :shmem, :stream]


Expand Down Expand Up @@ -434,6 +434,7 @@ The following keyword arguments are supported:
- `always_inline`: inline all function calls in the kernel
- `fastmath`: use less precise square roots and flush denormals
- `cap` and `ptx`: to override the compute capability and PTX version to compile for
- `feature_set`: PTX feature set, one of `:baseline` (default), `:family`, or `:arch`

The output of this function is automatically cached, i.e. you can simply call `cufunction`
in a hot path without degrading performance. New code will be generated automatically, when
Expand Down
34 changes: 33 additions & 1 deletion test/core/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -255,10 +255,42 @@ end

@test !success(run_ptxas(asm_pre, "sm_75"))

asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0")
asm_post = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :baseline)
@test occursin(".target sm_90", asm_post)

@test success(run_ptxas(asm_post, "sm_90"))

# Architecture-specific feature set appends an `a` suffix to the .target directive (and the same
# string is what `compile()` passes to --gpu-name, since ptxas requires exact match for `a`-mode).
asm_arch = CUDACore.rewrite_ptx_header(asm_pre, v"8.0", v"9.0", :arch)
@test occursin(".target sm_90a", asm_arch)
@test success(run_ptxas(asm_arch, "sm_90a"))

# Family-specific appends `f`. Requires PTX 8.8+ at the `.target` line.
asm_family = CUDACore.rewrite_ptx_header(asm_pre, v"8.8", v"10.0", :family)
@test occursin(".target sm_100f", asm_family)
@test success(run_ptxas(asm_family, "sm_100f"))
end

@testset "CUDACompilerParams hash discriminates on feature_set" begin
# Without feature_set in the hash, two params differing only on feature_set would collide
# in the compiler cache and silently return a cubin compiled for the wrong feature set.
base = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:baseline)
arch = CUDACore.CUDACompilerParams(cap=v"9.0", ptx=v"8.0", feature_set=:arch)
@test hash(base) != hash(arch)
@test base != arch
end

@testset "validate_feature_set" begin
# Architecture-specific needs CC >= 9.0 and PTX >= 8.0
# Family-specific needs CC >= 10.0 and PTX >= 8.8.
@test_throws ErrorException CUDACore.validate_feature_set(v"8.6", v"8.0", :arch)
@test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"7.8", :arch)
@test_throws ErrorException CUDACore.validate_feature_set(v"9.0", v"8.0", :family)
@test_throws ErrorException CUDACore.validate_feature_set(v"10.0", v"8.7", :family)
@test CUDACore.validate_feature_set(v"9.0", v"8.0", :arch) === nothing
@test CUDACore.validate_feature_set(v"10.0", v"8.8", :family) === nothing
@test CUDACore.validate_feature_set(v"5.0", v"6.2", :baseline) === nothing
end

end
Expand Down