Skip to content

Commit d95add6

Browse files
authored
Improve TTFX (#3064)
1 parent ecb27a7 commit d95add6

33 files changed

Lines changed: 296 additions & 90 deletions

CUDACore/Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
2121
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
2222
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
2323
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
24+
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
2425
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
2526
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
2627
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -59,6 +60,7 @@ LazyArtifacts = "1"
5960
Libdl = "1"
6061
LinearAlgebra = "1"
6162
Logging = "1"
63+
PrecompileTools = "1"
6264
Preferences = "1"
6365
Printf = "1"
6466
Random = "1"

CUDACore/src/CUDACore.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ using Libdl
5454

5555
using Printf
5656

57+
using PrecompileTools
58+
5759
# Julia has several notions of `sizeof`
5860
# - Base.sizeof is the size of an object in memory
5961
# - Base.aligned_sizeof is the size of an object in an array/inline alloced

CUDACore/src/compiler/compilation.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ function compile(@nospecialize(job::CompilerJob))
247247
# lower to PTX
248248
# TODO: on 1.9, this actually creates a context. cache those.
249249
asm, meta = JuliaContext() do ctx
250-
GPUCompiler.compile(:asm, job)
250+
invoke_frozen(GPUCompiler.compile, :asm, job)
251251
end
252252

253253
# check if we'll need the device runtime

CUDACore/src/initialization.jl

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,29 @@
99
const _initialized = Ref{Bool}(false)
1010
const _initialization_error = Ref{String}()
1111

12+
# World age captured at __init__ time. Running the GPU compiler infrastructure
13+
# (typeinf_local, etc.) in this world avoids recompilation of native code that was
14+
# cached during precompilation but invalidated by later method definitions.
15+
# Default to typemax(UInt) so that during precompilation (before __init__ runs)
16+
# invoke_in_world clamps to the current world and behaves normally.
17+
const _initialization_world = Ref{UInt}(typemax(UInt))
18+
19+
"""
20+
invoke_frozen(f, args...; kwargs...)
21+
22+
Invoke `f(args...; kwargs...)` in the world captured at `__init__` time.
23+
This allows precompiled native code for the GPU compiler infrastructure
24+
(typeinf_local, etc.) to be reused, avoiding expensive recompilation.
25+
"""
26+
function invoke_frozen(f, args...; kwargs...)
27+
@inline
28+
kwargs = merge(NamedTuple(), kwargs)
29+
if isempty(kwargs)
30+
return Base.invoke_in_world(_initialization_world[], f, args...)
31+
end
32+
return Base.invoke_in_world(_initialization_world[], Core.kwcall, kwargs, f, args...)
33+
end
34+
1235
"""
1336
functional(show_reason=false)
1437
@@ -207,6 +230,10 @@ function __init__()
207230
end
208231
end
209232

233+
# capture the world age so that the compiler infrastructure can be invoked
234+
# in this world, reusing precompiled native code for typeinf_local etc.
235+
_initialization_world[] = Base.get_world_counter()
236+
210237
_initialized[] = true
211238
end
212239

CUDACore/src/precompile.jl

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,45 @@
1+
@compile_workload begin
2+
# compile a dummy kernel to PTX to precompile the GPUCompiler pipeline.
3+
# this doesn't need a GPU — it only uses LLVM.
4+
let
5+
function _precompile_vadd(a)
6+
i = threadIdx().x
7+
@inbounds a[i] += 1f0
8+
return nothing
9+
end
110

2-
# array
3-
precompile(CuArray, (Vector{Int},))
11+
llvm_support = llvm_compat()
12+
llvm_cap = maximum(filter(<=(v"7.5"), llvm_support.cap))
13+
llvm_ptx = maximum(filter(>=(v"6.2"), llvm_support.ptx))
414

5-
# compilation
6-
precompile(compiler_cache, (CuContext,))
7-
#precompile(compiler_config, (CuDevice,))
8-
precompile(compile, (CompilerJob,))
9-
precompile(link, (CompilerJob,NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}))
10-
precompile(create_exceptions!, (CuModule,))
11-
precompile(run_and_collect, (Cmd,))
15+
target = PTXCompilerTarget(; cap=llvm_cap, ptx=llvm_ptx, debuginfo=true)
16+
params = CUDACompilerParams(; cap=llvm_cap, ptx=llvm_ptx)
17+
config = CompilerConfig(target, params; kernel=true, name=nothing, always_inline=false)
1218

13-
# launch
14-
precompile(cudaconvert, (Function,))
15-
precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}}))
16-
precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction))
19+
tt = Tuple{CuDeviceArray{Float32,1,AS.Global}}
20+
source = methodinstance(typeof(_precompile_vadd), tt)
21+
job = CompilerJob(source, config)
22+
23+
# On Julia < 1.12, GPU compilation during precompilation leaks foreign
24+
# MIs into native compilation, causing LLVM errors
25+
# (e.g. "Cannot select: intrinsic %llvm.nvvm.membar.sys").
26+
@static if VERSION >= v"1.12-"
27+
JuliaContext() do ctx
28+
GPUCompiler.compile(:asm, job)
29+
end
30+
end
31+
end
32+
end
33+
34+
# kernel launch infrastructure
35+
precompile(Tuple{typeof(cufunction), typeof(identity), Type{Tuple{Nothing}}})
36+
precompile(Tuple{typeof(link), CompilerJob, NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}})
37+
38+
# GPUCompiler compilation pipeline (specialized for CUDACore's compile/link)
39+
precompile(Tuple{typeof(GPUCompiler.actual_compilation),
40+
Dict{Any, CuFunction}, Core.MethodInstance, UInt64,
41+
CUDACompilerConfig, typeof(compile), typeof(link)})
42+
43+
# scalar reference (used by cuBLAS for alpha/beta parameters)
44+
precompile(Tuple{Type{CuRefValue{Float32}}, Float32})
45+
precompile(Tuple{typeof(pool_free), Managed{DeviceMemory}})

CUDATools/Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
1212
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
1313
NVML = "611af6d1-644e-4c5d-bd58-854d7d1254b9"
1414
NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
15+
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
1516
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
1617
PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
1718
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -31,6 +32,7 @@ GPUCompiler = "1.4"
3132
LLVM = "9.3.1"
3233
NVML = "=6.0.0"
3334
NVTX = "1"
35+
PrecompileTools = "1"
3436
Preferences = "1"
3537
PrettyTables = "3"
3638
Printf = "1"

CUDATools/src/CUDATools.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,6 @@ include("reflection.jl")
4040
include("profile.jl")
4141
include("utilities.jl")
4242

43+
include("precompile.jl")
44+
4345
end

CUDATools/src/precompile.jl

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# @profile infrastructure (GPU-dependent, can't execute during precompilation)
2+
precompile(Tuple{typeof(Profile.detect_cupti)})
3+
precompile(Tuple{typeof(Profile.profile_internally), Function})
4+
precompile(Tuple{typeof(Profile.capture), CUPTI.ActivityConfig})
5+
6+
using PrecompileTools
7+
8+
@compile_workload begin
9+
# exercise the @profile display path with a dummy result (no GPU needed).
10+
# the show method expects at least two cuCtxSynchronize entries in the host trace
11+
# to delimit the profiled region, and at least one event between them.
12+
dummy = Profile.ProfileResults(;
13+
host = (
14+
id = Int[1, 2, 3, 4],
15+
start = Float64[0.0, 0.001, 0.002, 0.010],
16+
stop = Float64[0.001, 0.002, 0.009, 0.011],
17+
name = String["cuCtxSynchronize", "cuCtxSynchronize",
18+
"cuLaunchKernel", "cuCtxSynchronize"],
19+
tid = Int[1, 1, 1, 1],
20+
),
21+
device = (
22+
id = Int[3],
23+
start = Float64[0.003],
24+
stop = Float64[0.008],
25+
name = String["kernel"],
26+
device = Int[0],
27+
context = Int[1],
28+
stream = Int[1],
29+
grid = Union{Missing,CUDACore.CuDim3}[CUDACore.CuDim3(1,1,1)],
30+
block = Union{Missing,CUDACore.CuDim3}[CUDACore.CuDim3(1,1,1)],
31+
registers = Union{Missing,Int64}[32],
32+
shared_mem = Union{Missing,@NamedTuple{static::Int64,dynamic::Int64}}[(static=0,dynamic=0)],
33+
local_mem = Union{Missing,@NamedTuple{thread::Int64,total::Int64}}[(thread=0,total=0)],
34+
size = Union{Missing,Int64}[missing],
35+
),
36+
nvtx = (
37+
id = Int[],
38+
start = Float64[],
39+
type = Symbol[],
40+
tid = Int[],
41+
name = Union{Missing,String}[],
42+
domain = Union{Missing,String}[],
43+
),
44+
)
45+
show(devnull, dummy)
46+
end

CUDATools/src/profile.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ end
147147
# external profiler
148148
#
149149

150-
function profile_externally(f)
150+
function profile_externally(@nospecialize(f))
151151
# wait for the device to become idle
152152
CUDACore.cuCtxSynchronize()
153153

@@ -366,7 +366,7 @@ Base.@kwdef struct ProfileResults
366366
raw::Bool=false
367367
end
368368

369-
function profile_internally(f; concurrent=true, kwargs...)
369+
function profile_internally(@nospecialize(f); concurrent=true, kwargs...)
370370
activity_kinds = [
371371
# API calls
372372
CUPTI.CUPTI_ACTIVITY_KIND_DRIVER,
@@ -390,7 +390,7 @@ function profile_internally(f; concurrent=true, kwargs...)
390390
# wait for the device to become idle
391391
CUDACore.cuCtxSynchronize()
392392

393-
CUPTI.enable!(cfg) do
393+
CUPTI.@enable! cfg begin
394394
# perform dummy operations to "warm up" the profiler, and avoid slow first calls.
395395
# we'll skip everything up until the synchronization call during processing
396396
CuArray([1])

CUDATools/src/reflection.jl

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,9 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
5959
# NVIDIA bug #4604961: CUPTI in CUDA 12.4 Update 1 does not capture profiled events
6060
# unless the activity API is first activated. This is fixed in 12.5 Update 1.
6161
if v"2024.1.1" <= CUPTI.library_version() <= v"2024.2.0"
62-
cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
63-
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
64-
CUPTI.enable!(cfg) do
65-
# do nothing
66-
end
62+
warmup_cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
63+
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
64+
CUPTI.@enable! warmup_cfg nothing
6765
end
6866

6967
cfg = CUPTI.CallbackConfig([CUPTI.CUPTI_CB_DOMAIN_RESOURCE]) do domain, id, data
@@ -78,9 +76,7 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
7876
end
7977

8078
compiled = CUDACore.compile(job)
81-
CUPTI.enable!(cfg) do
82-
CUDACore.link(job, compiled)
83-
end
79+
CUPTI.@enable! cfg CUDACore.link(job, compiled)
8480

8581
return
8682
end
@@ -96,11 +92,9 @@ function code_sass(f::Base.Callable, io::IO=stdout; raw::Bool=false)
9692
# NVIDIA bug #4604961: CUPTI in CUDA 12.4 Update 1 does not capture profiled events
9793
# unless the activity API is first activated. This is fixed in 12.5 Update 1.
9894
if v"2024.1.1" <= CUPTI.library_version() <= v"2024.2.0"
99-
cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
100-
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
101-
CUPTI.enable!(cfg) do
102-
# do nothing
103-
end
95+
warmup_cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
96+
CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
97+
CUPTI.@enable! warmup_cfg nothing
10498
end
10599

106100
seen_modules = Set{UInt32}()
@@ -121,7 +115,7 @@ function code_sass(f::Base.Callable, io::IO=stdout; raw::Bool=false)
121115
disassemble_cubin(io, cubin; raw)
122116
end
123117

124-
CUPTI.enable!(f, cfg)
118+
CUPTI.@enable! cfg f()
125119

126120
return
127121
end
@@ -177,7 +171,8 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
177171
source = methodinstance(typeof(func), Base.to_tuple_type(types))
178172
config = CUDACore.compiler_config(device(); kernel, compiler_kwargs...)
179173
job = CompilerJob(source, config)
180-
GPUCompiler.$method($(args...); kwargs...)
174+
# use frozen world to avoid recompiling the compiler infrastructure
175+
CUDACore.invoke_frozen(GPUCompiler.$method, $(args...); kwargs...)
181176
end
182177
$method(@nospecialize(func), @nospecialize(types); kwargs...) =
183178
$method(stdout, func, types; kwargs...)

0 commit comments

Comments
 (0)