Precompile profile display.

maleadt · maleadt · commit e26dfb5cc566 · 2026-03-27T11:22:55.000+01:00
diff --git a/CUDATools/Project.toml b/CUDATools/Project.toml
@@ -12,6 +12,7 @@ GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 NVML = "611af6d1-644e-4c5d-bd58-854d7d1254b9"
 NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
+PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -31,6 +32,7 @@ GPUCompiler = "1.4"
 LLVM = "9.3.1"
 NVML = "=6.0.0"
 NVTX = "1"
+PrecompileTools = "1"
 Preferences = "1"
 PrettyTables = "3"
 Printf = "1"
diff --git a/CUDATools/src/precompile.jl b/CUDATools/src/precompile.jl
@@ -1,4 +1,46 @@
-# @profile infrastructure
+# @profile infrastructure (GPU-dependent, can't execute during precompilation)
 precompile(Tuple{typeof(Profile.detect_cupti)})
 precompile(Tuple{typeof(Profile.profile_internally), Function})
 precompile(Tuple{typeof(Profile.capture), CUPTI.ActivityConfig})
+
+using PrecompileTools
+
+@compile_workload begin
+    # exercise the @profile display path with a dummy result (no GPU needed).
+    # the show method expects at least two cuCtxSynchronize entries in the host trace
+    # to delimit the profiled region, and at least one event between them.
+    dummy = Profile.ProfileResults(;
+        host = (
+            id      = Int[1, 2, 3, 4],
+            start   = Float64[0.0, 0.001, 0.002, 0.010],
+            stop    = Float64[0.001, 0.002, 0.009, 0.011],
+            name    = String["cuCtxSynchronize", "cuCtxSynchronize",
+                             "cuLaunchKernel", "cuCtxSynchronize"],
+            tid     = Int[1, 1, 1, 1],
+        ),
+        device = (
+            id      = Int[3],
+            start   = Float64[0.003],
+            stop    = Float64[0.008],
+            name    = String["kernel"],
+            device  = Int[0],
+            context = Int[1],
+            stream  = Int[1],
+            grid            = Union{Missing,CUDACore.CuDim3}[CUDACore.CuDim3(1,1,1)],
+            block           = Union{Missing,CUDACore.CuDim3}[CUDACore.CuDim3(1,1,1)],
+            registers       = Union{Missing,Int64}[32],
+            shared_mem      = Union{Missing,@NamedTuple{static::Int64,dynamic::Int64}}[(static=0,dynamic=0)],
+            local_mem       = Union{Missing,@NamedTuple{thread::Int64,total::Int64}}[(thread=0,total=0)],
+            size            = Union{Missing,Int64}[missing],
+        ),
+        nvtx = (
+            id      = Int[],
+            start   = Float64[],
+            type    = Symbol[],
+            tid     = Int[],
+            name    = Union{Missing,String}[],
+            domain  = Union{Missing,String}[],
+        ),
+    )
+    show(devnull, dummy)
+end