Update benchmarks scripts. (#151)

maleadt · web-flow · commit aa5f34e7d3b8 · 2026-03-30T09:13:31.000+02:00
diff --git a/README.md b/README.md
@@ -88,20 +88,29 @@ while the latter needs valid `CuArray`s to be passed to the kernel.
 Run benchmarks with:
 
 ```bash
-julia --project examples/benchmarks.jl  # Julia
-uv run python examples/benchmarks.py    # Python (for comparison)
+julia --project=examples examples/benchmarks.jl  # Julia
+uv run python examples/benchmarks.py             # Python (for comparison)
 ```
 
-Benchmarks comparing cuTile.jl against cuTile Python on an RTX 5080:
-
-| Kernel | Julia | Python | Status |
-|--------|-------|--------|--------|
-| Vector Addition | 841 GB/s | 847 GB/s | OK (=) |
-| Matrix Transpose | 807 GB/s | 813 GB/s | OK (-1%) |
-| Layer Normalization | 653 GB/s | 758 GB/s | -14% |
-| Matrix Multiplication | 43.1 TFLOPS | 50.3 TFLOPS | -14% |
-| Batch Matrix Multiply | 30.4 TFLOPS | 40.0 TFLOPS | -24% |
-| FFT (3-stage Cooley-Tukey) | 620 μs | 486 μs | -28% |
+Benchmarks comparing cuTile.jl against cuTile Python on an RTX 5080 (20 runs, 5 warmup,
+min time reported):
+
+| Kernel | Size | Julia | Python | Status |
+|--------|------|-------|--------|--------|
+| Vector Addition | 2^27 f32 | 841 GB/s | 847 GB/s | OK (=) |
+| Matrix Transpose | 8192² f32 | 773 GB/s | 817 GB/s | -5% |
+| Layer Normalization | 4096² f32 fwd | 615 GB/s | 761 GB/s | -19% |
+| Matrix Multiplication | 4096³ f32 | 47.6 TFLOPS | 50.2 TFLOPS | -5% |
+| Batch Matrix Multiply | 1024×512×2048 ×8 f32 | 28.7 TFLOPS | 40.0 TFLOPS | -28% |
+| FFT (3-stage Cooley-Tukey) | 512-pt ×64 c64 | 465 μs | 486 μs | OK (+4%) |
+
+With the same tileiras, all kernels compile to identical register counts, block sizes, and
+occupancy. The remaining gap is from **1→0 indexing overhead**: Julia's 1-based `bid()` and
+load indices generate extra `subi` ops in the Tile IR that perturb tileiras's SASS
+instruction scheduling (e.g. missing `.reuse` operand collector flags on HMMA, different
+address computation instruction selection). This affects all kernels proportional to loop
+count (layernorm 174 vs 128 IR lines across 3 loops; batchmatmul L1 hit 9.5% vs 41.3%
+from cascading scheduling differences).
 
 
 ## Supported Operations
diff --git a/examples/batchmatmul.jl b/examples/batchmatmul.jl
@@ -104,6 +104,11 @@ function verify(data, result)
     @assert isapprox(Array(result.C), expected; rtol=1e-2) "max diff: $(maximum(abs.(Array(result.C) - expected)))"
 end
 
+function metric(data)
+    # 2*M*K*N*Batch FLOPs (multiply-add = 2 ops)
+    return 2 * data.M * data.K * data.N * data.Batch, "TFLOPS"
+end
+
 #=============================================================================
  Reference implementations for benchmarking
 =============================================================================#
diff --git a/examples/batchmatmul.py b/examples/batchmatmul.py
@@ -93,6 +93,12 @@ def run(data, *, tm: int = 128, tn: int = 128, tk: int = 64, nruns: int = 1, war
     return {"C": C, "times": times}
 
 
+def metric(data):
+    """Return (total_flops, unit) for throughput calculation."""
+    # 2*M*K*N*Batch FLOPs (multiply-add = 2 ops)
+    return 2 * data["M"] * data["K"] * data["N"] * data["Batch"], "TFLOPS"
+
+
 def verify(data, result):
     """Verify batch matmul results."""
     A_np = cp.asnumpy(data["A"]).astype(np.float32)
diff --git a/examples/benchmarks.jl b/examples/benchmarks.jl
@@ -9,8 +9,8 @@ using CUDA
  Configuration
 =============================================================================#
 
-const NRUNS = 10
-const WARMUP = 3
+const NRUNS = 20
+const WARMUP = 5
 
 #=============================================================================
  Benchmark Utilities
@@ -20,20 +20,45 @@ struct BenchmarkResult
     name::String
     min_ms::Float64
     mean_ms::Float64
+    throughput::String  # e.g. "841 GB/s" or "43.1 TFLOPS" or ""
+end
+
+function format_throughput(total, unit::String, time_ms::Float64)
+    if unit == "GB/s"
+        gbps = total / (time_ms / 1000) / 1e9
+        return "$(round(Int, gbps)) GB/s"
+    elseif unit == "TFLOPS"
+        tflops = total / (time_ms / 1000) / 1e12
+        return "$(round(tflops, digits=1)) TFLOPS"
+    elseif unit == "μs"
+        return "$(round(Int, time_ms * 1000)) μs"
+    else
+        return ""
+    end
 end
 
 function print_table(title::String, results::Vector{BenchmarkResult})
     println()
-    println("=" ^ 60)
+    println("=" ^ 72)
     println("  ", title)
-    println("=" ^ 60)
-    println(rpad("Implementation", 20), rpad("Min (ms)", 12), "Mean (ms)")
-    println("-" ^ 60)
+    println("=" ^ 72)
+    has_throughput = any(r -> !isempty(r.throughput), results)
+    if has_throughput
+        println(rpad("Implementation", 20), rpad("Min (ms)", 12), rpad("Mean (ms)", 12), "Throughput")
+    else
+        println(rpad("Implementation", 20), rpad("Min (ms)", 12), "Mean (ms)")
+    end
+    println("-" ^ 72)
     for r in results
-        println(rpad(r.name, 20), rpad(round(r.min_ms, digits=3), 12),
-                round(r.mean_ms, digits=3))
+        if has_throughput
+            println(rpad(r.name, 20), rpad(round(r.min_ms, digits=3), 12),
+                    rpad(round(r.mean_ms, digits=3), 12), r.throughput)
+        else
+            println(rpad(r.name, 20), rpad(round(r.min_ms, digits=3), 12),
+                    round(r.mean_ms, digits=3))
+        end
     end
-    println("-" ^ 60)
+    println("-" ^ 72)
 end
 
 #=============================================================================
@@ -65,6 +90,12 @@ function run_benchmark(name::String)
     # Prepare data with benchmark=true for larger sizes
     data = @invokelatest mod.prepare(; benchmark=true)
 
+    # Get metric info if available
+    metric_total, metric_unit = 0, ""
+    if isdefined(mod, :metric)
+        metric_total, metric_unit = @invokelatest mod.metric(data)
+    end
+
     # Run cuTile
     result = @invokelatest mod.run(data; nruns=NRUNS, warmup=WARMUP)
 
@@ -86,17 +117,17 @@ function run_benchmark(name::String)
         merge!(results, others)
     end
 
-    return results
+    return results, metric_total, metric_unit
 end
 
 #=============================================================================
  Main
 =============================================================================#
 
 function main()
-    println("=" ^ 60)
+    println("=" ^ 72)
     println("  cuTile.jl Benchmarks")
-    println("=" ^ 60)
+    println("=" ^ 72)
     println()
     println("Configuration:")
     println("  Runs: $NRUNS (+ $WARMUP warmup)")
@@ -105,18 +136,21 @@ function main()
     for name in discover_benchmarks()
         println("\nBenchmarking $name...")
 
-        results = run_benchmark(name)
-        if results === nothing
+        ret = run_benchmark(name)
+        if ret === nothing
             println("  (skipped - no prepare/run functions)")
             continue
         end
 
+        results, metric_total, metric_unit = ret
+
         # Convert to BenchmarkResult for printing
         benchmark_results = BenchmarkResult[]
         for (impl_name, times) in results
             min_t = minimum(times)
             mean_t = sum(times) / length(times)
-            push!(benchmark_results, BenchmarkResult(impl_name, min_t, mean_t))
+            tp = !isempty(metric_unit) ? format_throughput(metric_total, metric_unit, min_t) : ""
+            push!(benchmark_results, BenchmarkResult(impl_name, min_t, mean_t, tp))
         end
 
         # Sort by min time
@@ -126,9 +160,9 @@ function main()
     end
 
     println()
-    println("=" ^ 60)
+    println("=" ^ 72)
     println("  Benchmark Complete")
-    println("=" ^ 60)
+    println("=" ^ 72)
 end
 
 if abspath(PROGRAM_FILE) == @__FILE__
diff --git a/examples/benchmarks.py b/examples/benchmarks.py
@@ -12,31 +12,52 @@
 # Configuration
 #=============================================================================
 
-NRUNS = 10
-WARMUP = 3
+NRUNS = 20
+WARMUP = 5
 
 #=============================================================================
 # Benchmark Utilities
 #=============================================================================
 
 class BenchmarkResult:
-    def __init__(self, name: str, min_ms: float, mean_ms: float):
+    def __init__(self, name: str, min_ms: float, mean_ms: float, throughput: str = ""):
         self.name = name
         self.min_ms = min_ms
         self.mean_ms = mean_ms
+        self.throughput = throughput
+
+
+def format_throughput(total, unit: str, time_ms: float) -> str:
+    if unit == "GB/s":
+        gbps = total / (time_ms / 1000) / 1e9
+        return f"{gbps:.0f} GB/s"
+    elif unit == "TFLOPS":
+        tflops = total / (time_ms / 1000) / 1e12
+        return f"{tflops:.1f} TFLOPS"
+    elif unit == "μs":
+        return f"{time_ms * 1000:.0f} μs"
+    else:
+        return ""
 
 
 def print_table(title: str, results: list):
     """Print formatted benchmark results table."""
     print()
-    print("=" * 60)
+    print("=" * 72)
     print(f"  {title}")
-    print("=" * 60)
-    print(f"{'Implementation':<20}{'Min (ms)':<12}Mean (ms)")
-    print("-" * 60)
+    print("=" * 72)
+    has_throughput = any(r.throughput for r in results)
+    if has_throughput:
+        print(f"{'Implementation':<20}{'Min (ms)':<12}{'Mean (ms)':<12}Throughput")
+    else:
+        print(f"{'Implementation':<20}{'Min (ms)':<12}Mean (ms)")
+    print("-" * 72)
     for r in results:
-        print(f"{r.name:<20}{r.min_ms:<12.3f}{r.mean_ms:.3f}")
-    print("-" * 60)
+        if has_throughput:
+            print(f"{r.name:<20}{r.min_ms:<12.3f}{r.mean_ms:<12.3f}{r.throughput}")
+        else:
+            print(f"{r.name:<20}{r.min_ms:<12.3f}{r.mean_ms:.3f}")
+    print("-" * 72)
 
 
 #=============================================================================
@@ -76,6 +97,10 @@ def run_benchmark(name: str):
     # Prepare data with benchmark=True for larger sizes
     data = prepare_fn(benchmark=True)
 
+    # Get metric info if available
+    metric_fn = getattr(mod, "metric", None)
+    metric_total, metric_unit = (0, "") if not metric_fn else metric_fn(data)
+
     # Run cuTile
     result = run_fn(data, nruns=NRUNS, warmup=WARMUP)
 
@@ -96,7 +121,7 @@ def run_benchmark(name: str):
         others = run_others_fn(data, nruns=NRUNS, warmup=WARMUP)
         results.update(others)
 
-    return results
+    return results, metric_total, metric_unit
 
 
 #=============================================================================
@@ -106,9 +131,9 @@ def run_benchmark(name: str):
 def main():
     import torch  # For GPU name
 
-    print("=" * 60)
+    print("=" * 72)
     print("  cuTile Python Benchmarks")
-    print("=" * 60)
+    print("=" * 72)
     print()
     print("Configuration:")
     print(f"  Runs: {NRUNS} (+ {WARMUP} warmup)")
@@ -117,27 +142,30 @@ def main():
     for name in discover_benchmarks():
         print(f"\nBenchmarking {name}...")
 
-        results = run_benchmark(name)
-        if results is None:
+        ret = run_benchmark(name)
+        if ret is None:
             print("  (skipped - no prepare/run functions)")
             continue
 
+        results, metric_total, metric_unit = ret
+
         # Convert to BenchmarkResult for printing
         benchmark_results = []
         for impl_name, times in results.items():
             min_t = min(times)
             mean_t = sum(times) / len(times)
-            benchmark_results.append(BenchmarkResult(impl_name, min_t, mean_t))
+            tp = format_throughput(metric_total, metric_unit, min_t) if metric_unit else ""
+            benchmark_results.append(BenchmarkResult(impl_name, min_t, mean_t, tp))
 
         # Sort by min time
         benchmark_results.sort(key=lambda r: r.min_ms)
 
         print_table(name, benchmark_results)
 
     print()
-    print("=" * 60)
+    print("=" * 72)
     print("  Benchmark Complete")
-    print("=" * 60)
+    print("=" * 72)
 
 
 if __name__ == "__main__":
diff --git a/examples/fft.jl b/examples/fft.jl
@@ -266,6 +266,11 @@ function verify(data, result)
     @assert isapprox(Array(result.output), reference, rtol=1e-4)
 end
 
+function metric(data)
+    # FFT is a latency benchmark; report time directly
+    return 0, "μs"
+end
+
 #=============================================================================
  Reference implementations for benchmarking
 =============================================================================#
diff --git a/examples/fft.py b/examples/fft.py
@@ -176,6 +176,11 @@ def run(data, *, nruns: int = 1, warmup: int = 0):
     return {"output": output, "times": times}
 
 
+def metric(data):
+    """Return (0, unit) for FFT - latency benchmark, report time directly."""
+    return 0, "μs"
+
+
 def verify(data, result):
     """Verify FFT results."""
     reference = torch.fft.fft(data["input"], dim=-1)
diff --git a/examples/layernorm.jl b/examples/layernorm.jl
@@ -385,6 +385,11 @@ function test_layernorm(M, N, TILE_N; TILE_M::Int=32, eps::Float32=1f-5, name=no
     println("  fwd passed, bwd passed")
 end
 
+function metric(data)
+    # Forward: 3 reads of X + W + B reads + Y write + Mean/Rstd writes ≈ 4*M*N floats
+    return 4 * data.M * data.N * sizeof(Float32), "GB/s"
+end
+
 # No run_others for layernorm - no simple reference implementation to compare against
 
 #=============================================================================
diff --git a/examples/layernorm.py b/examples/layernorm.py
@@ -254,6 +254,12 @@ def verify(data, result):
     assert np.allclose(cp.asnumpy(result["DB"]), expected_DB, rtol=rtol, atol=atol), \
         f"DB mismatch! max diff: {np.max(np.abs(cp.asnumpy(result['DB']) - expected_DB))}"
 
+def metric(data):
+    """Return (total_bytes, unit) for throughput calculation."""
+    # Forward: 3 reads of X + W + B reads + Y write + Mean/Rstd writes ≈ 4*M*N floats
+    return 4 * data["M"] * data["N"] * 4, "GB/s"
+
+
 # No run_others for layernorm - no simple reference implementation to compare against
 
 
diff --git a/examples/matmul.jl b/examples/matmul.jl
@@ -105,6 +105,11 @@ function verify(data, result)
     @assert isapprox(Array(result.C), expected; rtol=1e-2) "max diff: $(maximum(abs.(Array(result.C) - expected)))"
 end
 
+function metric(data)
+    # 2*M*N*K FLOPs (multiply-add = 2 ops)
+    return 2 * data.M * data.N * data.K, "TFLOPS"
+end
+
 #=============================================================================
  Reference implementations for benchmarking
 =============================================================================#
diff --git a/examples/matmul.py b/examples/matmul.py
diff --git a/examples/transpose.jl b/examples/transpose.jl
diff --git a/examples/transpose.py b/examples/transpose.py
diff --git a/examples/vadd.jl b/examples/vadd.jl
diff --git a/examples/vadd.py b/examples/vadd.py