small fix for inference roofline model (#3990)

vkuzo · web-flow · commit ee3d62aafa90 · 2026-03-04T13:55:43.000-05:00
Update

[ghstack-poisoned]
diff --git a/benchmarks/float8/float8_inference_roofline.py b/benchmarks/float8/float8_inference_roofline.py
@@ -581,6 +581,7 @@ def run(
             M, K, N, torch.float8_e4m3fn, gemm_recipe_name
         )
     print("bf16_gemm_time_sympy", bf16_gemm_time_sympy)
+    print("bf16_ovhd_time_sympy", bf16_ovhd_time_sympy)
     print("fp8_gemm_time_sympy", fp8_gemm_time_sympy)
     print("fp8_ovhd_time_sympy", fp8_ovhd_time_sympy)
     print()
diff --git a/docs/source/workflows/inference.md b/docs/source/workflows/inference.md
@@ -121,10 +121,10 @@ torch version           2.12.0.dev20260218+cu130
 torchao version         0.17.0+git3075bb624
 ...
    fwd_M  fwd_K  fwd_N  r_fp8_gemm_and_ovhd_spdp  b_fp8_e2e_spdp
-0   1024   1024   1024                      0.64            0.94
-1   2048   2048   2048                      1.75            1.21
-2   4096   4096   4096                      1.90            1.45
-3   8192   8192   8192                      1.94            1.75
+0   1024   1024   1024                      1.00            0.93
+1   2048   2048   2048                      1.75            1.20
+2   4096   4096   4096                      1.90            1.46
+3   8192   8192   8192                      1.94            1.76
 4  16384  16384  16384                      1.97            1.77
 
 #
@@ -137,11 +137,11 @@ torch version           2.12.0.dev20260218+cu130
 torchao version         0.17.0+git3075bb624
 ...
    fwd_M  fwd_K  fwd_N  r_fp8_gemm_and_ovhd_spdp  b_fp8_e2e_spdp
-0   1024   1024   1024                      0.64            0.37
-1   2048   2048   2048                      2.39            0.74
+0   1024   1024   1024                      1.00            0.38
+1   2048   2048   2048                      2.39            0.73
 2   4096   4096   4096                      2.92            1.19
-3   8192   8192   8192                      3.34            1.78
-4  16384  16384  16384                      3.63            2.57
+3   8192   8192   8192                      3.34            1.80
+4  16384  16384  16384                      3.63            2.56
 ```
 
 ## Other Available Quantization Techniques
diff --git a/torchao/testing/training/roofline_utils.py b/torchao/testing/training/roofline_utils.py
@@ -561,6 +561,7 @@ def get_inference_bf16_activation_mem_sympy(M, K, N, gpu_name: Optional[str] = N
     kernel_rw = BYTES_PER_EL_BF16 * M * K * 2
     # convert from bytes to seconds
     res_s = kernel_rw / specs["peak_mem_bw_bytes_sec"] / specs["pct_achievable_mem_bw"]
+    res_s = sympy.Max(res_s, KERNEL_LAUNCH_OVERHEAD_SEC)
     return res_s
 
 

Original file line number	Diff line number	Diff line change
`@@ -581,6 +581,7 @@ def run(`
`581`	`581`	`M, K, N, torch.float8_e4m3fn, gemm_recipe_name`
`582`	`582`	`)`
`583`	`583`	`print("bf16_gemm_time_sympy", bf16_gemm_time_sympy)`
	`584`	`+ print("bf16_ovhd_time_sympy", bf16_ovhd_time_sympy)`
`584`	`585`	`print("fp8_gemm_time_sympy", fp8_gemm_time_sympy)`
`585`	`586`	`print("fp8_ovhd_time_sympy", fp8_ovhd_time_sympy)`
`586`	`587`	`print()`