[wip] perf improvements

vkuzo · vkuzo · commit b09e974f121f · 2026-04-24T20:30:59.000Z
Summary: Test Plan: ghstack-source-id: 7824ff7 ghstack-comment-id: 4316156896 Pull-Request: #4331
diff --git a/torchao/prototype/gptq/api.py b/torchao/prototype/gptq/api.py
@@ -274,12 +274,27 @@ def _nvfp4_with_precalculated_scales_q(
     return data_lp_packed
 
 
+def _nvfp4_gptq_inner_loop(
+    w_t,
+    nvfp4_global_scale,
+    scale,
+    Hinv_cur_k_k,
+):
+    dq = _nvfp4_with_precalculated_scales_qdq(
+        w_t,
+        nvfp4_global_scale,
+        scale.squeeze(-1),
+    )
+    err1 = (w_t - dq) / Hinv_cur_k_k
+    return err1
+
+
 # Set to True to torch.compile the NVFP4 quantize/dequantize functions
 # inside gptq_quantize. Gives ~3x speedup.
 _use_torch_compile = True
 
 if _use_torch_compile:
-    _nvfp4_qdq_fn = torch.compile(_nvfp4_with_precalculated_scales_qdq)
+    _nvfp4_gptq_inner_loop_fn = torch.compile(_nvfp4_gptq_inner_loop)
     _nvfp4_q_fn = torch.compile(_nvfp4_with_precalculated_scales_q)
 
     if torch_version_at_least("2.11.0"):
@@ -304,7 +319,7 @@ def _nvfp4_with_precalculated_scales_q(
             "division rounding)."
         )
 else:
-    _nvfp4_qdq_fn = _nvfp4_with_precalculated_scales_qdq
+    _nvfp4_gptq_inner_loop_fn = _nvfp4_gptq_inner_loop
     _nvfp4_q_fn = _nvfp4_with_precalculated_scales_q
 
 
@@ -507,21 +522,22 @@ def gptq_quantize(H: torch.Tensor, W_t: torch.Tensor, config: GPTQConfig):
                         w_t, scale, zero_point, group_size
                     )
                     dq = _int4_row_dequantize_zp(q, scale, zero_point, group_size)
+                    err1 = (w_t - dq) / Hinv_cur[k, k]
+
                 elif isinstance(base_config, Int8WeightOnlyConfig):
                     q = Int8Tensor.from_hp(
                         w_t,
                         granularity=base_config.granularity,
                         scale=quantized_tensor.scale,
                     )
                     dq = q.dequantize(output_dtype=torch.float)
+                    err1 = (w_t - dq) / Hinv_cur[k, k]
+
                 elif isinstance(base_config, NVFP4DynamicActivationNVFP4WeightConfig):
-                    dq = _nvfp4_qdq_fn(
-                        w_t,
-                        nvfp4_global_scale,
-                        scale.squeeze(-1),
+                    Hinv_cur_k_k = Hinv_cur[k, k]
+                    err1 = _nvfp4_gptq_inner_loop_fn(
+                        w_t, nvfp4_global_scale, scale, Hinv_cur_k_k
                     )
-
-                err1 = (w_t - dq) / Hinv_cur[k, k]
                 B_cur[:, k:] -= err1.matmul(Hinv_cur[k, k:].unsqueeze(0))
                 B_cur_Err1[:, k] = err1.flatten()
 
diff --git a/torchao/prototype/mx_formats/mx_tensor.py b/torchao/prototype/mx_formats/mx_tensor.py
@@ -204,7 +204,16 @@ def _to_mx_rceil(
     data_lp = data_hp * rcp_fp32
 
     # Note: clamp preserves NaN values
-    data_lp = torch.clamp(data_lp, min=-max_pos, max=max_pos)
+    if not (torch.compiler.is_compiling() or is_fake(descale)):
+        # As of 20250317, the Pytorch eager mode cast to `torch.float8_e4m3fn`
+        # is unsaturated. This cast is saturated in triton. If we are compute bound,
+        # we see a speedup if we remove this redundant clamp if we are compiling
+        # to triton.
+        # TODO(#1912): make the saturated cast work in eager mode and remove this
+        # workaround.
+        # TODO(future PR): unify this code between the FLOOR and RCEIL scaling
+        # methods
+        data_lp = torch.clamp(data_lp, min=-max_pos, max=max_pos)
 
     return exponent, data_lp