add gptq benchmark, and speed up by ~3x with compile (#4310)

vkuzo · web-flow · commit b49d8cb74e4b · 2026-04-23T05:23:03.000-04:00
* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]
diff --git a/benchmarks/benchmark_gptq.py b/benchmarks/benchmark_gptq.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+
+import fire
+import torch
+
+from torchao.prototype.gptq import GPTQConfig, gptq_quantize
+from torchao.prototype.mx_formats.inference_workflow import (
+    NVFP4DynamicActivationNVFP4WeightConfig,
+)
+
+
+def run(
+    K: int = 2048,
+    N: int = 4096,
+    profile_fname: str = None,
+):
+    print(f"K={K}, N={N}")
+
+    A = torch.randn(K, K, dtype=torch.float32, device="cuda")
+    H = A.t() @ A
+
+    W_t = torch.randn(N, K, dtype=torch.bfloat16, device="cuda")
+
+    config = GPTQConfig(
+        step="convert",
+        base_config=NVFP4DynamicActivationNVFP4WeightConfig(
+            use_dynamic_per_tensor_scale=True,
+            use_triton_kernel=True,
+        ),
+    )
+
+    # Warmup
+    print("Warmup...")
+    gptq_quantize(H.clone(), W_t.clone(), config)
+    torch.cuda.synchronize()
+
+    num_runs = 5
+    if profile_fname is not None:
+        print("Profiling run...")
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            record_shapes=True,
+            with_stack=True,
+        ) as prof:
+            torch.cuda.synchronize()
+            start = time.time()
+            gptq_quantize(H.clone(), W_t.clone(), config)
+            torch.cuda.synchronize()
+            elapsed = time.time() - start
+        print(f"gptq_quantize time: {elapsed:.3f}s")
+        prof.export_chrome_trace(profile_fname)
+        print(f"Saved: {profile_fname}")
+    else:
+        print(f"Timed run ({num_runs} iterations)...")
+        times = []
+        for _ in range(num_runs):
+            torch.cuda.synchronize()
+            start = time.time()
+            gptq_quantize(H.clone(), W_t.clone(), config)
+            torch.cuda.synchronize()
+            times.append(time.time() - start)
+        avg = sum(times) / len(times)
+        print(f"gptq_quantize avg time: {avg:.3f}s")
+
+
+if __name__ == "__main__":
+    fire.Fire(run)
diff --git a/test/prototype/gptq/test_gptqv2.py b/test/prototype/gptq/test_gptqv2.py
@@ -10,6 +10,13 @@
 import torch
 import torch.nn.functional as F
 
+from torchao.utils import torch_version_at_least
+
+pytestmark = pytest.mark.skipif(
+    not torch_version_at_least("2.11.0"),
+    reason="GPTQ prototype requires PyTorch 2.11+",
+)
+
 from torchao.prototype.gptq import (
     GPTQConfig,
     gptq_quantize,
diff --git a/torchao/prototype/gptq/api.py b/torchao/prototype/gptq/api.py
@@ -4,14 +4,18 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
+import os
 import types
+import warnings
 from dataclasses import dataclass
 from functools import partial
 from typing import Union
 
 import torch
 import torch.nn as nn
 
+from torchao.utils import torch_version_at_least
+
 try:
     from mslk.quantize.shuffle import int4_row_quantize_zp, pack_int4
 except:
@@ -265,6 +269,40 @@ def _nvfp4_with_precalculated_scales_q(
     return data_lp_packed
 
 
+# Set to True to torch.compile the NVFP4 quantize/dequantize functions
+# inside gptq_quantize. Gives ~3x speedup.
+_use_torch_compile = True
+
+if _use_torch_compile:
+    _nvfp4_qdq_fn = torch.compile(_nvfp4_with_precalculated_scales_qdq)
+    _nvfp4_q_fn = torch.compile(_nvfp4_with_precalculated_scales_q)
+
+    if torch_version_at_least("2.11.0"):
+        # Triton's default f32 division uses approximate reciprocal which
+        # introduces ~1 ULP error per division. In GPTQ's error propagation
+        # loop this compounds across columns. IEEE-compliant division rounding
+        # eliminates the drift.
+        import torch._inductor.config as _inductor_config
+
+        if os.environ.get("TORCHINDUCTOR_EMULATE_DIVISION_ROUNDING") == "0":
+            warnings.warn(
+                "TORCHINDUCTOR_EMULATE_DIVISION_ROUNDING=0 may cause numerical "
+                "drift in GPTQ with torch.compile. "
+                "Consider unsetting it or setting it to 1."
+            )
+        else:
+            _inductor_config.eager_numerics.division_rounding = True
+    else:
+        warnings.warn(
+            "PyTorch < 2.11.0 detected. Upgrade to PyTorch 2.11.0+ for "
+            "better GPTQ numerics with torch.compile (IEEE-compliant "
+            "division rounding)."
+        )
+else:
+    _nvfp4_qdq_fn = _nvfp4_with_precalculated_scales_qdq
+    _nvfp4_q_fn = _nvfp4_with_precalculated_scales_q
+
+
 def gptq_quantize(H: torch.Tensor, W_t: torch.Tensor, config: GPTQConfig):
     """
     This function implements the GPTQ algorithm described in this paper: https://arxiv.org/abs/2210.17323 (Algorithm 1)
@@ -472,7 +510,7 @@ def gptq_quantize(H: torch.Tensor, W_t: torch.Tensor, config: GPTQConfig):
                     )
                     dq = q.dequantize(output_dtype=torch.float)
                 elif isinstance(base_config, NVFP4DynamicActivationNVFP4WeightConfig):
-                    dq = _nvfp4_with_precalculated_scales_qdq(
+                    dq = _nvfp4_qdq_fn(
                         w_t,
                         nvfp4_global_scale,
                         scale.squeeze(-1),
@@ -519,7 +557,7 @@ def gptq_quantize(H: torch.Tensor, W_t: torch.Tensor, config: GPTQConfig):
         combined_scale = (
             torch.cat(group_qparams, dim=0).reshape(K // group_size, N).t().contiguous()
         )
-        qdata = _nvfp4_with_precalculated_scales_q(
+        qdata = _nvfp4_q_fn(
             W_t,
             nvfp4_global_scale,
             combined_scale,