Add support for flashinfer quantize kernel option for nvfp4

jerryzh168 · jerryzh168 · commit 488e4474cb4b · 2026-03-13T22:44:18.000-07:00
Summary: Added the flashinfer option for better performance on some of the workflow we are interested in, also added numerical equivalence test between different quantize_kernel_preference options Test Plan: pytest test/prototype/mx_formats/test_nvfp4_tensor.py -k test_kernel_preference_numerical_equivalence Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 0db4b3f Pull Request resolved: #3912
diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py
@@ -23,6 +23,7 @@
 from torchao.quantization.utils import compute_error
 from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
+    _is_flashinfer_available,
     is_sm_at_least_100,
     torch_version_at_least,
 )
@@ -368,8 +369,10 @@ def test_nvfp4_swizzled_scales_get_scales_method():
     not is_sm_at_least_100(), reason="requires sm100+ for raw intrinsics"
 )
 @torch.no_grad()
-def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
-    """Test that Triton and PyTorch NVFP4 quantization produce equivalent results."""
+def test_quantize_to_nvfp4_kernel_numerical_equivalence(
+    M, N, use_per_tensor_scale, dtype
+):
+    """Test that different quantize_to_nvfp4 kernel choices produce numerically equivalent results."""
     if not use_per_tensor_scale:
         pytest.skip("MSLK triton kernel requires per_tensor_scale")
 
@@ -380,19 +383,16 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
     if use_per_tensor_scale:
         per_tensor_scale = per_tensor_amax_to_scale(torch.amax(torch.abs(x)))
 
-    nvfp4_pt = NVFP4Tensor.to_nvfp4(
+    # Reference: TORCH kernel choice
+    nvfp4_ref = NVFP4Tensor.to_nvfp4(
         x.clone(),
         per_tensor_scale=per_tensor_scale,
         is_swizzled_scales=True,
         quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.TORCH,
     )
+    ref_dequant = nvfp4_ref.dequantize(dtype)
 
-    nvfp4_triton = NVFP4Tensor.to_nvfp4(
-        x.clone(),
-        per_tensor_scale=per_tensor_scale,
-        is_swizzled_scales=True,
-        quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.MSLK,
-    )
+    other_kernel_choices = [QuantizeToNVFP4KernelChoice.MSLK]
 
     torch.testing.assert_close(nvfp4_pt.scale.flatten(), nvfp4_triton.scale.flatten())
     pt_unpacked = unpack_uint4(nvfp4_pt.qdata.view(torch.uint8))
@@ -404,16 +404,44 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
         rtol=0,
     )
 
-    x_pt_dequant = nvfp4_pt.dequantize(dtype)
-    x_triton_dequant = nvfp4_triton.dequantize(dtype)
-
-    sqnr = compute_error(x_pt_dequant, x_triton_dequant)
-    SQNR_THRESHOLD = 40.0
-
-    assert sqnr >= SQNR_THRESHOLD, (
-        f"SQNR {sqnr:.2f} < {SQNR_THRESHOLD} for M={M}, N={N}, "
-        f"use_per_tensor_scale={use_per_tensor_scale}, dtype={dtype}"
-    )
+    # Flashinfer requires the library and per_tensor_scale
+    if _is_flashinfer_available() and use_per_tensor_scale:
+        other_kernel_choices.append(QuantizeToNVFP4KernelChoice.FLASHINFER)
+
+    SQNR_THRESHOLD = 28.0
+    for kc in other_kernel_choices:
+        nvfp4_other = NVFP4Tensor.to_nvfp4(
+            x.clone(),
+            per_tensor_scale=per_tensor_scale,
+            is_swizzled_scales=True,
+            quantize_to_nvfp4_kernel_choice=kc,
+        )
+
+        # For kernel choices that use the same quantization algorithm as TORCH
+        # (MSLK should be bitwise identical), verify internal data matches exactly
+        if kc == QuantizeToNVFP4KernelChoice.MSLK:
+            torch.testing.assert_close(
+                nvfp4_ref.scale.flatten(),
+                nvfp4_other.scale.flatten(),
+                atol=0,
+                rtol=0,
+            )
+            ref_unpacked = unpack_uint4(nvfp4_ref.qdata)
+            other_unpacked = unpack_uint4(nvfp4_other.qdata)
+            torch.testing.assert_close(
+                ref_unpacked,
+                other_unpacked,
+                atol=0,
+                rtol=0,
+            )
+
+        # Verify dequantized values are numerically close for all kernel choices
+        other_dequant = nvfp4_other.dequantize(dtype)
+        sqnr = compute_error(ref_dequant, other_dequant)
+        assert sqnr >= SQNR_THRESHOLD, (
+            f"SQNR {sqnr:.2f} < {SQNR_THRESHOLD} between TORCH and {kc}, "
+            f"M={M}, N={N}, use_per_tensor_scale={use_per_tensor_scale}, dtype={dtype}"
+        )
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -430,7 +458,11 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
 @pytest.mark.parametrize("inpt_dtype", [torch.bfloat16, torch.float32])
 @pytest.mark.parametrize(
     "quantize_to_nvfp4_kernel_choice",
-    [QuantizeToNVFP4KernelChoice.MSLK, QuantizeToNVFP4KernelChoice.TORCH],
+    [
+        QuantizeToNVFP4KernelChoice.MSLK,
+        QuantizeToNVFP4KernelChoice.FLASHINFER,
+        QuantizeToNVFP4KernelChoice.TORCH,
+    ],
 )
 @pytest.mark.parametrize(
     "shapes",
@@ -469,6 +501,10 @@ def test_nvfp4_matmul_with_amax(
     if quant_type == "weight_only" and compile:
         pytest.skip("TODO: weight_only currently errors w/ compile")
 
+    if quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.FLASHINFER:
+        if not _is_flashinfer_available():
+            pytest.skip("flashinfer not available")
+
     m, k, n = shapes
 
     # Create activation tensor
@@ -544,7 +580,11 @@ def test_nvfp4_to_copy():
 @pytest.mark.parametrize("transpose", [False, True])
 @pytest.mark.parametrize(
     "quantize_to_nvfp4_kernel_choice",
-    [QuantizeToNVFP4KernelChoice.TORCH, QuantizeToNVFP4KernelChoice.MSLK],
+    [
+        QuantizeToNVFP4KernelChoice.TORCH,
+        QuantizeToNVFP4KernelChoice.MSLK,
+        QuantizeToNVFP4KernelChoice.FLASHINFER,
+    ],
 )
 @pytest.mark.parametrize("is_swizzled_scales", [False, True])
 @pytest.mark.parametrize(
@@ -570,11 +610,22 @@ def test_scale_shape_matches_qdata(
         and not is_swizzled_scales
     ):
         pytest.skip("triton kernel requires swizzled scales")
+    if quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.FLASHINFER:
+        if not _is_flashinfer_available():
+            pytest.skip("flashinfer not available")
+        if not is_swizzled_scales:
+            pytest.skip("flashinfer requires swizzled scales")
+        if shape[-1] % 64 != 0:
+            pytest.skip("flashinfer requires K to be divisible by 64")
 
     block_size = 16
 
     x_hp = torch.randn(*shape, device="cuda")
 
+    if quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.FLASHINFER:
+        # flashinfer only supports fp16/bf16/e4m3 input
+        x_hp = x_hp.to(torch.bfloat16)
+
     per_tensor_scale = per_tensor_amax_to_scale(torch.amax(torch.abs(x_hp)))
 
     x = NVFP4Tensor.to_nvfp4(
diff --git a/torchao/prototype/mx_formats/config.py b/torchao/prototype/mx_formats/config.py
@@ -43,6 +43,9 @@ class QuantizeToNVFP4KernelChoice(str, Enum):
     MSLK = "mslk"
     """Use MSLK triton high precision to nvfp4 quantize kernel"""
 
+    FLASHINFER = "flashinfer"
+    """Use flashinfer bf16 to nvfp4 quantize kernel"""
+
 
 torch.serialization.add_safe_globals([QuantizeToNVFP4KernelChoice])
 
diff --git a/torchao/prototype/mx_formats/inference_workflow.py b/torchao/prototype/mx_formats/inference_workflow.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import types
+import warnings
 from dataclasses import dataclass
 from functools import partial
 from typing import Optional
@@ -208,7 +209,7 @@ class NVFP4DynamicActivationNVFP4WeightConfig(AOBaseConfig):
     set to False.
 
     Configuration parameters:
-    - quantize_to_nvfp4_kernel_choice: QuantizeToNVFP4KernelChoice, kernel preference for quantization (default: QuantizeToNVFP4KernelChoice.MSLK)
+    - quantize_to_nvfp4_kernel_choice: QuantizeToNVFP4KernelChoice, kernel choice for quantization (default: QuantizeToNVFP4KernelChoice.MSLK)
       Requires `MSLK <https://github.com/pytorch/MSLK>`__ to be installed.
     - use_dynamic_per_tensor_scale: bool, whether to dynamically compute per tensor scale (default: True)
     - step: Optional[QuantizationStep], the quantization step for observer-based flow
@@ -249,6 +250,17 @@ def __post_init__(self):
             # Static quantization implies use_dynamic_per_tensor_scale=False
             self.use_dynamic_per_tensor_scale = False
 
+        if (
+            self.quantize_to_nvfp4_kernel_choice
+            == QuantizeToNVFP4KernelChoice.FLASHINFER
+        ):
+            if self.step is None and not self.use_dynamic_per_tensor_scale:
+                raise ValueError(
+                    "FLASHINFER kernel choice requires per_tensor_scale. "
+                    "Use step='prepare'/'convert' for static quantization, "
+                    "or set use_dynamic_per_tensor_scale=True."
+                )
+
 
 @register_quantize_module_handler(NVFP4DynamicActivationNVFP4WeightConfig)
 def _nvfp4_inference_linear_transform(
@@ -269,6 +281,15 @@ def _nvfp4_inference_linear_transform(
         raise RuntimeError(
             f"NVFP4 only supports weight shape with last 2 dims divisible by 16, got {weight.shape}"
         )
+    if (
+        config.quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.FLASHINFER
+        and weight.shape[-1] % 64 != 0
+    ):
+        warnings.warn(
+            f"Skipping NVFP4 quantization for layer with K={weight.shape[-1]}: "
+            f"flashinfer requires K to be divisible by 64."
+        )
+        return module
 
     step = config.step
     if step == QuantizationStep.PREPARE or step == "prepare":
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py
@@ -33,7 +33,7 @@
 from torchao.quantization.quantize_.common import (
     QuantizeTensorKwargs,
 )
-from torchao.utils import TorchAOBaseTensor, fill_defaults
+from torchao.utils import TorchAOBaseTensor, _is_flashinfer_available, fill_defaults
 
 E4M3_EPS = torch.finfo(torch.float8_e4m3fn).tiny
 
@@ -98,7 +98,7 @@ class NVFP4Tensor(TorchAOBaseTensor):
         block_size (int): Block size for quantization (fixed at 16)
         orig_dtype (torch.dtype): Original tensor dtype before quantization
         is_swizzled_scales (bool): Whether scales are stored in swizzled (blocked) format
-        quantize_to_nvfp4_kernel_choice (QuantizeToNVFP4KernelChoice): Kernel preference for quantization
+        quantize_to_nvfp4_kernel_choice (QuantizeToNVFP4KernelChoice): Kernel choice for quantization
     """
 
     tensor_data_names = ["qdata", "scale"]
@@ -179,7 +179,7 @@ def to_nvfp4(
             act_per_tensor_scale: Optional pre-computed absolute maximum for calibration for activation
                 If provided, uses per-tensor scaling. If None, uses block-wise scaling only.
             is_swizzled_scales: If True, store scales in swizzled format for faster matrix multiplication
-            quantize_to_nvfp4_kernel_choice: Kernel preference for quantization
+            quantize_to_nvfp4_kernel_choice: Kernel choice for quantization
             act_quant_kwargs: If specified, config for quantizing the activation
 
         Returns:
@@ -201,6 +201,31 @@ def to_nvfp4(
                 "Triton kernel requires per_tensor_scale"
             )
             blockwise_scales, data_lp = mslk_quantize_nvfp4(data_hp, per_tensor_scale)
+        elif quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.FLASHINFER:
+            from flashinfer import SfLayout
+            from flashinfer import nvfp4_quantize as flashinfer_nvfp4_quantize
+
+            assert _is_flashinfer_available(), (
+                "flashinfer is not available, please install flashinfer-python, apache-tvm-ffi, and nvidia-ml-py to use FLASHINFER kernel choice"
+            )
+            assert per_tensor_scale is not None, (
+                "flashinfer nvfp4_quantize requires per_tensor_scale"
+            )
+            assert is_swizzled_scales, (
+                "flashinfer nvfp4_quantize only supports swizzled scales"
+            )
+            assert K % 64 == 0, (
+                f"flashinfer nvfp4_quantize requires K (dim -1) to be divisible by 64, got {K}"
+            )
+            # flashinfer uses global_sf = (F8E4M3_MAX * F4_E2M1_MAX) / amax
+            # which is 1 / per_tensor_scale
+            global_sf = 1.0 / per_tensor_scale
+            data_lp, blockwise_scales = flashinfer_nvfp4_quantize(
+                data_hp,
+                global_sf,
+                sfLayout=SfLayout.layout_128x4,
+                do_shuffle=False,
+            )
         elif quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.TORCH:
             blockwise_scales, data_lp = nvfp4_quantize(
                 data_hp, block_size, per_tensor_scale