Update on "Add support for flashinfer quantize kernel option for nvfp4"

jerryzh168 · jerryzh168 · commit 6d6cc54e440d · 2026-03-14T01:03:52.000-07:00
Summary:
Added the flashinfer option for better performance on some of the workflow
we are interested in, also added numerical equivalence test between different
nvfp4_quantize_kernel_choice options

Test Plan:
pytest test/prototype/mx_formats/test_nvfp4_tensor.py -k test_kernel_preference_numerical_equivalence

We'll test speedup a bit later

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py
@@ -394,16 +394,6 @@ def test_quantize_to_nvfp4_kernel_numerical_equivalence(
 
     other_kernel_choices = [QuantizeToNVFP4KernelChoice.MSLK]
 
-    torch.testing.assert_close(nvfp4_pt.scale.flatten(), nvfp4_triton.scale.flatten())
-    pt_unpacked = unpack_uint4(nvfp4_pt.qdata.view(torch.uint8))
-    triton_unpacked = unpack_uint4(nvfp4_triton.qdata.view(torch.uint8))
-    torch.testing.assert_close(
-        pt_unpacked,
-        triton_unpacked,
-        atol=0,
-        rtol=0,
-    )
-
     # Flashinfer requires the library and per_tensor_scale
     if _is_flashinfer_available() and use_per_tensor_scale:
         other_kernel_choices.append(QuantizeToNVFP4KernelChoice.FLASHINFER)
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py
@@ -46,25 +46,30 @@ def _handle_use_triton_kernel(
 ) -> QuantizeToNVFP4KernelChoice:
     """Handle deprecated use_triton_kernel parameter.
 
-    Raises an exception if use_triton_kernel does not match
-    quantize_to_nvfp4_kernel_choice.
+    Raises ValueError if use_triton_kernel does not match
+    quantize_to_nvfp4_kernel_choice. use_triton_kernel=True corresponds to
+    MSLK, use_triton_kernel=False corresponds to TORCH or FLASHINFER.
     """
-    expected = (
-        QuantizeToNVFP4KernelChoice.MSLK
-        if use_triton_kernel
-        else QuantizeToNVFP4KernelChoice.TORCH
-    )
-    if expected != quantize_to_nvfp4_kernel_choice:
-        raise ValueError(
-            f"`use_triton_kernel={use_triton_kernel}` does not match "
-            f"`quantize_to_nvfp4_kernel_choice={quantize_to_nvfp4_kernel_choice}`. "
-            "`use_triton_kernel` is deprecated and will be removed after 0.17. "
-            "Please use `quantize_to_nvfp4_kernel_choice` instead. "
-            "`use_triton_kernel=True` is equivalent to "
-            "`quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.MSLK`, "
-            "`use_triton_kernel=False` is equivalent to "
-            "`quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.TORCH`."
-        )
+    if use_triton_kernel:
+        if quantize_to_nvfp4_kernel_choice != QuantizeToNVFP4KernelChoice.MSLK:
+            raise ValueError(
+                f"`use_triton_kernel=True` does not match "
+                f"`quantize_to_nvfp4_kernel_choice={quantize_to_nvfp4_kernel_choice}`. "
+                "`use_triton_kernel` is deprecated and will be removed after 0.17. "
+                "Please use `quantize_to_nvfp4_kernel_choice` instead. "
+                "`use_triton_kernel=True` is equivalent to "
+                "`quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.MSLK`."
+            )
+    else:
+        if quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.MSLK:
+            raise ValueError(
+                f"`use_triton_kernel=False` does not match "
+                f"`quantize_to_nvfp4_kernel_choice={quantize_to_nvfp4_kernel_choice}`. "
+                "`use_triton_kernel` is deprecated and will be removed after 0.17. "
+                "Please use `quantize_to_nvfp4_kernel_choice` instead. "
+                "`use_triton_kernel=False` is equivalent to "
+                "`quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.TORCH`."
+            )
     return quantize_to_nvfp4_kernel_choice