hook up mslk's to_nvfp4 kernel to torchao's inference nvfp4 workflows

vkuzo · vkuzo · commit ded3400bbde5 · 2026-03-10T13:06:53.000Z
Summary: Decent speedups across the board. Note that we slightly modify the PyTorch reference code (global scale is a reciprocal in MSLK of its meaning in torchao) to keep bitwise equivalency between torchao reference and MSLK's kernel. Test Plan: performance: wins across the board simple microbenchmark sweep before ``` > python benchmarks/float8/float8_inference_roofline.py --recipe_name nvfp4_static --enable_fusion_modeling True --skip_printing_detailed_metrics True > python benchmarks/float8/float8_inference_roofline.py --recipe_name nvfp4_static --enable_fusion_modeling True --skip_printing_detailed_metrics True ``` and after ``` > python benchmarks/float8/float8_inference_roofline.py --recipe_name nvfp4_static --enable_fusion_modeling True --skip_printing_detailed_metrics True fwd_M fwd_K fwd_N r_fp8_gemm_and_ovhd_spdp b_fp8_e2e_spdp 0 1024 1024 1024 1.00 0.39 1 2048 2048 2048 2.36 0.68 2 4096 4096 4096 2.89 1.27 3 8192 8192 8192 3.32 1.93 4 16384 16384 16384 3.62 2.73 > python benchmarks/float8/float8_inference_roofline.py --recipe_name nvfp4_static --enable_fusion_modeling True --skip_printing_detailed_metrics True fwd_M fwd_K fwd_N r_fp8_gemm_and_ovhd_spdp b_fp8_e2e_spdp 0 1024 1024 1024 1.00 0.48 1 2048 2048 2048 2.74 0.88 2 4096 4096 4096 3.42 1.62 3 8192 8192 8192 3.67 2.27 4 16384 16384 16384 3.82 2.98 ``` TODO verify e2e model accuracy ghstack-source-id: fc673b2 ghstack-comment-id: 4027115619 Pull-Request: #4031
diff --git a/benchmarks/mx_formats/cast_bench.py b/benchmarks/mx_formats/cast_bench.py
@@ -83,8 +83,12 @@ def to_nvfp4_reference(x_hp):
 
 
 def to_nvfp4_reference_triton_swizzle(x_hp):
+    per_tensor_scale = torch.tensor(1.0, dtype=torch.float32, device=x_hp.device)
     nvfp4_tensor = NVFP4Tensor.to_nvfp4(
-        x_hp, use_triton_kernel=True, is_swizzled_scales=True
+        x_hp,
+        per_tensor_scale=per_tensor_scale,
+        use_triton_kernel=True,
+        is_swizzled_scales=True,
     )
     return nvfp4_tensor.qdata, nvfp4_tensor.scale
 
diff --git a/docs/source/workflows/inference.md b/docs/source/workflows/inference.md
@@ -155,11 +155,11 @@ torch version           2.12.0.dev20260218+cu130
 torchao version         0.17.0+git3075bb624
 ...
    fwd_M  fwd_K  fwd_N  r_fp8_gemm_and_ovhd_spdp  b_fp8_e2e_spdp
-0   1024   1024   1024                      1.00            0.28
-1   2048   2048   2048                      2.36            0.52
-2   4096   4096   4096                      2.89            0.90
-3   8192   8192   8192                      3.32            1.41
-4  16384  16384  16384                      3.62            2.14
+0   1024   1024   1024                      1.00            0.39
+1   2048   2048   2048                      2.36            0.68
+2   4096   4096   4096                      2.89            1.27
+3   8192   8192   8192                      3.32            1.93
+4  16384  16384  16384                      3.62            2.73
 
 #
 # nvfp4 with static global scaling (user API in progress)
@@ -171,11 +171,11 @@ torch version           2.12.0.dev20260218+cu130
 torchao version         0.17.0+git3075bb624
 ...
    fwd_M  fwd_K  fwd_N  r_fp8_gemm_and_ovhd_spdp  b_fp8_e2e_spdp
-0   1024   1024   1024                      1.00            0.34
-1   2048   2048   2048                      2.74            0.64
-2   4096   4096   4096                      3.42            1.06
-3   8192   8192   8192                      3.67            1.58
-4  16384  16384  16384                      3.82            2.31
+0   1024   1024   1024                      1.00            0.48
+1   2048   2048   2048                      2.74            0.88
+2   4096   4096   4096                      3.42            1.62
+3   8192   8192   8192                      3.67            2.27
+4  16384  16384  16384                      3.82            2.98
 ```
 
 ## Other Available Quantization Techniques
diff --git a/test/prototype/mx_formats/test_inference_workflow.py b/test/prototype/mx_formats/test_inference_workflow.py
@@ -217,7 +217,7 @@ def test_inference_workflow_nvfp4(
     y_ref = m(x)
 
     if use_triton_kernel and quant_type == "dynamic":
-        with cuda_kernel_profiler("quantize_nvfp4_triton_kernel") as result:
+        with cuda_kernel_profiler("triton_quantize_nvfp4_kernel") as result:
             y_mx = m_mx(x)
         assert result["found"], "Expected quantize_nvfp4 kernel to be found"
     else:
diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py
@@ -369,6 +369,8 @@ def test_nvfp4_swizzled_scales_get_scales_method():
 @torch.no_grad()
 def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
     """Test that Triton and PyTorch NVFP4 quantization produce equivalent results."""
+    if not use_per_tensor_scale:
+        pytest.skip("MSLK triton kernel requires per_tensor_scale")
 
     torch.manual_seed(42)
     x = torch.randn(M, N, dtype=dtype, device="cuda")
@@ -559,8 +561,14 @@ def test_scale_shape_matches_qdata(
     block_size = 16
 
     x_hp = torch.randn(*shape, device="cuda")
+
+    per_tensor_scale = per_tensor_amax_to_scale(torch.amax(torch.abs(x_hp)))
+
     x = NVFP4Tensor.to_nvfp4(
-        x_hp, is_swizzled_scales=is_swizzled_scales, use_triton_kernel=use_triton_kernel
+        x_hp,
+        per_tensor_scale=per_tensor_scale,
+        is_swizzled_scales=is_swizzled_scales,
+        use_triton_kernel=use_triton_kernel,
     )
 
     if len(shape) == 2:
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -1327,3 +1327,56 @@ def mxfp8_quantize_cuda(
         raise NotImplementedError(
             "`mxfp8_quantize_cuda` needs (1) torch 2.8+ and (2) torchao built from source on a machine with CUDA capability 10.0+. Please see https://github.com/pytorch/ao/issues/2932 for more details."
         )
+
+
+try:
+    from mslk.quantize.triton.fp4_quantize import (
+        triton_quantize_nvfp4 as _mslk_triton_quantize_nvfp4,
+    )
+
+    _mslk_available = True
+except ImportError:
+    _mslk_available = False
+
+
+@torch.library.custom_op("ao::mslk_quantize_nvfp4", mutates_args=())
+def mslk_quantize_nvfp4(
+    x: torch.Tensor, global_scale: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Quantize a tensor to NVFP4 using the MSLK triton kernel.
+
+    Args:
+        x: Input tensor to quantize.
+        global_scale: Global scale in MSLK convention (1.0 / per_tensor_scale).
+
+    Returns:
+        Tuple of (blockwise_scales, quantized_data_uint8) matching TorchAO's convention.
+    """
+    assert _mslk_available, (
+        "mslk is required for NVFP4 triton quantization. "
+        "Install from https://github.com/pytorch/MSLK"
+    )
+    data_lp, blockwise_scales = _mslk_triton_quantize_nvfp4(x, global_scale)
+    return blockwise_scales, data_lp.view(torch.uint8)
+
+
+@mslk_quantize_nvfp4.register_fake
+def _(x, global_scale):
+    # Mirror the reshape logic from the real MSLK kernel
+    orig_leading_dims, orig_N = x.shape[:-2], x.shape[-1]
+    x_2d = x.reshape(-1, orig_N)
+    M, N = x_2d.shape
+
+    num_scales = N // 16
+    n_row_blocks = triton.cdiv(M, 128)
+    n_col_blocks = triton.cdiv(num_scales, 4)
+    padded_rows = n_row_blocks * 128
+    padded_cols = n_col_blocks * 4
+
+    scales = x.new_empty(padded_rows, padded_cols, dtype=torch.float8_e4m3fn)
+    xq = x.new_empty(M, N // 2, dtype=torch.uint8)
+
+    # Reshape back to match original leading dims
+    scales = scales.view(*orig_leading_dims, -1, padded_cols)
+    xq = xq.view(*orig_leading_dims, -1, N // 2)
+    return scales, xq
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py
@@ -15,8 +15,8 @@
 from torchao.prototype.mx_formats.kernels import (
     f4_unpacked_to_f32,
     f32_to_f4_unpacked,
+    mslk_quantize_nvfp4,
     pack_uint4,
-    triton_quantize_nvfp4,
     unpack_uint4,
 )
 from torchao.prototype.mx_formats.mx_tensor import (
@@ -155,7 +155,10 @@ def to_nvfp4(
             assert K % 16 == 0, (
                 f"Triton kernel requires K (dim -1) to be divisible by 16, got {K}"
             )
-            blockwise_scales, data_lp = triton_quantize_nvfp4(data_hp, per_tensor_scale)
+            assert per_tensor_scale is not None, (
+                "Triton kernel requires per_tensor_scale"
+            )
+            blockwise_scales, data_lp = mslk_quantize_nvfp4(data_hp, per_tensor_scale)
         else:
             blockwise_scales, data_lp = nvfp4_quantize(
                 data_hp, block_size, per_tensor_scale
@@ -245,7 +248,7 @@ def get_hp_scales(self) -> torch.Tensor:
         return (
             scale_e4m3.to(self.orig_dtype)
             if self.per_tensor_scale is None
-            else self.per_tensor_scale * scale_e4m3.to(self.orig_dtype)
+            else scale_e4m3.to(self.orig_dtype) / self.per_tensor_scale
         )
 
     @classmethod
@@ -465,7 +468,7 @@ def _addmm_nvfp4_dispatch(
     # Merge double quant scales into 1 scale for Scale_In^D
     if a.per_tensor_scale is not None:
         assert b.per_tensor_scale is not None
-        scale_result = a.per_tensor_scale * b.per_tensor_scale
+        scale_result = 1.0 / (a.per_tensor_scale * b.per_tensor_scale)
     else:
         assert b.per_tensor_scale is None and a.per_tensor_scale is None
         scale_result = None
@@ -625,17 +628,17 @@ def nvfp4_addmm(func, types, args, kwargs):
 def per_tensor_amax_to_scale(amax: torch.Tensor) -> torch.Tensor:
     """Convert per-tensor amax to per-tensor scale for NVFP4 quantization.
 
-    Divides by both F8E4M3_MAX and F4_E2M1_MAX to ensure block scales can utilize
-    the full FP8 E4M3 range (up to 448) when block_max equals tensor_max.
-    Without F4_E2M1_MAX, the maximum scale would only reach FP8_MAX / FP4_MAX.
+    Returns the global scale in MSLK convention: (F8E4M3_MAX * F4_E2M1_MAX) / amax.
+    This ensures block scales can utilize the full FP8 E4M3 range (up to 448)
+    when block_max equals tensor_max.
 
     Args:
         amax: Per-tensor absolute maximum value from calibration
 
     Returns:
         torch.Tensor: Per-tensor scale for two-level NVFP4 scaling
     """
-    return amax.to(torch.float32) / (F8E4M3_MAX * F4_E2M1_MAX)
+    return (F8E4M3_MAX * F4_E2M1_MAX) / amax.to(torch.float32)
 
 
 def nvfp4_quantize(
@@ -694,15 +697,15 @@ def nvfp4_quantize(
         # we want the per_tensor_scale ~= amax of the block_scale_fp32
         block_scale_fp32 = block_scale.to(torch.float32)
         # Quantize the blockwise scales w/ the per_tensor_scale
-        scaled_block_scales = block_scale_fp32 / per_tensor_scale
+        scaled_block_scales = block_scale_fp32 * per_tensor_scale
         scaled_block_scales_fp8 = torch.clamp(
             scaled_block_scales, min=E4M3_EPS, max=F8E4M3_MAX
         ).to(torch.float8_e4m3fn)
         scaled_block_scales_fp32 = scaled_block_scales_fp8.to(torch.float32)
-        # We "temporarily" dequant the scaled_block_scales_fp32 to get the per_tensor_scale
-        # To apply to data
-        total_scale = per_tensor_scale * scaled_block_scales_fp32
-        data_scaled = data_hp / total_scale.unsqueeze(-1)
+        # Multiply by reciprocal of combined scale instead of dividing,
+        # to match the MSLK triton kernel numerics: x * (global_scale / fp8_scale)
+        reciprocal_scale = per_tensor_scale / scaled_block_scales_fp32
+        data_scaled = data_hp * reciprocal_scale.unsqueeze(-1)
         out_scales = scaled_block_scales_fp8
 
     data_scaled = torch.clamp(data_scaled, -F4_E2M1_MAX, F4_E2M1_MAX)