[nvfp4] Make per_tensor_scale optional for triton kernel path (#4188)

jerryzh168 · web-flow · commit a302c10bb1b9 · 2026-04-02T16:03:48.000-07:00
* [nvfp4] Make per_tensor_scale optional for triton kernel path

Summary:
MSLK now supports optional global scale in its triton quantize kernel
(MSLK#233, commit c01f06c). This change relaxes the corresponding
constraint in torchao so the triton kernel path can be used without
a per_tensor_scale (single-level block-wise scaling only).

Changes:
- Remove `assert per_tensor_scale is not None` from `to_nvfp4` triton branch
- Update `mslk_quantize_nvfp4` and its custom op to accept `Optional[torch.Tensor]`,
  passing `None` through to MSLK (which treats it as global_scale=1.0)
- Relax `_addmm_nvfp4_dispatch` to allow mixed per_tensor_scale states between
  operands (treat None as 1.0) instead of asserting both-or-neither

Test Plan:
Requires SM100+ GPU with MSLK nightly installed.

```
python -m pytest test/prototype/mx_formats/test_nvfp4_tensor.py::test_triton_nvfp4_quantize_equivalence -v
python -m pytest test/prototype/mx_formats/test_nvfp4_tensor.py::test_nvfp4_matmul_optional_per_tensor_scale -v
python -m pytest test/prototype/mx_formats/test_inference_workflow.py::test_inference_workflow_nvfp4 -k "test_inference_workflow_nvfp4" -v
```

Performance:

with global scale:
```
python benchmarks/float8/float8_inference_roofline.py --recipe_name nvfp4 --enable_fusion_modeling True --skip_printing_detailed_metrics True

Parameter               Value
----------------------  ------------------------
GPU                     NVIDIA GB200
torch version           2.12.0.dev20260316+cu128
torchao version         0.17.0+git95281b63b
recipe_name             nvfp4
do_benchmarks           True
shape_gen_name          pow2
enable_fusion_modeling  True
op_name                 linear
MKN                     None None None
DHW                     None None None
kernel_size
stride                  1
padding                 0
bf16_gemm_time_sympy Max(2.0e-6, 1.13960113960114e-15*K*M*N, 2.71739130434783e-13*K*M + 2.71739130434783e-13*K*N + 2.71739130434783e-13*M*N)
bf16_ovhd_time_sympy Max(2.0e-6, 5.43478260869565e-13*K*M)
fp8_gemm_time_sympy Max(2.0e-6, 2.84900284900285e-16*K*M*N, 6.79347826086956e-14*K*M + 6.79347826086956e-14*K*N + 2.71739130434783e-13*M*N + 6.79347826086956e-14*floor(K*M/16 + K*N/16))
fp8_ovhd_time_sympy Max(2.0e-6, 6.11413043478261e-13*K*M + 1.35869565217391e-13*M*floor(K/16))
   fwd_M  fwd_K  fwd_N  r_fp8_gemm_and_ovhd_spdp  b_fp8_e2e_spdp
0   1024   1024   1024                      1.00            0.45
1   2048   2048   2048                      2.39            0.66
2   4096   4096   4096                      2.92            1.29
3   8192   8192   8192                      3.34            1.74
4  16384  16384  16384                      3.63            2.84

```

without global scale:
```
python benchmarks/float8/float8_inference_roofline.py --recipe_name nvfp4_no_global_scale --enable_fusion_modeling True --skip_printing_detailed_metrics True

Parameter               Value
----------------------  ------------------------
GPU                     NVIDIA GB200
torch version           2.12.0.dev20260316+cu128
torchao version         0.17.0+gitabb103d3b
recipe_name             nvfp4_no_global_scale
do_benchmarks           True
shape_gen_name          pow2
enable_fusion_modeling  True
op_name                 linear
MKN                     None None None
DHW                     None None None
kernel_size
stride                  1
padding                 0
bf16_gemm_time_sympy Max(2.0e-6, 1.13960113960114e-15*K*M*N, 2.71739130434783e-13*K*M + 2.71739130434783e-13*K*N + 2.71739130434783e-13*M*N)
bf16_ovhd_time_sympy Max(2.0e-6, 5.43478260869565e-13*K*M)
fp8_gemm_time_sympy Max(2.0e-6, 2.84900284900285e-16*K*M*N, 6.79347826086956e-14*K*M + 6.79347826086956e-14*K*N + 2.71739130434783e-13*M*N + 6.79347826086956e-14*floor(K*M/16 + K*N/16))
fp8_ovhd_time_sympy Max(2.0e-6, 3.39673913043478e-13*K*M + 1.35869565217391e-13*M*floor(K/16))

   fwd_M  fwd_K  fwd_N  r_fp8_gemm_and_ovhd_spdp  b_fp8_e2e_spdp
0   1024   1024   1024                      1.00            0.73
1   2048   2048   2048                      2.71            1.09
2   4096   4096   4096                      3.44            2.22
3   8192   8192   8192                      3.68            2.82
4  16384  16384  16384                      3.83            3.65

```

[ghstack-poisoned]
diff --git a/benchmarks/float8/float8_inference_roofline.py b/benchmarks/float8/float8_inference_roofline.py
@@ -112,7 +112,12 @@ def get_gemm_times(
 
     bf16_time_s = get_gpu_kernel_gemm_time_s(torch.mm, x_bf16, w_bf16)
 
-    if recipe_name in ("mxfp4_cutlass", "nvfp4", "nvfp4_static"):
+    if recipe_name in (
+        "mxfp4_cutlass",
+        "nvfp4",
+        "nvfp4_static",
+        "nvfp4_no_global_scale",
+    ):
         d1, d2, d3 = torch.float4_e2m1fn_x2, torch.float4_e2m1fn_x2, torch.bfloat16
         A = torch.randint(0, 255, (M, K // 2), device=device, dtype=torch.uint8).view(
             d1
@@ -151,7 +156,7 @@ def get_gemm_times(
         scale_b = torch.ones(N, K // 32, device=device, dtype=torch.float8_e8m0fnu)
         scale_a = to_blocked(scale_a)
         scale_b = to_blocked(scale_b)
-    elif recipe_name in ("nvfp4", "nvfp4_static"):
+    elif recipe_name in ("nvfp4", "nvfp4_static", "nvfp4_no_global_scale"):
         scale_a = torch.ones(M, K // 16, device=device, dtype=torch.float8_e4m3fn)
         scale_b = torch.ones(N, K // 16, device=device, dtype=torch.float8_e4m3fn)
         scale_a = to_blocked(scale_a)
@@ -177,7 +182,7 @@ def do_matmul(A, B):
                 swizzle_b=SwizzleType.SWIZZLE_32_4_4,
                 output_dtype=d3,
             )
-        if recipe_name in ("nvfp4", "nvfp4_static"):
+        if recipe_name in ("nvfp4", "nvfp4_static", "nvfp4_no_global_scale"):
             return torch._scaled_mm(
                 A, B, scale_a, scale_b, out_dtype=d3, use_fast_accum=False
             )
@@ -797,6 +802,10 @@ def run(
                     config = NVFP4DynamicActivationNVFP4WeightConfig(
                         use_dynamic_per_tensor_scale=True,
                     )
+                elif recipe_name == "nvfp4_no_global_scale":
+                    config = NVFP4DynamicActivationNVFP4WeightConfig(
+                        use_dynamic_per_tensor_scale=False,
+                    )
                 elif recipe_name == "nvfp4_static":
                     config_calib = NVFP4DynamicActivationNVFP4WeightConfig(
                         step="prepare",
diff --git a/test/prototype/mx_formats/test_inference_workflow.py b/test/prototype/mx_formats/test_inference_workflow.py
@@ -181,8 +181,6 @@ def test_inference_workflow_nvfp4(
         pytest.skip("TODO: weight_only quant currently errors w/ compile")
     if quant_type == "weight_only" and use_triton_kernel:
         pytest.skip("unsupported configuration")
-    if use_triton_kernel and not use_dynamic_per_tensor_scale:
-        pytest.skip("unsupported configuration")
 
     if use_inference_mode and (
         shapes != (128, 64, 256) or inpt_dtype != torch.bfloat16 or use_triton_kernel
diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py
@@ -369,9 +369,6 @@ def test_nvfp4_swizzled_scales_get_scales_method():
 @torch.no_grad()
 def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
     """Test that Triton and PyTorch NVFP4 quantization produce equivalent results."""
-    if not use_per_tensor_scale:
-        pytest.skip("MSLK triton kernel requires per_tensor_scale")
-
     torch.manual_seed(42)
     x = torch.randn(M, N, dtype=dtype, device="cuda")
 
@@ -657,3 +654,61 @@ def test_nvfp4_pin_memory(use_per_tensor_scale):
     assert torch.equal(
         x_cpu.dequantize(torch.float32), x_pinned.dequantize(torch.float32)
     )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not is_sm_at_least_100(), reason="requires sm100+ for nvfp4 triton kernel"
+)
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        (128, 64, 256),
+        (256, 128, 512),
+    ],
+    ids=lambda s: f"{s[0]}x{s[1]}x{s[2]}",
+)
+@pytest.mark.parametrize(
+    "a_has_scale",
+    [True, False],
+    ids=["a_scale", "no_a_scale"],
+)
+@pytest.mark.parametrize("use_triton_kernel", [True, False])
+@torch.no_grad()
+@skip_if_rocm("ROCm float4 gemm require gfx950")
+def test_nvfp4_matmul_optional_per_tensor_scale(shapes, a_has_scale, use_triton_kernel):
+    """Test NVFP4 matmul works when per_tensor_scale is None for activation but always set for weight."""
+    m, k, n = shapes
+
+    A = torch.randn(m, k, dtype=torch.bfloat16, device="cuda")
+    B = torch.randn(n, k, dtype=torch.bfloat16, device="cuda")
+
+    C_ref = F.linear(A, B)
+
+    a_scale = (
+        per_tensor_amax_to_scale(torch.amax(torch.abs(A))) if a_has_scale else None
+    )
+    b_scale = per_tensor_amax_to_scale(torch.amax(torch.abs(B)))
+
+    act_quant_kwargs = QuantizeTensorToNVFP4Kwargs()
+
+    A_nvfp4 = NVFP4Tensor.to_nvfp4(
+        A,
+        per_tensor_scale=a_scale,
+        is_swizzled_scales=True,
+        use_triton_kernel=use_triton_kernel,
+    )
+    B_nvfp4 = NVFP4Tensor.to_nvfp4(
+        B,
+        per_tensor_scale=b_scale,
+        is_swizzled_scales=True,
+        use_triton_kernel=use_triton_kernel,
+        act_quant_kwargs=act_quant_kwargs,
+    )
+
+    C_nvfp4 = F.linear(A_nvfp4, B_nvfp4)
+    assert C_nvfp4.dtype == torch.bfloat16
+
+    sqnr = compute_error(C_ref, C_nvfp4)
+    SQNR_THRESHOLD = 16.0
+    assert sqnr >= SQNR_THRESHOLD, f"SQNR {sqnr:.2f} < {SQNR_THRESHOLD}, {a_has_scale=}"
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -6,7 +6,7 @@
 
 import importlib
 import logging
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 import torch
@@ -22,6 +22,7 @@
 from torchao.utils import (
     is_cuda_version_at_least,
     is_MI350,
+    is_mslk_version_at_least,
     is_ROCM,
     is_sm_at_least_100,
     torch_version_at_least,
@@ -1175,30 +1176,34 @@ def mxfp8_quantize_cuda(
 
 
 def mslk_quantize_nvfp4(
-    x: torch.Tensor, per_tensor_scale: torch.Tensor
+    x: torch.Tensor, per_tensor_scale: Optional[torch.Tensor] = None
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Quantize a tensor to NVFP4 using the MSLK triton kernel.
 
     Args:
         x: Input tensor to quantize.
-        per_tensor_scale: Per-tensor scale (TorchAO convention: amax / (F8E4M3_MAX * F4_E2M1_MAX)).
+        per_tensor_scale: Optional per-tensor scale (TorchAO convention: amax / (F8E4M3_MAX * F4_E2M1_MAX)).
+            If None, the global scale is not applied (single-level block-wise scaling only).
 
     Returns:
         Tuple of (blockwise_scales, quantized_data_uint8) matching TorchAO's convention.
     """
-    mslk_global_scale = per_tensor_scale.reciprocal()
+    mslk_global_scale = (
+        per_tensor_scale.reciprocal() if per_tensor_scale is not None else None
+    )
     return _mslk_quantize_nvfp4_custom_op(x, mslk_global_scale)
 
 
 @torch.library.custom_op("ao::mslk_quantize_nvfp4", mutates_args=())
 def _mslk_quantize_nvfp4_custom_op(
-    x: torch.Tensor, global_scale: torch.Tensor
+    x: torch.Tensor, global_scale: Optional[torch.Tensor] = None
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Inner custom op for MSLK NVFP4 quantization.
 
     Args:
         x: Input tensor to quantize.
-        global_scale: Global scale in MSLK convention (1.0 / per_tensor_scale).
+        global_scale: Optional global scale in MSLK convention (1.0 / per_tensor_scale).
+            If None, the global scale is not applied (treated as 1.0).
 
     Returns:
         Tuple of (blockwise_scales, quantized_data_uint8) matching TorchAO's convention.
@@ -1211,12 +1216,18 @@ def _mslk_quantize_nvfp4_custom_op(
         triton_quantize_nvfp4 as _mslk_triton_quantize_nvfp4,
     )
 
+    if global_scale is None:
+        assert is_mslk_version_at_least("1.1.0"), (
+            "Optional global_scale support requires MSLK >= 1.1.0, "
+            "Please upgrade MSLK: https://github.com/pytorch/MSLK"
+        )
+
     data_lp, blockwise_scales = _mslk_triton_quantize_nvfp4(x, global_scale)
     return blockwise_scales, data_lp.view(torch.uint8)
 
 
 @_mslk_quantize_nvfp4_custom_op.register_fake
-def _(x, global_scale):
+def _(x, global_scale=None):
     # Mirror the reshape logic from the real MSLK kernel
     orig_leading_dims, orig_N = x.shape[:-2], x.shape[-1]
     x_2d = x.reshape(-1, orig_N)
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py
@@ -155,9 +155,6 @@ def to_nvfp4(
             assert K % 16 == 0, (
                 f"Triton kernel requires K (dim -1) to be divisible by 16, got {K}"
             )
-            assert per_tensor_scale is not None, (
-                "Triton kernel requires per_tensor_scale"
-            )
             blockwise_scales, data_lp = mslk_quantize_nvfp4(data_hp, per_tensor_scale)
         else:
             blockwise_scales, data_lp = nvfp4_quantize(
@@ -497,11 +494,14 @@ def _addmm_nvfp4_dispatch(
         b_scale_blocked = to_blocked(b_scale)
 
     # Merge double quant scales into 1 scale for Scale_In^D
-    if a.per_tensor_scale is not None:
-        assert b.per_tensor_scale is not None
-        scale_result = a.per_tensor_scale * b.per_tensor_scale
+    # When per_tensor_scale is None for an operand, it's treated as 1.0
+    a_scale = a.per_tensor_scale
+    b_scale = b.per_tensor_scale
+    if a_scale is not None and b_scale is not None:
+        scale_result = a_scale * b_scale
+    elif a_scale is not None or b_scale is not None:
+        scale_result = a_scale if a_scale is not None else b_scale
     else:
-        assert b.per_tensor_scale is None and a.per_tensor_scale is None
         scale_result = None
 
     # THIS IS A WORKAROUND FOR TWO ERRORS:
@@ -720,7 +720,9 @@ def nvfp4_quantize(
             torch.float8_e4m3fn
         )
         block_scale_fp32 = block_scale_fp8.to(torch.float32)
-        data_scaled = data_hp / block_scale_fp32.unsqueeze(-1)
+        # Multiply by reciprocal instead of dividing to match MSLK triton kernel
+        # numerics (global_scale=None treated as 1.0): x * (1.0 / fp8_scale)
+        data_scaled = data_hp * (1.0 / block_scale_fp32).unsqueeze(-1)
         out_scales = block_scale_fp8
     else:
         # We are doing two level scaling,
diff --git a/torchao/testing/training/roofline_utils.py b/torchao/testing/training/roofline_utils.py
@@ -519,7 +519,7 @@ def get_inference_tensor_memory_traffic_ovhd_s(
             )
             res_bytes = [kernel_1_rw + kernel_3_rw]
 
-        case "nvfp4_static":
+        case "nvfp4_static" | "nvfp4_no_global_scale":
             # nvfp4 with static global scaling
             # x_b16 = ...
             # static_max_abs = ...
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -1277,6 +1277,14 @@ def _is_mslk_available():
     return True
 
 
+def is_mslk_version_at_least(min_version: str) -> bool:
+    if not _is_mslk_available():
+        return False
+    import mslk
+
+    return parse_version(mslk.__version__) >= parse_version(min_version)
+
+
 def _is_flashinfer_available():
     return (
         # flashinfer-python

Original file line number	Diff line number	Diff line change
`@@ -519,7 +519,7 @@ def get_inference_tensor_memory_traffic_ovhd_s(`
`519`	`519`	`)`
`520`	`520`	`res_bytes = [kernel_1_rw + kernel_3_rw]`
`521`	`521`
`522`		`- case "nvfp4_static":`
	`522`	`+ case "nvfp4_static" \| "nvfp4_no_global_scale":`
`523`	`523`	`# nvfp4 with static global scaling`
`524`	`524`	`# x_b16 = ...`
`525`	`525`	`# static_max_abs = ...`