Add pre-quantized activation support to MXFP8 grouped GEMM (_to_mxfp8_then_scaled_grouped_mm) (#3961)

MagellaX · web-flow · commit e654d745748f · 2026-03-12T19:02:13.000-07:00
* moe: support MXTensor helper for prequantized mxfp8 grouped mm

# Conflicts:
#	torchao/prototype/moe_training/mxfp8_grouped_mm.py

* lint: apply ruff format in mxfp8 grouped mm test

* mx: derive helper dtype from qdata
diff --git a/test/prototype/moe_training/test_mxfp8_grouped_mm.py b/test/prototype/moe_training/test_mxfp8_grouped_mm.py
@@ -40,7 +40,7 @@
     _to_mxfp8_per_group_rowwise,
     generate_jagged_offs,
 )
-from torchao.prototype.mx_formats.mx_tensor import to_mx
+from torchao.prototype.mx_formats.mx_tensor import MXTensor, to_mx
 from torchao.quantization.quantize_.common import KernelPreference
 from torchao.testing.utils import skip_if_rocm
 
@@ -225,3 +225,176 @@ def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(
     assert sqnr >= min_weight_grad_sqnr, (
         f"Weight grad sqnr {sqnr} is too low, must be >= {min_weight_grad_sqnr}"
     )
+
+
+@skip_if_rocm("ROCm not supported")
+def test_mxfp8_grouped_gemm_from_qdata_and_scales_matches_dynamic():
+    block_size = 32
+    M, K, N, num_experts = 4096, 1024, 2048, 8
+    x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda", requires_grad=True)
+    w = torch.randn(
+        num_experts,
+        N,
+        K,
+        dtype=torch.bfloat16,
+        device="cuda",
+    )
+    w_t = w.transpose(-2, -1).requires_grad_(True)
+    offs = generate_jagged_offs(num_experts, M, multiple_of=block_size)
+
+    x_ref = x.detach().clone().requires_grad_(True)
+    w_t_ref = w_t.detach().clone().requires_grad_(True)
+
+    x_scale, x_qdata = to_mx(
+        x.detach(),
+        elem_dtype=torch.float8_e4m3fn,
+        block_size=block_size,
+        scaling_mode=ScaleCalculationMode.RCEIL,
+    )
+    x_mx = MXTensor.from_qdata_and_scales(
+        x_qdata,
+        x_scale,
+        orig_dtype=x.dtype,
+        block_size=block_size,
+        is_swizzled_scales=False,
+    )
+    out = _to_mxfp8_then_scaled_grouped_mm(
+        x_mx,
+        w_t,
+        offs=offs,
+        block_size=block_size,
+        out_dtype=torch.bfloat16,
+        kernel_preference=KernelPreference.EMULATED,
+        wgrad_with_hp=True,
+        scale_calculation_mode=ScaleCalculationMode.RCEIL,
+    )
+    out_ref = _to_mxfp8_then_scaled_grouped_mm(
+        x_ref,
+        w_t_ref,
+        offs=offs,
+        block_size=block_size,
+        out_dtype=torch.bfloat16,
+        kernel_preference=KernelPreference.EMULATED,
+        wgrad_with_hp=True,
+        scale_calculation_mode=ScaleCalculationMode.RCEIL,
+    )
+
+    output_sqnr = compute_error(out_ref, out)
+    min_output_sqnr = 60.0
+    assert output_sqnr >= min_output_sqnr, (
+        f"Output sqnr {output_sqnr} is too low, must be >= {min_output_sqnr}"
+    )
+
+    labels = torch.ones_like(out_ref)
+    F.mse_loss(out_ref, labels).backward()
+    F.mse_loss(out, labels).backward()
+
+    assert x.grad is None, (
+        "MXTensor inputs are not connected back to the source HP tensor"
+    )
+
+    weight_grad_sqnr = compute_error(w_t_ref.grad, w_t.grad)
+    # MXTensor inputs dequantize for the `wgrad_with_hp` path, so the weight
+    # gradient is expected to be close to, but not identical to, the HP path.
+    min_weight_grad_sqnr = 30.0
+    assert weight_grad_sqnr >= min_weight_grad_sqnr, (
+        f"Weight grad sqnr {weight_grad_sqnr} is too low, must be >= {min_weight_grad_sqnr}"
+    )
+
+
+@skip_if_rocm("ROCm not supported")
+def test_mxfp8_grouped_gemm_from_qdata_and_scales_forward():
+    block_size = 32
+    M, K, N, num_experts = 4096, 1024, 2048, 8
+    x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
+    w = torch.randn(
+        num_experts,
+        N,
+        K,
+        dtype=torch.bfloat16,
+        device="cuda",
+    )
+    w_t = w.transpose(-2, -1)
+    offs = generate_jagged_offs(num_experts, M, multiple_of=block_size)
+
+    x_scale, x_qdata = to_mx(
+        x.detach(),
+        elem_dtype=torch.float8_e4m3fn,
+        block_size=block_size,
+        scaling_mode=ScaleCalculationMode.RCEIL,
+    )
+    x_mx = MXTensor.from_qdata_and_scales(
+        x_qdata,
+        x_scale,
+        orig_dtype=x.dtype,
+        block_size=block_size,
+        is_swizzled_scales=False,
+    )
+    out_mx = _to_mxfp8_then_scaled_grouped_mm(
+        x_mx,
+        w_t,
+        offs=offs,
+        block_size=block_size,
+        out_dtype=torch.bfloat16,
+        kernel_preference=KernelPreference.EMULATED,
+        wgrad_with_hp=True,
+        scale_calculation_mode=ScaleCalculationMode.RCEIL,
+    )
+    out_ref = _to_mxfp8_then_scaled_grouped_mm(
+        x,
+        w_t,
+        offs=offs,
+        block_size=block_size,
+        out_dtype=torch.bfloat16,
+        kernel_preference=KernelPreference.EMULATED,
+        wgrad_with_hp=True,
+        scale_calculation_mode=ScaleCalculationMode.RCEIL,
+    )
+
+    output_sqnr = compute_error(out_ref, out_mx)
+    min_output_sqnr = 60.0
+    assert output_sqnr >= min_output_sqnr, (
+        f"Output sqnr {output_sqnr} is too low, must be >= {min_output_sqnr}"
+    )
+
+
+@skip_if_rocm("ROCm not supported")
+def test_mxfp8_grouped_gemm_mxtensor_requires_wgrad_with_hp():
+    block_size = 32
+    M, K, N, num_experts = 1024, 1024, 2048, 4
+    x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
+    w = torch.randn(
+        num_experts,
+        N,
+        K,
+        dtype=torch.bfloat16,
+        device="cuda",
+    )
+    w_t = w.transpose(-2, -1)
+    offs = generate_jagged_offs(num_experts, M, multiple_of=block_size)
+
+    x_scale, x_qdata = to_mx(
+        x,
+        elem_dtype=torch.float8_e4m3fn,
+        block_size=block_size,
+        scaling_mode=ScaleCalculationMode.RCEIL,
+    )
+    x_mx = MXTensor.from_qdata_and_scales(
+        x_qdata,
+        x_scale,
+        orig_dtype=x.dtype,
+        block_size=block_size,
+        is_swizzled_scales=False,
+    )
+
+    with pytest.raises(AssertionError, match="wgrad_with_hp"):
+        _to_mxfp8_then_scaled_grouped_mm(
+            x_mx,
+            w_t,
+            offs=offs,
+            block_size=block_size,
+            out_dtype=torch.bfloat16,
+            kernel_preference=KernelPreference.EMULATED,
+            wgrad_with_hp=False,
+            scale_calculation_mode=ScaleCalculationMode.RCEIL,
+        )
diff --git a/test/prototype/mx_formats/test_mx_tensor.py b/test/prototype/mx_formats/test_mx_tensor.py
@@ -412,6 +412,66 @@ def test_block_sizes(elem_dtype, B):
     _test_mx(tensor_hp, elem_dtype, B)
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_from_qdata_and_scales_round_trip():
+    tensor_hp = torch.randn(128, 128, device="cuda", dtype=torch.bfloat16)
+    tensor_mx = MXTensor.to_mx(
+        tensor_hp,
+        torch.float8_e4m3fn,
+        32,
+        ScaleCalculationMode.RCEIL,
+    )
+    rebuilt = MXTensor.from_qdata_and_scales(
+        tensor_mx.qdata,
+        tensor_mx.scale,
+        orig_dtype=tensor_hp.dtype,
+        block_size=32,
+    )
+    torch.testing.assert_close(
+        rebuilt.dequantize(torch.float32),
+        tensor_mx.dequantize(torch.float32),
+    )
+    assert rebuilt.elem_dtype == tensor_mx.elem_dtype
+    assert rebuilt.block_size == tensor_mx.block_size
+    assert rebuilt.orig_dtype == tensor_mx.orig_dtype
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_from_qdata_and_scales_requires_float8_e8m0_scale_dtype():
+    tensor_hp = torch.randn(128, 128, device="cuda", dtype=torch.bfloat16)
+    tensor_mx = MXTensor.to_mx(
+        tensor_hp,
+        torch.float8_e4m3fn,
+        32,
+        ScaleCalculationMode.RCEIL,
+    )
+    with pytest.raises(AssertionError, match="scale.dtype"):
+        MXTensor.from_qdata_and_scales(
+            tensor_mx.qdata,
+            tensor_mx.scale.view(torch.uint8),
+            orig_dtype=tensor_hp.dtype,
+            block_size=32,
+        )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_from_qdata_and_scales_rejects_packed_uint8_qdata():
+    tensor_hp = torch.randn(128, 128, device="cuda", dtype=torch.bfloat16)
+    tensor_mx = MXTensor.to_mx(
+        tensor_hp,
+        torch.float8_e4m3fn,
+        32,
+        ScaleCalculationMode.RCEIL,
+    )
+    with pytest.raises(AssertionError, match="typed MX qdata"):
+        MXTensor.from_qdata_and_scales(
+            torch.zeros_like(tensor_mx.qdata, dtype=torch.uint8),
+            tensor_mx.scale,
+            orig_dtype=tensor_hp.dtype,
+            block_size=32,
+        )
+
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
 def test_transpose(elem_dtype):
diff --git a/torchao/prototype/moe_training/README.md b/torchao/prototype/moe_training/README.md
@@ -63,6 +63,7 @@ Training and model configurations for this run:
 ```python
 import torch
 from torch.nn import functional as F
+from torchao.prototype.mx_formats.mx_tensor import MXTensor, to_mx
 from torchao.prototype.moe_training import (
     _to_mxfp8_then_scaled_grouped_mm,
 )
@@ -83,6 +84,10 @@ out = _to_mxfp8_then_scaled_grouped_mm(
         B.transpose(-2, -1),
         offs,
 )
+# Optional: if you already have raw MXFP8 qdata/scales, wrap them as an MXTensor:
+# A_scale, A_qdata = to_mx(A, elem_dtype=torch.float8_e4m3fn, block_size=32)
+# A_mx = MXTensor.from_qdata_and_scales(A_qdata, A_scale, orig_dtype=A.dtype)
+# out = _to_mxfp8_then_scaled_grouped_mm(A_mx, B.transpose(-2, -1), offs, wgrad_with_hp=True)
 
 # (Fake labels for demonstration purposes)
 labels = torch.ones_like(out)
diff --git a/torchao/prototype/moe_training/mxfp8_grouped_mm.py b/torchao/prototype/moe_training/mxfp8_grouped_mm.py
@@ -49,6 +49,33 @@
 )
 
 
+def _validate_grouped_mm_input_act(
+    input_act: torch.Tensor,
+    block_size: int,
+) -> None:
+    if not isinstance(input_act, MXTensor):
+        return
+
+    assert input_act.elem_dtype == torch.float8_e4m3fn, (
+        f"Expected MXTensor with elem_dtype float8_e4m3fn, but got {input_act.elem_dtype}"
+    )
+    assert input_act.block_size == block_size, (
+        f"Expected MXTensor block_size={block_size}, but got {input_act.block_size}"
+    )
+    assert not input_act.is_swizzled_scales, (
+        "MXTensor input scales must be unswizzled for grouped GEMM"
+    )
+    assert input_act.qdata.ndim == 2, "MXTensor input_act data must be 2D"
+    assert input_act.scale.ndim == 2, "MXTensor input_act scale must be 2D"
+    assert input_act.scale.shape == (
+        input_act.shape[0],
+        input_act.shape[1] // block_size,
+    ), (
+        "MXTensor input scales must be rowwise with shape "
+        f"({input_act.shape[0]}, {input_act.shape[1] // block_size})"
+    )
+
+
 # Aliases for convenience/clarity
 # @conditional_nostrict_trace
 def _to_mxfp8_then_scaled_grouped_mm(
@@ -66,9 +93,11 @@ def _to_mxfp8_then_scaled_grouped_mm(
     Differentiable mxfp8 grouped gemm with dynamic mxfp8 quantization.
 
     Args:
-        A (bf16/float32 torch.Tensor): The first high-precision input tensor,
-            which must be a 2D tensor of shape (M * num_groups, K)
-            and in row-major memory layout.
+        A (torch.Tensor): Input activations. May be a high-precision 2D tensor of
+            shape (M * num_groups, K) in row-major memory layout, or an `MXTensor`
+            carrying pre-quantized MXFP8 activations. If you already have raw
+            `(qdata, scale)` tensors, wrap them first with
+            `MXTensor.from_qdata_and_scales(...)`.
         B_t (bf16/float32 torch.Tensor): The second high-precision input tensor
             which must be 3D, which must be shape (G, K, N)
             and in "per group column-major memory" layout (i.e., strides of (N*K, 1, N)).
@@ -85,6 +114,7 @@ def _to_mxfp8_then_scaled_grouped_mm(
     """
     # block_size is always 32 for MXFP8
     block_size = 32
+    _validate_grouped_mm_input_act(A, block_size)
     return _MXFP8GroupedMM.apply(
         A,
         B_t,
@@ -103,8 +133,8 @@ class _MXFP8GroupedMM(torch.autograd.Function):
     Differentiable implementation of grouped GEMM with dynamic MXFP8 quantization.
 
     This autograd function performs grouped matrix multiplication with MXFP8 quantization
-    for efficient MoE training. It supports both pre-quantized (MXTensor) and high-precision
-    inputs, with configurable quantization and layout conversion options.
+    for efficient MoE training. It supports both pre-quantized (`MXTensor`) and
+    high-precision inputs, with configurable quantization and layout conversion options.
     """
 
     @staticmethod
@@ -161,7 +191,9 @@ def forward(
         ), "out_dtype must be bfloat16 or float32"
         if isinstance(input_act, MXTensor):
             assert wgrad_with_hp, (
-                "only `wgrad_with_hp` recipe is supported for pre-quantized inputs, support for other recipes is still in progress"
+                "only `wgrad_with_hp` recipe is supported for MXTensor inputs because "
+                "backward needs the high-precision activations to quantize along dim1 "
+                "for weight gradients"
             )
 
         # Save original group_end_offsets and num_tokens before padding
@@ -347,8 +379,17 @@ def backward(ctx, grad_output: torch.Tensor):
             wgrad_with_hp,
             kernel_preference,
         )
-
-        return grad_input, grad_weight_t, None, None, None, None, None, None, None
+        return (
+            grad_input,
+            grad_weight_t,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
 
 
 def _compute_dgrad(
diff --git a/torchao/prototype/mx_formats/mx_tensor.py b/torchao/prototype/mx_formats/mx_tensor.py