rocm: scaled_grouped_mm support gfx942 fp8 data type (#3955)

xiaobochen-amd · web-flow · commit 4e18d87712a7 · 2026-02-26T17:23:41.000-08:00
diff --git a/benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py b/benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py
@@ -27,6 +27,7 @@
     MXFP8GroupedMMRecipe,
 )
 from torchao.prototype.moe_training.utils import generate_jagged_offs
+from torchao.utils import is_MI300, is_MI350, is_ROCM
 
 device = torch.device("cuda")
 
@@ -260,14 +261,19 @@ def main(args: argparse.Namespace):
     configs = get_configs()
     results = []
     for config in tqdm(configs):
-        if (
-            config.recipe == FP8GroupedMMRecipe.FP8_ROWWISE
-            and torch.cuda.get_device_capability() != (9, 0)
-        ):
-            logging.warning(
-                f"Skipping FP8 rowwise benchmarks, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
-            )
-            continue
+        if config.recipe == FP8GroupedMMRecipe.FP8_ROWWISE:
+            if is_ROCM():
+                if not (is_MI300() or is_MI350()):
+                    logging.warning(
+                        "Skipping FP8 rowwise benchmarks, requires MI300 or MI350 on ROCm"
+                    )
+                    continue
+            else:
+                if torch.cuda.get_device_capability() != (9, 0):
+                    logging.warning(
+                        f"Skipping FP8 rowwise benchmarks, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
+                    )
+                    continue
 
         elif config.recipe in (
             MXFP8GroupedMMRecipe.MXFP8_RCEIL,
diff --git a/test/prototype/moe_training/test_scaled_grouped_mm.py b/test/prototype/moe_training/test_scaled_grouped_mm.py
@@ -30,6 +30,8 @@
 from torchao.float8.float8_training_tensor import LinearMMConfig
 from torchao.float8.float8_utils import compute_error, tensor_to_scale, to_fp8_saturated
 from torchao.prototype.moe_training.config import (
+    FP8GroupedMMConfig,
+    FP8GroupedMMRecipe,
     MXFP8GroupedMMConfig,
     MXFP8GroupedMMRecipe,
 )
@@ -47,6 +49,7 @@
 from torchao.prototype.mx_formats.mx_tensor import to_mx
 from torchao.quantization.quantize_.common import KernelPreference
 from torchao.testing.utils import skip_if_rocm
+from torchao.utils import is_MI300, is_MI350, is_ROCM
 
 # Needed since changing args to function causes recompiles
 torch._dynamo.config.cache_size_limit = 1000
@@ -56,14 +59,18 @@
     True,
     reason="Skipping FP8 rowwise test pending fix for https://github.com/pytorch/ao/issues/3788",
 )
-@skip_if_rocm("ROCm not supported")
 @pytest.mark.parametrize("m", [4096])
 @pytest.mark.parametrize("n", [8192])
 @pytest.mark.parametrize("k", [5120])
 @pytest.mark.parametrize("n_groups", [1, 2, 4, 8])
 def test_valid_scaled_grouped_mm_2d_3d(m, n, k, n_groups):
-    if not is_sm_version(9, 0):
-        pytest.skip("Skipping FP8 rowwise test, requires sm90")
+    if is_ROCM():
+        if not (is_MI300() or is_MI350()):
+            pytest.skip("FP8 rowwise test requires MI300 or MI350 on ROCm")
+    else:
+        if not is_sm_version(9, 0):
+            pytest.skip("FP8 rowwise test requires SM 9.0 on CUDA")
+
     out_dtype = torch.bfloat16
     device = "cuda"
     a = torch.randn(
@@ -86,7 +93,7 @@ def test_valid_scaled_grouped_mm_2d_3d(m, n, k, n_groups):
     b_t = b.contiguous().transpose(-2, -1).requires_grad_(True)
 
     # Compute output.
-    config = MXFP8GroupedMMConfig.from_recipe(MXFP8GroupedMMRecipe.MXFP8_EMULATED_RCEIL)
+    config = FP8GroupedMMConfig.from_recipe(FP8GroupedMMRecipe.FP8_ROWWISE)
     out = _quantize_then_scaled_grouped_mm(
         a,
         b_t,
@@ -105,15 +112,26 @@ def test_valid_scaled_grouped_mm_2d_3d(m, n, k, n_groups):
         out_dtype,
         offs,
     )
-    assert torch.equal(out, ref_out)
 
     # Run backward pass.
     out.sum().backward()
     ref_out.sum().backward()
 
     # Validate gradients.
-    assert torch.equal(a.grad, ref_a.grad)
-    assert torch.equal(b_t.grad, ref_b_t.grad)
+    if is_ROCM():
+        # ROCm: reference vs tested path use different backends:
+        # - `torch._scaled_mm` uses hipBLASLt
+        # - `_quantize_then_scaled_grouped_mm` uses CK
+        # Different backends can use different kernel implementations / accumulation order, so the
+        # outputs can differ slightly and we need tolerance.
+        # On MI300/MI325 we need rtol=atol=1e-2 for this FP8 test to pass.
+        assert torch.allclose(out, ref_out, rtol=1e-2, atol=1e-2)
+        assert torch.allclose(a.grad, ref_a.grad, rtol=1e-2, atol=1e-2)
+        assert torch.allclose(b_t.grad, ref_b_t.grad, rtol=1e-2, atol=1e-2)
+    else:
+        assert torch.equal(out, ref_out)
+        assert torch.equal(a.grad, ref_a.grad)
+        assert torch.equal(b_t.grad, ref_b_t.grad)
 
 
 @skip_if_rocm("ROCm not supported")
@@ -180,7 +198,7 @@ def compute_reference_forward(
         round_scales_to_power_of_2=float8_config.round_scales_to_power_of_2,
     )
     A_scaled = A.to(torch.float32) * A_scales
-    A_fp8 = to_fp8_saturated(A_scaled, torch.float8_e4m3fn)
+    A_fp8 = to_fp8_saturated(A_scaled, float8_config.cast_config_input.target_dtype)
 
     # Convert B^t to fp8.
     B_t_scales = tensor_to_scale(
@@ -193,7 +211,7 @@ def compute_reference_forward(
     B_t_scaled = B_t.to(torch.float32) * B_t_scales
     B_t_fp8 = to_fp8_saturated(
         B_t_scaled,
-        torch.float8_e4m3fn,
+        float8_config.cast_config_input.target_dtype,
     )
 
     # Split A and result into chunks, one for each group.
@@ -231,8 +249,12 @@ def compute_reference_forward(
             LinearMMConfig(),
             float8_config,
         )
-        assert torch.equal(result1, ref_group_result1)
-        assert torch.equal(result2, ref_group_result2)
+        if is_ROCM():
+            assert torch.allclose(result1, ref_group_result1, rtol=1e-2, atol=1e-2)
+            assert torch.allclose(result2, ref_group_result2, rtol=1e-2, atol=1e-2)
+        else:
+            assert torch.equal(result1, ref_group_result1)
+            assert torch.equal(result2, ref_group_result2)
         outputs.append(ref_group_result2)
 
     # Concatenate the outputs and verify the full result is correct.
diff --git a/test/prototype/moe_training/test_training.py b/test/prototype/moe_training/test_training.py
@@ -20,6 +20,7 @@
 )
 from torchao.quantization.quant_api import quantize_
 from torchao.quantization.quantize_.common import KernelPreference
+from torchao.utils import is_MI300, is_MI350, is_ROCM
 
 # Reference MoE implementation (copied from torchtitan to avoid external dependency)
 from .reference_moe import MoE, MoEArgs, set_token_group_alignment_size_m
@@ -97,18 +98,16 @@ def test_moe_training(
             "Skipping compile=True with kernel_preference=EMULATED, not currently supported"
         )
 
-    # FP8_ROWWISE hardware path requires SM90
-    if (
-        recipe == FP8GroupedMMRecipe.FP8_ROWWISE
-        and torch.cuda.get_device_capability()
-        != (
-            9,
-            0,
-        )
-    ):
-        pytest.skip(
-            f"Skipping FP8 rowwise tests, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
-        )
+    # FP8_ROWWISE hardware path requires SM90 (CUDA) or MI300/MI350 (ROCm)
+    if recipe == FP8GroupedMMRecipe.FP8_ROWWISE:
+        if is_ROCM():
+            if not (is_MI300() or is_MI350()):
+                pytest.skip("FP8 rowwise test requires MI300 or MI350 on ROCm")
+        else:
+            if torch.cuda.get_device_capability() != (9, 0):
+                pytest.skip(
+                    f"Skipping FP8 rowwise tests, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
+                )
 
     # MXFP8 hardware path requires SM100
     if recipe in (
diff --git a/torchao/prototype/moe_training/config.py b/torchao/prototype/moe_training/config.py
@@ -15,7 +15,7 @@
 from torchao.prototype.mx_formats.config import ScaleCalculationMode
 from torchao.quantization.quantize_.common import KernelPreference
 from torchao.quantization.transform_module import register_quantize_module_handler
-from torchao.utils import register_as_pytree_constant
+from torchao.utils import is_MI300, register_as_pytree_constant
 
 
 class FP8GroupedMMRecipe(Enum):
@@ -45,6 +45,10 @@ class FP8GroupedMMConfig(GroupedMMConfig):
     Configuration for FP8 grouped matrix multiplication.
     """
 
+    # Float8 dtype for the FP8 grouped GEMMs.
+    float8_dtype: torch.dtype = (
+        torch.float8_e4m3fnuz if is_MI300() else torch.float8_e4m3fn
+    )
     # Output dtype for the FP8 grouped GEMMs.
     out_dtype: Optional[torch.dtype] = torch.bfloat16
 
diff --git a/torchao/prototype/moe_training/fp8_grouped_mm.py b/torchao/prototype/moe_training/fp8_grouped_mm.py
@@ -22,6 +22,7 @@ def _to_fp8_rowwise_then_scaled_grouped_mm(
     B_t: torch.Tensor,
     offs: torch.Tensor,
     out_dtype: Optional[torch.dtype] = torch.bfloat16,
+    float8_dtype: torch.dtype = torch.float8_e4m3fn,
 ) -> torch.Tensor:
     """
     Differentiable FP8 grouped matrix multiplication with dynamic FP8 rowwise quantization.
@@ -48,7 +49,7 @@ def _to_fp8_rowwise_then_scaled_grouped_mm(
         - Scales are computed per-row and rounded to powers of 2 for efficiency
         - This function is fully differentiable via custom autograd implementation
     """
-    return _Float8GroupedMM.apply(A, B_t, offs, out_dtype)
+    return _Float8GroupedMM.apply(A, B_t, offs, out_dtype, float8_dtype)
 
 
 class _Float8GroupedMM(torch.autograd.Function):
@@ -61,6 +62,7 @@ def forward(
         B_t: torch.Tensor,
         offs: Optional[torch.Tensor] = None,
         out_dtype: Optional[torch.dtype] = torch.bfloat16,
+        float8_dtype: torch.dtype = torch.float8_e4m3fn,
     ) -> torch.Tensor:
         # torchao _quantize_then_scaled_grouped_mm only supports A=2D|3D and B=3D.
         assert A.ndim == 2 or A.ndim == 3, "A must be 2D or 3D"
@@ -100,31 +102,32 @@ def forward(
         # A_scales shape: (M,1) or (B, M, 1)
         A_scales = tensor_to_scale(
             A,
-            torch.float8_e4m3fn,
+            float8_dtype,
             scaling_granularity=ScalingGranularity.AXISWISE,
             axiswise_dim=-1,
             round_scales_to_power_of_2=True,
         )
         A_scaled = A.to(torch.float32) * A_scales
-        A_data_row_major = to_fp8_saturated(A_scaled, torch.float8_e4m3fn)
+        A_data_row_major = to_fp8_saturated(A_scaled, float8_dtype)
 
         # Convert B to float8, column-major for right operand of grouped GEMM.
         # B_t shape: (E, K, N)
         # B_t scales must be computed rowwise keeping the outer/final dim, so:
         # B_t_scales shape: (E, 1, N)
         B_t_scales = tensor_to_scale(
             B_t,
-            torch.float8_e4m3fn,
+            float8_dtype,
             scaling_granularity=ScalingGranularity.AXISWISE,
             axiswise_dim=-2,
             round_scales_to_power_of_2=True,
         )
         B_t_scaled = B_t.to(torch.float32) * B_t_scales
-        B_t_data_col_major = to_fp8_saturated(B_t_scaled, torch.float8_e4m3fn)
+        B_t_data_col_major = to_fp8_saturated(B_t_scaled, float8_dtype)
 
         # Store what we need for backward.
         ctx.save_for_backward(A, B_t, offs)
         ctx.out_dtype = out_dtype
+        ctx.float8_dtype = float8_dtype
 
         # Perform scaled grouped GEMM and return result.
         # output shape: scaled grouped mm of (M,K) @ (B,K,N) = (M,N)
@@ -154,6 +157,7 @@ def forward(
     def backward(ctx, grad_output: torch.Tensor):
         A, B_t, offs = ctx.saved_tensors
         out_dtype = ctx.out_dtype
+        float8_dtype = ctx.float8_dtype
 
         # Convert grad_output to float8, row-major for left operand of grouped GEMM
         # needed for grad_A: grad_output @ B
@@ -162,21 +166,19 @@ def backward(ctx, grad_output: torch.Tensor):
         # grad_output_scale shape: (Mg, 1)
         grad_output_scales = tensor_to_scale(
             grad_output,
-            torch.float8_e4m3fn,
+            float8_dtype,
             scaling_granularity=ScalingGranularity.AXISWISE,
             axiswise_dim=-1,
             round_scales_to_power_of_2=True,
         )
         grad_output_scaled = grad_output.to(torch.float32) * grad_output_scales
-        grad_output_data_row_major = to_fp8_saturated(
-            grad_output_scaled, torch.float8_e4m3fn
-        )
+        grad_output_data_row_major = to_fp8_saturated(grad_output_scaled, float8_dtype)
 
         # Compute B fp8 column-major for right operand of grouped GEMM:
         # grad_A = grad_output @ B.
         B_data_col_major, B_scales = triton_fp8_rowwise_3d_transpose_rhs(
             B_t._data if hasattr(B_t, "_data") else B_t,
-            output_dtype=torch.float8_e4m3fn,
+            output_dtype=float8_dtype,
             round_scales_to_power_of_2=True,
         )
 
@@ -216,7 +218,7 @@ def backward(ctx, grad_output: torch.Tensor):
             .contiguous()
             .t(),  # Quantization is over 2x faster when input is col major, even with this transformation
             offs,
-            torch.float8_e4m3fn,
+            float8_dtype,
             round_scales_to_power_of_2=True,
         )
         grad_output_t_data_row_major = grad_out_data_colwise.t()
@@ -227,7 +229,7 @@ def backward(ctx, grad_output: torch.Tensor):
             .contiguous()
             .t(),  # Quantization is over 2x faster when input is col major, even with this transformation
             offs,
-            torch.float8_e4m3fn,
+            float8_dtype,
             round_scales_to_power_of_2=True,
         )
 
diff --git a/torchao/prototype/moe_training/kernels/float8_rowwise.py b/torchao/prototype/moe_training/kernels/float8_rowwise.py
@@ -24,7 +24,9 @@
         torch.int32: tl.int32,
         torch.int64: tl.int64,
         torch.float8_e4m3fn: tl.float8e4nv,
+        torch.float8_e4m3fnuz: tl.float8e4b8,
         torch.float8_e5m2: tl.float8e5,
+        torch.float8_e5m2fnuz: tl.float8e5b16,
         torch.float16: tl.float16,
         torch.bfloat16: tl.bfloat16,
         torch.float32: tl.float32,
diff --git a/torchao/prototype/moe_training/kernels/jagged_float8_scales.py b/torchao/prototype/moe_training/kernels/jagged_float8_scales.py
@@ -29,7 +29,9 @@
         torch.int32: tl.int32,
         torch.int64: tl.int64,
         torch.float8_e4m3fn: tl.float8e4nv,
+        torch.float8_e4m3fnuz: tl.float8e4b8,
         torch.float8_e5m2: tl.float8e5,
+        torch.float8_e5m2fnuz: tl.float8e5b16,
         torch.float16: tl.float16,
         torch.bfloat16: tl.bfloat16,
         torch.float32: tl.float32,
diff --git a/torchao/prototype/moe_training/tensor.py b/torchao/prototype/moe_training/tensor.py
@@ -264,6 +264,7 @@ def _quantize_then_scaled_grouped_mm(
             B_t,
             offs,
             config.out_dtype,
+            config.float8_dtype,
         )
     elif isinstance(config, MXFP8GroupedMMConfig):
         return _to_mxfp8_then_scaled_grouped_mm(

Original file line number	Diff line number	Diff line change
`@@ -264,6 +264,7 @@ def _quantize_then_scaled_grouped_mm(`
`264`	`264`	`B_t,`
`265`	`265`	`offs,`
`266`	`266`	`config.out_dtype,`
	`267`	`+ config.float8_dtype,`
`267`	`268`	`)`
`268`	`269`	`elif isinstance(config, MXFP8GroupedMMConfig):`
`269`	`270`	`return _to_mxfp8_then_scaled_grouped_mm(`