pytorch
diff --git a/‎benchmarks/prototype/moe_training/bench_2d_3d_grouped_gemm.py‎
Lines changed: 35 additions & 7 deletions b/‎benchmarks/prototype/moe_training/bench_2d_3d_grouped_gemm.py‎
Lines changed: 35 additions & 7 deletions
diff --git a/‎benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py‎
Lines changed: 6 additions & 2 deletions b/‎benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎test/prototype/moe_training/test_mxfp8_grouped_mm.py‎
Lines changed: 1 addition & 8 deletions b/‎test/prototype/moe_training/test_mxfp8_grouped_mm.py‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎torchao/prototype/moe_training/config.py‎
Lines changed: 14 additions & 1 deletion b/‎torchao/prototype/moe_training/config.py‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎torchao/prototype/moe_training/kernels/mxfp8/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎torchao/prototype/moe_training/kernels/mxfp8/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎torchao/prototype/moe_training/kernels/mxfp8/quant.py‎
Lines changed: 34 additions & 0 deletions b/‎torchao/prototype/moe_training/kernels/mxfp8/quant.py‎
Lines changed: 34 additions & 0 deletions
@@ -23,6 +23,7 @@
 )
 from torchao.prototype.moe_training.utils import generate_jagged_offs
 from torchao.prototype.mx_formats.mx_tensor import to_mx
+from torchao.utils import is_MI350
 
 device = torch.device("cuda")
 
@@ -115,13 +116,17 @@ def run_experiment(
         fp8_rowwise_us = bench_fp8_rowwise_grouped_mm(A, B_t, offs)
 
     # benchmark mxfp8 grouped mm
-    if torch.cuda.get_device_capability() != (10, 0):
+    if torch.cuda.get_device_capability() == (10, 0):
+        mxfp8_us = bench_mxfp8_grouped_mm(A, B_t, offs)
+    elif is_MI350():
+        mxfp8_us = bench_mxfp8_grouped_mm_rocm(A, B_t, offs)
+    else:
         logging.warning(
-            f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
+            f"Skipping MXFP8 benchmarks, only supported on CUDA SM 10.0 or MI350+ "
+            f"(found device_capability={torch.cuda.get_device_capability()}, "
+            f"hip={torch.version.hip})"
         )
         mxfp8_us = float("inf")
-    else:
-        mxfp8_us = bench_mxfp8_grouped_mm(A, B_t, offs)
 
     return ExperimentResult(
         bf16_us=round(bf16_us, 3),
@@ -148,13 +153,12 @@ def print_results(experiments: List[Experiment]):
     rows = []
     for experiment in experiments:
         # calculate tflops
-        e, m, n, k = (
-            experiment.config.e,
+        m, n, k = (
             experiment.config.m,
             experiment.config.n,
             experiment.config.k,
         )
-        flops = 2 * e * m * n * k
+        flops = 2 * m * n * k
         bf16_tflops = (flops / 1e12) / (experiment.result.bf16_us / 1e6)
         fp8_rowwise_tflops = (flops / 1e12) / (experiment.result.fp8_rowwise_us / 1e6)
         mxfp8_tflops = (flops / 1e12) / (experiment.result.mxfp8_us / 1e6)
@@ -247,6 +251,30 @@ def bench_mxfp8_grouped_mm(A, B_t, offs, block_size=32) -> float:
     return mxfp8_us
 
 
+def bench_mxfp8_grouped_mm_rocm(A, B_t, offs, block_size=32) -> float:
+    from torchao.prototype.moe_training.kernels.mxfp8.rocm_mxfp8_mm import (
+        triton_mxfp8_grouped_mm,
+    )
+
+    A_scales, A_fp8 = to_mx(A, elem_dtype=torch.float8_e4m3fn, block_size=block_size)
+    B_nkK = B_t.transpose(-2, -1).contiguous()
+    B_scales, B_fp8 = to_mx(B_nkK, elem_dtype=torch.float8_e4m3fn, block_size=block_size)
+
+    E = offs.shape[0]
+    Mg = A.shape[0]
+    offs_mxfp8 = generate_jagged_offs(E, Mg, multiple_of=block_size)
+
+    mxfp8_us = benchmark_cuda_function_in_microseconds(
+        triton_mxfp8_grouped_mm,
+        A_fp8,
+        B_fp8,
+        A_scales,
+        B_scales,
+        offs_mxfp8,
+    )
+    return mxfp8_us
+
+
 def main(args: argparse.Namespace):
     torch.random.manual_seed(123)
     configs = get_configs()
 
@@ -249,9 +249,13 @@ def main(args: argparse.Namespace):
         elif config.recipe in (
             MXFP8TrainingRecipe.MXFP8_RCEIL,
             MXFP8TrainingRecipe.MXFP8_RCEIL_WGRAD_WITH_HP,
-        ) and torch.cuda.get_device_capability() != (10, 0):
+        ) and not (
+            torch.cuda.get_device_capability() == (10, 0) or is_MI350()
+        ):
             logging.warning(
-                f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
+                f"Skipping MXFP8 benchmarks, only supported on CUDA SM 10.0 or MI350+ "
+                f"(found device_capability={torch.cuda.get_device_capability()}, "
+                f"hip={torch.version.hip})"
             )
             continue
 
 
@@ -42,13 +42,11 @@
 )
 from torchao.prototype.mx_formats.mx_tensor import MXTensor, to_mx
 from torchao.quantization.quantize_.common import KernelPreference
-from torchao.testing.utils import skip_if_rocm
 
 # Needed since changing args to function causes recompiles
 torch._dynamo.config.cache_size_limit = 1000
 
 
-@skip_if_rocm("ROCm not supported")
 @pytest.mark.parametrize("M,K,N", [(1024, 1024, 1024), (1024, 2048, 4096)])
 @pytest.mark.parametrize("num_experts", (1, 8, 16))
 def test_emulate_mxfp8_grouped_gemm_2d_3d(M, K, N, num_experts):
@@ -80,7 +78,6 @@ def test_emulate_mxfp8_grouped_gemm_2d_3d(M, K, N, num_experts):
     assert sqnr >= min_sqnr, f"sqnr {sqnr} is too low, must be >= {min_sqnr}"
 
 
-@skip_if_rocm("ROCm not supported")
 @pytest.mark.parametrize("M", (1024, 4096))
 @pytest.mark.parametrize("N", (1024, 4096))
 @pytest.mark.parametrize("num_experts", (8, 16))
@@ -128,7 +125,6 @@ def test_emulate_mxfp8_grouped_gemm_2d_2d(M, N, num_experts):
     assert sqnr >= min_sqnr, f"sqnr {sqnr} is too low, must be >= {min_sqnr}"
 
 
-@skip_if_rocm("ROCm not supported")
 @pytest.mark.parametrize("M,K,N", [(32768, 5120, 8192), (16640, 7168, 2048)])
 @pytest.mark.parametrize("num_experts", (1, 8))
 @pytest.mark.parametrize("wgrad_with_hp", (True, False))
@@ -152,7 +148,7 @@ def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(
     pad_token_groups_for_grouped_mm,
 ):
     # MXFP8 hardware path requires SM100
-    if kernel_preference != KernelPreference.EMULATED and not is_sm_version(10, 0):
+    if kernel_preference != KernelPreference.EMULATED and not (is_sm_version(10, 0) or is_MI350()):
         pytest.skip(
             f"Skipping MXFP8 hardware mode tests, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
         )
@@ -225,7 +221,6 @@ def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(
     )
 
 
-@skip_if_rocm("ROCm not supported")
 def test_mxfp8_grouped_gemm_from_qdata_and_scales_matches_dynamic():
     block_size = 32
     M, K, N, num_experts = 4096, 1024, 2048, 8
@@ -298,7 +293,6 @@ def test_mxfp8_grouped_gemm_from_qdata_and_scales_matches_dynamic():
     )
 
 
-@skip_if_rocm("ROCm not supported")
 def test_mxfp8_grouped_gemm_from_qdata_and_scales_forward():
     block_size = 32
     M, K, N, num_experts = 4096, 1024, 2048, 8
@@ -352,7 +346,6 @@ def test_mxfp8_grouped_gemm_from_qdata_and_scales_forward():
     )
 
 
-@skip_if_rocm("ROCm not supported")
 def test_mxfp8_grouped_gemm_mxtensor_requires_wgrad_with_hp():
     block_size = 32
     M, K, N, num_experts = 1024, 1024, 2048, 4
 
@@ -12,7 +12,10 @@
 from torch import nn
 
 from torchao.core.config import AOBaseConfig
-from torchao.prototype.mx_formats.config import ScaleCalculationMode
+from torchao.prototype.mx_formats.config import (
+    MXFP8Dim1CastKernelChoice,
+    ScaleCalculationMode,
+)
 from torchao.quantization.quantize_.common import KernelPreference
 from torchao.quantization.transform_module import register_quantize_module_handler
 from torchao.utils import is_MI300, register_as_pytree_constant
@@ -131,6 +134,13 @@ class MXFP8TrainingOpConfig(TrainingOpBaseConfig):
     # Whether to pad the token group sizes to multiples of 32 (MXFP8 scaling block size).
     pad_token_groups_for_grouped_mm: bool = False
 
+    # Kernel used for the MXFP8 dim1 cast in backward (wgrad path). Default is
+    # CUDA (best on CUDA SM100+). On backends without the CUDA kernel (e.g.
+    # ROCm), set to MXFP8Dim1CastKernelChoice.TRITON.
+    mxfp8_dim1_cast_kernel_choice: MXFP8Dim1CastKernelChoice = (
+        MXFP8Dim1CastKernelChoice.CUDA
+    )
+
     @classmethod
     def from_recipe(
         cls,
@@ -173,6 +183,8 @@ def __eq__(self, other):
                 and self.scale_calculation_mode == other.scale_calculation_mode
                 and self.pad_token_groups_for_grouped_mm
                 == other.pad_token_groups_for_grouped_mm
+                and self.mxfp8_dim1_cast_kernel_choice
+                == other.mxfp8_dim1_cast_kernel_choice
             )
         return NotImplemented
 
@@ -184,6 +196,7 @@ def __hash__(self):
                 self.wgrad_with_hp,
                 self.scale_calculation_mode,
                 self.pad_token_groups_for_grouped_mm,
+                self.mxfp8_dim1_cast_kernel_choice,
             )
         )
 
 
@@ -15,3 +15,7 @@
     triton_mx_block_rearrange_2d_M_groups,  # noqa: F401
     triton_mx_block_rearrange_per_group_3d,  # noqa: F401
 )
+from torchao.prototype.moe_training.kernels.mxfp8.rocm_mxfp8_mm import (
+    triton_mxfp8_grouped_mm,  # noqa: F401
+    triton_mxfp8_wgrad,  # noqa: F401
+)
@@ -260,6 +260,7 @@ def compute_blocked_scale_offsets_for_K_groups(
     return group_sizes, starting_col_after_padding
 
 
+@torch.library.custom_op("torchao::torch_pad_token_groups", mutates_args=())
 def torch_pad_token_groups(
     inputs: torch.Tensor, group_offsets: torch.Tensor, alignment_size: int
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -323,6 +324,27 @@ def torch_pad_token_groups(
     return padded_tokens, padded_start_offsets, padded_offsets
 
 
+@torch_pad_token_groups.register_fake
+def _torch_pad_token_groups_fake(
+    inputs: torch.Tensor, group_offsets: torch.Tensor, alignment_size: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    num_tokens, dim = inputs.shape
+    num_groups = group_offsets.shape[0]
+    output_rows = num_tokens + num_groups * alignment_size
+    output_rows = (
+        (output_rows + alignment_size - 1) // alignment_size
+    ) * alignment_size
+    padded_tokens = inputs.new_empty((output_rows, dim))
+    padded_group_start_offsets = torch.empty(
+        (num_groups,), dtype=torch.int32, device=inputs.device
+    )
+    padded_group_end_offsets = torch.empty(
+        (num_groups,), dtype=torch.int32, device=inputs.device
+    )
+    return padded_tokens, padded_group_start_offsets, padded_group_end_offsets
+
+
+@torch.library.custom_op("torchao::torch_unpad_token_groups", mutates_args=())
 def torch_unpad_token_groups(
     padded_inputs: torch.Tensor,
     group_offsets: torch.Tensor,
@@ -373,6 +395,18 @@ def torch_unpad_token_groups(
     return unpadded_tokens
 
 
+@torch_unpad_token_groups.register_fake
+def _torch_unpad_token_groups_fake(
+    padded_inputs: torch.Tensor,
+    group_offsets: torch.Tensor,
+    padded_group_start_offsets: torch.Tensor,
+    num_tokens: int,
+    alignment_size: int,
+) -> torch.Tensor:
+    dim = padded_inputs.shape[1]
+    return padded_inputs.new_empty((num_tokens, dim))
+
+
 if torch_version_at_least("2.7.0") and has_triton():
     import triton
     import triton.language as tl
Original file line number	Diff line number	Diff line change
`@@ -15,3 +15,7 @@`
`15`	`15`	`triton_mx_block_rearrange_2d_M_groups, # noqa: F401`
`16`	`16`	`triton_mx_block_rearrange_per_group_3d, # noqa: F401`
`17`	`17`	`)`
	`18`	`+from torchao.prototype.moe_training.kernels.mxfp8.rocm_mxfp8_mm import (`
	`19`	`+ triton_mxfp8_grouped_mm, # noqa: F401`
	`20`	`+ triton_mxfp8_wgrad, # noqa: F401`
	`21`	`+)`