[mxfp8 moe training] default pad_token_groups_for_grouped_mm to False

danielvegamyhre · danielvegamyhre · commit b6bf1a50019b · 2026-03-13T22:29:25.000Z
diff --git a/benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py b/benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py
@@ -116,7 +116,17 @@ def run_experiment(
         requires_grad=True,
     ).transpose(-2, -1)
 
-    offs = generate_jagged_offs(G, total_M, multiple_of=1)
+    # Create config object from recipe
+    if isinstance(config.recipe, Float8TrainingRecipe):
+        quant_config = Float8TrainingOpConfig.from_recipe(config.recipe)
+        alignment_size = 16 if args.aligned else 1
+        # TODO: support pad_token_groups_for_grouped_mm option in Float8TrainingOpConfig
+    else:
+        quant_config = MXFP8TrainingOpConfig.from_recipe(config.recipe)
+        quant_config.pad_token_groups_for_grouped_mm = not args.aligned
+        alignment_size = 32 if args.aligned else 1
+
+    offs = generate_jagged_offs(G, total_M, multiple_of=alignment_size)
 
     # fwd_bwd bf16 benchmark + profiling
     bf16_fwd_bwd_us = bench_fwd_bwd_microseconds(
@@ -138,12 +148,6 @@ def run_experiment(
             profile_name="bf16_profile",
         )
 
-    # Create config object from recipe
-    if isinstance(config.recipe, Float8TrainingRecipe):
-        quant_config = Float8TrainingOpConfig.from_recipe(config.recipe)
-    else:
-        quant_config = MXFP8TrainingOpConfig.from_recipe(config.recipe)
-
     # fwd_bwd scaled benchmark + profiling
     scaled_fwd_bwd_us = bench_fwd_bwd_microseconds(
         _quantize_then_scaled_grouped_mm,
@@ -262,5 +266,11 @@ def main(args: argparse.Namespace):
     arg_parser = argparse.ArgumentParser()
     arg_parser.add_argument("--compile", action="store_true")
     arg_parser.add_argument("--profile", action="store_true")
+    arg_parser.add_argument(
+        "--aligned",
+        action="store_true",
+        help="If true, token group sizes are pre-aligned, to simulate flow with HybridEP or similar",
+    )
+
     args = arg_parser.parse_args()
     main(args)
diff --git a/test/prototype/moe_training/test_training.py b/test/prototype/moe_training/test_training.py
@@ -37,6 +37,7 @@
 @pytest.mark.parametrize(
     "kernel_preference", [KernelPreference.AUTO, KernelPreference.EMULATED]
 )
+@pytest.mark.parametrize("token_groups_aligned", [False])
 @pytest.mark.parametrize(
     "recipe_config",
     [
@@ -74,6 +75,7 @@ def test_moe_training(
     target_fqns: list[str],
     compile: bool,
     kernel_preference: KernelPreference,
+    token_groups_aligned: bool,
     recipe_config: dict,
 ):
     (
@@ -110,6 +112,8 @@ def test_moe_training(
                 pytest.skip(
                     f"Skipping FP8 rowwise tests, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
                 )
+        if not token_groups_aligned:
+            pytest.skip("FP8 rowwise doesn't support per group token padding yet")
 
     # MXFP8 hardware path requires SM100
     if recipe in (
@@ -123,7 +127,11 @@ def test_moe_training(
             f"Skipping MXFP8 hardware mode tests, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
         )
 
-    set_token_group_alignment_size_m(1)
+    alignment_size = 32 if isinstance(recipe, MXFP8TrainingRecipe) else 16
+    if not token_groups_aligned:
+        alignment_size = 1
+    set_token_group_alignment_size_m(alignment_size)
+
     model_args = MoEArgs(
         num_experts=8,
         num_shared_experts=1,
@@ -159,6 +167,11 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
         else Float8TrainingOpConfig
     )
     config = config_cls.from_recipe(recipe)
+
+    # TODO: support pad_token_groups_for_grouped_mm in Float8TrainingOpConfig
+    if isinstance(recipe, MXFP8TrainingRecipe) and not token_groups_aligned:
+        config.pad_token_groups_for_grouped_mm = True
+
     quantize_(model, config=config, filter_fn=moe_module_filter_fn)
 
     # validate that only the experts were converted
diff --git a/torchao/prototype/moe_training/config.py b/torchao/prototype/moe_training/config.py
@@ -119,23 +119,23 @@ def from_recipe(
                 out_dtype=torch.bfloat16,
                 wgrad_with_hp=False,
                 scale_calculation_mode=ScaleCalculationMode.RCEIL,
-                pad_token_groups_for_grouped_mm=True,
+                pad_token_groups_for_grouped_mm=False,
             )
         elif recipe == MXFP8TrainingRecipe.MXFP8_RCEIL_WGRAD_WITH_HP:
             return cls(
                 kernel_preference=KernelPreference.AUTO,
                 out_dtype=torch.bfloat16,
                 wgrad_with_hp=True,
                 scale_calculation_mode=ScaleCalculationMode.RCEIL,
-                pad_token_groups_for_grouped_mm=True,
+                pad_token_groups_for_grouped_mm=False,
             )
         elif recipe == MXFP8TrainingRecipe.MXFP8_EMULATED_RCEIL:
             return cls(
                 kernel_preference=KernelPreference.EMULATED,
                 out_dtype=torch.bfloat16,
                 wgrad_with_hp=False,
                 scale_calculation_mode=ScaleCalculationMode.RCEIL,
-                pad_token_groups_for_grouped_mm=True,
+                pad_token_groups_for_grouped_mm=False,
             )
         else:
             raise ValueError(f"Unsupported MXFP8 recipe: {recipe}")
diff --git a/torchao/prototype/moe_training/mxfp8_grouped_mm.py b/torchao/prototype/moe_training/mxfp8_grouped_mm.py
@@ -19,6 +19,7 @@
     triton_mx_block_rearrange_per_group_3d,
 )
 from torchao.prototype.moe_training.utils import (
+    conditional_nostrict_trace,
     pad_token_groups,
     unpad_token_groups,
 )
@@ -77,7 +78,7 @@ def _validate_grouped_mm_input_act(
 
 
 # Aliases for convenience/clarity
-# @conditional_nostrict_trace
+@conditional_nostrict_trace
 def _to_mxfp8_then_scaled_grouped_mm(
     A: torch.Tensor,
     B_t: torch.Tensor,

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@`
`19`	`19`	`triton_mx_block_rearrange_per_group_3d,`
`20`	`20`	`)`
`21`	`21`	`from torchao.prototype.moe_training.utils import (`
	`22`	`+ conditional_nostrict_trace,`
`22`	`23`	`pad_token_groups,`
`23`	`24`	`unpad_token_groups,`
`24`	`25`	`)`
`@@ -77,7 +78,7 @@ def _validate_grouped_mm_input_act(`
`77`	`78`
`78`	`79`
`79`	`80`	`# Aliases for convenience/clarity`
`80`		`-# @conditional_nostrict_trace`
	`81`	`+@conditional_nostrict_trace`
`81`	`82`	`def _to_mxfp8_then_scaled_grouped_mm(`
`82`	`83`	`A: torch.Tensor,`
`83`	`84`	`B_t: torch.Tensor,`