[moe training] default pad_token_groups_for_grouped_mm=False

danielvegamyhre · danielvegamyhre · commit 10362fcd6832 · 2026-03-13T20:16:11.000Z
diff --git a/benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py b/benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py
@@ -116,7 +116,17 @@ def run_experiment(
         requires_grad=True,
     ).transpose(-2, -1)
 
-    offs = generate_jagged_offs(G, total_M, multiple_of=1)
+    # Create config object from recipe
+    if isinstance(config.recipe, Float8TrainingRecipe):
+        quant_config = Float8TrainingOpConfig.from_recipe(config.recipe)
+        alignment_size = 16 if args.aligned else 1
+        # TODO: support pad_token_groups_for_grouped_mm option in Float8TrainingOpConfig
+    else:
+        quant_config = MXFP8TrainingOpConfig.from_recipe(config.recipe)
+        quant_config.pad_token_groups_for_grouped_mm = not args.aligned
+        alignment_size = 32 if args.aligned else 1
+
+    offs = generate_jagged_offs(G, total_M, multiple_of=alignment_size)
 
     # fwd_bwd bf16 benchmark + profiling
     bf16_fwd_bwd_us = bench_fwd_bwd_microseconds(
@@ -138,12 +148,6 @@ def run_experiment(
             profile_name="bf16_profile",
         )
 
-    # Create config object from recipe
-    if isinstance(config.recipe, Float8TrainingRecipe):
-        quant_config = Float8TrainingOpConfig.from_recipe(config.recipe)
-    else:
-        quant_config = MXFP8TrainingOpConfig.from_recipe(config.recipe)
-
     # fwd_bwd scaled benchmark + profiling
     scaled_fwd_bwd_us = bench_fwd_bwd_microseconds(
         _quantize_then_scaled_grouped_mm,
@@ -262,5 +266,11 @@ def main(args: argparse.Namespace):
     arg_parser = argparse.ArgumentParser()
     arg_parser.add_argument("--compile", action="store_true")
     arg_parser.add_argument("--profile", action="store_true")
+    arg_parser.add_argument(
+        "--aligned",
+        action="store_true",
+        help="If true, token group sizes are pre-aligned, to simulate flow with HybridEP or similar",
+    )
+
     args = arg_parser.parse_args()
     main(args)
diff --git a/test/prototype/moe_training/test_training.py b/test/prototype/moe_training/test_training.py
@@ -37,6 +37,7 @@
 @pytest.mark.parametrize(
     "kernel_preference", [KernelPreference.AUTO, KernelPreference.EMULATED]
 )
+@pytest.mark.parametrize("token_groups_aligned", [False])
 @pytest.mark.parametrize(
     "recipe_config",
     [
@@ -74,6 +75,7 @@ def test_moe_training(
     target_fqns: list[str],
     compile: bool,
     kernel_preference: KernelPreference,
+    token_groups_aligned: bool,
     recipe_config: dict,
 ):
     (
@@ -105,6 +107,8 @@ def test_moe_training(
                 pytest.skip(
                     f"Skipping FP8 rowwise tests, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
                 )
+        if not token_groups_aligned:
+            pytest.skip("FP8 rowwise doesn't support per group token padding yet")
 
     # MXFP8 hardware path requires SM100
     if recipe in (
@@ -118,7 +122,11 @@ def test_moe_training(
             f"Skipping MXFP8 hardware mode tests, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
         )
 
-    set_token_group_alignment_size_m(1)
+    alignment_size = 32 if isinstance(recipe, MXFP8TrainingRecipe) else 16
+    if not token_groups_aligned:
+        alignment_size = 1
+    set_token_group_alignment_size_m(alignment_size)
+
     model_args = MoEArgs(
         num_experts=8,
         num_shared_experts=1,
@@ -154,6 +162,11 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
         else Float8TrainingOpConfig
     )
     config = config_cls.from_recipe(recipe)
+
+    # TODO: support pad_token_groups_for_grouped_mm in Float8TrainingOpConfig
+    if isinstance(recipe, MXFP8TrainingRecipe) and not token_groups_aligned:
+        config.pad_token_groups_for_grouped_mm = True
+
     quantize_(model, config=config, filter_fn=moe_module_filter_fn)
 
     # validate that only the experts were converted
diff --git a/torchao/prototype/moe_training/config.py b/torchao/prototype/moe_training/config.py
@@ -117,23 +117,23 @@ def from_recipe(
                 out_dtype=torch.bfloat16,
                 wgrad_with_hp=False,
                 scale_calculation_mode=ScaleCalculationMode.RCEIL,
-                pad_token_groups_for_grouped_mm=True,
+                pad_token_groups_for_grouped_mm=False,
             )
         elif recipe == MXFP8TrainingRecipe.MXFP8_RCEIL_WGRAD_WITH_HP:
             return cls(
                 kernel_preference=KernelPreference.AUTO,
                 out_dtype=torch.bfloat16,
                 wgrad_with_hp=True,
                 scale_calculation_mode=ScaleCalculationMode.RCEIL,
-                pad_token_groups_for_grouped_mm=True,
+                pad_token_groups_for_grouped_mm=False,
             )
         elif recipe == MXFP8TrainingRecipe.MXFP8_EMULATED_RCEIL:
             return cls(
                 kernel_preference=KernelPreference.EMULATED,
                 out_dtype=torch.bfloat16,
                 wgrad_with_hp=False,
                 scale_calculation_mode=ScaleCalculationMode.RCEIL,
-                pad_token_groups_for_grouped_mm=True,
+                pad_token_groups_for_grouped_mm=False,
             )
         else:
             raise ValueError(f"Unsupported MXFP8 recipe: {recipe}")