[mxfp8 moe training] _permute_bf16 -> permute_and_pad (pytorch#4083)

danielvegamyhre · web-flow · commit c0da952504c5 · 2026-03-13T16:15:03.000-07:00
[mxfp8 moe training] _permute_bf16 -&gt; _permute_and_pad
diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_ep_pipeline.py b/benchmarks/prototype/moe_training/mxfp8/bench_ep_pipeline.py
@@ -36,7 +36,7 @@
     permute_mxfp8_fwd_hp_bwd,
     unpermute_hp_fwd_mxfp8_bwd,
 )
-from torchao.prototype.moe_training.ep.permute import _permute_bf16
+from torchao.prototype.moe_training.ep.permute import permute_and_pad
 from torchao.prototype.moe_training.ep.unpermute import _unpermute_bf16
 from torchao.prototype.moe_training.mxfp8_grouped_mm import (
     _to_mxfp8_then_scaled_grouped_mm,
@@ -144,7 +144,7 @@ def standard_pipeline(
 
     # Step 2: Permute (BF16)
     input_shape, permuted, permuted_indices, num_tokens_per_expert_padded, offsets = (
-        _permute_bf16(
+        permute_and_pad(
             dispatched,
             num_tokens_per_expert_group,
             ep_degree,
diff --git a/test/prototype/moe_training/ep/test_compile.py b/test/prototype/moe_training/ep/test_compile.py
@@ -37,7 +37,7 @@
     permute_mxfp8_fwd_hp_bwd,
     unpermute_hp_fwd_mxfp8_bwd,
 )
-from torchao.prototype.moe_training.ep.permute import _permute_bf16
+from torchao.prototype.moe_training.ep.permute import permute_and_pad
 from torchao.prototype.moe_training.ep.unpermute import _unpermute_bf16
 from torchao.prototype.moe_training.mxfp8_grouped_mm import (
     _to_mxfp8_then_scaled_grouped_mm,
@@ -72,7 +72,7 @@ def standard_pipeline(
 
     # Step 2: Permute (BF16)
     input_shape, permuted, permuted_indices, num_tokens_per_expert_padded, offsets = (
-        _permute_bf16(
+        permute_and_pad(
             dispatched,
             num_tokens_per_expert_group,
             ep_degree,
diff --git a/test/prototype/moe_training/ep/test_integration.py b/test/prototype/moe_training/ep/test_integration.py
@@ -36,7 +36,7 @@
     permute_mxfp8_fwd_hp_bwd,
     unpermute_hp_fwd_mxfp8_bwd,
 )
-from torchao.prototype.moe_training.ep.permute import _permute_bf16
+from torchao.prototype.moe_training.ep.permute import permute_and_pad
 from torchao.prototype.moe_training.ep.unpermute import _unpermute_bf16
 from torchao.prototype.moe_training.mxfp8_grouped_mm import (
     _to_mxfp8_then_scaled_grouped_mm,
@@ -181,7 +181,7 @@ def test_full_pipeline(self):
                 bf16_permuted_indices,
                 bf16_num_tokens_per_expert_padded,
                 bf16_group_offsets,
-            ) = _permute_bf16(
+            ) = permute_and_pad(
                 bf16_dispatched,
                 num_tokens_per_expert_group,
                 ep_degree,
diff --git a/test/prototype/moe_training/ep/test_permute.py b/test/prototype/moe_training/ep/test_permute.py
@@ -11,7 +11,7 @@
     pytest.skip("Test requires CUDA 12.8+ with SM >= 100", allow_module_level=True)
 
 from torchao.prototype.moe_training.ep import permute_mxfp8_fwd_hp_bwd
-from torchao.prototype.moe_training.ep.permute import _permute_bf16
+from torchao.prototype.moe_training.ep.permute import permute_and_pad
 from torchao.prototype.mx_formats.mx_tensor import MXTensor
 from torchao.quantization.utils import compute_error
 
@@ -57,7 +57,7 @@ def test_mxfp8_permute_forward():
         _,
         _,
         _,
-    ) = _permute_bf16(
+    ) = permute_and_pad(
         input_tensor,
         num_tokens_per_expert,
         ep_degree,
diff --git a/torchao/prototype/moe_training/ep/permute.py b/torchao/prototype/moe_training/ep/permute.py
@@ -164,16 +164,16 @@ def backward(
         return grad_input, None, None, None, None, None
 
 
-# Reference impl for testing
-def _permute_bf16(
+def permute_and_pad(
     x: torch.Tensor,
     num_tokens_per_expert: torch.Tensor,
     ep_degree: int,
     num_local_experts: int,
     alignment: int,
 ):
     """
-    BF16 permute operation used for testing and benchmarking.
+    Permute token groups from rank-major to expert-major order, and pad group sizes to alignment size,
+    in preparation for grouped GEMM.
 
     Args:
         x: BF16 input tensor