pytorch
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/bench_quantize_3d.py‎
Lines changed: 40 additions & 16 deletions b/‎benchmarks/prototype/moe_training/mxfp8/bench_quantize_3d.py‎
Lines changed: 40 additions & 16 deletions
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/roofline_unified.py‎
Lines changed: 25 additions & 35 deletions b/‎benchmarks/prototype/moe_training/mxfp8/roofline_unified.py‎
Lines changed: 25 additions & 35 deletions
@@ -31,7 +31,7 @@
 class ExperimentConfig:
     input_shape: tuple[int]
     scaling_mode: ScaleCalculationMode
-    scale_block_k: int
+    variant: str
 
 
 @dataclass(frozen=True)
@@ -63,41 +63,60 @@ def get_configs() -> List[ExperimentConfig]:
         (32, 8192, 5120),
     ]
     round_modes = [ScaleCalculationMode.FLOOR, ScaleCalculationMode.RCEIL]
-    scale_block_ks = [1, 32]
+    variants = ["32x1_t", "32x1_n", "32x32_n"]
     configs = []
-    for shape, scaling_mode, scale_block_k in itertools.product(
-        input_shapes, round_modes, scale_block_ks
+    for shape, scaling_mode, variant in itertools.product(
+        input_shapes, round_modes, variants
     ):
         configs.append(
             ExperimentConfig(
                 input_shape=shape,
                 scaling_mode=scaling_mode,
-                scale_block_k=scale_block_k,
+                variant=variant,
             )
         )
     return configs
 
 
 def run_experiment(config: ExperimentConfig) -> ExperimentResult:
     block_size = 32
-    scale_block_k = config.scale_block_k
+    variant = config.variant
     input_shape = config.input_shape
     input_tensor = torch.randn(
         *input_shape,
         dtype=torch.bfloat16,
         device=device,
     )
 
+    def get_quant_input(x: torch.Tensor) -> torch.Tensor:
+        # The "32x1_t" benchmark row is the reviewer-requested
+        # contract: feed (E, K, N) K-major expert weights directly into the
+        # existing 3D 32x1 kernel.
+        if variant == "32x1_t":
+            return x.transpose(-2, -1)
+        return x
+
     def using_to_mx(x: torch.Tensor) -> torch.Tensor:
-        if scale_block_k == 1:
+        if variant == "32x1_t":
+            x_t = x.transpose(-2, -1)
+            s_ref, y_ref = to_mx(
+                x_t.transpose(-2, -1).contiguous(),
+                elem_dtype=torch.float8_e4m3fn,
+                block_size=block_size,
+                scaling_mode=config.scaling_mode,
+            )
+            return y_ref.transpose(-2, -1), s_ref.transpose(-2, -1)
+
+        if variant == "32x1_n":
             s_ref, y_ref = to_mx(
                 x.transpose(-2, -1).contiguous(),
                 elem_dtype=torch.float8_e4m3fn,
                 block_size=block_size,
+                scaling_mode=config.scaling_mode,
             )
             return y_ref.transpose(-2, -1), s_ref.transpose(-2, -1)
 
-        assert scale_block_k == 32
+        assert variant == "32x32_n"
         E, N, K = x.shape
         x_tiles = (
             x.view(E, N // block_size, block_size, K // block_size, block_size)
@@ -109,6 +128,7 @@ def using_to_mx(x: torch.Tensor) -> torch.Tensor:
             x_tiles,
             elem_dtype=torch.float8_e4m3fn,
             block_size=block_size * block_size,
+            scaling_mode=config.scaling_mode,
         )
         y_ref = (
             y_tiles_ref.view(
@@ -129,7 +149,7 @@ def using_to_mx(x: torch.Tensor) -> torch.Tensor:
         input_tensor,
     )
 
-    if scale_block_k == 1:
+    if variant == "32x1_n":
         # bench 2d dim1 kernel then transforming to col major
         using_cuda_2d_c = torch.compile(_to_mxfp8_dim1_3d)
         using_cuda_2d_c(input_tensor)
@@ -142,19 +162,23 @@ def using_to_mx(x: torch.Tensor) -> torch.Tensor:
     else:
         time_cuda_2d_us = float("nan")
 
+    quant_input = get_quant_input(input_tensor)
+    scale_block_n = block_size
+    scale_block_k = 1 if variant in ("32x1_t", "32x1_n") else block_size
+
     # bench 3d CuTeDSL kernel
     data_cuda_3d, scales_cuda_3d = mxfp8_quantize_cuda_3d(
-        input_tensor,
+        quant_input,
         block_size=block_size,
-        scale_block_n=block_size,
+        scale_block_n=scale_block_n,
         scale_block_k=scale_block_k,
         scaling_mode=str(config.scaling_mode.value),
     )
     time_cutedsl_3d_us = benchmark_cuda_function_in_microseconds(
         mxfp8_quantize_cuda_3d,
-        input_tensor,
+        quant_input,
         block_size=block_size,
-        scale_block_n=block_size,
+        scale_block_n=scale_block_n,
         scale_block_k=scale_block_k,
         scaling_mode=str(config.scaling_mode.value),
     )
@@ -164,7 +188,7 @@ def using_to_mx(x: torch.Tensor) -> torch.Tensor:
     bytes_per_output_el = torch.finfo(torch.float8_e4m3fn).bits / 8
     bytes_per_scale_el = torch.finfo(torch.float8_e8m0fnu).bits / 8
 
-    read_bytes = input_tensor.numel() * bytes_per_input_el
+    read_bytes = quant_input.numel() * bytes_per_input_el
     write_bytes = (
         data_cuda_3d.numel() * bytes_per_output_el
         + scales_cuda_3d.numel() * bytes_per_scale_el
@@ -189,7 +213,7 @@ def print_results(experiments: List[Experiment]):
     headers = [
         "input_shape",
         "scaling_mode",
-        "scale_block_k",
+        "variant",
         "cuda_2d_us",
         "cutedsl_3d_us",
         "to_mx_us",
@@ -203,7 +227,7 @@ def print_results(experiments: List[Experiment]):
             [
                 str(experiment.config.input_shape),
                 str(experiment.config.scaling_mode),
-                str(experiment.config.scale_block_k),
+                experiment.config.variant,
                 experiment.result.cuda_2d_us,
                 experiment.result.cutedsl_3d_us,
                 experiment.result.to_mx_us,
 
@@ -21,7 +21,6 @@
 from torchao.prototype.moe_training.kernels.mxfp8 import (
     mx_block_rearrange_2d_M_groups_cuda,
     torch_to_blocked_2d_M_groups,
-    torch_to_blocked_per_group_3d,
     triton_mx_block_rearrange_2d_K_groups,
     triton_mx_block_rearrange_per_group_3d,
 )
@@ -238,33 +237,27 @@ def compute_mxfp8_2d_2d_gemm_time(self, N, M, K):
         return time_s
 
     def compute_mxfp8_fwd_bwd_time(self, M, K, N, G):
-        """Compute time for MXFP8 forward + backward pass including scale rearrangement overhead"""
+        """Compute time for MXFP8 forward + backward pass."""
         block_size = 32
 
         # Forward: (M, K) @ (G, K, N)^T -> (M, N) [2D-3D]
         fwd_quant_time = self.compute_mxfp8_fwd_quant_time(M, K, G, N)
         # Forward scale rearrangement:
         # - Input scales (M, K//32) -> M-groups rearrangement
-        # - Weight scales (G, N, K//32) -> 3D per-group rearrangement
+        # - Weight scales are emitted directly in blocked layout by the 3D kernel
         fwd_input_scale_rearrange_time = self.compute_rearrange_2d_M_groups_time(
             M, K // block_size
         )
-        fwd_weight_scale_rearrange_time = self.compute_rearrange_3d_per_group_time(
-            G, N, K // block_size
-        )
         fwd_gemm_time = self.compute_mxfp8_2d_3d_gemm_time(M, K, N)
 
         # Backward input: (M, N) @ (G, N, K) -> (M, K) [2D-3D]
         bwd_input_quant_time = self.compute_mxfp8_bwd_input_quant_time(M, K, G, N)
         # Backward input scale rearrangement:
         # - grad_output scales (M, N//32) -> M-groups rearrangement
-        # - Weight scales (G, K, N//32) -> 3D per-group rearrangement (transposed weight)
+        # - Weight scales are emitted directly in blocked layout by the 3D kernel
         bwd_input_grad_scale_rearrange_time = self.compute_rearrange_2d_M_groups_time(
             M, N // block_size
         )
-        bwd_input_weight_scale_rearrange_time = (
-            self.compute_rearrange_3d_per_group_time(G, K, N // block_size)
-        )
         bwd_input_gemm_time = self.compute_mxfp8_2d_3d_gemm_time(M, N, K)
 
         # Backward weight: (N, M) @ (M, K) -> G separate (N, K) [2D-2D]
@@ -283,11 +276,9 @@ def compute_mxfp8_fwd_bwd_time(self, M, K, N, G):
         total_time = (
             fwd_quant_time
             + fwd_input_scale_rearrange_time
-            + fwd_weight_scale_rearrange_time
             + fwd_gemm_time
             + bwd_input_quant_time
             + bwd_input_grad_scale_rearrange_time
-            + bwd_input_weight_scale_rearrange_time
             + bwd_input_gemm_time
             + bwd_weight_quant_time
             + bwd_weight_grad_scale_rearrange_time
@@ -441,8 +432,6 @@ def benchmark_mxfp8_grouped_mm_fwd_bwd(x, w_t, offs, labels):
     x_clone = x.clone().requires_grad_(True)
     w_t_clone = w_t.clone().requires_grad_(True)
 
-    fn = torch.compile(_to_mxfp8_then_scaled_grouped_mm, fullgraph=True)
-
     # Set all parameters explicitly as variables for positional args
     A = x_clone
     B_t = w_t_clone
@@ -453,7 +442,7 @@ def benchmark_mxfp8_grouped_mm_fwd_bwd(x, w_t, offs, labels):
     scale_calculation_mode = MoEScaleCalculationMode.RCEIL
 
     def wrapper():
-        out = fn(
+        out = _to_mxfp8_then_scaled_grouped_mm(
             A,
             B_t,
             offs_arg,
@@ -492,7 +481,7 @@ def benchmark_to_mxfp8_dim1_cuda(tensor, block_size=32):
 
 
 def benchmark_mxfp8_quantize_cuda_3d(tensor, block_size=32):
-    """Benchmark mxfp8_quantize_cuda_3d kernel"""
+    """Benchmark the 3D 32x1 quantizer on its input tensor."""
     return benchmark_cuda_function_in_microseconds(
         lambda: mxfp8_quantize_cuda_3d(
             tensor,
@@ -715,7 +704,7 @@ def run(
     # 3. 3D Quantization Kernel Analysis
     # =============================================================================
     print("\n" + "=" * 80)
-    print("3D QUANTIZATION KERNELS (Backward Pass - Weight Quantization)")
+    print("3D QUANTIZATION KERNELS (Direct Transposed-Weight Quantization)")
     print("=" * 80)
 
     quant_3d_results = []
@@ -741,8 +730,10 @@ def run(
 
         print(f"\nBenchmarking {desc}...")
 
-        # Create test tensor
-        tensor = torch.randn(G_val, N_val, K_val, dtype=torch.bfloat16, device="cuda")
+        # Benchmark the direct grouped-GEMM weight contract: w_t has shape
+        # (G, K, N), and the existing 3D 32x1 kernel quantizes it directly.
+        weight = torch.randn(G_val, N_val, K_val, dtype=torch.bfloat16, device="cuda")
+        tensor = weight.transpose(-2, -1)
 
         # Benchmark mxfp8_quantize_cuda_3d
         cuda_3d_time_us = benchmark_mxfp8_quantize_cuda_3d(tensor)
@@ -1030,21 +1021,25 @@ def run(
             f"  BF16 Grouped GEMM: Roofline={model.bf16_tflops:.1f} TFLOPS, Actual={bf16_actual_tflops:.1f} TFLOPS, Efficiency={result_dict['bf16_tflops_efficiency_pct']:.1f}%"
         )
 
-        # Convert to MXFP8 format using triton_to_mxfp8_dim0 (blocks along dim0)
+        # Convert activations to MXFP8 format using triton_to_mxfp8_dim0
         x_fp8, x_scales = triton_to_mxfp8_dim0(x, inner_block_size=32)
-        w_fp8, w_scales = triton_to_mxfp8_dim0(
-            w_t.transpose(-2, -1), inner_block_size=32
+        w_fp8, w_scales_blocked = mxfp8_quantize_cuda_3d(
+            w_t,
+            block_size=32,
+            scale_block_n=32,
+            scale_block_k=1,
+            scaling_mode="rceil",
         )
 
-        # Convert scales to blocked format
+        # Convert only activation scales to blocked format. Weight scales are
+        # already produced in blocked layout by mxfp8_quantize_cuda_3d.
         x_scales_blocked, _ = torch_to_blocked_2d_M_groups(
             x_scales, offs, block_size=32
         )
-        w_scales_blocked = torch_to_blocked_per_group_3d(w_scales)
 
         # Benchmark the MXFP8 grouped GEMM kernel
         mxfp8_gemm_time_us = benchmark_mxfp8_grouped_gemm(
-            x_fp8, w_fp8.transpose(-2, -1), x_scales_blocked, w_scales_blocked, offs
+            x_fp8, w_fp8, x_scales_blocked, w_scales_blocked, offs
         )
 
         # Calculate MXFP8 actual TFLOPS
@@ -1068,7 +1063,7 @@ def run(
         grouped_gemm_results.append(result_dict)
 
         # Clean up tensors to free GPU memory
-        del x, w, w_t, offs, x_fp8, x_scales, w_fp8, w_scales
+        del x, w, w_t, offs, x_fp8, x_scales, w_fp8
         del x_scales_blocked, w_scales_blocked
         torch.cuda.empty_cache()
 
@@ -1436,7 +1431,8 @@ def run(
     # Input quantization: use triton_to_mxfp8_dim0 for (M, K)
     fwd_input_quant_ms = df_quant_2d.loc[idx_large, "triton_to_mxfp8_dim0_us"] / 1000
 
-    # Weight quantization: use mxfp8_quantize_cuda_3d for (G, N, K)
+    # Weight quantization: use mxfp8_quantize_cuda_3d directly on w_t, shape
+    # (G, K, N), with no separate 3D scale rearrangement step.
     idx_3d_large = df_quant_3d[df_quant_3d["description"] == f"M={M_large}"].index[0]
     fwd_weight_quant_ms = (
         df_quant_3d.loc[idx_3d_large, "mxfp8_quantize_cuda_3d_us"] / 1000
@@ -1450,14 +1446,8 @@ def run(
         df_rearrange.loc[idx_m_groups, "mx_block_rearrange_2d_M_groups_cuda_us"] / 1000
     )
 
-    # Weight scale rearrangement: 3D per-group for (G, N, K//32)
-    idx_3d_rearrange = df_rearrange_3d[df_rearrange_3d["M"] == M_large].index[0]
-    fwd_weight_scale_rearrange_ms = (
-        df_rearrange_3d.loc[
-            idx_3d_rearrange, "triton_mx_block_rearrange_per_group_3d_us"
-        ]
-        / 1000
-    )
+    # Weight scales are emitted directly in blocked layout by the 3D quantizer.
+    fwd_weight_scale_rearrange_ms = 0.0
 
     # GEMM: use actual MXFP8 2D/3D grouped GEMM time
     idx_gemm = df_grouped_gemm[df_grouped_gemm["M"] == M_large].index[0]