pytorch
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/bench_quantize_3d.py‎
Lines changed: 54 additions & 22 deletions b/‎benchmarks/prototype/moe_training/mxfp8/bench_quantize_3d.py‎
Lines changed: 54 additions & 22 deletions
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/roofline_unified.py‎
Lines changed: 5 additions & 1 deletion b/‎benchmarks/prototype/moe_training/mxfp8/roofline_unified.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎test/prototype/moe_training/test_kernels.py‎
Lines changed: 89 additions & 24 deletions b/‎test/prototype/moe_training/test_kernels.py‎
Lines changed: 89 additions & 24 deletions
@@ -31,6 +31,7 @@
 class ExperimentConfig:
     input_shape: tuple[int]
     scaling_mode: ScaleCalculationMode
+    scale_block_k: int
 
 
 @dataclass(frozen=True)
@@ -62,19 +63,24 @@ def get_configs() -> List[ExperimentConfig]:
         (32, 8192, 5120),
     ]
     round_modes = [ScaleCalculationMode.FLOOR, ScaleCalculationMode.RCEIL]
+    scale_block_ks = [1, 32]
     configs = []
-    for shape, scaling_mode in itertools.product(input_shapes, round_modes):
+    for shape, scaling_mode, scale_block_k in itertools.product(
+        input_shapes, round_modes, scale_block_ks
+    ):
         configs.append(
             ExperimentConfig(
                 input_shape=shape,
                 scaling_mode=scaling_mode,
+                scale_block_k=scale_block_k,
             )
         )
     return configs
 
 
 def run_experiment(config: ExperimentConfig) -> ExperimentResult:
     block_size = 32
+    scale_block_k = config.scale_block_k
     input_shape = config.input_shape
     input_tensor = torch.randn(
         *input_shape,
@@ -83,20 +89,37 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult:
     )
 
     def using_to_mx(x: torch.Tensor) -> torch.Tensor:
-        # Reference implementation
-        s_d1_ref, y_d1_ref = to_mx(
-            # Transpose (E,N,K) to (E,K,N) so N is final dim,
-            # since to_mx scales along that dim
-            x.transpose(-2, -1).contiguous(),
+        if scale_block_k == 1:
+            s_ref, y_ref = to_mx(
+                x.transpose(-2, -1).contiguous(),
+                elem_dtype=torch.float8_e4m3fn,
+                block_size=block_size,
+            )
+            return y_ref.transpose(-2, -1), s_ref.transpose(-2, -1)
+
+        assert scale_block_k == 32
+        E, N, K = x.shape
+        x_tiles = (
+            x.view(E, N // block_size, block_size, K // block_size, block_size)
+            .permute(0, 1, 3, 2, 4)
+            .contiguous()
+            .view(E, N // block_size, K // block_size, block_size * block_size)
+        )
+        s_ref, y_tiles_ref = to_mx(
+            x_tiles,
             elem_dtype=torch.float8_e4m3fn,
-            block_size=block_size,
+            block_size=block_size * block_size,
         )
-
-        # Transpose tensors and scales back so we have effectively
-        # quantized input shape (E, N, K) along N
-        y_d1_ref = y_d1_ref.transpose(-2, -1)
-        s_d1_ref = s_d1_ref.transpose(-2, -1)
-        return y_d1_ref, s_d1_ref
+        y_ref = (
+            y_tiles_ref.view(
+                E, N // block_size, K // block_size, block_size, block_size
+            )
+            .permute(0, 1, 3, 2, 4)
+            .contiguous()
+            .view(E, N, K)
+        )
+        y_ref = y_ref.transpose(-2, -1).contiguous().transpose(-2, -1)
+        return y_ref, s_ref
 
     # bench to_mx
     using_to_mx_c = torch.compile(using_to_mx)
@@ -106,26 +129,33 @@ def using_to_mx(x: torch.Tensor) -> torch.Tensor:
         input_tensor,
     )
 
-    # bench 2d dim1 kernel then transforming to col major
-    using_cuda_2d_c = torch.compile(_to_mxfp8_dim1_3d)
-    scales_cuda_2d, data_cuda_2d = using_cuda_2d_c(input_tensor)
-    time_cuda_2d_us = benchmark_cuda_function_in_microseconds(
-        using_cuda_2d_c,
-        input_tensor,
-        block_size=block_size,
-        scaling_mode=config.scaling_mode,
-    )
+    if scale_block_k == 1:
+        # bench 2d dim1 kernel then transforming to col major
+        using_cuda_2d_c = torch.compile(_to_mxfp8_dim1_3d)
+        using_cuda_2d_c(input_tensor)
+        time_cuda_2d_us = benchmark_cuda_function_in_microseconds(
+            using_cuda_2d_c,
+            input_tensor,
+            block_size=block_size,
+            scaling_mode=config.scaling_mode,
+        )
+    else:
+        time_cuda_2d_us = float("nan")
 
     # bench 3d CuTeDSL kernel
     data_cuda_3d, scales_cuda_3d = mxfp8_quantize_cuda_3d(
         input_tensor,
         block_size=block_size,
+        scale_block_n=block_size,
+        scale_block_k=scale_block_k,
         scaling_mode=str(config.scaling_mode.value),
     )
     time_cutedsl_3d_us = benchmark_cuda_function_in_microseconds(
         mxfp8_quantize_cuda_3d,
         input_tensor,
         block_size=block_size,
+        scale_block_n=block_size,
+        scale_block_k=scale_block_k,
         scaling_mode=str(config.scaling_mode.value),
     )
 
@@ -159,6 +189,7 @@ def print_results(experiments: List[Experiment]):
     headers = [
         "input_shape",
         "scaling_mode",
+        "scale_block_k",
         "cuda_2d_us",
         "cutedsl_3d_us",
         "to_mx_us",
@@ -172,6 +203,7 @@ def print_results(experiments: List[Experiment]):
             [
                 str(experiment.config.input_shape),
                 str(experiment.config.scaling_mode),
+                str(experiment.config.scale_block_k),
                 experiment.result.cuda_2d_us,
                 experiment.result.cutedsl_3d_us,
                 experiment.result.to_mx_us,
 
@@ -495,7 +495,11 @@ def benchmark_mxfp8_quantize_cuda_3d(tensor, block_size=32):
     """Benchmark mxfp8_quantize_cuda_3d kernel"""
     return benchmark_cuda_function_in_microseconds(
         lambda: mxfp8_quantize_cuda_3d(
-            tensor, block_size=block_size, scaling_mode="rceil"
+            tensor,
+            block_size=block_size,
+            scale_block_n=block_size,
+            scale_block_k=1,
+            scaling_mode="rceil",
         )
     )
 
 
@@ -392,7 +392,12 @@ def test_triton_mx_block_rearrange_2d_K_groups(
 @pytest.mark.parametrize(
     "scaling_mode", (ScaleCalculationMode.FLOOR, ScaleCalculationMode.RCEIL)
 )
-def test_cuda_mx_dim1_3d_numerics(E, N, K, input_dtype, scaling_mode):
+@pytest.mark.parametrize(
+    "scale_block_k",
+    (1, 32),
+    ids=("32x1", "32x32"),
+)
+def test_cuda_mx_3d_cutedsl_numerics(E, N, K, input_dtype, scaling_mode, scale_block_k):
     if not _mxfp8_cutedsl_kernels_available:
         pytest.skip("mxfp8_quantize_3d is unavailable")
 
@@ -408,37 +413,97 @@ def test_cuda_mx_dim1_3d_numerics(E, N, K, input_dtype, scaling_mode):
         .contiguous()
     )
 
-    # Reference implementation
-    s_d1_ref, y_d1_ref = to_mx(
-        # Transpose so N is final dim, since to_mx scales along that dim
-        x.transpose(-2, -1).contiguous(),
-        elem_dtype=torch.float8_e4m3fn,
-        block_size=block_size,
-        scaling_mode=scaling_mode,
-    )
+    if scale_block_k == 1:
+        s_ref, y_ref = to_mx(
+            x.transpose(-2, -1).contiguous(),
+            elem_dtype=torch.float8_e4m3fn,
+            block_size=block_size,
+            scaling_mode=scaling_mode,
+        )
+        y_ref = y_ref.transpose(-2, -1)
+        s_ref = s_ref.transpose(-2, -1)
+        s_rows, s_cols = K, N // block_size
+        undo_scale = (
+            lambda scale: from_blocked(scale, s_rows, s_cols)
+            .transpose(-2, -1)
+            .contiguous()
+        )
+    else:
+        x_tiles = (
+            x.view(E, N // block_size, block_size, K // block_size, block_size)
+            .permute(0, 1, 3, 2, 4)
+            .contiguous()
+            .view(E, N // block_size, K // block_size, block_size * block_size)
+        )
+        s_ref, y_tiles_ref = to_mx(
+            x_tiles,
+            elem_dtype=torch.float8_e4m3fn,
+            block_size=block_size * block_size,
+            scaling_mode=scaling_mode,
+        )
+        s_ref = s_ref.squeeze(-1)
+        y_ref = (
+            y_tiles_ref.view(
+                E, N // block_size, K // block_size, block_size, block_size
+            )
+            .permute(0, 1, 3, 2, 4)
+            .contiguous()
+            .view(E, N, K)
+        )
+        y_ref = y_ref.transpose(-2, -1).contiguous().transpose(-2, -1)
+        s_rows, s_cols = K, N // block_size
+        undo_scale = lambda scale: from_blocked(scale, s_rows, s_cols)[
+            ::block_size
+        ].transpose(-2, -1)
 
-    # Transpose tensors and scales back so we have effectively
-    # quantized input shape (E, N, K) along N
-    y_d1_ref = y_d1_ref.transpose(-2, -1)
-    s_d1_ref = s_d1_ref.transpose(-2, -1)
-    y_d1, s_d1 = mxfp8_quantize_cuda_3d(
+    y, s = mxfp8_quantize_cuda_3d(
         x,
         block_size=block_size,
+        scale_block_n=block_size,
+        scale_block_k=scale_block_k,
         scaling_mode=scaling_mode_str,
+        blocked_scale_output=True,
+    )
+    if scale_block_k == 32:
+        s_blocked_full = (
+            torch.stack(
+                [
+                    from_blocked(s[e], s_rows, s_cols).view(torch.uint8)
+                    for e in range(E)
+                ],
+                dim=0,
+            )
+            .view(torch.float8_e8m0fnu)
+            .to(s_ref.dtype)
+        )
+        s_ref_replicated = s_ref.transpose(-2, -1).repeat_interleave(block_size, dim=1)
+        torch.testing.assert_close(s_blocked_full, s_ref_replicated, rtol=0, atol=0)
+    s = (
+        torch.stack([undo_scale(s[e]).view(torch.uint8) for e in range(E)], dim=0)
+        .view(torch.float8_e8m0fnu)
+        .to(s_ref.dtype)
     )
-    s_d1 = torch.stack(
-        [
-            from_blocked(s_d1[e], K, N // block_size).transpose(-2, -1).contiguous()
-            for e in range(E)
-        ],
-        dim=0,
-    ).to(s_d1_ref.dtype)
     # Check scales
-    torch.testing.assert_close(s_d1, s_d1_ref, rtol=0, atol=0)
+    torch.testing.assert_close(s, s_ref, rtol=0, atol=0)
 
     # Check quantized values
-    torch.testing.assert_close(y_d1, y_d1_ref, rtol=0, atol=0)
-    assert y_d1.stride() == y_d1_ref.stride(), "quantized tensor strides do not match"
+    torch.testing.assert_close(y, y_ref, rtol=0, atol=0)
+    assert y.stride() == y_ref.stride(), "quantized tensor strides do not match"
+
+    y_unblocked, s_unblocked = mxfp8_quantize_cuda_3d(
+        x,
+        block_size=block_size,
+        scale_block_n=block_size,
+        scale_block_k=scale_block_k,
+        scaling_mode=scaling_mode_str,
+        blocked_scale_output=False,
+    )
+    s_unblocked = s_unblocked.to(s_ref.dtype)
+    torch.testing.assert_close(s_unblocked, s_ref, rtol=0, atol=0)
+    torch.testing.assert_close(y_unblocked, y_ref, rtol=0, atol=0)
+    assert y_unblocked.stride() == y_ref.stride(), (
+        "unblocked quantized tensor strides do not match"
+    )
 
 
 @pytest.mark.skipif(
Original file line number	Diff line number	Diff line change
`@@ -495,7 +495,11 @@ def benchmark_mxfp8_quantize_cuda_3d(tensor, block_size=32):`
`495`	`495`	`"""Benchmark mxfp8_quantize_cuda_3d kernel"""`
`496`	`496`	`return benchmark_cuda_function_in_microseconds(`
`497`	`497`	`lambda: mxfp8_quantize_cuda_3d(`
`498`		`- tensor, block_size=block_size, scaling_mode="rceil"`
	`498`	`+ tensor,`
	`499`	`+ block_size=block_size,`
	`500`	`+ scale_block_n=block_size,`
	`501`	`+ scale_block_k=1,`
	`502`	`+ scaling_mode="rceil",`
`499`	`503`	`)`
`500`	`504`	`)`
`501`	`505`