[mxfp8 training] on-device validation of group sizes in cutedsl quant kernels (#4253)

danielvegamyhre · web-flow · commit 4c6dbea396aa · 2026-04-13T20:31:55.000-07:00
diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_cutedsl_quantize_2d_1x32.py b/benchmarks/prototype/moe_training/mxfp8/bench_cutedsl_quantize_2d_1x32.py
@@ -94,9 +94,7 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult:
 
     # Generate jagged offsets with multiples of 128
     # TODO: we use multiple of 128 here to avoid per-group padding requirement in blocked scales layout, which cutedsl doesn't support yet.
-    group_end_offsets = generate_jagged_offs(
-        num_groups, M, multiple_of=128, device=device
-    )
+    offs = generate_jagged_offs(num_groups, M, multiple_of=128, device=device)
 
     # Benchmark 1: CuTeDSL kernel with blocked scale output
     data_cutedsl, scales_cutedsl = mxfp8_quantize_cutedsl_2d_1x32(
@@ -127,11 +125,11 @@ def triton_plus_rearrange(x, group_offs):
         )
         return data, scales_blocked
 
-    data_triton, scales_triton = triton_plus_rearrange(input_tensor, group_end_offsets)
+    data_triton, scales_triton = triton_plus_rearrange(input_tensor, offs)
     triton_plus_rearrange_time_us = benchmark_cuda_function_in_microseconds(
         triton_plus_rearrange,
         input_tensor,
-        group_end_offsets,
+        offs,
     )
 
     # Memory bandwidth calculations
diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_cutedsl_quantize_2d_32x1.py b/benchmarks/prototype/moe_training/mxfp8/bench_cutedsl_quantize_2d_32x1.py
@@ -94,9 +94,7 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult:
 
     # Generate jagged offsets with multiples of 128 for K dimension
     # TODO: we use multiple of 128 here to avoid per-group padding requirement in blocked scales layout, which cutedsl doesn't support yet.
-    group_end_offsets = generate_jagged_offs(
-        num_groups, K, multiple_of=128, device=device
-    )
+    offs = generate_jagged_offs(num_groups, K, multiple_of=128, device=device)
 
     # Benchmark 1: CuTeDSL kernel with blocked scale output
     data_cutedsl, scales_cutedsl = mxfp8_quantize_cutedsl_2d_32x1(
@@ -128,11 +126,11 @@ def cuda_plus_rearrange(x, group_offs):
         )
         return output_colwise, scales_blocked
 
-    data_cuda, scales_cuda = cuda_plus_rearrange(input_tensor, group_end_offsets)
+    data_cuda, scales_cuda = cuda_plus_rearrange(input_tensor, offs)
     cuda_plus_rearrange_time_us = benchmark_cuda_function_in_microseconds(
         cuda_plus_rearrange,
         input_tensor,
-        group_end_offsets,
+        offs,
     )
 
     # Memory bandwidth calculations
diff --git a/test/prototype/moe_training/test_kernels.py b/test/prototype/moe_training/test_kernels.py
@@ -65,6 +65,8 @@ def _is_sm_10x() -> bool:
 from torchao.prototype.mx_formats.utils import from_blocked
 from torchao.testing.utils import skip_if_rocm
 
+from .testing_utils import generate_split_sizes
+
 
 @pytest.mark.parametrize("round_scales_to_power_of_2", [True, False])
 def test_row_major_with_jagged_rowwise_scales(round_scales_to_power_of_2: bool):
@@ -636,8 +638,8 @@ def test_cuda_fused_unpad_token_groups(
     )
 
     # First pad the tokens to create padded inputs
-    padded_tokens, padded_group_start_offsets, padded_group_end_offsets = (
-        torch_pad_token_groups(inputs, group_offsets, alignment_size)
+    padded_tokens, padded_group_start_offsets, padded_offsets = torch_pad_token_groups(
+        inputs, group_offsets, alignment_size
     )
 
     # Get reference output using torch implementation
@@ -704,3 +706,115 @@ def test_triton_fp8_rowwise_2d_scale_and_cast(
     assert ref_scales.shape == triton_scales.shape, "scale shapes not equal"
     assert torch.allclose(ref_fp8, triton_fp8, rtol=0, atol=0), "fp8 data not equal"
     assert torch.allclose(ref_scales, triton_scales, rtol=0, atol=0), "scales not equal"
+
+
+@pytest.mark.skipif(
+    not _is_sm_10x(),
+    reason="MXFP8 requires CUDA SM 10.x",
+)
+@pytest.mark.skipif(
+    not _mxfp8_cutedsl_kernels_available,
+    reason="MXFP8 cutedsl kernels not available",
+)
+@skip_if_rocm("ROCm enablement in progress")
+def test_cutedsl_1x32_group_validation_error():
+    """Test that 1x32 CuTeDSL kernel raises error for non-128-multiple group sizes."""
+    device = "cuda"
+    M, K = 512, 1024
+    x = torch.randn(M, K, dtype=torch.bfloat16, device=device)
+    num_groups = 4
+
+    # Generate group sizes and force at least one to be invalid
+    group_sizes = generate_split_sizes(num_groups, M, device)
+    if group_sizes[0] % 128 == 0:
+        group_sizes[0] = group_sizes[0] - 1  # Make it not a multiple of 128
+        group_sizes[1] = group_sizes[1] + 1  # Compensate to maintain total sum
+
+    invalid_offsets = torch.cumsum(group_sizes, dim=0, dtype=torch.int32)
+
+    # Test should raise RuntimeError due to device assertion failure with specific message
+    with pytest.raises(
+        RuntimeError,
+        match=r"unspecified launch failure",
+    ):
+        _ = mxfp8_quantize_2d_1x32_cutedsl(
+            x, block_size=32, scaling_mode="rceil", offs=invalid_offsets
+        )
+        # Force synchronization to ensure device error propagates
+        torch.cuda.synchronize()
+
+
+@pytest.mark.skipif(
+    not _is_sm_10x(),
+    reason="MXFP8 requires CUDA SM 10.x",
+)
+@pytest.mark.skipif(
+    not _mxfp8_cutedsl_kernels_available,
+    reason="MXFP8 cutedsl kernels not available",
+)
+@skip_if_rocm("ROCm enablement in progress")
+def test_cutedsl_32x1_group_validation_error():
+    """Test that 32x1 CuTeDSL kernel raises error for non-128-multiple group sizes."""
+    device = "cuda"
+    M, K = 512, 1024
+    x = torch.randn(M, K, dtype=torch.bfloat16, device=device)
+    num_groups = 4
+
+    # Generate group sizes and force at least one to be invalid
+    group_sizes = generate_split_sizes(num_groups, M, device)
+    if group_sizes[0] % 128 == 0:
+        group_sizes[0] = group_sizes[0] - 1  # Make it not a multiple of 128
+        group_sizes[1] = group_sizes[1] + 1  # Compensate to maintain total sum
+
+    invalid_offsets = torch.cumsum(group_sizes, dim=0, dtype=torch.int32)
+
+    # Test should raise RuntimeError due to device assertion failure with specific message
+    with pytest.raises(RuntimeError, match=r"unspecified launch failure"):
+        _ = mxfp8_quantize_2d_32x1_cutedsl(
+            x, block_size=32, scaling_mode="rceil", offs=invalid_offsets
+        )
+        # Force synchronization to ensure device error propagates
+        torch.cuda.synchronize()
+
+
+@pytest.mark.skipif(
+    not _is_sm_10x(),
+    reason="MXFP8 requires CUDA SM 10.x",
+)
+@pytest.mark.skipif(
+    not _mxfp8_cutedsl_kernels_available,
+    reason="MXFP8 cutedsl kernels not available",
+)
+@skip_if_rocm("ROCm enablement in progress")
+def test_cutedsl_kernels_work_with_valid_128_multiple_groups():
+    """Test that both CuTeDSL kernels work correctly with valid 128-multiple group sizes."""
+    device = "cuda"
+    M, K = 512, 1024
+    x = torch.randn(M, K, dtype=torch.bfloat16, device=device)
+
+    # Create valid group offsets (all group sizes are multiples of 128)
+    valid_group_sizes = [128, 256, 128]  # All multiples of 128
+    valid_offsets = torch.cumsum(
+        torch.tensor(valid_group_sizes, dtype=torch.int32), dim=0
+    ).to(device)
+
+    # Verify all group sizes are multiples of 128
+    group_sizes = torch.diff(
+        torch.cat([torch.zeros(1, device=device, dtype=torch.int32), valid_offsets])
+    )
+    assert torch.all(group_sizes % 128 == 0), (
+        "Test setup failed: not all group sizes are multiples of 128"
+    )
+
+    # Both kernels should work without error
+    y_1x32, s_1x32 = mxfp8_quantize_2d_1x32_cutedsl(
+        x, block_size=32, scaling_mode="rceil", offs=valid_offsets
+    )
+
+    y_32x1, s_32x1 = mxfp8_quantize_2d_32x1_cutedsl(
+        x, block_size=32, scaling_mode="rceil", offs=valid_offsets
+    )
+
+    # Basic output validation
+    assert y_1x32.shape == (M, K)
+    assert y_32x1.shape == (M, K)
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/cute_utils.py b/torchao/prototype/moe_training/kernels/mxfp8/cute_utils.py
@@ -225,3 +225,26 @@ def load_vals_chunk_tail(
             else:
                 vals_chunk[j] = cutlass.Float32(0.0)
         return vals_chunk
+
+    @cute.jit
+    def validate_group_sizes(offs: cute.Tensor):
+        # Only first thread validates to avoid redundant work
+        num_groups = offs.shape[0]
+
+        # Validate first group (from 0 to offs[0])
+        if num_groups > 0:
+            first_group_size = offs[0]
+            cute.testing.assert_(
+                first_group_size % 128 == 0,
+                "Group sizes must be multiples of 128",
+            )
+
+        # Validate subsequent groups
+        for i in range(1, num_groups):
+            prev_end = offs[i - 1]
+            curr_end = offs[i]
+            group_size = curr_end - prev_end
+            cute.testing.assert_(
+                group_size % 128 == 0,
+                "Group sizes must be multiples of 128",
+            )
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/cutedsl_quantize_2d_1x32.py b/torchao/prototype/moe_training/kernels/mxfp8/cutedsl_quantize_2d_1x32.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import functools
-from typing import Tuple
+from typing import Optional, Tuple
 
 import torch
 
@@ -17,6 +17,7 @@
     compute_scale_from_amax,
     load_vals_chunk_full,
     load_vals_chunk_tail,
+    validate_group_sizes,
 )
 
 
@@ -83,6 +84,7 @@ def _compile_mxfp8_quantize_2d_cutedsl(
     k_tiles_per_cta: int,
     is_full_k_tiles: bool,
     blocked_scale_output: bool,
+    offs: Optional[torch.Tensor] = None,
 ):
     """Compile the 2D MXFP8 quantization kernel using CuTeDSL.
 
@@ -531,6 +533,7 @@ def kernel(
             m_cta_tiles: cutlass.Int64,
             k_cta_tiles: cutlass.Int64,
             blocked_scale_layout: cute.Layout,
+            offs: Optional[cute.Tensor],
             SCALE_DIM_K: cutlass.Constexpr[int],
             USE_RCEIL: cutlass.Constexpr[bool],
             IS_FULL_K_TILES: cutlass.Constexpr[bool],
@@ -560,6 +563,7 @@ def kernel(
                 m_cta_tiles: Number of tiles in M dimension
                 k_cta_tiles: Number of tile groups in K dimension
                 blocked_scale_layout: Layout for blocked scale output
+                offs: Tensor of group end offsets for validation
                 SCALE_DIM_K: Block size (32)
                 USE_RCEIL: Whether using RCEIL mode
                 IS_FULL_K_TILES: Whether K is perfectly tiled
@@ -575,6 +579,11 @@ def kernel(
             warp_idx = cute.arch.make_warp_uniform(warp_idx)
             bidx, bidy, _ = cute.arch.block_idx()
 
+            # Validate group sizes are multiples of 128 if offs is provided
+            if cutlass.const_expr(offs is not None):
+                if tidx == 0:
+                    validate_group_sizes(offs)
+
             smem_allocator = utils.SmemAllocator()
             storage = smem_allocator.allocate(SharedStorage)
             # The tuned contract keeps STAGE_COUNT <= 2.
@@ -812,6 +821,7 @@ def __call__(
             m_cta_tiles: cutlass.Int64,
             k_cta_tiles: cutlass.Int64,
             stream: cuda.CUstream,
+            offs: Optional[cute.Tensor],
         ):
             """Kernel launcher that sets up TMA descriptors and blocked scale layout.
 
@@ -825,6 +835,7 @@ def __call__(
                 m_cta_tiles: Number of tiles in M dimension
                 k_cta_tiles: Number of tile groups in K dimension
                 stream: CUDA stream
+                offs: Tensor of group end offsets for validation (group sizes must be multiples of 128)
 
             Storage locations:
                 All tensors in global memory
@@ -874,6 +885,7 @@ def __call__(
                 m_cta_tiles,
                 k_cta_tiles,
                 blocked_scale_layout,
+                offs,
                 SCALE_DIM_K=SCALE_DIM_K_VALUE,
                 USE_RCEIL=(scaling_mode == "rceil"),
                 IS_FULL_K_TILES=IS_FULL_K_TILES_VALUE,
@@ -923,6 +935,21 @@ def __call__(
         )
     fake_stream = make_fake_stream()
 
+    if offs is not None:
+        offs_stride = cute.sym_int()
+        fake_offs = make_fake_tensor(
+            cutlass.Int32,
+            (cute.sym_int(),),
+            stride=(offs_stride,),
+        )
+    else:
+        fake_offs = None
+
+    compile_options = (
+        "--enable-tvm-ffi"
+        if fake_offs is None
+        else "--enable-tvm-ffi --enable-assertions"
+    )
     return cute.compile(
         kernel,
         inp_mk=fake_inp,
@@ -934,7 +961,8 @@ def __call__(
         m_cta_tiles=1,
         k_cta_tiles=1,
         stream=fake_stream,
-        options="--enable-tvm-ffi",
+        offs=fake_offs,
+        options=compile_options,
     )
 
 
@@ -944,6 +972,7 @@ def mxfp8_quantize_cutedsl_2d_1x32(
     scaling_mode: str = "rceil",
     stage_count: int = 2,
     blocked_scale_output: bool = False,
+    offs: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize a 2D tensor to MXFP8 format using CuTe DSL kernel.
@@ -956,6 +985,7 @@ def mxfp8_quantize_cutedsl_2d_1x32(
         scaling_mode: Scaling mode ("floor" or "rceil")
         stage_count: Number of pipeline stages (1 or 2)
         blocked_scale_output: Whether to output scales in blocked layout
+        offs: Optional tensor of group end offsets for validation (must have group sizes as multiples of 128)
 
     Returns:
         q_data: Quantized data in row-major layout with shape (M, K)
@@ -970,6 +1000,11 @@ def mxfp8_quantize_cutedsl_2d_1x32(
     M, K = x.shape
     assert K % block_size == 0, "K must be divisible by block_size"
 
+    if offs is not None:
+        assert offs.is_cuda, "offs tensor must be CUDA"
+        assert offs.dtype == torch.int32, "offs must be int32 tensor"
+        assert offs.dim() == 1, "offs must be 1D tensor"
+
     _, config = _select_cutedsl_config(x.dtype, scaling_mode)
     compute_warps, tile_m, tile_k, k_tiles_per_cta = config
     # B200 sweeps over representative shapes showed no
@@ -1019,6 +1054,7 @@ def mxfp8_quantize_cutedsl_2d_1x32(
         k_tiles_per_cta,
         is_full_k_tiles,
         blocked_scale_output,
+        offs,
     )
 
     import cuda.bindings.driver as cuda
@@ -1037,6 +1073,7 @@ def mxfp8_quantize_cutedsl_2d_1x32(
         int(m_cta_tiles),
         int(k_cta_tiles),
         stream,
+        offs,
     )
 
     return q_data, scales_u8.view(torch.float8_e8m0fnu)
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/cutedsl_quantize_2d_32x1.py b/torchao/prototype/moe_training/kernels/mxfp8/cutedsl_quantize_2d_32x1.py
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/quant.py b/torchao/prototype/moe_training/kernels/mxfp8/quant.py