pytorch
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/bench_quantize_3d.py‎
Lines changed: 54 additions & 22 deletions b/‎benchmarks/prototype/moe_training/mxfp8/bench_quantize_3d.py‎
Lines changed: 54 additions & 22 deletions
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/roofline_unified.py‎
Lines changed: 5 additions & 1 deletion b/‎benchmarks/prototype/moe_training/mxfp8/roofline_unified.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎test/prototype/blockwise_fp8_training/_distributed_test_utils.py‎
Lines changed: 186 additions & 0 deletions b/‎test/prototype/blockwise_fp8_training/_distributed_test_utils.py‎
Lines changed: 186 additions & 0 deletions
@@ -31,6 +31,7 @@
 class ExperimentConfig:
     input_shape: tuple[int]
     scaling_mode: ScaleCalculationMode
+    scale_block_k: int
 
 
 @dataclass(frozen=True)
@@ -62,19 +63,24 @@ def get_configs() -> List[ExperimentConfig]:
         (32, 8192, 5120),
     ]
     round_modes = [ScaleCalculationMode.FLOOR, ScaleCalculationMode.RCEIL]
+    scale_block_ks = [1, 32]
     configs = []
-    for shape, scaling_mode in itertools.product(input_shapes, round_modes):
+    for shape, scaling_mode, scale_block_k in itertools.product(
+        input_shapes, round_modes, scale_block_ks
+    ):
         configs.append(
             ExperimentConfig(
                 input_shape=shape,
                 scaling_mode=scaling_mode,
+                scale_block_k=scale_block_k,
             )
         )
     return configs
 
 
 def run_experiment(config: ExperimentConfig) -> ExperimentResult:
     block_size = 32
+    scale_block_k = config.scale_block_k
     input_shape = config.input_shape
     input_tensor = torch.randn(
         *input_shape,
@@ -83,20 +89,37 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult:
     )
 
     def using_to_mx(x: torch.Tensor) -> torch.Tensor:
-        # Reference implementation
-        s_d1_ref, y_d1_ref = to_mx(
-            # Transpose (E,N,K) to (E,K,N) so N is final dim,
-            # since to_mx scales along that dim
-            x.transpose(-2, -1).contiguous(),
+        if scale_block_k == 1:
+            s_ref, y_ref = to_mx(
+                x.transpose(-2, -1).contiguous(),
+                elem_dtype=torch.float8_e4m3fn,
+                block_size=block_size,
+            )
+            return y_ref.transpose(-2, -1), s_ref.transpose(-2, -1)
+
+        assert scale_block_k == 32
+        E, N, K = x.shape
+        x_tiles = (
+            x.view(E, N // block_size, block_size, K // block_size, block_size)
+            .permute(0, 1, 3, 2, 4)
+            .contiguous()
+            .view(E, N // block_size, K // block_size, block_size * block_size)
+        )
+        s_ref, y_tiles_ref = to_mx(
+            x_tiles,
             elem_dtype=torch.float8_e4m3fn,
-            block_size=block_size,
+            block_size=block_size * block_size,
         )
-
-        # Transpose tensors and scales back so we have effectively
-        # quantized input shape (E, N, K) along N
-        y_d1_ref = y_d1_ref.transpose(-2, -1)
-        s_d1_ref = s_d1_ref.transpose(-2, -1)
-        return y_d1_ref, s_d1_ref
+        y_ref = (
+            y_tiles_ref.view(
+                E, N // block_size, K // block_size, block_size, block_size
+            )
+            .permute(0, 1, 3, 2, 4)
+            .contiguous()
+            .view(E, N, K)
+        )
+        y_ref = y_ref.transpose(-2, -1).contiguous().transpose(-2, -1)
+        return y_ref, s_ref
 
     # bench to_mx
     using_to_mx_c = torch.compile(using_to_mx)
@@ -106,26 +129,33 @@ def using_to_mx(x: torch.Tensor) -> torch.Tensor:
         input_tensor,
     )
 
-    # bench 2d dim1 kernel then transforming to col major
-    using_cuda_2d_c = torch.compile(_to_mxfp8_dim1_3d)
-    scales_cuda_2d, data_cuda_2d = using_cuda_2d_c(input_tensor)
-    time_cuda_2d_us = benchmark_cuda_function_in_microseconds(
-        using_cuda_2d_c,
-        input_tensor,
-        block_size=block_size,
-        scaling_mode=config.scaling_mode,
-    )
+    if scale_block_k == 1:
+        # bench 2d dim1 kernel then transforming to col major
+        using_cuda_2d_c = torch.compile(_to_mxfp8_dim1_3d)
+        using_cuda_2d_c(input_tensor)
+        time_cuda_2d_us = benchmark_cuda_function_in_microseconds(
+            using_cuda_2d_c,
+            input_tensor,
+            block_size=block_size,
+            scaling_mode=config.scaling_mode,
+        )
+    else:
+        time_cuda_2d_us = float("nan")
 
     # bench 3d CuTeDSL kernel
     data_cuda_3d, scales_cuda_3d = mxfp8_quantize_cuda_3d(
         input_tensor,
         block_size=block_size,
+        scale_block_n=block_size,
+        scale_block_k=scale_block_k,
         scaling_mode=str(config.scaling_mode.value),
     )
     time_cutedsl_3d_us = benchmark_cuda_function_in_microseconds(
         mxfp8_quantize_cuda_3d,
         input_tensor,
         block_size=block_size,
+        scale_block_n=block_size,
+        scale_block_k=scale_block_k,
         scaling_mode=str(config.scaling_mode.value),
     )
 
@@ -159,6 +189,7 @@ def print_results(experiments: List[Experiment]):
     headers = [
         "input_shape",
         "scaling_mode",
+        "scale_block_k",
         "cuda_2d_us",
         "cutedsl_3d_us",
         "to_mx_us",
@@ -172,6 +203,7 @@ def print_results(experiments: List[Experiment]):
             [
                 str(experiment.config.input_shape),
                 str(experiment.config.scaling_mode),
+                str(experiment.config.scale_block_k),
                 experiment.result.cuda_2d_us,
                 experiment.result.cutedsl_3d_us,
                 experiment.result.to_mx_us,
 
@@ -495,7 +495,11 @@ def benchmark_mxfp8_quantize_cuda_3d(tensor, block_size=32):
     """Benchmark mxfp8_quantize_cuda_3d kernel"""
     return benchmark_cuda_function_in_microseconds(
         lambda: mxfp8_quantize_cuda_3d(
-            tensor, block_size=block_size, scaling_mode="rceil"
+            tensor,
+            block_size=block_size,
+            scale_block_n=block_size,
+            scale_block_k=1,
+            scaling_mode="rceil",
         )
     )
 
 
@@ -0,0 +1,186 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+from collections.abc import Iterable
+
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+
+from packaging import version
+from torchao.prototype.blockwise_fp8_training.linear import (
+    Float8BlockwiseLinear,
+    Float8BlockwiseLinearConfig,
+)
+from torchao.quantization import quantize_
+from torchao.testing.training.dtensor_utils import ToyModel
+from torchao.utils import is_sm_at_least_90
+
+
+def get_blockwise_linear_skip_reason(
+    *,
+    triton_module,
+    min_cuda_devices: int,
+) -> str | None:
+    """Shared module-level gating for Float8BlockwiseLinear distributed tests.
+
+    This is intentionally separate from the lower-level kernel test gating because
+    the module swap currently requires SM90+ and the newer scaled_mm/Triton path.
+    """
+    if not torch.cuda.is_available():
+        return "CUDA not available"
+    if torch.cuda.device_count() < min_cuda_devices:
+        return f"Need at least {min_cuda_devices} CUDA devices"
+    if not is_sm_at_least_90():
+        return "Float8BlockwiseLinear currently requires CUDA SM90+"
+    if version.parse(triton_module.__version__) < version.parse("3.3.0"):
+        return "Triton version < 3.3.0"
+    return None
+
+
+def full_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    """Materialize a DTensor for parity checks, otherwise return the tensor as-is."""
+    return tensor.full_tensor() if isinstance(tensor, DTensor) else tensor
+
+
+def assert_close(
+    actual: torch.Tensor,
+    expected: torch.Tensor,
+    *,
+    atol: float = 2e-2,
+    rtol: float = 2e-2,
+) -> None:
+    """Compare eager tensors and DTensors using a common float32 tolerance path."""
+    torch.testing.assert_close(
+        full_tensor(actual).float(),
+        full_tensor(expected).float(),
+        atol=atol,
+        rtol=rtol,
+    )
+
+
+def set_blockwise_linear_use_triton(
+    model: torch.nn.Module,
+    use_triton: bool,
+) -> None:
+    converted = 0
+    for module in model.modules():
+        if isinstance(module, Float8BlockwiseLinear):
+            module.use_triton = use_triton
+            converted += 1
+    if converted == 0:
+        raise AssertionError("Expected at least one Float8BlockwiseLinear module")
+
+
+def broadcast_module(module: torch.nn.Module) -> None:
+    for param in module.parameters():
+        dist.broadcast(param, src=0)
+
+
+def init_toy_model(
+    *,
+    size: int = 128,
+    seed: int = 42,
+    device: str | torch.device = "cuda",
+    broadcast_weights: bool = False,
+) -> torch.nn.Module:
+    torch.manual_seed(seed)
+    model = ToyModel(size).to(device=device, dtype=torch.bfloat16)
+    if broadcast_weights:
+        broadcast_module(model)
+    return model
+
+
+def make_quantized_toy_model_pair(
+    *,
+    size: int = 128,
+    seed: int = 42,
+    device: str | torch.device = "cuda",
+    use_triton: bool,
+    broadcast_weights: bool = False,
+) -> tuple[torch.nn.Module, torch.nn.Module]:
+    ref_model = init_toy_model(
+        size=size,
+        seed=seed,
+        device=device,
+        broadcast_weights=broadcast_weights,
+    )
+    dist_model = copy.deepcopy(ref_model)
+    for model in (ref_model, dist_model):
+        quantize_(model, Float8BlockwiseLinearConfig())
+        set_blockwise_linear_use_triton(model, use_triton)
+    return ref_model, dist_model
+
+
+def get_replicated_local_batch(
+    *,
+    replica_count: int,
+    replica_index: int,
+    iter_idx: int,
+    size: int = 128,
+    device: str | torch.device = "cuda",
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Build one global batch and hand each replica its deterministic local slice.
+
+    TP peers should see the same sample, while different data-parallel replicas
+    should see different samples. Broadcasting from rank 0 keeps the reference
+    and distributed models aligned across all ranks.
+    """
+    torch.manual_seed(100 + iter_idx)
+    global_input = torch.randn(
+        replica_count,
+        1,
+        size,
+        size,
+        device=device,
+        dtype=torch.bfloat16,
+    )
+    global_target = torch.randn_like(global_input)
+    dist.broadcast(global_input, src=0)
+    dist.broadcast(global_target, src=0)
+    return (
+        global_input[replica_index].contiguous(),
+        global_target[replica_index].contiguous(),
+    )
+
+
+def assert_parameters_are_dtensors(parameters: Iterable[torch.Tensor]) -> None:
+    for param in parameters:
+        assert isinstance(param, DTensor)
+
+
+def allreduce_reference_grads(
+    model: torch.nn.Module,
+    *,
+    world_size: int,
+    group=None,
+) -> None:
+    for param in model.parameters():
+        assert param.grad is not None
+        dist.all_reduce(param.grad, group=group)
+        param.grad.div_(world_size)
+
+
+def assert_dtensor_parameter_grads_match(
+    ref_parameters: Iterable[torch.nn.Parameter],
+    dist_parameters: Iterable[torch.nn.Parameter],
+) -> None:
+    for ref_param, dist_param in zip(ref_parameters, dist_parameters, strict=True):
+        assert ref_param.grad is not None
+        assert dist_param.grad is not None
+        assert isinstance(dist_param, DTensor)
+        assert isinstance(dist_param.grad, DTensor)
+        assert_close(dist_param.grad, ref_param.grad)
+
+
+def assert_dtensor_parameter_values_match(
+    ref_parameters: Iterable[torch.nn.Parameter],
+    dist_parameters: Iterable[torch.nn.Parameter],
+) -> None:
+    for ref_param, dist_param in zip(ref_parameters, dist_parameters, strict=True):
+        assert isinstance(dist_param, DTensor)
+        assert_close(dist_param, ref_param)
Original file line number	Diff line number	Diff line change
`@@ -495,7 +495,11 @@ def benchmark_mxfp8_quantize_cuda_3d(tensor, block_size=32):`
`495`	`495`	`"""Benchmark mxfp8_quantize_cuda_3d kernel"""`
`496`	`496`	`return benchmark_cuda_function_in_microseconds(`
`497`	`497`	`lambda: mxfp8_quantize_cuda_3d(`
`498`		`- tensor, block_size=block_size, scaling_mode="rceil"`
	`498`	`+ tensor,`
	`499`	`+ block_size=block_size,`
	`500`	`+ scale_block_n=block_size,`
	`501`	`+ scale_block_k=1,`
	`502`	`+ scaling_mode="rceil",`
`499`	`503`	`)`
`500`	`504`	`)`
`501`	`505`