[fp8 training] support linear op overrides in Float8TrainingWeightWrapperTensor (#4325)

danielvegamyhre · web-flow · commit 721d69c88d55 · 2026-04-26T09:14:47.000-07:00
stack-info: PR: #4325, branch: danielvegamyhre/stack/165
diff --git a/test/prototype/moe_training/test_tensor.py b/test/prototype/moe_training/test_tensor.py
@@ -15,13 +15,17 @@
     pytest.skip("CUDA and PyTorch 2.7.0+ required", allow_module_level=True)
 
 from torchao.prototype.moe_training.config import (
+    Float8TrainingOpConfig,
     MXFP8TrainingOpConfig,
     MXFP8TrainingRecipe,
 )
 from torchao.prototype.moe_training.kernels.mxfp8.quant import (
     _mxfp8_cutedsl_kernels_available,
 )
-from torchao.prototype.moe_training.tensor import MXFP8TrainingWeightWrapperTensor
+from torchao.prototype.moe_training.tensor import (
+    Float8TrainingWeightWrapperTensor,
+    MXFP8TrainingWeightWrapperTensor,
+)
 from torchao.prototype.mx_formats.config import (
     MXFP8Dim1CastKernelChoice,
 )
@@ -183,3 +187,104 @@ def test_mxfp8_training_tensor_ops_preserve_subclass():
     assert isinstance(result, MXFP8TrainingWeightWrapperTensor), (
         "slice should preserve subclass"
     )
+
+
+@pytest.mark.parametrize("op_name", ["mm", "matmul", "linear"])
+@pytest.mark.parametrize("batch_size", [None, 2])
+@pytest.mark.parametrize(
+    "float8_linear_recipe", ["tensorwise", "rowwise", "rowwise_with_gw_hp"]
+)
+def test_float8_training_tensor_ops_fwd_bwd(op_name, batch_size, float8_linear_recipe):
+    # mm doesn't support batching
+    if op_name == "mm" and batch_size is not None:
+        pytest.skip("mm doesn't support batching")
+
+    # All FP8 linear recipes require SM89+ (torch._scaled_mm)
+    if torch.cuda.get_device_capability() < (8, 9):
+        pytest.skip("FP8 linear requires SM89+")
+
+    # rowwise and rowwise_with_gw_hp require SM90+ (CUTLASS axiswise kernels)
+    if float8_linear_recipe in (
+        "rowwise",
+        "rowwise_with_gw_hp",
+    ) and torch.cuda.get_device_capability() < (9, 0):
+        pytest.skip("Rowwise FP8 requires SM90+")
+
+    config = Float8TrainingOpConfig(float8_linear_recipe=float8_linear_recipe)
+
+    M, K, N = 1024, 1024, 2048
+    if batch_size is None:
+        A_shape = (M, K)
+    else:
+        A_shape = (batch_size, M, K)
+
+    A = torch.randn(*A_shape, dtype=torch.bfloat16, device="cuda", requires_grad=True)
+    B = torch.randn(N, K, dtype=torch.bfloat16, device="cuda", requires_grad=True)
+    bias = (
+        torch.randn(N, dtype=torch.bfloat16, device="cuda")
+        if op_name == "linear"
+        else None
+    )
+
+    # Reference computation with bf16
+    A_ref = A.clone().detach().requires_grad_(True)
+    B_ref = B.clone().detach().requires_grad_(True)
+
+    if op_name == "mm":
+        result_ref = torch.mm(A_ref, B_ref.t())
+    elif op_name == "matmul":
+        result_ref = torch.matmul(A_ref, B_ref.t())
+    elif op_name == "linear":
+        result_ref = F.linear(A_ref, B_ref, bias)
+
+    # FP8 computation
+    B_fp8 = Float8TrainingWeightWrapperTensor(B, config)
+
+    if op_name == "mm":
+        result_fp8 = torch.mm(A, B_fp8)
+    elif op_name == "matmul":
+        result_fp8 = torch.matmul(A, B_fp8)
+    elif op_name == "linear":
+        result_fp8 = F.linear(A, B_fp8, bias)
+
+    # Validate forward pass
+    assert result_fp8.shape == result_ref.shape, "Shape mismatch"
+    assert result_fp8.dtype == torch.bfloat16, "Dtype should be bfloat16"
+    assert not isinstance(result_fp8, Float8TrainingWeightWrapperTensor), (
+        "Result should be unwrapped"
+    )
+
+    # Check forward SQNR
+    sqnr_fwd = compute_error(result_ref, result_fp8)
+    min_sqnr_fwd = 25.0
+    assert sqnr_fwd >= min_sqnr_fwd, (
+        f"Forward SQNR {sqnr_fwd} is too low, must be >= {min_sqnr_fwd}"
+    )
+
+    # Backward pass
+    labels_ref = torch.ones_like(result_ref)
+    labels_fp8 = torch.ones_like(result_fp8)
+    loss_ref = F.mse_loss(result_ref, labels_ref)
+    loss_fp8 = F.mse_loss(result_fp8, labels_fp8)
+    loss_ref.backward()
+    loss_fp8.backward()
+
+    # Verify gradients exist
+    assert A.grad is not None, "A.grad should be computed"
+    assert A_ref.grad is not None, "A_ref.grad should be computed"
+    assert B_fp8.grad is not None, "B_fp8.grad should be computed"
+    assert B_ref.grad is not None, "B_ref.grad should be computed"
+
+    # Check input gradient SQNR
+    sqnr_input_grad = compute_error(A_ref.grad, A.grad)
+    min_sqnr_input_grad = 24.0
+    assert sqnr_input_grad >= min_sqnr_input_grad, (
+        f"Input grad SQNR {sqnr_input_grad} is too low, must be >= {min_sqnr_input_grad}"
+    )
+
+    # Check weight gradient SQNR
+    sqnr_weight_grad = compute_error(B_ref.grad, B_fp8.grad)
+    min_sqnr_weight_grad = 23.0
+    assert sqnr_weight_grad >= min_sqnr_weight_grad, (
+        f"Weight grad SQNR {sqnr_weight_grad} is too low, must be >= {min_sqnr_weight_grad}"
+    )
diff --git a/torchao/prototype/moe_training/config.py b/torchao/prototype/moe_training/config.py
@@ -6,12 +6,14 @@
 
 from dataclasses import dataclass
 from enum import Enum
-from typing import Optional
+from typing import Literal, Optional
 
 import torch
 from torch import nn
 
 from torchao.core.config import AOBaseConfig
+from torchao.float8.config import Float8LinearConfig
+from torchao.float8.float8_training_tensor import LinearMMConfig, ScaledMMConfig
 from torchao.prototype.mx_formats.config import ScaleCalculationMode
 from torchao.quantization.quantize_.common import KernelPreference
 from torchao.quantization.transform_module import register_quantize_module_handler
@@ -64,6 +66,41 @@ class Float8TrainingOpConfig(TrainingOpBaseConfig):
     # causes a D2H sync that breaks torch.compile.
     pad_token_groups_for_grouped_mm: bool = False
 
+    # Recipe for the float8 linear op override ("tensorwise" or "rowwise").
+    float8_linear_recipe: Literal["tensorwise", "rowwise", "rowwise_with_gw_hp"] = (
+        "rowwise"
+    )
+
+    def __post_init__(self):
+        # Pre-build internal configs for the linear op override.
+        self._float8_linear_config = Float8LinearConfig.from_recipe_name(
+            self.float8_linear_recipe
+        )
+        c = self._float8_linear_config
+        self._linear_mm_config = LinearMMConfig(
+            # output
+            ScaledMMConfig(
+                c.emulate,
+                c.gemm_config_output.use_fast_accum,
+                False,
+                c.pad_inner_dim,
+            ),
+            # grad_input
+            ScaledMMConfig(
+                c.emulate,
+                c.gemm_config_grad_input.use_fast_accum,
+                False,
+                c.pad_inner_dim,
+            ),
+            # grad_weight
+            ScaledMMConfig(
+                c.emulate,
+                c.gemm_config_grad_weight.use_fast_accum,
+                False,
+                c.pad_inner_dim,
+            ),
+        )
+
     @classmethod
     def from_recipe(
         cls,
@@ -82,6 +119,7 @@ def __eq__(self, other):
                 and self.out_dtype == other.out_dtype
                 and self.pad_token_groups_for_grouped_mm
                 == other.pad_token_groups_for_grouped_mm
+                and self.float8_linear_recipe == other.float8_linear_recipe
             )
         return NotImplemented
 
@@ -91,6 +129,7 @@ def __hash__(self):
                 self.float8_dtype,
                 self.out_dtype,
                 self.pad_token_groups_for_grouped_mm,
+                self.float8_linear_recipe,
             )
         )
 
diff --git a/torchao/prototype/moe_training/tensor.py b/torchao/prototype/moe_training/tensor.py
@@ -21,6 +21,7 @@
 )
 from torchao.prototype.moe_training.utils import (
     _quantize_then_scaled_grouped_mm,
+    _to_fp8_then_scaled_mm,
     unwrap_weight,
 )
 from torchao.prototype.mx_formats.mx_linear import _to_mxfp8_then_scaled_mm
@@ -250,7 +251,19 @@ def __torch_function__(cls, func, types, args, kwargs={}):
                     config=config,
                 )
 
-        # TOOD: linear op override
+        # linear op override
+        elif func.__name__ in ("linear", "mm", "matmul", "addmm"):
+            A, B = args[0], args[1]
+            assert not isinstance(A, cls), f"A should not be a {cls.__name__}"
+            assert isinstance(B, cls), f"B should be a {cls.__name__}"
+            config = B.config
+            result = _to_fp8_then_scaled_mm(A, unwrap_weight(B), config)
+            # Handle bias for F.linear(input, weight, bias) calls
+            bias = args[2] if len(args) > 2 else kwargs.get("bias", None)
+            if bias is not None:
+                result = result + bias.to(result.dtype)
+            return result
+
         else:
             # Disable torch_function by hand because we don't want
             # the wrapping behavior of the super() impl, go directly to dispatch
diff --git a/torchao/prototype/moe_training/utils.py b/torchao/prototype/moe_training/utils.py
@@ -512,3 +512,12 @@ def backward(ctx, grad_output):
 
 def unwrap_weight(wrapper_tensor):
     return _UnwrapWeight.apply(wrapper_tensor)
+
+
+def _to_fp8_then_scaled_mm(input, weight, config):
+    """Helper to perform FP8 linear via matmul_with_hp_or_float8_args."""
+    from torchao.float8.float8_linear import matmul_with_hp_or_float8_args
+
+    return matmul_with_hp_or_float8_args.apply(
+        input, weight.t(), config._linear_mm_config, config._float8_linear_config
+    )