pytorch
diff --git a/‎test/prototype/attention/__init__.py‎ b/‎test/prototype/attention/__init__.py‎
diff --git a/‎test/prototype/attention/test_fp8_attention.py‎
Lines changed: 127 additions & 0 deletions b/‎test/prototype/attention/test_fp8_attention.py‎
Lines changed: 127 additions & 0 deletions
diff --git a/‎torchao/prototype/attention/__init__.py‎
Lines changed: 21 additions & 0 deletions b/‎torchao/prototype/attention/__init__.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎torchao/prototype/attention/api.py‎
Lines changed: 88 additions & 0 deletions b/‎torchao/prototype/attention/api.py‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎torchao/prototype/attention/fp8_fa3/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎torchao/prototype/attention/fp8_fa3/__init__.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎torchao/prototype/attention/fp8_fa3/attention.py‎
Lines changed: 22 additions & 0 deletions b/‎torchao/prototype/attention/fp8_fa3/attention.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎torchao/prototype/attention/fp8_fa3/setup.py‎
Lines changed: 27 additions & 0 deletions b/‎torchao/prototype/attention/fp8_fa3/setup.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎torchao/prototype/attention/quantization/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎torchao/prototype/attention/quantization/__init__.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎torchao/prototype/attention/quantization/quantization.py‎
Lines changed: 50 additions & 0 deletions b/‎torchao/prototype/attention/quantization/quantization.py‎
Lines changed: 50 additions & 0 deletions
@@ -0,0 +1,127 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tests for FP8 low-precision attention (FA3 backend on Hopper)."""
+
+import unittest
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.testing._internal import common_utils
+from torch.testing._internal.common_utils import TestCase, run_tests
+
+from torchao.quantization.utils import compute_error
+from torchao.utils import torch_version_at_least
+
+if torch_version_at_least("2.11.0"):
+    from torchao.prototype.attention.utils import _is_fa3_available, _is_hopper
+
+    if _is_hopper() and _is_fa3_available():
+        from torch.nn.attention import (
+            activate_flash_attention_impl,
+            restore_flash_attention_impl,
+        )
+
+        from torchao.prototype.attention import (
+            AttentionBackend,
+            apply_low_precision_attention,
+        )
+        from torchao.prototype.attention.fp8_fa3.attention import fp8_fa3_sdpa
+
+
+class SimpleAttentionModel(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+
+    def forward(self, x):
+        B, S, _ = x.shape
+        q = self.q_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        attn_out = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        return self.out_proj(attn_out.transpose(1, 2).contiguous().view(B, S, -1))
+
+
+@common_utils.instantiate_parametrized_tests
+class TestFP8FA3Attention(TestCase):
+    @unittest.skipUnless(
+        torch_version_at_least("2.11.0") and _is_hopper() and _is_fa3_available(),
+        "Requires PyTorch >= 2.11, Hopper GPU, and FA3",
+    )
+    @common_utils.parametrize("shape", [(2, 8, 1024, 64), (1, 16, 1024, 128)])
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
+    def test_sdpa_accuracy(self, shape, dtype):
+        B, H, S, D = shape
+        q = torch.randn(B, H, S, D, device="cuda", dtype=dtype)
+        k = torch.randn(B, H, S, D, device="cuda", dtype=dtype)
+        v = torch.randn(B, H, S, D, device="cuda", dtype=dtype)
+
+        with torch.no_grad():
+            out_ref = F.scaled_dot_product_attention(q, k, v, is_causal=False)
+
+        activate_flash_attention_impl("FA3")
+        try:
+            with torch.no_grad():
+                out_fp8 = fp8_fa3_sdpa(q, k, v, is_causal=False)
+        finally:
+            restore_flash_attention_impl()
+
+        sqnr = compute_error(out_ref, out_fp8)
+        self.assertGreater(
+            sqnr.item(),
+            25.0,
+            f"SQNR {sqnr.item():.2f} dB below 25 dB for shape={shape}, dtype={dtype}",
+        )
+
+    @unittest.skipUnless(
+        torch_version_at_least("2.11.0") and _is_hopper() and _is_fa3_available(),
+        "Requires PyTorch >= 2.11, Hopper GPU, and FA3",
+    )
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
+    def test_monkey_patch_model(self, dtype):
+        embed_dim, num_heads = 512, 8
+        model = (
+            SimpleAttentionModel(embed_dim, num_heads)
+            .to(device="cuda", dtype=dtype)
+            .eval()
+        )
+        x = torch.randn(2, 128, embed_dim, device="cuda", dtype=dtype)
+
+        with torch.no_grad():
+            out_ref = model(x)
+
+        fp8_model = (
+            SimpleAttentionModel(embed_dim, num_heads)
+            .to(device="cuda", dtype=dtype)
+            .eval()
+        )
+        fp8_model.load_state_dict(model.state_dict())
+        fp8_model = apply_low_precision_attention(
+            fp8_model,
+            backend=AttentionBackend.FP8_FA3,
+            fuse_rope_using_torch_compile=False,
+        )
+
+        with torch.no_grad():
+            out_fp8 = fp8_model(x)
+
+        sqnr = compute_error(out_ref, out_fp8)
+        self.assertGreater(
+            sqnr.item(),
+            20.0,
+            f"SQNR {sqnr.item():.2f} dB below 20 dB for dtype={dtype}",
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Low-precision attention for inference.
+
+Only supports forward pass — backward is not supported by the underlying backends.
+"""
+
+from torchao.prototype.attention.api import (
+    AttentionBackend,
+    apply_low_precision_attention,
+)
+
+__all__ = [
+    "AttentionBackend",
+    "apply_low_precision_attention",
+]
@@ -0,0 +1,88 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""User-facing API for low-precision attention."""
+
+from enum import Enum
+from typing import Optional
+
+import torch
+import torch._dynamo
+import torch.nn as nn
+
+from torchao.prototype.attention.utils import _is_fa3_available, _is_hopper
+from torchao.utils import torch_version_at_least
+
+if torch_version_at_least("2.11.0"):
+    from torchao.prototype.attention.shared_utils.setup import setup_fp8_backend
+    from torchao.prototype.attention.shared_utils.wrapper import (
+        _LowPrecisionAttentionWrapper,
+    )
+else:
+    raise ImportError("Low-precision attention requires PyTorch 2.11+.")
+
+
+class AttentionBackend(str, Enum):
+    """Backend kernel for computing attention."""
+
+    FP8_FA3 = "FP8_FA3"  # Requires SM90+ (Hopper)
+
+
+def _get_available_backend() -> AttentionBackend:
+    if not torch.cuda.is_available():
+        raise RuntimeError("Low-precision attention requires CUDA.")
+    capability = torch.cuda.get_device_capability()
+    if _is_hopper() and _is_fa3_available():
+        return AttentionBackend.FP8_FA3
+    raise RuntimeError(f"No compatible backend for SM{capability[0]}{capability[1]}.")
+
+
+def _check_backend_available(backend: AttentionBackend) -> None:
+    if not torch.cuda.is_available():
+        raise RuntimeError(f"{backend} backend requires CUDA.")
+    capability = torch.cuda.get_device_capability()
+    if backend == AttentionBackend.FP8_FA3:
+        if not _is_hopper():
+            raise RuntimeError(
+                f"FP8_FA3 requires Hopper (SM 9.x), got SM{capability[0]}{capability[1]}."
+            )
+        if not _is_fa3_available():
+            raise RuntimeError(
+                "FP8_FA3 requires the flash-attn package with FA3 support."
+            )
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+
+def apply_low_precision_attention(
+    model: nn.Module,
+    backend: Optional[AttentionBackend] = None,
+    fuse_rope_using_torch_compile: bool = False,
+) -> nn.Module:
+    """Apply low-precision attention to a model.
+
+    Must be called before ``torch.compile``. KV caching should be
+    disabled before calling (e.g., ``config.use_cache = False`` for
+    HuggingFace models).
+    """
+    if isinstance(model, _LowPrecisionAttentionWrapper):
+        raise RuntimeError(
+            "apply_low_precision_attention has already been applied to this module."
+        )
+    if isinstance(model, torch._dynamo.OptimizedModule):
+        raise RuntimeError(
+            "apply_low_precision_attention must be called before torch.compile."
+        )
+
+    if backend is None:
+        backend = _get_available_backend()
+    else:
+        _check_backend_available(backend)
+
+    if backend == AttentionBackend.FP8_FA3:
+        return setup_fp8_backend(model, "FA3", fuse_rope_using_torch_compile)
+
+    raise ValueError(f"Unknown backend: {backend}")
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""FP8 attention using FA3 backend."""
+
+from torchao.prototype.attention.fp8_fa3.attention import fp8_fa3_sdpa
+from torchao.prototype.attention.quantization import _fp8_sdpa_quantize
+
+__all__ = [
+    "fp8_fa3_sdpa",
+    "_fp8_sdpa_quantize",
+]
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""FP8 SDPA using FA3 backend.
+
+Thin wrapper around ``shared_utils/attention.py``. When using directly,
+activate the FA3 flash attention implementation before calling.
+"""
+
+from functools import partial
+
+from torchao.prototype.attention.shared_utils.attention import (
+    _fp8_sdpa,
+)
+
+fp8_fa3_sdpa = partial(_fp8_sdpa, backend_name="FA3")
+fp8_fa3_sdpa.__doc__ = _fp8_sdpa.__doc__
+fp8_fa3_sdpa.__name__ = "fp8_fa3_sdpa"
+fp8_fa3_sdpa.__qualname__ = "fp8_fa3_sdpa"
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""FP8 FA3 backend setup."""
+
+import torch.nn as nn
+
+from torchao.prototype.attention.config import LowPrecisionAttentionConfig
+from torchao.prototype.attention.shared_utils.setup import setup_fp8_backend
+
+
+def setup_fp8_fa3(
+    model: nn.Module,
+    config: LowPrecisionAttentionConfig,
+) -> nn.Module:
+    """Set up FP8 FA3 attention on *model* and wrap it."""
+    from torchao.prototype.attention.fp8_fa3.attention import fp8_fa3_sdpa
+
+    return setup_fp8_backend(
+        model,
+        config,
+        flash_impl_name="FA3",
+        sdpa_fn=fp8_fa3_sdpa,
+    )
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Shared FP8 quantization kernels for low-precision attention."""
+
+from torchao.prototype.attention.quantization.quantization import (
+    _fp8_sdpa_quantize,
+)
+
+__all__ = [
+    "_fp8_sdpa_quantize",
+]
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+FP8 quantization for attention inputs.
+"""
+
+from typing import Tuple
+
+import torch
+
+
+def _fp8_sdpa_quantize(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+) -> Tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    """Quantize Q, K, V to FP8 with per-head scaling."""
+    if q.dim() != 4:
+        raise ValueError(f"Expected 4D tensor for q, got {q.dim()}D")
+    if k.dim() != 4:
+        raise ValueError(f"Expected 4D tensor for k, got {k.dim()}D")
+    if v.dim() != 4:
+        raise ValueError(f"Expected 4D tensor for v, got {v.dim()}D")
+    if k.shape != v.shape:
+        raise ValueError(f"K and V shape mismatch: {k.shape} vs {v.shape}")
+    if q.shape[0] != k.shape[0]:
+        raise ValueError(f"Batch size mismatch: {q.shape[0]} vs {k.shape[0]}")
+    if q.shape[1] % k.shape[1] != 0:
+        raise ValueError(
+            f"Q head count ({q.shape[1]}) must be a multiple of K head count ({k.shape[1]})"
+        )
+    if q.shape[3] != k.shape[3]:
+        raise ValueError(f"Head dim mismatch: {q.shape[3]} vs {k.shape[3]}")
+
+    from torchao.prototype.attention.quantization.triton_qkv_quantization import (
+        triton_fp8_sdpa_quantize,
+    )
+
+    return triton_fp8_sdpa_quantize(q, k, v)