pytorch
diff --git a/‎benchmarks/prototype/attention/benchmark_sdpa.py‎
Lines changed: 6 additions & 1 deletion b/‎benchmarks/prototype/attention/benchmark_sdpa.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎benchmarks/prototype/attention/eval_flux_model.py‎
Lines changed: 38 additions & 7 deletions b/‎benchmarks/prototype/attention/eval_flux_model.py‎
Lines changed: 38 additions & 7 deletions
diff --git a/‎benchmarks/prototype/attention/eval_llama3_model.py‎
Lines changed: 48 additions & 29 deletions b/‎benchmarks/prototype/attention/eval_llama3_model.py‎
Lines changed: 48 additions & 29 deletions
diff --git a/‎test/prototype/moe_training/test_kernels.py‎
Lines changed: 1 addition & 0 deletions b/‎test/prototype/moe_training/test_kernels.py‎
Lines changed: 1 addition & 0 deletions
@@ -56,14 +56,19 @@ def _activate_backend(backend: str):
     "fa3_fp8_hadamard_v": "V_ONLY",
 }
 
+_SDPA_BACKEND = {
+    "fa2": SDPBackend.FLASH_ATTENTION,
+    "fa3": SDPBackend.FLASH_ATTENTION,
+}
+
 
 def _run_attention(backend: str, q, k, v, is_causal: bool):
     """Run a single attention call for the given backend."""
     if backend in _HADAMARD_MODE:
         return fp8_fa3_sdpa(
             q, k, v, is_causal=is_causal, hadamard=_HADAMARD_MODE[backend]
         )
-    with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+    with sdpa_kernel(_SDPA_BACKEND[backend]):
         return F.scaled_dot_product_attention(q, k, v, is_causal=is_causal)
 
 
 
@@ -26,8 +26,10 @@
 from diffusers import FluxPipeline
 from PIL import Image
 from torch.nn.attention import (
+    SDPBackend,
     activate_flash_attention_impl,
     restore_flash_attention_impl,
+    sdpa_kernel,
 )
 
 from torchao.prototype.attention import (
@@ -37,8 +39,16 @@
 )
 
 BACKENDS = {
-    "fa2": {"flash_impl": None, "fp8": False},
-    "fa3": {"flash_impl": "FA3", "fp8": False},
+    "fa2": {
+        "flash_impl": None,
+        "fp8": False,
+        "sdpa_backend": SDPBackend.FLASH_ATTENTION,
+    },
+    "fa3": {
+        "flash_impl": "FA3",
+        "fp8": False,
+        "sdpa_backend": SDPBackend.FLASH_ATTENTION,
+    },
     "fa3_fp8": {
         "flash_impl": "FA3",
         "fp8": True,
@@ -76,7 +86,7 @@ def setup_backend(
     compile_flag,
     orig_transformer,
 ):
-    """Set up a backend and return the flash_impl name."""
+    """Set up a backend and return (flash_impl, sdpa_backend)."""
     cfg = BACKENDS[backend_name]
     pipe.transformer = orig_transformer
 
@@ -90,12 +100,12 @@ def setup_backend(
         if compile_flag:
             print(f"Compiling transformer with torch.compile ({backend_name})...")
             pipe.transformer = torch.compile(pipe.transformer)
-        return cfg["flash_impl"]
+        return cfg["flash_impl"], None
     else:
         if compile_flag:
             print(f"Compiling transformer with torch.compile ({backend_name})...")
             pipe.transformer = torch.compile(pipe.transformer)
-        return cfg["flash_impl"]
+        return cfg["flash_impl"], cfg.get("sdpa_backend")
 
 
 def pil_to_lpips_tensor(img: Image.Image, device: str) -> torch.Tensor:
@@ -124,10 +134,25 @@ def generate_image(
     height: int = 2048,
     width: int = 2048,
     flash_impl: Optional[str] = None,
+    sdpa_backend: Optional[SDPBackend] = None,
 ) -> Image.Image:
     """Generate an image from a prompt with deterministic seed."""
     generator = torch.Generator(device=device).manual_seed(seed)
 
+    # For BF16 backends, force the correct SDPA backend on the transformer
+    # only (not the VAE, whose head_dim=512 exceeds flash/cuDNN limits and
+    # needs the math backend). FP8 backends call their ops directly and
+    # don't need this.
+    orig_forward = None
+    if sdpa_backend is not None:
+        orig_forward = pipe.transformer.forward
+
+        def _forced_backend_forward(*args, **kwargs):
+            with sdpa_kernel(sdpa_backend):
+                return orig_forward(*args, **kwargs)
+
+        pipe.transformer.forward = _forced_backend_forward
+
     if flash_impl:
         activate_flash_attention_impl(flash_impl)
     try:
@@ -140,6 +165,8 @@ def generate_image(
             generator=generator,
         ).images[0]
     finally:
+        if orig_forward is not None:
+            pipe.transformer.forward = orig_forward
         if flash_impl:
             restore_flash_attention_impl()
 
@@ -211,7 +238,7 @@ def run_benchmark(
     print(f"Phase 1: Generating images ({baseline_backend})")
     print("-" * 80)
 
-    baseline_flash_impl = setup_backend(
+    baseline_flash_impl, baseline_sdpa = setup_backend(
         pipe,
         baseline_backend,
         compile,
@@ -230,6 +257,7 @@ def run_benchmark(
             height=height,
             width=width,
             flash_impl=baseline_flash_impl,
+            sdpa_backend=baseline_sdpa,
         )
         print(f"  Warmup {i + 1}/{warmup_iters} complete")
 
@@ -251,6 +279,7 @@ def run_benchmark(
             height=height,
             width=width,
             flash_impl=baseline_flash_impl,
+            sdpa_backend=baseline_sdpa,
         )
         end_event.record()
         torch.cuda.synchronize()
@@ -274,7 +303,7 @@ def run_benchmark(
     print(f"Phase 2: Generating images ({test_backend})")
     print("-" * 80)
 
-    test_flash_impl = setup_backend(
+    test_flash_impl, test_sdpa = setup_backend(
         pipe,
         test_backend,
         compile,
@@ -292,6 +321,7 @@ def run_benchmark(
             height=height,
             width=width,
             flash_impl=test_flash_impl,
+            sdpa_backend=test_sdpa,
         )
         print(f"  Warmup {i + 1}/{warmup_iters} complete")
 
@@ -314,6 +344,7 @@ def run_benchmark(
             height=height,
             width=width,
             flash_impl=test_flash_impl,
+            sdpa_backend=test_sdpa,
         )
         end_event.record()
         torch.cuda.synchronize()
 
@@ -16,6 +16,7 @@
 
 import argparse
 import gc
+from contextlib import nullcontext
 
 import torch
 import torch._dynamo
@@ -24,8 +25,10 @@
 from lm_eval.models.huggingface import HFLM
 from torch._inductor.compile_fx import compile_fx
 from torch.nn.attention import (
+    SDPBackend,
     activate_flash_attention_impl,
     restore_flash_attention_impl,
+    sdpa_kernel,
 )
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -46,11 +49,13 @@
         "flash_impl": None,
         "fp8": False,
         "label": "FA2 BF16",
+        "sdpa_backend": SDPBackend.FLASH_ATTENTION,
     },
     "fa3": {
         "flash_impl": "FA3",
         "fp8": False,
         "label": "FA3 BF16",
+        "sdpa_backend": SDPBackend.FLASH_ATTENTION,
     },
     "fa3_fp8": {
         "flash_impl": "FA3",
@@ -120,8 +125,9 @@ def mask_strip_backend(gm, example_inputs):
 
 
 def setup_backend(orig_model, backend_name, compile_flag):
-    """Set up a backend and return (model, flash_impl)."""
+    """Set up a backend and return (model, flash_impl, sdpa_backend)."""
     cfg = BACKENDS[backend_name]
+    sdpa_backend = cfg.get("sdpa_backend")
 
     if cfg["fp8"]:
         print(f"  Applying low-precision FP8 attention ({backend_name})...")
@@ -136,7 +142,7 @@ def setup_backend(orig_model, backend_name, compile_flag):
         if compile_flag:
             print(f"  Compiling model with torch.compile ({backend_name})...")
             model = torch.compile(model)
-        return model, cfg["flash_impl"]
+        return model, cfg["flash_impl"], sdpa_backend
     else:
         if compile_flag:
             print(f"  Compiling model with torch.compile ({backend_name})...")
@@ -146,22 +152,24 @@ def setup_backend(orig_model, backend_name, compile_flag):
             model = _compile_with_mask_strip(
                 orig_model, flash_impl_name=cfg["flash_impl"]
             )
-            return model, cfg["flash_impl"]
+            return model, cfg["flash_impl"], sdpa_backend
         # Restore use_cache in case a prior setup disabled it.
         orig_model.config.use_cache = True
-        return orig_model, cfg["flash_impl"]
+        return orig_model, cfg["flash_impl"], sdpa_backend
 
 
-def evaluate_perplexity(model, tokenizer, flash_impl) -> float:
+def evaluate_perplexity(model, tokenizer, flash_impl, sdpa_backend=None) -> float:
     # Evaluate perplexity on WikiText-2 using lm_eval.
     if flash_impl:
         activate_flash_attention_impl(flash_impl)
+    ctx = sdpa_kernel(sdpa_backend) if sdpa_backend is not None else nullcontext()
     try:
-        results = evaluator.simple_evaluate(
-            HFLM(pretrained=model, tokenizer=tokenizer),
-            tasks=["wikitext"],
-            batch_size=1,
-        )
+        with ctx:
+            results = evaluator.simple_evaluate(
+                HFLM(pretrained=model, tokenizer=tokenizer),
+                tasks=["wikitext"],
+                batch_size=1,
+            )
     finally:
         if flash_impl:
             restore_flash_attention_impl()
@@ -178,26 +186,33 @@ def benchmark_runtime(
     flash_impl,
     num_warmup,
     num_iters,
+    sdpa_backend=None,
 ) -> float:
     """Benchmark forward-pass latency at a given sequence length. Returns median ms."""
     input_ids = torch.randint(0, vocab_size, (1, seq_len), device=device)
 
     if flash_impl:
         activate_flash_attention_impl(flash_impl)
+    ctx = sdpa_kernel(sdpa_backend) if sdpa_backend is not None else nullcontext()
     try:
-        # Warmup
-        for _ in range(num_warmup):
-            model(input_ids)
-        torch.cuda.synchronize()
-
-        start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_iters)]
-        end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_iters)]
-
-        for i in range(num_iters):
-            start_events[i].record()
-            model(input_ids)
-            end_events[i].record()
-        torch.cuda.synchronize()
+        with ctx:
+            # Warmup
+            for _ in range(num_warmup):
+                model(input_ids)
+            torch.cuda.synchronize()
+
+            start_events = [
+                torch.cuda.Event(enable_timing=True) for _ in range(num_iters)
+            ]
+            end_events = [
+                torch.cuda.Event(enable_timing=True) for _ in range(num_iters)
+            ]
+
+            for i in range(num_iters):
+                start_events[i].record()
+                model(input_ids)
+                end_events[i].record()
+            torch.cuda.synchronize()
     finally:
         if flash_impl:
             restore_flash_attention_impl()
@@ -258,22 +273,24 @@ def run_benchmark(
 
     # --- Baseline perplexity ---
     print(f"\n  Computing perplexity with {baseline_label}...")
-    baseline_model, baseline_flash = setup_backend(
+    baseline_model, baseline_flash, baseline_sdpa = setup_backend(
         orig_model,
         baseline_backend,
         compile,
     )
-    baseline_ppl = evaluate_perplexity(baseline_model, tokenizer, baseline_flash)
+    baseline_ppl = evaluate_perplexity(
+        baseline_model, tokenizer, baseline_flash, baseline_sdpa
+    )
     print(f"  {baseline_label} perplexity: {baseline_ppl:.2f}")
 
     # --- Test perplexity ---
     print(f"\n  Computing perplexity with {test_label}...")
-    test_model, test_flash = setup_backend(
+    test_model, test_flash, test_sdpa = setup_backend(
         orig_model,
         test_backend,
         compile,
     )
-    test_ppl = evaluate_perplexity(test_model, tokenizer, test_flash)
+    test_ppl = evaluate_perplexity(test_model, tokenizer, test_flash, test_sdpa)
     print(f"  {test_label} perplexity: {test_ppl:.2f}")
 
     print(f"\n  Delta: {test_ppl - baseline_ppl:+.2f}")
@@ -289,7 +306,7 @@ def run_benchmark(
 
     # --- Baseline runtime (all sequence lengths) ---
     print(f"\n  Running baseline ({baseline_label})...")
-    baseline_model, baseline_flash = setup_backend(
+    baseline_model, baseline_flash, baseline_sdpa = setup_backend(
         orig_model,
         baseline_backend,
         compile,
@@ -305,6 +322,7 @@ def run_benchmark(
                 baseline_flash,
                 num_warmup,
                 num_runtime_iters,
+                sdpa_backend=baseline_sdpa,
             )
             baseline_runtimes[S] = ms
             print(f"    seq_len={S:>6}: {ms:.1f} ms")
@@ -319,7 +337,7 @@ def run_benchmark(
 
     # --- Test runtime (all sequence lengths) ---
     print(f"\n  Running test ({test_label})...")
-    test_model, test_flash = setup_backend(
+    test_model, test_flash, test_sdpa = setup_backend(
         orig_model,
         test_backend,
         compile,
@@ -335,6 +353,7 @@ def run_benchmark(
                 test_flash,
                 num_warmup,
                 num_runtime_iters,
+                sdpa_backend=test_sdpa,
             )
             test_runtimes[S] = ms
             print(f"    seq_len={S:>6}: {ms:.1f} ms")
 
@@ -69,6 +69,7 @@ def _is_sm_10x() -> bool:
 
 
 @pytest.mark.parametrize("round_scales_to_power_of_2", [True, False])
+@skip_if_rocm("jagged rowwise scales kernel vs torch reference mismatch on ROCm")
 def test_row_major_with_jagged_rowwise_scales(round_scales_to_power_of_2: bool):
     # Tests case where rowwise scales are computed for multiple distinct subtensors,
     # with end boundary of each group is determine by their end column indexes (offsets).