fix: prevent inductor from fusing away bf16→fp32 cast in RoPE (pytorch#2575)

dean-mccoppin · web-flow · commit 716c91c52af4 · 2026-03-14T16:01:39.000-07:00
With compile.enable=true, Qwen3 produces different outputs from eager
(max diff ~1.56 in bfloat16). Inductor traces the whole transformer
block as one graph and legally eliminates the .to(dtype=xq.dtype)
downcast between q_norm/k_norm and RoPE, keeping the multiply-add in
fp32. Valid algebra, wrong dtype boundary relative to eager.

Fix borrowed from apply_rotary_emb_complex: upcast xq/xk to float32
before the multiply-add instead of downcasting cos/sin to match. The
fp32 compute is now unconditional in the graph so Inductor has nothing
to fuse away. Cast back with type_as at the end as before.

Fixes Qwen3 and GPT-OSS (the only callers of apply_rotary_emb_cos_sin).
RoPE now always computes in fp32 in eager too: slightly more accurate,
matches HF Qwen3 behavior, no checkpoint impact.
diff --git a/tests/unit_tests/test_rope.py b/tests/unit_tests/test_rope.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from torchtitan.models.common.rope import apply_rotary_emb_cos_sin
+
+
+class TestApplyRotaryEmbCosSin(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(42)
+        self.bsz = 2
+        self.seqlen = 16
+        self.n_heads = 4
+        self.head_dim = 64
+        self.xq = torch.randn(
+            self.bsz, self.seqlen, self.n_heads, self.head_dim, dtype=torch.bfloat16
+        )
+        self.xk = torch.randn(
+            self.bsz, self.seqlen, self.n_heads, self.head_dim, dtype=torch.bfloat16
+        )
+        self.rope_cache = torch.randn(
+            self.seqlen, self.head_dim * 2, dtype=torch.float32
+        )
+
+    def test_output_dtype_matches_input(self):
+        xq_out, xk_out = apply_rotary_emb_cos_sin(self.xq, self.xk, self.rope_cache)
+        self.assertEqual(xq_out.dtype, self.xq.dtype)
+        self.assertEqual(xk_out.dtype, self.xk.dtype)
+
+    def test_output_shape_matches_input(self):
+        xq_out, xk_out = apply_rotary_emb_cos_sin(self.xq, self.xk, self.rope_cache)
+        self.assertEqual(xq_out.shape, self.xq.shape)
+        self.assertEqual(xk_out.shape, self.xk.shape)
+
+    def test_computes_in_fp32(self):
+        """Output must match a reference computed entirely in float32.
+
+        Ensures inductor cannot fuse away the fp32 upcast when compiling
+        adjacent ops (e.g. q_norm/k_norm) with the RoPE computation.
+        """
+        xq_out, xk_out = apply_rotary_emb_cos_sin(self.xq, self.xk, self.rope_cache)
+
+        cos = self.rope_cache[..., : self.head_dim].unsqueeze(0).unsqueeze(2)
+        sin = self.rope_cache[..., self.head_dim :].unsqueeze(0).unsqueeze(2)
+
+        def rotate_half(x):
+            half = x.shape[-1] // 2
+            return torch.cat([-x[..., half:], x[..., :half]], dim=-1)
+
+        xq_ref = (
+            (self.xq.float() * cos) + (rotate_half(self.xq.float()) * sin)
+        ).bfloat16()
+        xk_ref = (
+            (self.xk.float() * cos) + (rotate_half(self.xk.float()) * sin)
+        ).bfloat16()
+
+        self.assertEqual((xq_out - xq_ref).abs().max().item(), 0.0)
+        self.assertEqual((xk_out - xk_ref).abs().max().item(), 0.0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchtitan/models/common/rope.py b/torchtitan/models/common/rope.py
@@ -347,8 +347,10 @@ def apply_rotary_emb_cos_sin(
     """
     head_dim = xq.shape[-1]
     rope_cache = _reshape_for_broadcast_cos_sin(rope_cache, xq, positions)
-    cos = rope_cache[..., :head_dim].to(dtype=xq.dtype, device=xq.device)
-    sin = rope_cache[..., head_dim:].to(dtype=xq.dtype, device=xq.device)
-    xq_out = (xq * cos) + (_rotate_half(xq) * sin)
-    xk_out = (xk * cos) + (_rotate_half(xk) * sin)
+    cos = rope_cache[..., :head_dim].to(device=xq.device)
+    sin = rope_cache[..., head_dim:].to(device=xq.device)
+    xq_f = xq.float()
+    xk_f = xk.float()
+    xq_out = (xq_f * cos) + (_rotate_half(xq_f) * sin)
+    xk_out = (xk_f * cos) + (_rotate_half(xk_f) * sin)
     return xq_out.type_as(xq), xk_out.type_as(xk)