[Compile] Reduce guards and recompiles for TensorDict under torch.compile

vmoens · vmoens · commit eec4f53e7ae9 · 2026-03-03T17:41:35.000Z
Four optimizations to make tensordict more compile-friendly:

1. Per-field property descriptors for tensor_only TensorClasses: bypasses
   generic __getattr__ dispatch, reducing Dynamo guards on attribute access.
2. Lighter clone(recurse=False): uses dict.update() fast path under
   is_compiling() to avoid dict comprehension and _clone_value overhead.
3. Lighter UnbatchedTensor.clone(): bypasses TensorClass.__init__ tracing
   under compile via __new__ + direct attribute setting.
4. allow_in_graph wrapper for _foreach_copy_ in update_(): treats the
   bulk copy as a single graph node, reducing per-tensor guards.

Includes guard-count tests in TestGuardCount to verify no recompilation
after warm-up for each optimized path.

Made-with: Cursor
diff --git a/benchmarks/compile/compile_td_test.py b/benchmarks/compile/compile_td_test.py
@@ -400,6 +400,122 @@ def test_compile_replace(mode, variant, benchmark):
     benchmark(func, s)
 
 
+
+# ── Attribute-access benchmarks ──────────────────────────────────────────
+
+
+@tensorclass(tensor_only=True)
+class BigTC20:
+    f0: torch.Tensor
+    f1: torch.Tensor
+    f2: torch.Tensor
+    f3: torch.Tensor
+    f4: torch.Tensor
+    f5: torch.Tensor
+    f6: torch.Tensor
+    f7: torch.Tensor
+    f8: torch.Tensor
+    f9: torch.Tensor
+    f10: torch.Tensor
+    f11: torch.Tensor
+    f12: torch.Tensor
+    f13: torch.Tensor
+    f14: torch.Tensor
+    f15: torch.Tensor
+    f16: torch.Tensor
+    f17: torch.Tensor
+    f18: torch.Tensor
+    f19: torch.Tensor
+
+
+def _get_big_tc20():
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    kwargs = {f"f{i}": torch.randn(4, device=device) for i in range(20)}
+    return BigTC20(**kwargs, batch_size=[4], device=device)
+
+
+def tc_getattr_sum(tc):
+    total = tc.f0
+    for i in range(1, 20):
+        total = total + getattr(tc, f"f{i}")
+    return total
+
+
+@pytest.mark.skipif(
+    TORCH_VERSION < version.parse("2.4.0"), reason="requires torch>=2.4"
+)
+@pytest.mark.parametrize("mode", ["eager", "compile"])
+def test_compile_tc_getattr_20(mode, benchmark):
+    func = tc_getattr_sum
+    if mode == "compile":
+        func = torch.compile(func, fullgraph=True, mode="reduce-overhead")
+    tc = _get_big_tc20()
+    func(tc)
+    func(tc)
+    benchmark(func, tc)
+
+
+# ── Shallow clone benchmarks ────────────────────────────────────────────
+
+def clone_shallow(td):
+    return td.clone(recurse=False)
+
+
+def _get_flat_td_n(n):
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    return TensorDict(
+        {f"k{i}": torch.randn(4, device=device) for i in range(n)},
+        batch_size=[4],
+        device=device,
+    )
+
+
+@pytest.mark.skipif(
+    TORCH_VERSION < version.parse("2.4.0"), reason="requires torch>=2.4"
+)
+@pytest.mark.parametrize("mode", ["eager", "compile"])
+@pytest.mark.parametrize("n_fields", [20, 40, 80])
+def test_compile_clone_shallow(mode, n_fields, benchmark):
+    td = _get_flat_td_n(n_fields)
+    func = clone_shallow
+    if mode == "compile":
+        func = torch.compile(func, fullgraph=True, mode="reduce-overhead")
+    func(td)
+    func(td)
+    benchmark(func, td)
+
+
+# ── update_ benchmarks ──────────────────────────────────────────────────
+
+def update_inplace(td, src):
+    td.update_(src)
+    return td
+
+
+@pytest.mark.skipif(
+    TORCH_VERSION < version.parse("2.4.0"), reason="requires torch>=2.4"
+)
+@pytest.mark.parametrize("mode", ["eager", "compile"])
+def test_compile_update_inplace(mode, benchmark):
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    td = TensorDict(
+        {f"k{i}": torch.randn(4, device=device) for i in range(20)},
+        batch_size=[4],
+        device=device,
+    )
+    src = TensorDict(
+        {f"k{i}": torch.ones(4, device=device) for i in range(20)},
+        batch_size=[4],
+        device=device,
+    )
+    func = update_inplace
+    if mode == "compile":
+        func = torch.compile(func, fullgraph=True, mode="reduce-overhead")
+    func(td, src)
+    func(td, src)
+    benchmark(func, td, src)
+
+
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
     pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)
diff --git a/tensordict/_td.py b/tensordict/_td.py
@@ -3397,6 +3397,11 @@ def _clone(self, recurse: bool = True) -> Self:
         if recurse and self.device is not None:
             return self._clone_recurse()
 
+        if not recurse and is_compiling():
+            result = TensorDict(batch_size=self.batch_size, device=self.device)
+            result._tensordict.update(self._tensordict)
+            return result
+
         result = self._new_unsafe(
             source={key: _clone_value(value, recurse) for key, value in self.items()},
             batch_size=self.batch_size,
diff --git a/tensordict/_unbatched.py b/tensordict/_unbatched.py
@@ -9,9 +9,15 @@
 from typing import Any, Callable, TYPE_CHECKING
 
 import torch
+from tensordict._td import TensorDict
 from tensordict._tensorcollection import TensorCollection
 from tensordict.base import TensorDictBase
 
+try:
+    from torch.compiler import is_compiling
+except ImportError:
+    from torch._dynamo import is_compiling
+
 from tensordict.tensorclass import (
     _arg_to_tensordict,
     _from_tensordict_with_copy,
@@ -438,6 +444,13 @@ def flatten(self, start_dim: int = 0, end_dim=-1): ...
     def clone(self, recurse: bool = True):
         """Clones the UnbatchedTensor, preserving the batch_size."""
         data = self.data.clone() if recurse else self.data
+        if is_compiling():
+            result = UnbatchedTensor.__new__(UnbatchedTensor)
+            td = TensorDict(source={"data": data}, batch_size=[])
+            td._batch_size = self.batch_size
+            object.__setattr__(result, "_tensordict", td)
+            object.__setattr__(result, "_non_tensordict", {})
+            return result
         result = type(self)(data=data)
         result.batch_size = self.batch_size
         return result
diff --git a/tensordict/base.py b/tensordict/base.py
@@ -127,6 +127,16 @@
 except ImportError:
     _foreach_copy_ = None
 
+try:
+    from torch.compiler import allow_in_graph as _allow_in_graph
+except (ImportError, AttributeError):
+    _allow_in_graph = None
+
+if _foreach_copy_ is not None and _allow_in_graph is not None:
+    _foreach_copy_compiled = _allow_in_graph(_foreach_copy_)
+else:
+    _foreach_copy_compiled = _foreach_copy_
+
 try:
     from torch.nn.parameter import Buffer
 except ImportError:
@@ -8295,7 +8305,12 @@ def inplace_update(name, source, dest):
                 if len(other_val) != len(vals):
                     vals = dict(zip(keys, vals))
                     vals = [vals[k] for k in new_keys]
-                _foreach_copy_(vals, other_val, non_blocking=non_blocking)
+                copy_fn = (
+                    _foreach_copy_compiled
+                    if is_compiling()
+                    else _foreach_copy_
+                )
+                copy_fn(vals, other_val, non_blocking=non_blocking)
                 return self
             named = True
 
diff --git a/tensordict/tensorclass.py b/tensordict/tensorclass.py
@@ -995,6 +995,27 @@ def __torch_function__(
                 delattr(cls, field.name)
             except AttributeError:
                 pass
+
+    if tensor_only:
+        for field in cls.fields():
+            name = field.name
+
+            def _make_prop(key):
+                def _getter(self):
+                    out = self._tensordict._get_str(key, _UNSET)
+                    if out is _UNSET:
+                        out = self._non_tensordict.get(key, _UNSET)
+                        if out is _UNSET:
+                            raise AttributeError(key)
+                        return out
+                    if _is_unbatched(out):
+                        return out.data
+                    return out
+
+                return property(_getter)
+
+            setattr(cls, name, _make_prop(name))
+
     _get_type_hints(cls, tensor_only=tensor_only)
     # Detect user-defined __setattr__ that must be called during init.
     # After dataclass(), frozen=True adds a guard __setattr__, which is not
@@ -1903,7 +1924,13 @@ def _setattr_tensor_only(self, key: str, value: Any) -> None:  # noqa: D417
             or "_non_tensordict" not in __dict__
             or (
                 not self._shadow
-                and (key in SET_ATTRIBUTES or key in type(self).__dict__)
+                and (
+                    key in SET_ATTRIBUTES
+                    or (
+                        key in type(self).__dict__
+                        and key not in self.__expected_keys__
+                    )
+                )
             )
         ):
             return self.__setattr_parent__(key, value)
diff --git a/test/test_compile.py b/test/test_compile.py
@@ -37,10 +37,12 @@
     TensorDictSequential as Seq,
 )
 
+from tensordict._unbatched import UnbatchedTensor
 from tensordict.nn.functional_modules import _exclude_td_from_pytree
 
 from tensordict.tensorclass import TensorClass
 
+from torch._dynamo.testing import CompileCounterWithBackend
 from torch.utils._pytree import SUPPORTED_NODES, tree_map
 
 TORCH_VERSION = version.parse(version.parse(torch.__version__).base_version)
@@ -1623,6 +1625,142 @@ def fn(a):
         torch.testing.assert_close(result, inp * 2)
 
 
+def _count_compiles(fn, *args):
+    """Compile fn, run it twice, return (frame_count_first, frame_count_second).
+
+    Uses CompileCounterWithBackend("eager") so the function actually executes.
+    """
+    torch._dynamo.reset_code_caches()
+    cnt = CompileCounterWithBackend("eager")
+    compiled = torch.compile(fn, backend=cnt)
+    compiled(*args)
+    first = cnt.frame_count
+    compiled(*args)
+    second = cnt.frame_count
+    return first, second
+
+
+@pytest.mark.skipif(
+    TORCH_VERSION < version.parse("2.4.0"), reason="requires torch>=2.4"
+)
+class TestGuardCount:
+    """Tests that verify compile guard/recompile counts for optimized paths."""
+
+    def test_clone_recurse_false_no_recompile(self):
+        def fn(td):
+            c = td.clone(recurse=False)
+            return c["a"] + 1
+
+        td = TensorDict(
+            {"a": torch.randn(4), **{f"key_{i}": torch.randn(4) for i in range(20)}},
+            batch_size=[4],
+        )
+        first, second = _count_compiles(fn, td)
+        assert first == 1, f"Expected 1 compile frame, got {first}"
+        assert second == 1, f"Recompilation detected: {second} frames"
+
+    def test_tc_getattr_no_recompile(self):
+        class BigTC(TensorClass["nocast"]):
+            a: torch.Tensor
+            b: torch.Tensor
+            c: torch.Tensor
+            d: torch.Tensor
+            e: torch.Tensor
+
+        def fn(tc):
+            return tc.a + tc.b + tc.c + tc.d + tc.e
+
+        tc = BigTC(
+            a=torch.randn(4),
+            b=torch.randn(4),
+            c=torch.randn(4),
+            d=torch.randn(4),
+            e=torch.randn(4),
+            batch_size=[4],
+        )
+        first, second = _count_compiles(fn, tc)
+        assert first == 1, f"Expected 1 compile frame, got {first}"
+        assert second == 1, f"Recompilation detected: {second} frames"
+
+    def test_replace_no_recompile(self):
+        class State(TensorClass["nocast"]):
+            x: torch.Tensor
+            y: torch.Tensor
+            z: torch.Tensor
+
+        def fn(s):
+            s = s.replace(x=s.x + 1)
+            s = s.replace(y=s.y + 2)
+            s = s.replace(x=s.x + s.y, z=s.z + 1)
+            return s
+
+        s = State(
+            x=torch.randn(4),
+            y=torch.randn(4),
+            z=torch.randn(4),
+            batch_size=[4],
+        )
+        first, second = _count_compiles(fn, s)
+        assert first == 1, f"Expected 1 compile frame, got {first}"
+        assert second == 1, f"Recompilation detected: {second} frames"
+
+    def test_update_inplace_no_recompile(self):
+        def fn(td, src):
+            td.update_(src)
+            return td["a"] + 0
+
+        td = TensorDict(
+            {"a": torch.randn(4), "b": torch.randn(4)},
+            batch_size=[4],
+        )
+        src = TensorDict(
+            {"a": torch.ones(4), "b": torch.ones(4)},
+            batch_size=[4],
+        )
+        first, second = _count_compiles(fn, td, src)
+        assert first == 1, f"Expected 1 compile frame, got {first}"
+        assert second == 1, f"Recompilation detected: {second} frames"
+
+    def test_unbatched_clone_no_recompile(self):
+        def fn(td):
+            c = td.clone()
+            return c["a"] + 0
+
+        td = TensorDict(
+            {
+                "a": torch.randn(4, 3),
+                "unbatched": UnbatchedTensor(data=torch.randn(5)),
+            },
+            batch_size=[4],
+        )
+        first, second = _count_compiles(fn, td)
+        assert first == 1, f"Expected 1 compile frame, got {first}"
+        assert second == 1, f"Recompilation detected: {second} frames"
+
+    def test_unbatched_clone_preserves_semantics(self):
+        """Cloning an UnbatchedTensor must produce independent data."""
+        torch._dynamo.reset_code_caches()
+
+        def fn(td):
+            cloned = td.clone()
+            return cloned
+
+        td = TensorDict(
+            {
+                "a": torch.randn(4, 3),
+                "unbatched": UnbatchedTensor(data=torch.randn(5)),
+            },
+            batch_size=[4],
+        )
+        fn_c = torch.compile(fn, fullgraph=True)
+        result = fn_c(td)
+        ut_orig = td.get("unbatched")
+        ut_clone = result.get("unbatched")
+        assert ut_clone.data.data_ptr() != ut_orig.data.data_ptr(), (
+            "clone() must produce independent data"
+        )
+
+
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
     pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)