fix(utils): propagate non_blocking in TorchAOBaseTensor._to_copy and _get_to_kwargs (#4297)

Dev-next-gen · web-flow · commit 28e6aca562c8 · 2026-04-30T12:02:00.000-07:00
* fix(utils): propagate non_blocking in TorchAOBaseTensor._to_copy and _get_to_kwargs ## Problem `_get_to_kwargs` explicitly discarded the `non_blocking` argument parsed from `torch._C._nn._parse_to`, with a comment saying it is "not very useful for most tensor subclasses". As a result, any call to `tensor.to(device, non_blocking=True)` on a `TorchAOBaseTensor` subclass silently became a blocking transfer at the inner-tensor level. This matters in practice for async CPU→GPU offloading workflows such as `diffusers` `enable_group_offload(use_stream=True)`: the diffusers hook schedules copies with `non_blocking=True` so that the transfer stream and the compute stream can overlap. Because the flag was dropped, all copies became blocking, negating the overlap benefit. On AMD ROCm (gfx1xxx) the missing non_blocking also interacts with a separate stream-ordering race (fixed in huggingface/diffusers#13502): the default stream can race ahead of "blocking" copies that the OS scheduler hasn't committed yet, producing device-mismatch errors in the first matmul. ## Fix 1. `_get_to_kwargs`: include `non_blocking` in the returned kwargs dict. 2. `TorchAOBaseTensor._to_copy.default`: pop `non_blocking` from kwargs and forward it to every inner `.to()` call for both `tensor_data_names` and `optional_tensor_data_names`. The change is backward-compatible: when `non_blocking=False` (the default), behaviour is identical to before. ## Tested on - 5× AMD RX 7800 XT (gfx1101), ROCm 7.1, PyTorch 2.7 - FLUX.1-dev int8 (`Int8WeightOnlyConfig`) with `enable_group_offload(use_stream=True)` - Companion fix in diffusers: huggingface/diffusers#13502 * test(utils): add non_blocking propagation test for _get_to_kwargs Verifies the contract change in TorchAOBaseTensor._get_to_kwargs: the returned kwargs dict now includes `non_blocking`, propagated from the original `.to(device, non_blocking=...)` call. Covers three cases: explicit True, explicit False, and default (unspecified). Runs on CPU only, no @skip_if_no_cuda needed. Addresses review feedback on PR #4297.
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -56,6 +56,39 @@ def __init__(self, data):
         with self.assertRaisesRegex(NotImplementedError, "arg_types"):
             l.weight = torch.nn.Parameter(MyTensor(l.weight))
 
+    def test_get_to_kwargs_non_blocking(self):
+        """Verify _get_to_kwargs parses and returns the non_blocking flag."""
+
+        class MyTensor(TorchAOBaseTensor):
+            tensor_data_names = ["qdata"]
+            tensor_attribute_names = ["attr", "device"]
+
+            def __new__(cls, qdata, attr="attr", device=None):
+                if device is None:
+                    device = qdata.device
+                kwargs = {"device": device, "dtype": qdata.dtype}
+                r = torch.Tensor._make_wrapper_subclass(cls, qdata.shape, **kwargs)
+                r.qdata = qdata
+                r.attr = attr
+                return r
+
+            def __init__(self, qdata, attr="attr", device=None):
+                pass
+
+        t = MyTensor(torch.randn(4, 4))
+
+        # non_blocking=True is preserved
+        kwargs = t._get_to_kwargs(device="cpu", non_blocking=True)
+        self.assertTrue(kwargs["non_blocking"])
+
+        # non_blocking=False (explicit) is preserved
+        kwargs = t._get_to_kwargs(device="cpu", non_blocking=False)
+        self.assertFalse(kwargs["non_blocking"])
+
+        # default (not specified) → False
+        kwargs = t._get_to_kwargs(device="cpu")
+        self.assertFalse(kwargs["non_blocking"])
+
     def _test_default_impls_helper(self, lp_tensor, lp_tensor_for_copy):
         # get `all_tensor_data_names` and `all_tensor_attribute_names`
         all_tensor_data_names = lp_tensor.tensor_data_names.copy()
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -594,15 +594,19 @@ def _(func, types, args, kwargs):
         ):
             kwargs = self._get_to_kwargs(*args[1:], **kwargs)
             device = kwargs.pop("device")
+            non_blocking = kwargs.pop("non_blocking", False)
             tensors = [
-                getattr(self, name).to(device) for name in self.tensor_data_names
+                getattr(self, name).to(device, non_blocking=non_blocking)
+                for name in self.tensor_data_names
             ]
             optional_tensors = []
             if hasattr(self, "optional_tensor_data_names"):
                 for tensor_data_name in self.optional_tensor_data_names:
                     maybe_tensor = getattr(self, tensor_data_name)
                     if maybe_tensor is not None:
-                        optional_tensors.append(maybe_tensor.to(device))
+                        optional_tensors.append(
+                            maybe_tensor.to(device, non_blocking=non_blocking)
+                        )
                     else:
                         optional_tensors.append(None)
 
@@ -693,25 +697,21 @@ class MyTensor(torch.Tensor):
 
 
 def _get_to_kwargs(self, *args, **kwargs):
-    """Helper function to get the device and dtype keyword args for `aten._to_copy.default` op
-    only device and dtype are kept
+    """Helper function to get the device, dtype and non_blocking keyword args for `aten._to_copy.default` op
 
-    Returns: {"device": device, "dtype": dtype}
+    Returns: {"device": device, "dtype": dtype, "non_blocking": non_blocking}
     """
     # `torch._C._nn._parse_to` can't handle `layout` argument
     args = tuple(arg for arg in args if not isinstance(arg, torch.layout))
     if "layout" in kwargs:
         kwargs.pop("layout")
-    # ignoring `non_blocking` and `memory_format` args since these are not
-    # very useful for most of the tensor subclasses
-    # if in the future there are use cases that need these, we'd recommend
-    # to override `_get_to_kwargs` and return these args
-    device, dtype, _, _ = torch._C._nn._parse_to(*args, **kwargs)
+    device, dtype, non_blocking, _ = torch._C._nn._parse_to(*args, **kwargs)
     device = self.device if device is None else device
     dtype = self.dtype if dtype is None else dtype
     kwargs = {
         "device": device,
         "dtype": dtype,
+        "non_blocking": non_blocking,
     }
     return kwargs