Update on "Remove WeightTensorWithLinearActivationScaleMetadata and related code"

jerryzh168 · jerryzh168 · commit 712c565ca229 · 2026-04-10T11:25:56.000-07:00
Summary:
Delete `linear_activation_scale.py` which defined
`WeightTensorWithLinearActivationScaleMetadata` and its helper
`to_weight_tensor_with_linear_activation_scale_metadata`. Remove the
import and `__all__` entry from `torchao/quantization/__init__.py`.

Test Plan:
python -c "import torchao.quantization"

[ghstack-poisoned]
diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh
@@ -14,7 +14,8 @@ python3 -c "import torch; import torchao; print(f'Torch version: {torch.__versio
 
 pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0' transformers tabulate fire
 
-pytest -v -s torchao/test/quantization/pt2e/ \
+pytest -v -s --ignore=torchao/test/quantization/pt2e/test_x86inductor_fusion.py \
+        torchao/test/quantization/pt2e/ \
         torchao/test/quantization/*.py \
         torchao/test/dtypes/ \
         torchao/test/float8/ \
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -3330,6 +3330,74 @@ def has_inplace_ops(graph_module: torch.fx.GraphModule) -> bool:
         result = m(*example_inputs)
         self.assertIsNotNone(result)
 
+    def test_quantize_in_place_index_put(self):
+        class IndexPutQuantizer(Quantizer):
+            def __init__(self) -> None:
+                super().__init__()
+                self.qspec = QuantizationSpec(
+                    dtype=torch.int8,
+                    observer_or_fake_quant_ctr=observer.default_observer,
+                    quant_min=-128,
+                    quant_max=127,
+                    qscheme=torch.per_tensor_symmetric,
+                )
+
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                for node in model.graph.nodes:
+                    if node.op != "call_function":
+                        continue
+                    if node.target != torch.ops.aten.index_put_.default:
+                        continue
+
+                    dst = node.args[0]
+                    value = node.args[2]
+                    node.meta["quantization_annotation"] = QuantizationAnnotation(
+                        input_qspec_map={
+                            dst: self.qspec,
+                            value: SharedQuantizationSpec((dst, node)),
+                        },
+                        output_qspec=SharedQuantizationSpec((dst, node)),
+                        _annotated=True,
+                    )
+                return model
+
+            def transform_for_annotation(
+                self, model: torch.fx.GraphModule
+            ) -> torch.fx.GraphModule:
+                return model
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                return None
+
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.register_buffer("buf", torch.zeros(4, dtype=torch.float32))
+
+            def forward(self, x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+                updated = self.buf.index_put_((idx,), x)
+                return updated.clone()
+
+        m = M().eval()
+        quantizer = IndexPutQuantizer()
+        example_inputs = (
+            torch.tensor([1.0, 2.0], dtype=torch.float32),
+            torch.tensor([1, 3], dtype=torch.int64),
+        )
+        m = torch.export.export(m, example_inputs, strict=True).module()
+
+        m = prepare_pt2e(m, quantizer)
+        m(*example_inputs)
+        m = convert_pt2e(m, fold_quantize=True)
+
+        # Check that the named buffer is not folded
+        # If it folded it will be named _frozen_param0
+        self.assertTrue("buf" in dict(m.named_buffers()))
+
+        # Verify the quantized model works
+        result = m(*example_inputs)
+        self.assertIsNotNone(result)
+
     def test_scan_op_quantization(self):
         """Test that prepare_pt2e and convert_pt2e correctly quantize ops
         inside the combine_fn subgraph of torch._higher_order_ops.scan.
diff --git a/torchao/quantization/pt2e/constant_fold.py b/torchao/quantization/pt2e/constant_fold.py
@@ -10,6 +10,7 @@
 import torch
 import torch.utils._pytree as pytree
 from torch._inductor.freezing_utils import maybe_set_is_frozen_param
+from torch.ao.quantization.fx.utils import collect_producer_nodes
 from torch.utils._ordered_set import OrderedSet
 
 aten = torch.ops.aten
@@ -94,19 +95,37 @@ def __init__(
         # Identify mutable buffers by finding copy_ operations
         self.mutable_buffers = self._find_mutable_buffers()
 
+    def _is_mutable_buffer(self, node: torch.fx.Node) -> bool:
+        """Check if a node is a mutable buffer."""
+        named_buffers = dict(self.module.named_buffers())
+        if node.op == "placeholder":
+            return True
+
+        if node.op == "get_attr" and str(node.target) in named_buffers:
+            return True
+
+        return False
+
     def _find_mutable_buffers(self) -> set[torch.fx.Node]:
-        """Find mutable buffers by identifying copy_ operations.
-        The first argument of copy_ op is the mutable buffer."""
+        """Find mutable buffers by identifying copy_ or put_ operations.
+        The graph then traces all nodes that lead to a mutable buffer."""
         mutable_buffers = set()
         for node in self.module.graph.nodes:
             if (
                 node.op == "call_function"
                 and hasattr(node.target, "_schema")
-                and "copy_" in str(node.target)
+                and ("copy_" in str(node.target) or "put_" in str(node.target))
             ):
-                # The first argument of copy_ is the mutable buffer
+                # The first argument of copy_ or put_ is the mutable input.
+                # If any producer in the chain is a mutable buffer, mark
+                # all producers as mutable to prevent constant folding.
                 if len(node.args) > 0 and isinstance(node.args[0], torch.fx.Node):
-                    mutable_buffers.add(node.args[0])
+                    producer_nodes = collect_producer_nodes(node.args[0])
+                    if producer_nodes is not None and any(
+                        self._is_mutable_buffer(p) for p in producer_nodes
+                    ):
+                        mutable_buffers.update(producer_nodes)
+
         return mutable_buffers
 
     def _support_dynamic_shape(self) -> bool: