pytorch
diff --git a/‎test/quantization/pt2e/test_x86inductor_fusion.py‎
Lines changed: 156 additions & 17 deletions b/‎test/quantization/pt2e/test_x86inductor_fusion.py‎
Lines changed: 156 additions & 17 deletions
@@ -8,11 +8,15 @@
 import contextlib
 import copy
 import functools
+import glob
 import itertools
+import os
+import tempfile
 import unittest
 from typing import NamedTuple
 
 import torch
+import torch._export.utils as eu
 from torch._dynamo import config as dynamo_config
 from torch._dynamo.testing import make_test_cls_with_patches
 from torch._dynamo.utils import counters
@@ -101,6 +105,37 @@
 )
 
 
+def _get_fp8_aoti_options():
+    """
+    Detect whether AOTI can exercise the FP8 fusion path.
+
+    AOTI constant-folds lifted tensors by default, which folds away the
+    dequant scale and breaks pattern matching.  We need to call
+    ``add_dont_constant_fold`` on the FP8 dequant op so that the scale
+    survives into the inductor graph and the fusion pattern can still be
+    matched.  If the API is not yet available we fall back to compile-only
+    testing.
+
+    Returns ``[False]`` when AOTI is not exercisable, ``[False, True]``
+    when it is. Float8 AOTI execution is additionally gated on torch >= 2.13.
+    """
+    aoti_options = [False]
+    if not torch_version_at_least("2.13.0"):
+        return aoti_options
+
+    try:
+        import torch._inductor.constant_folding as cf
+
+        if hasattr(cf, "add_dont_constant_fold"):
+            cf.add_dont_constant_fold(
+                torch.ops.torchao.dequantize_affine_float8_non_decomposed.default
+            )
+            aoti_options = [False, True]
+    except ImportError:
+        pass
+    return aoti_options
+
+
 def cal_conv_generated_kernel_number(mod, input, dtype, dim=4, device="cpu"):
     # this function is to decide how many kernels are generated
     # while testing conv2d/3d/deconv2d
@@ -168,6 +203,7 @@ def _test_common(
         quantizer=None,
         compile_options={},  # noqa: B006
         is_fp8=False,
+        use_aoti=False,
         include_ops=None,
         exclude_ops=None,
         check_dynamic=None,
@@ -185,6 +221,33 @@ def _test_common(
         mod = mod.to(device=device)
         counters.clear()
         torch._dynamo.reset()
+
+        if use_aoti:
+
+            def aoti_compile(model, inputs, get_source_code=False):
+                with eu._disable_aten_to_metadata_assertions():
+                    exported = torch.export.export(model, inputs)
+                with tempfile.TemporaryDirectory() as tmpdir:
+                    package_path = os.path.join(tmpdir, "model.pt2")
+                    with config.patch({"aot_inductor.output_path": tmpdir}):
+                        torch._inductor.aoti_compile_and_package(
+                            exported,
+                            package_path=package_path,
+                        )
+
+                    if get_source_code:
+                        cpp_paths = glob.glob(
+                            os.path.join(tmpdir, "**/*.wrapper.cpp"), recursive=True
+                        )
+                        assert cpp_paths, "Failed to find generated .wrapper.cpp"
+                        with open(cpp_paths[0]) as f:
+                            source_code = f.read()
+
+                    compiled_mod = torch._inductor.aoti_load_package(package_path)
+                    if get_source_code:
+                        return compiled_mod, source_code
+                    return compiled_mod
+
         if check_autocast == torch.bfloat16 and (
             torch.ops.mkldnn._is_mkldnn_bf16_supported() or device == "xpu"
         ):
@@ -213,16 +276,29 @@ def _test_common(
                 mod, inputs, is_qat, is_dynamic, quantizer, is_fp8
             )
 
+        # Dynamic aoti check is not supported for now.
+        # Support can be added in the future if needed (e.g. via dynamic_shapes in
+        # torch.export.export for the dynamic case).
+        assert not (
+            use_aoti and (check_dynamic is not None or compile_options.get("dynamic"))
+        ), "Dynamic AOTI check is not supported for now"
+
         with torch.no_grad(), maybe_autocast:
             if check_code:
                 expected = mod(*inputs)
-                code_compile_options = dict(compile_options)
-                if check_dynamic is not None:
-                    code_compile_options["dynamic"] = check_dynamic
-                actual, (source_code,) = run_and_get_code(
-                    torch.compile(mod, **code_compile_options),
-                    *inputs,
-                )
+                if use_aoti:
+                    compiled_mod, source_code = aoti_compile(
+                        mod, inputs, get_source_code=True
+                    )
+                    actual = compiled_mod(*inputs)
+                else:
+                    code_compile_options = dict(compile_options)
+                    if check_dynamic is not None:
+                        code_compile_options["dynamic"] = check_dynamic
+                    actual, (source_code,) = run_and_get_code(
+                        torch.compile(mod, **code_compile_options),
+                        *inputs,
+                    )
                 for op in include_ops:
                     self.assertIn(op, source_code)
                 if num_include_ops is not None:
@@ -239,10 +315,17 @@ def _test_common(
                     # Skip due to reduce range setting for Quantization on preCI system.
                     torch.testing.assert_close(actual, expected, atol=atol, rtol=rtol)
             elif check_quantization:
-                _ = torch.compile(mod, **compile_options)(*inputs)
+                if use_aoti:
+                    _ = aoti_compile(mod, inputs)(*inputs)
+                else:
+                    _ = torch.compile(mod, **compile_options)(*inputs)
             else:
                 expected = mod(*inputs)
-                actual = torch.compile(mod, **compile_options)(*inputs)
+                if use_aoti:
+                    compiled_mod = aoti_compile(mod, inputs)
+                    actual = compiled_mod(*inputs)
+                else:
+                    actual = torch.compile(mod, **compile_options)(*inputs)
                 torch.testing.assert_close(
                     actual.float(), expected.float(), atol=atol, rtol=rtol
                 )
@@ -1488,6 +1571,7 @@ def _qlinear_test_helper(
         is_dynamic=False,
         is_qat=False,
         is_fp8=False,
+        use_aoti=False,
     ):
         class M(torch.nn.Module):
             def __init__(self, use_bias, do_permute=False):
@@ -1527,6 +1611,7 @@ def _default_matcher_check_fn():
             is_qat=is_qat,
             is_dynamic=is_dynamic,
             is_fp8=is_fp8,
+            use_aoti=use_aoti,
             include_ops=[] if is_fp8 else None,
             # ensure quantize_affine_float8_non_decomposed is lowered
             exclude_ops=[
@@ -1552,8 +1637,10 @@ def test_fp8_qlinear_cpu(self):
         r"""
         This testcase will quantize a single Linear Moduel.
         """
-        for bias in [True, False]:
-            self._qlinear_test_helper((torch.randn((2, 4)),), bias=bias, is_fp8=True)
+        for use_aoti, bias in itertools.product(_get_fp8_aoti_options(), [True, False]):
+            self._qlinear_test_helper(
+                (torch.randn((2, 4)),), bias=bias, is_fp8=True, use_aoti=use_aoti
+            )
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
@@ -3076,7 +3163,9 @@ def test_fp8_q_attention_block(self):
                 annotate_matmul=annotate_matmul, is_fp8=True
             )
 
-    def _test_scaled_embedding_bag_helper(self, dtype, with_output_quant=False):
+    def _test_scaled_embedding_bag_helper(
+        self, dtype, with_output_quant=False, use_aoti=False
+    ):
         class FP8QDQEmbeddingBag(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -3086,13 +3175,13 @@ def __init__(self):
             def _dequantize(self, weight):
                 if dtype == torch.float8_e4m3fn:
                     res = torch.ops.torchao.dequantize_affine_float8_non_decomposed.default(
-                        tensor=weight.data,
+                        tensor=weight,
                         scale=torch.tensor([self.weight_scale]),
                         output_dtype=torch.float,
                     )
                 else:
                     res = torch.ops.quantized_decomposed.dequantize_per_tensor.default(
-                        weight.data,
+                        weight,
                         self.weight_scale,
                         0,
                         -128,
@@ -3171,6 +3260,7 @@ def matcher_check_fn():
                     mod,
                     (weight, indices, offsets),
                     matcher_check_fn,
+                    use_aoti=use_aoti,
                 )
 
     @skipIfNoDynamoSupport
@@ -3181,7 +3271,10 @@ def matcher_check_fn():
         reason="cpp kernels not built",
     )
     def test_fp8_scaled_embedding_bag(self):
-        self._test_scaled_embedding_bag_helper(torch.float8_e4m3fn)
+        for use_aoti in _get_fp8_aoti_options():
+            self._test_scaled_embedding_bag_helper(
+                torch.float8_e4m3fn, use_aoti=use_aoti
+            )
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
@@ -3252,15 +3345,61 @@ def matcher_check_fn():
         shape = (128, 3)
 
         mod = Mod()
-        for len in input_len_list:
-            inputs = [torch.randn(shape) for _ in range(len)]
+        for length in input_len_list:
+            inputs = [torch.randn(shape) for _ in range(length)]
             int8_inputs = [quant_input(x) for x in inputs]
             self._test_common(
                 mod,
                 (int8_inputs,),
                 matcher_check_fn,
             )
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfNoFloat8Support
+    @unittest.skipIf(
+        "CPU" not in torch._C._dispatch_dump("torchao::_scaled_embedding_bag"),
+        reason="cpp kernels not built",
+    )
+    def test_fp8_concat_dequant_quant(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.scale = 0.5
+
+            def forward(self, fp8_inputs):
+                res = torch.cat(fp8_inputs, dim=1)
+                res = torch.ops.torchao.dequantize_affine_float8_non_decomposed.default(
+                    tensor=res,
+                    scale=torch.tensor([self.scale]),
+                    output_dtype=torch.float32,
+                )
+                res = torch.ops.torchao.quantize_affine_float8_non_decomposed.default(
+                    tensor=res,
+                    scale=torch.tensor([self.scale]),
+                    float8_dtype=torch.float8_e4m3fn,
+                )
+                return res
+
+        def quant_input(x):
+            scale = x.abs().max() / 448.0
+            return (x / scale).to(torch.float8_e4m3fn)
+
+        def matcher_check_fn():
+            self.assertEqual(counters["inductor"]["concat_dq_q_matcher_count"], 1)
+
+        shape = (128, 3)
+        mod = Mod()
+        for use_aoti, length in itertools.product(_get_fp8_aoti_options(), [2, 3]):
+            inputs = [torch.randn(shape) for _ in range(length)]
+            fp8_inputs = [quant_input(x) for x in inputs]
+            self._test_common(
+                mod,
+                (fp8_inputs,),
+                matcher_check_fn,
+                use_aoti=use_aoti,
+            )
+
 
 @unittest.skipIf(not torch_version_at_least("2.8.0"), "Requires torch 2.8+")
 @unittest.skipIf(torch.version.hip is not None, "Not applicable to ROCm")