fix: bugs and code quality improvements across prototype modules

tatavishnurao · tatavishnurao · commit d9d14aa9430c · 2026-04-25T15:00:03.000+05:30
- Replace print() with logger.info() in fusion_utils.py (unconsoleable output during torch.compile)
- Add kernel availability guard to Int4OpaqueTensor.from_hp_da8w4() with clear error message
- Add kernel availability guard to QuantizedLinear._forward_2d() to prevent AttributeError
- Replace mutable default kwargs={} in _replace_embedding_with_quantized_embedding()
- Fix fragile stdout capture in test_rope_fusion_detection.py to use logger capture
- Add public API exports to embedding/__init__.py (EmbeddingQuantizer, QuantizedLinear, etc.)
- Remove unused _is_blackwell() from attention/utils.py
- Remove misconfigured @triton.autotune decorators (empty configs, constexpr key)
diff --git a/test/prototype/attention/test_rope_fusion_detection.py b/test/prototype/attention/test_rope_fusion_detection.py
@@ -11,6 +11,7 @@
 """
 
 import contextlib
+import logging
 import io
 import unittest
 from functools import partial
@@ -118,18 +119,29 @@ def tearDown(self):
         torch._dynamo.reset()
 
     def _run_fusion_pass(self, model, *args):
-        """Compile model with fusion pass, return captured stdout."""
+        """Compile model with fusion pass, return captured logger output."""
         inductor_config.pre_grad_custom_pass = partial(
             rope_sdpa_fusion_pass,
             rope_sdpa_op=_ops.rope_sdpa_op,
             fp8_sdpa_op=_ops.fp8_sdpa_op,
             backend_name="TEST",
         )
         compiled = torch.compile(model)
-        buf = io.StringIO()
-        with torch.no_grad(), contextlib.redirect_stdout(buf):
-            compiled(*args)
-        return buf.getvalue()
+        fusion_logger = logging.getLogger(
+            "torchao.prototype.attention.shared_utils.fusion_utils"
+        )
+        old_level = fusion_logger.level
+        fusion_logger.setLevel(logging.DEBUG)
+        handler = logging.StreamHandler(io.StringIO())
+        handler.setLevel(logging.DEBUG)
+        fusion_logger.addHandler(handler)
+        try:
+            with torch.no_grad():
+                compiled(*args)
+            return handler.stream.getvalue()
+        finally:
+            fusion_logger.removeHandler(handler)
+            fusion_logger.setLevel(old_level)
 
     def _assert_fused(self, model, *extra_args):
         """Create BSHD inputs, run fusion pass, assert 1 node was fused."""
diff --git a/torchao/prototype/attention/quantization/triton_hadamard_qkv_quantization.py b/torchao/prototype/attention/quantization/triton_hadamard_qkv_quantization.py
@@ -38,14 +38,6 @@
 )
 
 
-@triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=2),
-        triton.Config({}, num_warps=4),
-        triton.Config({}, num_warps=8),
-    ],
-    key=["D"],
-)
 @triton.jit
 def hadamard_single_phase1_kernel(
     # Input tensor [B, H, S, D]
diff --git a/torchao/prototype/attention/quantization/triton_hadamard_rope_qkv_quantization.py b/torchao/prototype/attention/quantization/triton_hadamard_rope_qkv_quantization.py
@@ -33,14 +33,6 @@
 )
 
 
-@triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=2),
-        triton.Config({}, num_warps=4),
-        triton.Config({}, num_warps=8),
-    ],
-    key=["D"],
-)
 @triton.jit
 def hadamard_rope_single_phase1_kernel(
     # Input tensor [B, S, H, D]
@@ -160,14 +152,6 @@ def hadamard_rope_single_phase1_kernel(
     tl.store(partial_max_ptr + chunk_idx, x_max_scalar)
 
 
-@triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=2),
-        triton.Config({}, num_warps=4),
-        triton.Config({}, num_warps=8),
-    ],
-    key=["D"],
-)
 @triton.jit
 def hadamard_v_phase1_kernel(
     # Input tensor [B, S, H, D]
diff --git a/torchao/prototype/attention/shared_utils/fusion_utils.py b/torchao/prototype/attention/shared_utils/fusion_utils.py
@@ -964,7 +964,7 @@ def rope_sdpa_fusion_pass(
     fp8_sdpa_nodes = [n for n in graph.nodes if _is_fp8_sdpa_node(n, fp8_sdpa_op)]
 
     if not fp8_sdpa_nodes:
-        print(
+        logger.info(
             f"[low_precision_attention] RoPE fusion pass ({backend_name}): "
             f"found 0 FP8 SDPA nodes in graph"
         )
@@ -1102,7 +1102,7 @@ def rope_sdpa_fusion_pass(
                 fused_count += 1
                 continue
 
-    print(
+    logger.info(
         f"[low_precision_attention] RoPE fusion pass ({backend_name}): "
         f"found {len(fp8_sdpa_nodes)} FP8 SDPA node(s), "
         f"{fused_count} fused with RoPE"
diff --git a/torchao/prototype/attention/utils.py b/torchao/prototype/attention/utils.py
@@ -16,13 +16,6 @@ def _is_hopper() -> bool:
     return major == 9
 
 
-def _is_blackwell() -> bool:
-    if not torch.cuda.is_available():
-        return False
-    major, _ = torch.cuda.get_device_capability()
-    return major == 10
-
-
 def _is_fa3_available() -> bool:
     try:
         importlib.import_module("flash_attn_interface")
diff --git a/torchao/prototype/quantization/embedding/__init__.py b/torchao/prototype/quantization/embedding/__init__.py
@@ -0,0 +1,17 @@
+from .api import (
+    EmbeddingQuantizer,
+    QuantizedEmbedding,
+    QuantizedEmbeddingFallback,
+    QuantizedLinear,
+    QuantizedTiedEmbedding,
+    TiedEmbeddingQuantizer,
+)
+
+__all__ = [
+    "EmbeddingQuantizer",
+    "QuantizedEmbedding",
+    "QuantizedEmbeddingFallback",
+    "QuantizedLinear",
+    "QuantizedTiedEmbedding",
+    "TiedEmbeddingQuantizer",
+]
diff --git a/torchao/prototype/quantization/embedding/api.py b/torchao/prototype/quantization/embedding/api.py
@@ -142,9 +142,11 @@ def forward(self, x):
 
 def _replace_embedding_with_quantized_embedding(
     module: nn.Module,
-    kwargs={},
+    kwargs=None,
     fqn: str = "",
 ):
+    if kwargs is None:
+        kwargs = {}
     group_size = kwargs.get("group_size", None)
     bit_width = kwargs.get("bit_width", None)
     use_fallback = kwargs.get("use_fallback", None)
@@ -254,6 +256,10 @@ def _forward_2d(self, x):
         assert x.dim() == 2
         m, k = x.shape
         assert k == self.k
+        assert _is_kernel_library_loaded(), (
+            "QuantizedLinear requires the torchao kernel library to be loaded. "
+            "Please build torchao with C++ extensions enabled (USE_CPP=1)."
+        )
         return getattr(
             torch.ops.torchao, f"_linear_8bit_act_{self.bit_width}bit_weight"
         )(x, self.packed_weight, self.group_size, self.n, self.k)
diff --git a/torchao/prototype/quantization/int4/int4_opaque_tensor.py b/torchao/prototype/quantization/int4/int4_opaque_tensor.py
@@ -233,6 +233,10 @@ def from_hp_da8w4(
             act_mapping_type: MappingType.ASYMMETRIC (uint8 activation, default) or
                               MappingType.SYMMETRIC (int8 activation, requires PyTorch >= 2.8)
         """
+        assert "CPU" in torch._C._dispatch_dump("torchao::da8w4_linear_prepack_cpu"), (
+            "DA8W4 on CPU requires the da8w4_linear_cpu kernel to be built and available. "
+            "Please build torchao with C++ extensions enabled (USE_CPP=1)."
+        )
         assert w.ndim == 2 and w.device.type == "cpu", (
             f"Expecting 2D tensor on CPU, but got: {w.shape} on {w.device.type}"
         )