Add UIntxBitPackedTensor, UIntxWeightOnlyConfig, and Int8DynamicActivationUIntxWeightConfig

jerryzh168 · jerryzh168 · commit 07a5f546030d · 2026-03-13T17:53:44.000-07:00
Add v2 tensor subclass UIntxBitPackedTensor(TorchAOBaseTensor) using gemlite bit-packing and Triton GEMM kernels, replacing the old AQT-based GemliteUIntXWeightOnlyConfig path. - UIntxBitPackedTensor: tensor subclass with from_hp(), dequantize(), and aten.linear/t/slice dispatch implementations - UIntxWeightOnlyConfig: weight-only quantization (4-bit/8-bit) - Int8DynamicActivationUIntxWeightConfig: int8 dynamic activation + uintx weight - Tests for both configs covering 4-bit, 8-bit, slice, and non-standard shapes Test Plan: - python test/prototype/test_uintx_bit_packed_tensor.py - Tests cover UIntxWeightOnlyConfig: 4-bit (group64/128, pack32/8), 8-bit (perchannel, pack32/8) - Tests cover Int8DynamicActivationUIntxWeightConfig: same bit_width/group_size/packing combos - Tests cover slice dim0/dim1 for tensor parallelism - Tests cover non-standard shapes (1024x1025) - Verified backward compat: old GemliteUIntXWeightOnlyConfig still works ghstack-source-id: de8cbe1 Pull Request resolved: #4082
diff --git a/test/prototype/test_uintx_bit_packed_tensor.py b/test/prototype/test_uintx_bit_packed_tensor.py
@@ -0,0 +1,207 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import TestCase, run_tests
+
+from torchao.quantization import quantize_
+
+try:
+    import gemlite  # noqa: F401
+
+    has_gemlite = True
+except ModuleNotFoundError:
+    has_gemlite = False
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@unittest.skipIf(not has_gemlite, "gemlite not available")
+class TestUIntxBitPackedTensor(TestCase):
+    def _test_quantize_and_linear(self, bit_width, group_size, packing_bitwidth):
+        """Helper: quantize a linear layer and verify forward pass produces valid output."""
+        from torchao.prototype.quantization.quant_api import UIntxWeightOnlyConfig
+
+        in_features = 512
+        out_features = 256
+        model = torch.nn.Linear(in_features, out_features, bias=False).to(
+            device="cuda", dtype=torch.float16
+        )
+
+        config = UIntxWeightOnlyConfig(
+            group_size=group_size,
+            bit_width=bit_width,
+            packing_bitwidth=packing_bitwidth,
+        )
+        quantize_(model, config)
+
+        # Verify weight is now UIntxBitPackedTensor
+        from torchao.prototype.quantization.uintx.uintx_bit_packed_tensor import (
+            UIntxBitPackedTensor,
+        )
+
+        self.assertIsInstance(model.weight, UIntxBitPackedTensor)
+
+        # Verify forward pass works
+        x = torch.randn(2, in_features, device="cuda", dtype=torch.float16)
+        out = model(x)
+        self.assertEqual(out.shape, (2, out_features))
+        self.assertFalse(torch.isnan(out).any())
+        self.assertFalse(torch.isinf(out).any())
+
+    def test_4bit_group64_pack32(self):
+        self._test_quantize_and_linear(bit_width=4, group_size=64, packing_bitwidth=32)
+
+    def test_4bit_group128_pack32(self):
+        self._test_quantize_and_linear(bit_width=4, group_size=128, packing_bitwidth=32)
+
+    def test_4bit_group64_pack8(self):
+        self._test_quantize_and_linear(bit_width=4, group_size=64, packing_bitwidth=8)
+
+    def test_8bit_perchannel_pack32(self):
+        self._test_quantize_and_linear(
+            bit_width=8, group_size=None, packing_bitwidth=32
+        )
+
+    def test_8bit_perchannel_pack8(self):
+        self._test_quantize_and_linear(bit_width=8, group_size=None, packing_bitwidth=8)
+
+    def _test_dynamic_quantize_and_linear(
+        self, bit_width, group_size, packing_bitwidth
+    ):
+        """Helper: quantize with dynamic activation and verify forward pass."""
+        from torchao.prototype.quantization.quant_api import (
+            Int8DynamicActivationUIntxWeightConfig,
+        )
+
+        in_features = 512
+        out_features = 256
+        model = torch.nn.Linear(in_features, out_features, bias=False).to(
+            device="cuda", dtype=torch.float16
+        )
+
+        config = Int8DynamicActivationUIntxWeightConfig(
+            group_size=group_size,
+            bit_width=bit_width,
+            packing_bitwidth=packing_bitwidth,
+        )
+        quantize_(model, config)
+
+        from torchao.prototype.quantization.uintx.uintx_bit_packed_tensor import (
+            UIntxBitPackedTensor,
+        )
+
+        self.assertIsInstance(model.weight, UIntxBitPackedTensor)
+
+        x = torch.randn(2, in_features, device="cuda", dtype=torch.float16)
+        out = model(x)
+        self.assertEqual(out.shape, (2, out_features))
+        self.assertFalse(torch.isnan(out).any())
+        self.assertFalse(torch.isinf(out).any())
+
+    def test_dynamic_4bit_group64_pack32(self):
+        self._test_dynamic_quantize_and_linear(
+            bit_width=4, group_size=64, packing_bitwidth=32
+        )
+
+    def test_dynamic_4bit_group128_pack32(self):
+        self._test_dynamic_quantize_and_linear(
+            bit_width=4, group_size=128, packing_bitwidth=32
+        )
+
+    def test_dynamic_4bit_group64_pack8(self):
+        self._test_dynamic_quantize_and_linear(
+            bit_width=4, group_size=64, packing_bitwidth=8
+        )
+
+    def test_dynamic_8bit_perchannel_pack32(self):
+        self._test_dynamic_quantize_and_linear(
+            bit_width=8, group_size=None, packing_bitwidth=32
+        )
+
+    def test_dynamic_8bit_perchannel_pack8(self):
+        self._test_dynamic_quantize_and_linear(
+            bit_width=8, group_size=None, packing_bitwidth=8
+        )
+
+    def test_slice_dim0(self):
+        """Test narrow/slice on dim 0 (out_features) for tensor parallelism."""
+        from torchao.prototype.quantization.quant_api import UIntxWeightOnlyConfig
+
+        model = torch.nn.Linear(512, 256, bias=False).to(
+            device="cuda", dtype=torch.float16
+        )
+        quantize_(
+            model,
+            UIntxWeightOnlyConfig(group_size=64, bit_width=4, packing_bitwidth=32),
+        )
+
+        weight = model.weight
+        sliced = weight.narrow(0, 0, 64)
+        self.assertEqual(sliced.shape[0], 64)
+
+        # Verify internal tensors match direct slicing
+        # Data is stored transposed (K x N), so logical dim 0 -> data dim 1
+        self.assertEqual(
+            sliced.packed_weight,
+            weight.packed_weight.narrow(1, 0, 64),
+        )
+        self.assertEqual(
+            sliced.scale,
+            weight.scale.narrow(1, 0, 64),
+        )
+
+    def test_slice_dim1(self):
+        """Test narrow/slice on dim 1 (in_features) for tensor parallelism."""
+        from torchao.prototype.quantization.quant_api import UIntxWeightOnlyConfig
+
+        model = torch.nn.Linear(512, 256, bias=False).to(
+            device="cuda", dtype=torch.float16
+        )
+        quantize_(
+            model,
+            UIntxWeightOnlyConfig(group_size=64, bit_width=4, packing_bitwidth=32),
+        )
+
+        weight = model.weight
+        sliced = weight.narrow(1, 0, 128)
+        self.assertEqual(sliced.shape[1], 128)
+
+        # Verify internal tensors match direct slicing
+        # Data is stored transposed (K x N), so logical dim 1 -> data dim 0
+        # packed_weight dim 0 is packed by elements_per_sample
+        eps = weight.gemlite_kwargs["elements_per_sample"]
+        self.assertEqual(
+            sliced.packed_weight,
+            weight.packed_weight.narrow(0, 0, 128 // eps),
+        )
+        # scale dim 0 corresponds to groups along in_features
+        scale_ratio = 128 // 64  # in_features_slice / group_size
+        self.assertEqual(
+            sliced.scale,
+            weight.scale.narrow(0, 0, scale_ratio),
+        )
+
+    def test_non_standard_shapes(self):
+        """Test shapes not divisible by 128 but divisible by 32 (gemlite requirement)."""
+        from torchao.prototype.quantization.quant_api import UIntxWeightOnlyConfig
+
+        # gemlite requires in_features divisible by 32 or group_size
+        model = torch.nn.Linear(1024, 1025, bias=False).to(
+            device="cuda", dtype=torch.float16
+        )
+        config = UIntxWeightOnlyConfig(
+            group_size=None, bit_width=4, packing_bitwidth=32
+        )
+        quantize_(model, config)
+
+        x = torch.randn(1, 1024, device="cuda", dtype=torch.float16)
+        out = model(x)
+        self.assertEqual(out.shape, (1, 1025))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torchao/prototype/quantization/quant_api.py b/torchao/prototype/quantization/quant_api.py
@@ -112,6 +112,111 @@ def _gemlite_uintx_weight_only_transform(
     return module
 
 
+@dataclass
+class UIntxWeightOnlyConfig(AOBaseConfig):
+    """Weight-only uintx quantization using bit-packed format with gemlite Triton kernels.
+
+    Supports 4-bit (asymmetric, grouped) and 8-bit (symmetric, per-channel) quantization.
+    Uses gemlite library for efficient Triton-based GEMM.
+
+    Args:
+        group_size: quantization group size. Use None for per-channel (required for 8-bit).
+            Valid values: 32, 64, 128, 256, 512, 1024, None. Default: 128.
+        bit_width: quantization bit width, 4 or 8. Default: 4.
+        packing_bitwidth: bit width for packing, 8/16/32/None (auto). Default: None.
+        set_inductor_config: if True, set recommended torchinductor config. Default: True.
+    """
+
+    group_size: Optional[int] = 128
+    bit_width: int = 4
+    packing_bitwidth: Optional[int] = None
+    set_inductor_config: bool = True
+
+    def __post_init__(self):
+        torch._C._log_api_usage_once("torchao.quantization.UIntxWeightOnlyConfig")
+        assert self.bit_width in [4, 8], (
+            f"bit_width must be 4 or 8, got {self.bit_width}"
+        )
+
+
+@register_quantize_module_handler(UIntxWeightOnlyConfig)
+def _uintx_weight_only_transform(
+    module: torch.nn.Module,
+    config: UIntxWeightOnlyConfig,
+) -> torch.nn.Module:
+    from torchao.prototype.quantization.uintx.uintx_bit_packed_tensor import (
+        UIntxBitPackedTensor,
+    )
+
+    if config.set_inductor_config:
+        torchao.quantization.utils.recommended_inductor_config_setter()
+
+    weight = module.weight
+    quantized_weight = UIntxBitPackedTensor.from_hp(
+        weight,
+        bit_width=config.bit_width,
+        group_size=config.group_size,
+        packing_bitwidth=config.packing_bitwidth,
+    )
+    module.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
+    module.extra_repr = types.MethodType(_linear_extra_repr, module)
+    return module
+
+
+@dataclass
+class Int8DynamicActivationUIntxWeightConfig(AOBaseConfig):
+    """Dynamic activation + uintx weight quantization using gemlite Triton kernels.
+
+    Activations are quantized dynamically at runtime (int8). Weights use bit-packed
+    uintx format. Supports 4-bit and 8-bit weight quantization.
+
+    Args:
+        group_size: quantization group size. Use None for per-channel (required for 8-bit).
+            Valid values: 32, 64, 128, 256, 512, 1024, None. Default: 128.
+        bit_width: weight quantization bit width, 4 or 8. Default: 4.
+        packing_bitwidth: bit width for packing, 8/16/32/None (auto). Default: None.
+        set_inductor_config: if True, set recommended torchinductor config. Default: True.
+    """
+
+    group_size: Optional[int] = 128
+    bit_width: int = 4
+    packing_bitwidth: Optional[int] = None
+    set_inductor_config: bool = True
+
+    def __post_init__(self):
+        torch._C._log_api_usage_once(
+            "torchao.quantization.Int8DynamicActivationUIntxWeightConfig"
+        )
+        assert self.bit_width in [4, 8], (
+            f"bit_width must be 4 or 8, got {self.bit_width}"
+        )
+
+
+@register_quantize_module_handler(Int8DynamicActivationUIntxWeightConfig)
+def _int8_dynamic_activation_uintx_weight_transform(
+    module: torch.nn.Module,
+    config: Int8DynamicActivationUIntxWeightConfig,
+) -> torch.nn.Module:
+    from torchao.prototype.quantization.uintx.uintx_bit_packed_tensor import (
+        UIntxBitPackedTensor,
+    )
+
+    if config.set_inductor_config:
+        torchao.quantization.utils.recommended_inductor_config_setter()
+
+    weight = module.weight
+    quantized_weight = UIntxBitPackedTensor.from_hp(
+        weight,
+        bit_width=config.bit_width,
+        group_size=config.group_size,
+        packing_bitwidth=config.packing_bitwidth,
+        mode="dynamic",
+    )
+    module.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
+    module.extra_repr = types.MethodType(_linear_extra_repr, module)
+    return module
+
+
 @dataclass
 class Float8StaticActivationFloat8WeightConfig(AOBaseConfig):
     """
diff --git a/torchao/prototype/quantization/uintx/__init__.py b/torchao/prototype/quantization/uintx/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/torchao/prototype/quantization/uintx/uintx_bit_packed_tensor.py b/torchao/prototype/quantization/uintx/uintx_bit_packed_tensor.py