[metal] Enable bias in metal lowbit-quantized linear kernels (#3745)

manuelcandales · web-flow · commit 01d3a2d34569 · 2026-02-02T18:46:49.000-05:00
Enhances the quantization API in torchao/experimental/quant_api.py by adding support for quantized linear layers with bias and improving error handling for non-contiguous weights. The changes ensure that quantized layers can correctly handle bias terms and provide clearer error messages when input weights do not meet required conditions.
diff --git a/torchao/experimental/ops/mps/test/test_quantizer.py b/torchao/experimental/ops/mps/test/test_quantizer.py
@@ -13,11 +13,7 @@
 
 # Need to import to load the ops
 import torchao.experimental.ops.mps  # noqa: F401
-from torchao.experimental.quant_api import (
-    UIntxWeightOnlyConfig,
-    _linear_int_weight_mps_check,
-    _quantize,
-)
+from torchao.experimental.quant_api import UIntxWeightOnlyConfig, _quantize
 from torchao.quantization.quant_api import quantize_
 
 
@@ -50,7 +46,7 @@ def _quantize_model(self, model, precision, nbit, group_size):
         )
         quantized_model = copy.deepcopy(model)
         quantized_model = quantized_model.to(device="mps", dtype=precision)
-        quantize_(quantized_model, config, filter_fn=_linear_int_weight_mps_check)
+        quantize_(quantized_model, config)
         return quantized_model
 
     @parameterized.expand(BITWIDTHS)
diff --git a/torchao/experimental/quant_api.py b/torchao/experimental/quant_api.py
@@ -77,10 +77,15 @@ def __init__(
         self,
         pack_weight_op,
         linear_op,
+        bias: Optional[torch.Tensor] = None,
     ):
         super().__init__()
         self._pack_weights_op = pack_weight_op
         self._linear_op = linear_op
+        if bias is not None:
+            self.bias = nn.Parameter(bias, requires_grad=False)
+        else:
+            self.register_parameter("bias", None)
 
     def quantize_and_pack_weights(self, weights, nbit, group_size):
         self.nbit = nbit
@@ -100,24 +105,30 @@ def quantize_and_pack_weights(self, weights, nbit, group_size):
     def forward(self, x):
         assert x.dim() >= 2
         if x.dim() == 2:
-            return self._linear_op(
+            output = self._linear_op(
                 x,
                 self.packed_weights,
                 self.group_size,
                 self.weight_scales,
                 self.weight_zeros,
             )
+            if self.bias is not None:
+                output = output + self.bias
+            return output
 
         lead_shape = x.shape[0:-1]
         k = x.shape[-1]
         n = self.weight_scales.shape[0]
-        return self._linear_op(
+        output = self._linear_op(
             x.reshape(-1, k),
             self.packed_weights,
             self.group_size,
             self.weight_scales,
             self.weight_zeros,
         ).reshape(*lead_shape, n)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
 
 
 # TODO(mcandales): Consolidate with _replace_linear_with_quantized_linear
@@ -132,12 +143,17 @@ def _replace_linear_with_quantized_linear_mps(module: nn.Module, kwargs={}):
         if not isinstance(child, nn.Linear):
             _replace_linear_with_quantized_linear_mps(child, kwargs)
         else:
-            assert child.bias is None
+            if not child.weight.is_contiguous():
+                raise ValueError(
+                    f"UIntxWeightOnlyQuantizedLinear requires contiguous weights for layer '{name}'. "
+                    "Please call .contiguous() on the weight tensor before quantization."
+                )
             qlinear = UIntxWeightOnlyQuantizedLinear(
                 pack_weight_op=getattr(torch.ops.torchao, f"_pack_weight_{nbit}bit"),
                 linear_op=getattr(
                     torch.ops.torchao, f"_linear_fp_act_{nbit}bit_weight"
                 ),
+                bias=child.bias,
             )
             setattr(module, name, qlinear)
             qlinear.quantize_and_pack_weights(child.weight, nbit, group_size)
@@ -232,20 +248,23 @@ def __post_init__(self):
             )
 
 
-def _linear_int_weight_mps_check(module: nn.Module, fqn: str) -> bool:
-    return isinstance(module, nn.Linear) and module.bias is None
-
-
 @register_quantize_module_handler(UIntxWeightOnlyConfig)
 def _uintx_weight_only_mps_transform(
     module: torch.nn.Module, config: UIntxWeightOnlyConfig
 ) -> torch.nn.Module:
     nbit = config.bitwidth
     group_size = config.group_size
 
+    if not module.weight.is_contiguous():
+        raise ValueError(
+            "UIntxWeightOnlyQuantizedLinear requires contiguous weights. "
+            "Please call .contiguous() on the weight tensor before quantization."
+        )
+
     qlinear = UIntxWeightOnlyQuantizedLinear(
         pack_weight_op=getattr(torch.ops.torchao, f"_pack_weight_{nbit}bit"),
         linear_op=getattr(torch.ops.torchao, f"_linear_fp_act_{nbit}bit_weight"),
+        bias=module.bias,
     )
     qlinear.quantize_and_pack_weights(module.weight, nbit, group_size)
     return qlinear