nvfp4 gptq for bmm

vkuzo · vkuzo · commit e841b070026d · 2026-04-24T20:30:55.000Z
Summary: Extend GPTQ coverage for bmm, formulating the bmm as a 3d case of mm. This involves: 1. refactoring the 2d code to make it easily extendable to 3d 2. the existing bmm logic was numerically incorrect (used a single hessian), modify it to instead use E K by K Hessians for an `E, N, K` input shape, route to the 2D hessian logic N times. This is slow but we can optimize later. We test numerical correctness by bitwise matching E 2d hessian calculations to the 3D one. Test Plan: ``` torchao/prototype/gptq/gptq_nvfp4_llama3_2_1b_nonsequential_wikitext.sh // gptq accuracy unchanged ``` ghstack-source-id: 85a5964 ghstack-comment-id: 4313324694 Pull-Request: #4327
diff --git a/test/prototype/gptq/test_gptqv2.py b/test/prototype/gptq/test_gptqv2.py
@@ -108,31 +108,7 @@ def test_observer_tensor_creation(self):
         )
 
         # Check total_batches is initialized as 0
-        assert observer.total_batches == 0
-
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
-    def test_observer_tensor_attributes(self):
-        """Test GPTQObserverTensor attributes are correctly set."""
-        weight = torch.randn(16, 32, dtype=torch.bfloat16, device="cuda")
-        observer = GPTQObserverTensor.from_hp(weight)
-
-        # Test hp_data attribute
-        assert hasattr(observer, "hp_data")
-        assert isinstance(observer.hp_data, torch.Tensor)
-
-        # Test hessian attribute
-        assert hasattr(observer, "hessian")
-        assert torch.equal(
-            observer.hessian, torch.zeros(32, 32, dtype=torch.float32, device="cuda")
-        )
-
-        # Test total_batches attribute
-        assert hasattr(observer, "total_batches")
-        assert observer.total_batches == 0
-
-        # Test update method exists
-        assert hasattr(observer, "update")
-        assert callable(observer.update)
+        assert (observer.total_batches == 0).all()
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
     def test_linear_operation_with_observer(self):
@@ -161,7 +137,7 @@ def test_linear_operation_with_observer(self):
         # Check that Hessian was initialized and updated
         assert observer_weight.hessian is not None
         assert observer_weight.hessian.shape == (in_features, in_features)
-        assert observer_weight.total_batches == 1
+        assert (observer_weight.total_batches == 1).all()
 
         # Verify output is correct
         expected_output = F.linear(input_tensor, weight)
@@ -195,36 +171,47 @@ def test_multiple_observations(self):
         assert observer_weight.hessian.shape == (in_features, in_features)
 
         # Check total_batches matches total samples
-        assert observer_weight.total_batches == total_samples
+        assert (observer_weight.total_batches == total_samples).all()
 
-    @pytest.mark.skip(reason="bmm math is incorrect, will fix in next PR")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
     def test_bmm_operation_with_observer(self):
         """Test torch.bmm with GPTQObserverTensor updates Hessian correctly."""
-        batch = 4
+        num_experts = 4
         m = 8
         n = 16
         k = 12
+        num_passes = 4
 
         # Create input and weight tensors
-        input_tensor = torch.randn(batch, m, k, dtype=torch.float32, device="cuda")
-        weight = torch.randn(batch, k, n, dtype=torch.float32, device="cuda")
-        observer_weight = GPTQObserverTensor.from_hp(weight)
+        weight = torch.randn(num_experts, n, k, dtype=torch.float32, device="cuda")
 
-        # Perform bmm operation
-        output = torch.bmm(input_tensor, observer_weight)
+        inputs = [
+            torch.randn(num_experts, m, k, dtype=torch.float32, device="cuda")
+            for _ in range(num_passes)
+        ]
 
-        # Check output shape
-        assert output.shape == (batch, m, n)
+        # 3D path: single observer with bmm
+        observer_3d = GPTQObserverTensor.from_hp(weight)
+        for x in inputs:
+            torch.bmm(x, observer_3d.transpose(-2, -1))
 
-        # Check Hessian was initialized and updated
-        assert observer_weight.hessian is not None
-        # For bmm with batch dimension, the Hessian is computed on the last dimension
-        assert observer_weight.total_batches == batch
-
-        # Verify output is correct
-        expected_output = torch.bmm(input_tensor, weight)
-        torch.testing.assert_close(output, expected_output)
+        # 2D path: per-expert observers with F.linear
+        observers_2d = [
+            GPTQObserverTensor.from_hp(weight[e]) for e in range(num_experts)
+        ]
+        for x in inputs:
+            for e in range(num_experts):
+                F.linear(x[e], observers_2d[e])
+
+        # Verify per-expert hessians match bitwise to calculating each expert's
+        # hessian individually
+        for e in range(num_experts):
+            assert torch.equal(observer_3d.hessian[e], observers_2d[e].hessian), (
+                f"Expert {e} hessian mismatch"
+            )
+            assert torch.equal(
+                observer_3d.total_batches[e : e + 1], observers_2d[e].total_batches
+            ), f"Expert {e} total_batches mismatch"
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
     @pytest.mark.parametrize(
@@ -260,15 +247,15 @@ def test_observer_config_transform(self, base_config):
             linear.weight.hessian,
             torch.zeros(64, 64, dtype=torch.float32, device="cuda"),
         )
-        assert linear.weight.total_batches == 0
+        assert (linear.weight.total_batches == 0).all()
 
         # Perform a forward pass
         input_tensor = torch.randn(4, 64, dtype=torch.float32, device="cuda")
         output = linear(input_tensor)
 
         # Check Hessian was initialized after forward pass
         assert linear.weight.hessian is not None
-        assert linear.weight.total_batches == 1
+        assert (linear.weight.total_batches == 1).all()
 
         # Check output shape
         assert output.shape == (4, 32)
@@ -384,6 +371,9 @@ def test_unified_config_two_phase(self, base_config):
     )
     def test_gptq_quantize_function(self, base_config):
         """Test gptq_quantize function with synthetic Hessian and weights."""
+        if isinstance(base_config, Int4WeightOnlyConfig) and is_sm_at_least_100():
+            pytest.skip("int4 kernels do not work on sm100")
+
         torch.manual_seed(42)
 
         # Create synthetic weight matrix
@@ -556,6 +546,8 @@ def test_gptq_sqnr(self, base_config):
             and not is_sm_at_least_100()
         ):
             pytest.skip("CUDA capability >= 10.0 required for nvfp4")
+        if isinstance(base_config, Int4WeightOnlyConfig) and is_sm_at_least_100():
+            pytest.skip("int4 kernels do not work on sm100")
 
         torch.manual_seed(43)
 
diff --git a/torchao/prototype/gptq/api.py b/torchao/prototype/gptq/api.py
@@ -141,7 +141,7 @@ def _gptq_config_transform(
             )
 
         # Validate that observations were recorded
-        if tensor.total_batches == 0:
+        if (tensor.total_batches == 0).any():
             raise ValueError(
                 f"No observations recorded for {parameter_name}. "
                 f"total_batches is 0. Did you run forward passes during the observe step?"
diff --git a/torchao/prototype/gptq/observer.py b/torchao/prototype/gptq/observer.py
@@ -11,54 +11,92 @@
 
 
 class GPTQObserverTensor(TorchAOBaseTensor):
-    tensor_data_names = ["hp_data"]
+    tensor_data_names = ["hp_data", "total_batches"]
     optional_tensor_data_names = ["hessian"]
-    tensor_attribute_names = ["total_batches"]
+    tensor_attribute_names = []
 
-    def __new__(cls, hp_data: torch.Tensor, total_batches: int, hessian=None):
+    def __new__(cls, hp_data: torch.Tensor, total_batches, hessian=None):
         shape = hp_data.shape
         kwargs = {}
         kwargs["device"] = hp_data.device
         kwargs["dtype"] = hp_data.dtype
         kwargs["requires_grad"] = False
         return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
 
-    def __init__(self, hp_data: torch.Tensor, total_batches: int, hessian=None):
+    def __init__(self, hp_data: torch.Tensor, total_batches, hessian=None):
         super().__init__()
         self.hp_data = hp_data
         self.hessian = hessian
-        self.total_batches = total_batches
+        if isinstance(total_batches, torch.Tensor):
+            self.total_batches = total_batches
+        elif len(self.hp_data.shape) == 3:
+            self.total_batches = torch.zeros(
+                self.hp_data.shape[0], dtype=torch.int64, device=self.hp_data.device
+            )
+        else:
+            self.total_batches = torch.zeros(
+                1, dtype=torch.int64, device=self.hp_data.device
+            )
 
         # initialize hessian
-        assert self.hp_data.is_contiguous()
         if self.hessian is None:
+            assert self.hp_data.is_contiguous()
             feature_dim = self.hp_data.shape[-1]
-            self.hessian = torch.zeros(
-                feature_dim,
-                feature_dim,
-                dtype=torch.float32,
-                device=self.hp_data.device,
-            )
-
-    def update(self, input: torch.Tensor):
-        """Incrementally update Hessian matrix from input activations."""
-        # Move input to same device as hp_data and convert to float
-        x = input.float().to(self.hp_data.device)
+            if len(self.hp_data.shape) == 2:
+                self.hessian = torch.zeros(
+                    feature_dim,
+                    feature_dim,
+                    dtype=torch.float32,
+                    device=self.hp_data.device,
+                )
+            else:
+                assert len(self.hp_data.shape) == 3, "unsupported"
+                expert_dim = self.hp_data.shape[0]
+                self.hessian = torch.zeros(
+                    expert_dim,
+                    feature_dim,
+                    feature_dim,
+                    dtype=torch.float32,
+                    device=self.hp_data.device,
+                )
+
+    @staticmethod
+    def _update_single_hessian(
+        x: torch.Tensor, hessian: torch.Tensor, total_batches: torch.Tensor
+    ):
+        """Update a single 2D Hessian and total_batches in-place."""
         shape = x.shape
-
-        # Calculate batch size
         n = 1 if len(shape) == 2 else shape[0]
         x = x.reshape(-1, shape[-1])
 
-        # Apply running average formula
-        if self.total_batches > 0:
-            self.hessian *= self.total_batches / (self.total_batches + n)
+        # cast to Python int64 for optimal type promotion semantics
+        # Note: there is definitely a better way to get ^, saving for
+        # a follow-up PR. For now, this preserves numerics.
+        tb = total_batches.item()
+        if tb > 0:
+            hessian *= tb / (tb + n)
 
-        self.total_batches += n
+        total_batches += n
+        # cast to Python int64 for optimal type promotion semantics
+        # Note: there is definitely a better way to get ^, saving for
+        # a follow-up PR. For now, this preserves numerics.
+        tb = total_batches.item()
 
-        # Update Hessian: x = ((2 / total_batches) ** (1 / 2)) * x.t()
-        x = ((2 / self.total_batches) ** (1 / 2)) * x.t()
-        self.hessian += x.matmul(x.t())
+        x = ((2 / tb) ** (1 / 2)) * x.t()
+        hessian += x.matmul(x.t())
+
+    def update_2d(self, input: torch.Tensor):
+        x = input.float().to(self.hp_data.device)
+        self._update_single_hessian(x, self.hessian, self.total_batches[0:1])
+
+    def update_3d(self, input: torch.Tensor):
+        x = input.float().to(self.hp_data.device)
+        # TODO(future PR): optimize if this is too slow
+        for e_idx in range(self.hessian.shape[0]):
+            x_cur = x[e_idx]
+            h_cur = self.hessian[e_idx]
+            total_batches = self.total_batches[e_idx : e_idx + 1]
+            self._update_single_hessian(x_cur, h_cur, total_batches)
 
     @classmethod
     def from_hp(cls, hp_tensor):
@@ -79,19 +117,31 @@ def _(func, types, args, kwargs):
         args[2] if len(args) > 2 else None,
     )
     if isinstance(weight_tensor, GPTQObserverTensor):
-        weight_tensor.update(input_tensor.detach())
+        weight_tensor.update_2d(input_tensor.detach())
         return F.linear(input_tensor, weight_tensor.hp_data, bias)
     else:
         raise ValueError(
             f"Expected weight_tensor to be GPTQObserverTensor, got: {type(weight_tensor)}"
         )
 
 
+@implements(aten.transpose.int)
+def _(func, types, args, kwargs):
+    self, dim0, dim1 = args[0], args[1], args[2]
+    assert {dim0, dim1} == {-2, -1} or {dim0, dim1} == {
+        self.hp_data.ndim - 2,
+        self.hp_data.ndim - 1,
+    }, f"only transpose of last two dims is supported, got dims {dim0}, {dim1}"
+    new_data = func(self.hp_data, dim0, dim1)
+    new_hessian = func(self.hessian, dim0, dim1)
+    return GPTQObserverTensor(new_data, self.total_batches, new_hessian)
+
+
 @implements(aten.bmm.default)
 def _(func, types, args, kwargs):
     input_tensor, weight_tensor = (
         args[0],
         args[1],
     )
-    weight_tensor.update(input_tensor.detach())
+    weight_tensor.update_3d(input_tensor.detach())
     return func(input_tensor, weight_tensor.hp_data)

Original file line number	Diff line number	Diff line change
`@@ -141,7 +141,7 @@ def _gptq_config_transform(`
`141`	`141`	`)`
`142`	`142`
`143`	`143`	`# Validate that observations were recorded`
`144`		`- if tensor.total_batches == 0:`
	`144`	`+ if (tensor.total_batches == 0).any():`
`145`	`145`	`raise ValueError(`
`146`	`146`	`f"No observations recorded for {parameter_name}. "`
`147`	`147`	`f"total_batches is 0. Did you run forward passes during the observe step?"`