extend GPTQ coverage to grouped_mm

vkuzo · vkuzo · commit 7a3445534375 · 2026-04-24T20:30:55.000Z
Summary: Extend GPTQ for grouped_mm. Punting the redefinition of counting batches vs tokens to a future PR. Test Plan: ``` pytest test/prototype/gptq/test_gptqv2.py -s ``` ghstack-source-id: d93a8c3 ghstack-comment-id: 4313533590 Pull-Request: #4328
diff --git a/test/prototype/gptq/test_gptqv2.py b/test/prototype/gptq/test_gptqv2.py
@@ -213,6 +213,85 @@ def test_bmm_operation_with_observer(self):
                 observer_3d.total_batches[e : e + 1], observers_2d[e].total_batches
             ), f"Expert {e} total_batches mismatch"
 
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
+    @pytest.mark.skipif(
+        not is_sm_at_least_100(),
+        reason="CUDA capability >= 10.0 required for _grouped_mm",
+    )
+    def test_grouped_mm_operation_with_observer(self):
+        """Test torch._grouped_mm with GPTQObserverTensor updates per-expert Hessians correctly."""
+        num_experts = 4
+        n = 16
+        k = 12
+
+        weight = torch.randn(num_experts, n, k, dtype=torch.float32, device="cuda")
+
+        # 4 different per-expert token distributions. Several of these have
+        # experts that see 0 tokens, which exercises the empty-slice skip path.
+        m_per_group_list = [
+            [1, 3, 4, 16],  # all experts active
+            [0, 3, 4, 13],  # expert 0 sees 0 tokens
+            [5, 5, 0, 5],  # expert 2 sees 0 tokens
+            [2, 0, 6, 4],  # expert 1 sees 0 tokens
+        ]
+
+        offs_list = [
+            torch.tensor(
+                [sum(m_per_group[: i + 1]) for i in range(num_experts)],
+                device="cuda",
+                dtype=torch.int32,
+            )
+            for m_per_group in m_per_group_list
+        ]
+
+        inputs = [
+            torch.randn(sum(m_per_group), k, dtype=torch.float32, device="cuda")
+            for m_per_group in m_per_group_list
+        ]
+
+        # 3D path: single observer with _grouped_mm
+        observer_3d = GPTQObserverTensor.from_hp(weight)
+        for x, offs in zip(inputs, offs_list):
+            torch._grouped_mm(x, observer_3d.transpose(-2, -1), offs=offs)
+
+        # 2D path: per-expert observers with F.linear
+        observers_2d = [
+            GPTQObserverTensor.from_hp(weight[e]) for e in range(num_experts)
+        ]
+        for x, offs in zip(inputs, offs_list):
+            prev_end = 0
+            for e in range(num_experts):
+                end = offs[e].item()
+                if end > prev_end:
+                    F.linear(x[prev_end:end], observers_2d[e])
+                prev_end = end
+
+        # Verify per-expert hessians match bitwise to calculating each expert's
+        # hessian individually
+        for e in range(num_experts):
+            assert torch.equal(observer_3d.hessian[e], observers_2d[e].hessian), (
+                f"Expert {e} hessian mismatch"
+            )
+            assert torch.equal(
+                observer_3d.total_batches[e : e + 1], observers_2d[e].total_batches
+            ), f"Expert {e} total_batches mismatch"
+
+        # Verify total_batches matches an independent count derived directly
+        # from the offsets: each non-empty forward pass contributes 1 per
+        # active expert (each expert's 2D slice has len(shape) == 2, so n=1).
+        expected_total_batches = torch.tensor(
+            [
+                sum(1 for m_per_group in m_per_group_list if m_per_group[e] > 0)
+                for e in range(num_experts)
+            ],
+            dtype=torch.int64,
+            device="cuda",
+        )
+        assert torch.equal(observer_3d.total_batches, expected_total_batches), (
+            f"total_batches {observer_3d.total_batches.tolist()} "
+            f"does not match expected {expected_total_batches.tolist()}"
+        )
+
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
     @pytest.mark.parametrize(
         "base_config",
diff --git a/torchao/prototype/gptq/observer.py b/torchao/prototype/gptq/observer.py
@@ -30,6 +30,9 @@ def __init__(self, hp_data: torch.Tensor, total_batches, hessian=None):
         if isinstance(total_batches, torch.Tensor):
             self.total_batches = total_batches
         elif len(self.hp_data.shape) == 3:
+            # TODO(future PR): audit whether we need to change this
+            # from `total_batches` (current) to something like `total_tokens`,
+            # to ensure that each token is weighted equally in the 3d case.
             self.total_batches = torch.zeros(
                 self.hp_data.shape[0], dtype=torch.int64, device=self.hp_data.device
             )
@@ -98,6 +101,23 @@ def update_3d(self, input: torch.Tensor):
             total_batches = self.total_batches[e_idx : e_idx + 1]
             self._update_single_hessian(x_cur, h_cur, total_batches)
 
+    def update_3d_with_offs(self, input: torch.Tensor, offs: torch.Tensor):
+        x = input.float().to(self.hp_data.device)
+        # offs is cumulative end indices; expert e gets rows [prev_end : offs[e]]
+        # Pull offs to CPU once to avoid a GPU->CPU sync per expert.
+        # TODO(future PR): optimize if this is too slow
+        offs_cpu = offs.tolist()
+        prev_end = 0
+        for e_idx in range(self.hessian.shape[0]):
+            end = offs_cpu[e_idx]
+            if end == prev_end:
+                continue
+            x_cur = x[prev_end:end]
+            h_cur = self.hessian[e_idx]
+            total_batches = self.total_batches[e_idx : e_idx + 1]
+            self._update_single_hessian(x_cur, h_cur, total_batches)
+            prev_end = end
+
     @classmethod
     def from_hp(cls, hp_tensor):
         return GPTQObserverTensor(hp_tensor, 0, None)
@@ -145,3 +165,13 @@ def _(func, types, args, kwargs):
     )
     weight_tensor.update_3d(input_tensor.detach())
     return func(input_tensor, weight_tensor.hp_data)
+
+
+@implements([aten._grouped_mm.default])
+def _(func, types, args, kwargs):
+    mat_a, mat_b = args[0], args[1]
+    offs = args[2] if len(args) > 2 else kwargs.get("offs", None)
+    assert offs is not None, "offs is required for grouped_mm"
+    assert isinstance(mat_b, GPTQObserverTensor)
+    mat_b.update_3d_with_offs(mat_a.detach(), offs)
+    return func(mat_a, mat_b.hp_data, offs)