make gptq convert work for moe

vkuzo · vkuzo · commit 12cd338e5c5d · 2026-04-24T20:30:55.000Z
Summary: Makes gptq + moe + nvfp4 work e2e, results as expected on tiny model + tiny dataset Test Plan: ``` > TRITON_ALLOW_NON_CONSTEXPR_GLOBALS=1 torchao/prototype/gptq/gptq_nvfp4_olmoe_1b_7b_nonsequential_wikitext.sh bf16 | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| |--------|------:|------|-----:|---------------|---|-----:|---|------| |wikitext| 2|none | 0|bits_per_byte |↓ |0.5895|± | N/A| | | |none | 0|byte_perplexity|↓ |1.5047|± | N/A| | | |none | 0|word_perplexity|↓ |8.8910|± | N/A| real 0m59.219s user 0m42.554s sys 0m20.534s nvfp4-rtn | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| |--------|------:|------|-----:|---------------|---|-----:|---|------| |wikitext| 2|none | 0|bits_per_byte |↓ |0.6024|± | N/A| | | |none | 0|byte_perplexity|↓ |1.5183|± | N/A| | | |none | 0|word_perplexity|↓ |9.3277|± | N/A| real 0m42.528s user 0m41.217s sys 0m12.817s nvfp4-nonsequential with 4096 calibration samples on c4 | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| |--------|------:|------|-----:|---------------|---|-----:|---|------| |wikitext| 2|none | 0|bits_per_byte |↓ |0.6019|± | N/A| | | |none | 0|byte_perplexity|↓ |1.5177|± | N/A| | | |none | 0|word_perplexity|↓ |9.3087|± | N/A| real 22m28.505s user 22m36.008s sys 0m13.872s ``` ghstack-source-id: 43538e0 ghstack-comment-id: 4315147581 Pull-Request: #4330
diff --git a/test/prototype/gptq/test_gptqv2.py b/test/prototype/gptq/test_gptqv2.py
@@ -20,6 +20,7 @@
 from torchao.prototype.gptq import (
     GPTQConfig,
     gptq_quantize,
+    gptq_quantize_3d,
 )
 from torchao.prototype.gptq.observer import GPTQObserverTensor
 from torchao.prototype.mx_formats.inference_workflow import (
@@ -595,6 +596,66 @@ def test_gptq_quantize_better_than_naive(self, base_config):
         assert gptq_loss is not None
         assert naive_loss is not None
 
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
+    @pytest.mark.skipif(
+        not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for nvfp4"
+    )
+    def test_gptq_quantize_2d_matches_3d(self):
+        """Verify per-expert gptq_quantize and gptq_quantize_3d produce bitwise-identical outputs."""
+        torch.manual_seed(43)
+
+        E = 4
+        out_features = 64
+        in_features = 128
+        num_samples = 10
+
+        base_config = NVFP4DynamicActivationNVFP4WeightConfig(
+            use_dynamic_per_tensor_scale=True,
+            use_triton_kernel=True,
+        )
+        config = GPTQConfig(step="convert", base_config=base_config)
+
+        # Per-expert weights (E, N, K) and per-expert Hessians (E, K, K)
+        weight_3d = torch.randn(
+            E, out_features, in_features, dtype=torch.bfloat16, device="cuda"
+        )
+        hessians = []
+        for _ in range(E):
+            activations = [
+                torch.randn(4, in_features, dtype=torch.float32, device="cuda")
+                for _ in range(num_samples)
+            ]
+            hessians.append(_calculate_hessian(activations, device="cuda"))
+        hessian_3d = torch.stack(hessians, dim=0)
+
+        # gptq_quantize mutates its weight/Hessian arguments in place, so clone
+        # per-experiment to keep the two paths independent.
+        weight_a = weight_3d.clone()
+        weight_b = weight_3d.clone()
+        hessian_a = hessian_3d.clone()
+        hessian_b = hessian_3d.clone()
+
+        # Experiment A: E separate 2D gptq_quantize calls
+        per_expert_2d = [
+            gptq_quantize(hessian_a[e], weight_a[e], config) for e in range(E)
+        ]
+
+        # Experiment B: single 3D gptq_quantize_3d call
+        stacked_3d = gptq_quantize_3d(hessian_b, weight_b, config)
+
+        # Bitwise match per expert
+        for e in range(E):
+            assert torch.equal(per_expert_2d[e].qdata, stacked_3d.qdata[e]), (
+                f"Expert {e}: qdata mismatch"
+            )
+            assert torch.equal(per_expert_2d[e].scale, stacked_3d.scale[e]), (
+                f"Expert {e}: scale mismatch"
+            )
+            assert torch.equal(
+                per_expert_2d[e].per_tensor_scale.view(1, 1),
+                stacked_3d.per_tensor_scale[e],
+            ), f"Expert {e}: per_tensor_scale mismatch"
+
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
     @pytest.mark.parametrize(
         "base_config",
diff --git a/torchao/prototype/gptq/__init__.py b/torchao/prototype/gptq/__init__.py
@@ -4,6 +4,6 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
-from .api import GPTQConfig, gptq_quantize
+from .api import GPTQConfig, gptq_quantize, gptq_quantize_3d
 
-__all__ = ["GPTQConfig", "gptq_quantize"]
+__all__ = ["GPTQConfig", "gptq_quantize", "gptq_quantize_3d"]
diff --git a/torchao/prototype/gptq/api.py b/torchao/prototype/gptq/api.py
@@ -149,7 +149,12 @@ def _gptq_config_transform(
 
         # Use pre-computed Hessian directly
         hessian = tensor.hessian
-        new_tensor = gptq_quantize(hessian, tensor.hp_data, config)
+        if len(tensor.shape) == 2:
+            new_tensor = gptq_quantize(hessian, tensor.hp_data, config)
+        else:
+            assert len(tensor.shape) == 3, "unsupported"
+            new_tensor = gptq_quantize_3d(hessian, tensor.hp_data, config)
+
         new_quantized_tensor = nn.Parameter(new_tensor, requires_grad=False)
         setattr(module, parameter_name, new_quantized_tensor)
         return module
@@ -592,7 +597,51 @@ def gptq_quantize(H: torch.Tensor, W_t: torch.Tensor, config: GPTQConfig):
     return result
 
 
+def gptq_quantize_3d(H: torch.Tensor, W_t: torch.Tensor, config: GPTQConfig):
+    """3D variant of gptq_quantize for MoE expert weights.
+
+    Args:
+        H: per-expert Hessian of shape (E, K, K)
+        W_t: stacked expert weights of shape (E, N, K)
+        config: GPTQ configuration (NVFP4 only)
+
+    Returns:
+        NVFP4Tensor of shape (E, N, K) assembled from per-expert 2D results.
+    """
+    assert H.dim() == 3 and W_t.dim() == 3
+    assert H.shape[0] == W_t.shape[0]
+    base_config = config.base_config
+    assert isinstance(base_config, NVFP4DynamicActivationNVFP4WeightConfig), (
+        "gptq_quantize_3d only supports NVFP4"
+    )
+
+    E = W_t.shape[0]
+    pieces = [gptq_quantize(H[e], W_t[e], config) for e in range(E)]
+
+    # Stack inner NVFP4Tensor fields along a new expert dim 0. These are plain
+    # tensors (uint8 / float8_e4m3fn / float32), so torch.stack goes through
+    # normal aten dispatch, not NVFP4Tensor.
+    qdata_3d = torch.stack([p.qdata for p in pieces], dim=0)
+    scale_3d = torch.stack([p.scale for p in pieces], dim=0)
+    per_tensor_scale_3d = torch.stack(
+        [p.per_tensor_scale.view(1, 1) for p in pieces], dim=0
+    )
+
+    return NVFP4Tensor(
+        qdata_3d,
+        scale_3d,
+        block_size=pieces[0].block_size,
+        orig_dtype=pieces[0].orig_dtype,
+        per_tensor_scale=per_tensor_scale_3d,
+        act_per_tensor_scale=None,
+        is_swizzled_scales=True,
+        use_triton_kernel=pieces[0].use_triton_kernel,
+        act_quant_kwargs=pieces[0].act_quant_kwargs,
+    )
+
+
 __all__ = [
     "GPTQConfig",
     "gptq_quantize",
+    "gptq_quantize_3d",
 ]
diff --git a/torchao/prototype/gptq/gptq_example.py b/torchao/prototype/gptq/gptq_example.py
@@ -296,9 +296,13 @@ def main():
     args = parse_args()
 
     is_olmoe = args.model_id == OLMOE_MODEL_ID
-    if is_olmoe and args.quantization not in ("nvfp4-rtn", "nvfp4-gptq-nonsequential"):
+    if is_olmoe and args.quantization not in (
+        "none",
+        "nvfp4-rtn",
+        "nvfp4-gptq-nonsequential",
+    ):
         raise ValueError(
-            f"model {args.model_id} only supports 'nvfp4-rtn' or "
+            f"model {args.model_id} only supports 'none', 'nvfp4-rtn', or "
             f"'nvfp4-gptq-nonsequential', got '{args.quantization}'"
         )
 
@@ -403,6 +407,7 @@ def skip_lm_head_o_proj(module, fqn):
             _verify_olmoe_experts_quantized(model)
         else:
             quantize_(model, config, filter_fn=filter_fn_to_use)
+        print(model)
 
     elif args.quantization in [
         "int4-gptq-sequential",
@@ -449,6 +454,7 @@ def skip_lm_head_o_proj(module, fqn):
             )
         else:
             quantize_(model, observe_config, filter_fn=filter_fn_to_use)
+        print(model)
 
         # Prepare calibration dataset
         print(
diff --git a/torchao/prototype/gptq/gptq_nvfp4_olmoe_1b_7b_nonsequential_wikitext.sh b/torchao/prototype/gptq/gptq_nvfp4_olmoe_1b_7b_nonsequential_wikitext.sh
@@ -8,18 +8,16 @@ COMMON_ARGS="--output-dir-prefix /home/dev/tmp/20260421 --model-id allenai/OLMoE
 
 # baseline (bf16)
 echo -e "\n\nbaseline (bf16)\n\n"
-# python -u torchao/prototype/gptq/gptq_example.py $COMMON_ARGS --quantization none 
+time python -u torchao/prototype/gptq/gptq_example.py $COMMON_ARGS --quantization none 
 echo -e "done"
 
 # nvfp4-rtn
 echo -e "\n\nnvfp4-rtn\n\n"
-# python -u torchao/prototype/gptq/gptq_example.py $COMMON_ARGS --quantization nvfp4-rtn
+time python -u torchao/prototype/gptq/gptq_example.py $COMMON_ARGS --quantization nvfp4-rtn
 echo -e "done"
 
 # nvfp4-gptq-nonsequential
 echo -e "\n\nnvfp4-gptq-nonsequential\n\n"
-# TODO(future PR): fix https://gist.github.com/vkuzo/51b2bfcee77fc193253faf007d99d694
-# and enable this
-# python -u torchao/prototype/gptq/gptq_example.py $COMMON_ARGS --quantization nvfp4-gptq-nonsequential --dataset-id c4 --dataset-split train
+time python -u torchao/prototype/gptq/gptq_example.py $COMMON_ARGS --quantization nvfp4-gptq-nonsequential --dataset-id c4 --dataset-split train --num-calibration-samples 4096
 echo -e "done"