Add Sparse2x4HIPSPARSELTFloat8Tensor (#4277)

bbeckca · facebook-github-bot · commit 1c3b18081d84 · 2026-04-28T10:03:57.000-07:00
Summary: X-link: pytorch/pytorch#180312 What: Adding a new tensor subclass for FP8 2:4 sparsity via hipSPARSELt (ROCm only). Packs compressed values + metadata into a single tensor with `_cslt_compress` and dispatches through `_cslt_sparse_mm` with `A_scale * B_scale` as `alpha`. Why: This hipSPARSELt path differs enough in packing and kernel routing from CUTLASS to warrant a dedicated path. Reference: https://rocm.blogs.amd.com/artificial-intelligence/introduce_hipsparselt/README.html Differential Revision: D100640267
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_sparse_2x4_1d_data_1d_metadata_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_sparse_2x4_1d_data_1d_metadata_tensor.py
@@ -0,0 +1,154 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import logging
+import unittest
+
+import torch
+from torch import nn
+from torch.testing._internal import common_utils
+
+try:
+    from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8_SPARSE
+except ImportError:
+    PLATFORM_SUPPORTS_FP8_SPARSE = False
+from torchao.quantization import (
+    Float8DynamicActivationFloat8WeightConfig,
+)
+from torchao.quantization.granularity import PerTensor
+from torchao.quantization.quant_api import (
+    quantize_,
+)
+from torchao.quantization.quantize_.workflows import (
+    Float8PackingFormat,
+)
+from torchao.quantization.utils import compute_error
+from torchao.sparsity import apply_fake_sparsity
+from torchao.utils import (
+    torch_version_at_least,
+)
+
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
+
+
+@unittest.skipIf(
+    not torch_version_at_least("2.10.0"),
+    "Need torch >= 2.10.0",
+)
+class TestFloat8Sparse2x4_1DData1DMetadataTensor(common_utils.TestCase):
+    def setUp(self):
+        if not torch.cuda.is_available():
+            self.skipTest("Need CUDA available")
+        if not torch.version.hip:
+            self.skipTest("hipSPARSELt path is ROCm-only")
+        if not PLATFORM_SUPPORTS_FP8_SPARSE:
+            self.skipTest("Need platform with FP8 sparse support (hipSPARSELt)")
+
+    @common_utils.parametrize("compile", [True, False])
+    def test_fp8_hipsparselt_sparse(self, compile):
+        with torch.inference_mode():
+            input = torch.rand((256, 256), dtype=torch.bfloat16, device="cuda")
+            model = (
+                nn.Sequential(
+                    nn.Linear(256, 1024),
+                    nn.Linear(1024, 256),
+                )
+                .bfloat16()
+                .cuda()
+                .eval()
+            )
+
+            apply_fake_sparsity(model)
+            baseline_result = model(input)
+            model_copy = copy.deepcopy(model)
+
+            # Quantized (dense)
+            quantize_(
+                model_copy,
+                Float8DynamicActivationFloat8WeightConfig(
+                    granularity=PerTensor(),
+                ),
+            )
+            dense_result = model_copy(input)
+            dense_sqnr = compute_error(baseline_result, dense_result)
+
+            # Sparse + quantized
+            quantize_(
+                model,
+                Float8DynamicActivationFloat8WeightConfig(
+                    version=2,
+                    packing_format=Float8PackingFormat.SPARSE_1D_DATA_1D_METADATA,
+                    granularity=PerTensor(),
+                ),
+            )
+            if compile:
+                model = torch.compile(model)
+            sparse_result = model(input)
+            sparse_sqnr = compute_error(baseline_result, sparse_result)
+
+            self.assertEqual(dense_sqnr, sparse_sqnr)
+
+    def test_fp8_hipsparselt_sparse_lowering_op_clone(self):
+        """Validates clone dispatch correctly copies both sparse data and scale metadata."""
+        with torch.inference_mode():
+            model = nn.Linear(256, 1024).half().cuda().eval()
+            apply_fake_sparsity(model)
+            quantize_(
+                model,
+                Float8DynamicActivationFloat8WeightConfig(
+                    version=2,
+                    packing_format=Float8PackingFormat.SPARSE_1D_DATA_1D_METADATA,
+                    granularity=PerTensor(),
+                ),
+            )
+
+            original = model.weight.dequantize()
+            cloned = model.weight.clone().dequantize()
+
+            for o, c in zip(original, cloned):
+                self.assertEqual(o, c)
+
+    def test_fp8_hipsparselt_sparse_lowering_op_to(self):
+        """Validates both to.dtype_layout and to.dtype dispatch paths correctly dequantize the sparse tensor."""
+        with torch.inference_mode():
+            model = nn.Linear(256, 1024).half().cuda().eval()
+            apply_fake_sparsity(model)
+            model_copy = copy.deepcopy(model)
+            expected = model_copy.weight.to(dtype=torch.float)
+
+            quantize_(
+                model,
+                Float8DynamicActivationFloat8WeightConfig(
+                    version=2,
+                    packing_format=Float8PackingFormat.SPARSE_1D_DATA_1D_METADATA,
+                    granularity=PerTensor(),
+                ),
+            )
+
+            original_by_to_dtype_layout = torch.ops.aten.to.dtype_layout(
+                model.weight,
+                dtype=torch.float,
+                layout=torch.strided,
+            )
+            torch.testing.assert_close(
+                expected, original_by_to_dtype_layout, atol=1e-1, rtol=1e-1
+            )
+
+            original_by_to_dtype = torch.ops.aten.to.dtype(
+                model.weight,
+                torch.float,
+            )
+            torch.testing.assert_close(
+                expected, original_by_to_dtype, atol=1e-1, rtol=1e-1
+            )
+
+
+common_utils.instantiate_parametrized_tests(TestFloat8Sparse2x4_1DData1DMetadataTensor)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -52,6 +52,7 @@
 )
 from torchao.quantization.quantize_.workflows import (
     Float8PackingFormat,
+    Float8Sparse2x4_1DData1DMetadataTensor,
     Float8Tensor,
     Int4ChooseQParamsAlgorithm,
     Int4PackingFormat,
@@ -1268,6 +1269,17 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
             act_quant_kwargs=act_quant_kwargs,
         )
         return quantized_weight
+    elif packing_format == Float8PackingFormat.SPARSE_1D_DATA_1D_METADATA:
+        assert isinstance(weight_granularity, PerTensor), (
+            "Sparse 1D data 1D metadata packing format only supports per-tensor quantization"
+        )
+        quantized_weight = Float8Sparse2x4_1DData1DMetadataTensor.from_hp(
+            weight,
+            float8_dtype=weight_dtype,
+            granularity=weight_granularity,
+            act_quant_kwargs=act_quant_kwargs,
+        )
+        return quantized_weight
 
 
 @register_quantize_module_handler(Float8DynamicActivationFloat8WeightConfig)
diff --git a/torchao/quantization/quantize_/workflows/__init__.py b/torchao/quantization/quantize_/workflows/__init__.py
@@ -1,6 +1,9 @@
 from .float8.float8_packing_format import (
     Float8PackingFormat,
 )
+from .float8.float8_sparse_2x4_1d_data_1d_metadata_tensor import (
+    Float8Sparse2x4_1DData1DMetadataTensor,
+)
 from .float8.float8_tensor import (
     Float8Tensor,
     QuantizeTensorToFloat8Kwargs,
@@ -45,6 +48,7 @@
     "QuantizeTensorToInt8Kwargs",
     "Float8Tensor",
     "Sparse2x4CUTLASSFloat8Tensor",
+    "Float8Sparse2x4_1DData1DMetadataTensor",
     "Float8PackingFormat",
     "QuantizeTensorToFloat8Kwargs",
     "Int8Tensor",
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_packing_format.py b/torchao/quantization/quantize_/workflows/float8/float8_packing_format.py
@@ -33,6 +33,23 @@ class Float8PackingFormat(str, Enum):
     This packing format will dispatch to `rowwise_scaled_linear_sparse_cutlass_f8f8`, which will fuse the per-row scaling into the sparse matmul.
     """
     SPARSE_CUTLASS = "sparse_cutlass"
+    """
+    Sparse packing format for 2:4 sparsity + FP8 quantization using hipSPARSELt (ROCm/AMD only).
+
+    SPARSE_1D_DATA_1D_METADATA will pack the quantized_data into a single tensor containing both the quantized data and metadata
+    as a 1D tensor of r*c/2 + r*c/8 bytes with the following layout: [compressed_data | metadata]
+
+    - compressed_data: r*c/2 bytes
+        The 2 non-zero FP8 values per group of 4 elements, stored row-major:
+        row0_group0_val0, row0_group0_val1, row0_group1_val0, row0_group1_val1, ..., row1_group0_val0, ...
+    - metadata: r*c/8 bytes
+        4 bits per group of 4 elements encoding the positions of the 2 kept values
+        (2 bits per kept element index), groups packed contiguously row-major:
+        row0_group0_meta, row0_group1_meta, ..., row1_group0_meta, ...
+
+    This packing format will dispatch to torch._cslt_sparse_mm for matmul, with per-tensor scaling passed as alpha.
+    """
+    SPARSE_1D_DATA_1D_METADATA = "sparse_1d_data_1d_metadata"
 
 
 torch.serialization.add_safe_globals([Float8PackingFormat])
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_sparse_2x4_1d_data_1d_metadata_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_sparse_2x4_1d_data_1d_metadata_tensor.py