Rename Sparse2x4CUTLASSFloat8Tensor to Float8Sparse2x4_2DData2DMetadataTensor (pytorch#4343)

bbeckca · facebook-github-bot · commit f0135d74c359 · 2026-04-28T12:42:27.000-07:00
Summary:

Rename the CUTLASS float8 sparse tensor class to describe the memory layout:
- Class: Sparse2x4CUTLASSFloat8Tensor → Float8Sparse2x4_2DData2DMetadataTensor
- Enum: SPARSE_CUTLASS → SPARSE_2D_DATA_2D_METADATA (old value kept for backward compatibility)

The old identifiers to Sparse2x4CUTLASSFloat8Tensor will remain importable using backward compatible aliases.

Reviewed By: RandySheriff

Differential Revision: D102374347
diff --git a/test/quantization/quantize_/workflows/float8/test_sparse_2x4_cutlass_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_sparse_2x4_cutlass_float8_tensor.py
@@ -68,7 +68,7 @@ def test_fp8_cutlass_sparse(self, compile):
                 model,
                 Float8DynamicActivationFloat8WeightConfig(
                     version=2,
-                    packing_format=Float8PackingFormat.SPARSE_CUTLASS,
+                    packing_format=Float8PackingFormat.SPARSE_2D_DATA_2D_METADATA,
                     granularity=PerRow(),
                 ),
             )
@@ -89,7 +89,7 @@ def test_fp8_cutlass_sparse_lowering_op_clone(self):
                 model,
                 Float8DynamicActivationFloat8WeightConfig(
                     version=2,
-                    packing_format=Float8PackingFormat.SPARSE_CUTLASS,
+                    packing_format=Float8PackingFormat.SPARSE_2D_DATA_2D_METADATA,
                     granularity=PerRow(),
                 ),
             )
@@ -114,7 +114,7 @@ def test_fp8_cutlass_sparse_lowering_op_to(self):
                 model,
                 Float8DynamicActivationFloat8WeightConfig(
                     version=2,
-                    packing_format=Float8PackingFormat.SPARSE_CUTLASS,
+                    packing_format=Float8PackingFormat.SPARSE_2D_DATA_2D_METADATA,
                     granularity=PerRow(),
                 ),
             )
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -67,7 +67,7 @@
     IntxUnpackedToInt8Tensor,
     QuantizeTensorToFloat8Kwargs,
     QuantizeTensorToInt8Kwargs,
-    Sparse2x4CUTLASSFloat8Tensor,
+    Float8Sparse2x4_2DData2DMetadataTensor,
 )
 from torchao.quantization.transform_module import (
     _QUANTIZE_CONFIG_HANDLER,
@@ -1258,11 +1258,11 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
             act_quant_kwargs=act_quant_kwargs,
         )
         return quantized_weight
-    elif packing_format == Float8PackingFormat.SPARSE_CUTLASS:
+    elif packing_format == Float8PackingFormat.SPARSE_2D_DATA_2D_METADATA:
         assert isinstance(weight_granularity, PerRow), (
             "Sparse packing format only supports per-row quantization"
         )
-        quantized_weight = Sparse2x4CUTLASSFloat8Tensor.from_hp(
+        quantized_weight = Float8Sparse2x4_2DData2DMetadataTensor.from_hp(
             weight,
             float8_dtype=weight_dtype,
             granularity=weight_granularity,
diff --git a/torchao/quantization/quantize_/workflows/__init__.py b/torchao/quantization/quantize_/workflows/__init__.py
@@ -8,8 +8,8 @@
     Float8Tensor,
     QuantizeTensorToFloat8Kwargs,
 )
-from .float8.sparse_2x4_cutlass_float8_tensor import (
-    Sparse2x4CUTLASSFloat8Tensor,
+from .float8.sparse_2x4_2d_data_2d_metadata_float8_tensor import (
+    Float8Sparse2x4_2DData2DMetadataTensor,
 )
 from .int4.int4_choose_qparams_algorithm import Int4ChooseQParamsAlgorithm
 from .int4.int4_packing_format import Int4PackingFormat
@@ -39,6 +39,8 @@
 )
 from .nf4.nf4_tensor import NF4Tensor, to_nf4
 
+Sparse2x4CUTLASSFloat8Tensor = Float8Sparse2x4_2DData2DMetadataTensor
+
 __all__ = [
     "Int4Tensor",
     "Int4PreshuffledTensor",
@@ -47,6 +49,7 @@
     "Int8Tensor",
     "QuantizeTensorToInt8Kwargs",
     "Float8Tensor",
+    "Float8Sparse2x4_2DData2DMetadataTensor",
     "Sparse2x4CUTLASSFloat8Tensor",
     "Float8Sparse2x4_1DData1DMetadataTensor",
     "Float8PackingFormat",
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_packing_format.py b/torchao/quantization/quantize_/workflows/float8/float8_packing_format.py
@@ -29,10 +29,10 @@ class Float8PackingFormat(str, Enum):
     """
     Sparse packing format for 2:4 sparsity + FP8 quantization
 
-    SPARSE_CUTLASS will pack the quantized_data into two tensors, qdata and sparse_metadata, for the specified values and metadata respectively.
+    SPARSE_2D_DATA_2D_METADATA will pack the quantized_data into two tensors, qdata and sparse_metadata, for the specified values and metadata respectively.
     This packing format will dispatch to `rowwise_scaled_linear_sparse_cutlass_f8f8`, which will fuse the per-row scaling into the sparse matmul.
     """
-    SPARSE_CUTLASS = "sparse_cutlass"
+    SPARSE_2D_DATA_2D_METADATA = "sparse_2d_data_2d_metadata"
     """
     Sparse packing format for 2:4 sparsity + FP8 quantization using hipSPARSELt (ROCm/AMD only).
 
diff --git a/torchao/quantization/quantize_/workflows/float8/sparse_2x4_2d_data_2d_metadata_float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/sparse_2x4_2d_data_2d_metadata_float8_tensor.py
@@ -31,7 +31,7 @@
 )
 
 __all__ = [
-    "Sparse2x4CUTLASSFloat8Tensor",
+    "Float8Sparse2x4_2DData2DMetadataTensor",
 ]
 
 aten = torch.ops.aten
@@ -40,7 +40,7 @@
 from .float8_tensor import QuantizeTensorToFloat8Kwargs
 
 
-class Sparse2x4CUTLASSFloat8Tensor(TorchAOBaseTensor):
+class Float8Sparse2x4_2DData2DMetadataTensor(TorchAOBaseTensor):
     """
     Float8 Quantized + 2:4 sparse (weight) Tensor using CUTLASS kernels, with float8 dynamic quantization for activation.
 
@@ -176,7 +176,7 @@ def from_hp(
         # Use CUTLASS rowwise fp8 + 2:4 sparse mm kernel
         qdata, sparse_metadata = to_sparse_semi_structured_cutlass_sm9x_f8(data)
 
-        return Sparse2x4CUTLASSFloat8Tensor(
+        return Float8Sparse2x4_2DData2DMetadataTensor(
             qdata,
             sparse_metadata,
             scale,
@@ -186,8 +186,8 @@ def from_hp(
         )
 
 
-implements = Sparse2x4CUTLASSFloat8Tensor.implements
-implements_torch_function = Sparse2x4CUTLASSFloat8Tensor.implements_torch_function
+implements = Float8Sparse2x4_2DData2DMetadataTensor.implements
+implements_torch_function = Float8Sparse2x4_2DData2DMetadataTensor.implements_torch_function
 
 
 @implements(aten.linear.default)
@@ -251,4 +251,4 @@ def _(func, types, args, kwargs):
 
 
 # Allow a model with Float8Tensor weights to be loaded with `weights_only=True`
-torch.serialization.add_safe_globals([Sparse2x4CUTLASSFloat8Tensor])
+torch.serialization.add_safe_globals([Float8Sparse2x4_2DData2DMetadataTensor])