pytorch
diff --git a/‎benchmarks/benchmark_e2e_fp8_sparse_linear.py‎
Lines changed: 0 additions & 16 deletions b/‎benchmarks/benchmark_e2e_fp8_sparse_linear.py‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎benchmarks/microbenchmarks/test/test_utils.py‎
Lines changed: 0 additions & 7 deletions b/‎benchmarks/microbenchmarks/test/test_utils.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎benchmarks/microbenchmarks/utils.py‎
Lines changed: 1 addition & 28 deletions b/‎benchmarks/microbenchmarks/utils.py‎
Lines changed: 1 addition & 28 deletions
diff --git a/‎docs/source/eager_tutorials/finetuning.rst‎
Lines changed: 5 additions & 5 deletions b/‎docs/source/eager_tutorials/finetuning.rst‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/source/examples/inference/int8_dynamic_activation_int4_weight.py‎
Lines changed: 15 additions & 3 deletions b/‎docs/source/examples/inference/int8_dynamic_activation_int4_weight.py‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎docs/source/workflows/qat.md‎
Lines changed: 3 additions & 4 deletions b/‎docs/source/workflows/qat.md‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎test/core/test_config.py‎
Lines changed: 1 addition & 12 deletions b/‎test/core/test_config.py‎
Lines changed: 1 addition & 12 deletions
diff --git a/‎test/dtypes/test_affine_quantized.py‎
Lines changed: 0 additions & 2 deletions b/‎test/dtypes/test_affine_quantized.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎test/dtypes/test_uintx.py‎
Lines changed: 1 addition & 82 deletions b/‎test/dtypes/test_uintx.py‎
Lines changed: 1 addition & 82 deletions
@@ -14,7 +14,6 @@
 )
 from torchao.prototype.sparsity.activation.utils import SquaredReLU
 from torchao.quantization import (
-    Float8DynamicActivationFloat8SemiSparseWeightConfig,
     Float8DynamicActivationFloat8WeightConfig,
     Float8MMConfig,
     PerRow,
@@ -84,20 +83,6 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
     ffn_clone.forward = torch.compile(ffn_clone.forward, fullgraph=True)
     fp8_c_time = benchmark_microseconds(ffn_clone, input_tensor)
 
-    # fp8 sparse
-    ffn_clone = (
-        nn.Sequential(
-            nn.Linear(hidden_size, intermediate_size, bias=False),
-            SquaredReLU(),
-            nn.Linear(intermediate_size, hidden_size, bias=False),
-        )
-        .to(torch.bfloat16)
-        .cuda()
-    )
-    quantize_(ffn_clone, Float8DynamicActivationFloat8SemiSparseWeightConfig())
-    ffn_clone.forward = torch.compile(ffn_clone.forward, fullgraph=True)
-    fp8_c_sparse_time = benchmark_microseconds(ffn_clone, input_tensor)
-
     # activation fp8 sparse
     ffn_clone = (
         nn.Sequential(
@@ -127,7 +112,6 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
         "bf16_latency (us)": fp16_time,
         "bf16_c_latency (us)": fp16_c_time,
         "fp8_c_time (us)": fp8_c_time,
-        "fp8_c_sparse_time (us)": fp8_c_sparse_time,
         "fp8_c_activation_sparse_time (us)": fp8_c_activation_sparse_time,
         "ao_fast_sparsification_time (us)": ao_fast_sparsification_time,
         "cusparselt_compress_time (us)": cusparselt_time,
 
@@ -13,7 +13,6 @@
     BenchmarkConfig,
     BenchmarkResult,
     BlockSparseWeightConfig,
-    Float8DynamicActivationFloat8SemiSparseWeightConfig,
     Int4WeightOnlyConfig,
     SemiSparseWeightConfig,
     clean_caches,
@@ -112,12 +111,6 @@ def test_string_to_config_sparsity(self):
         config = string_to_config("marlin", "semi-sparse")
         self.assertIsInstance(config, Int4WeightOnlyConfig)
 
-        # Test float8 with semi-sparse
-        config = string_to_config("float8dq", "semi-sparse")
-        self.assertIsInstance(
-            config, Float8DynamicActivationFloat8SemiSparseWeightConfig
-        )
-
     def test_block_sparsity_with_baseline_quantization(self):
         """Test that block sparsity with baseline quantization returns BlockSparseWeightConfig"""
         config = string_to_config("baseline", "block")
 
@@ -14,7 +14,6 @@
 
 from torchao.core.config import AOBaseConfig
 from torchao.quantization import (
-    Float8DynamicActivationFloat8SemiSparseWeightConfig,
     Float8DynamicActivationFloat8WeightConfig,
     Float8WeightOnlyConfig,
     GemliteUIntXWeightOnlyConfig,
@@ -23,7 +22,6 @@
     MappingType,
     PerRow,
     PerTensor,
-    UIntXWeightOnlyConfig,
 )
 from torchao.sparsity.sparse_api import BlockSparseWeightConfig, SemiSparseWeightConfig
 
@@ -192,30 +190,7 @@ def string_to_config(
             return Int8DynamicActivationInt8WeightConfig(weight_only_decode=True)
         else:
             return Int8DynamicActivationInt8WeightConfig()
-    if "uintx" in quantization:
-        # uintx-nbits-group_size, e.g. "uintx-2-64"
-        if "hqq" in quantization:
-            # uintx-nbits-group_size-hqq
-            use_hqq = True
-        else:
-            use_hqq = False
-        _quant_args = quantization.split("-")
-        nbits = int(_quant_args[1])
-        assert nbits >= 1 and nbits <= 8, "nbits must be 1 to 8"
-        _NBITS_TO_DTYPE = {
-            1: torch.uint1,
-            2: torch.uint2,
-            3: torch.uint3,
-            4: torch.uint4,
-            5: torch.uint5,
-            6: torch.uint6,
-            7: torch.uint7,
-            8: torch.uint8,
-        }
-        dtype = _NBITS_TO_DTYPE[nbits]
-        group_size = int(_quant_args[2])
-        return UIntXWeightOnlyConfig(dtype, group_size, use_hqq=use_hqq)
-    elif "int8_dynamic_activation_intx_weight" in quantization:
+    if "int8_dynamic_activation_intx_weight" in quantization:
         assert high_precision_dtype == torch.float32, (
             "int8_dynamic_activation_intx_weight requires using high_precision_dtype=torch.float32"
         )
@@ -242,8 +217,6 @@ def string_to_config(
     elif "float8wo" in quantization:
         return Float8WeightOnlyConfig()
     elif "float8dq" in quantization:
-        if sparsity and "semi" in sparsity:
-            return Float8DynamicActivationFloat8SemiSparseWeightConfig()
         granularity = str(quantization.split("-")[-1])
         if granularity == "tensor":
             granularity = PerTensor()
 
@@ -76,7 +76,7 @@ is optional:
 
   # Fine-tuning with QAT, by default:
   #   activations are fake quantized to asymmetric per token int8
-  #   weights are fake quantized to symmetric per group int4 
+  #   weights are fake quantized to symmetric per group int4
   #   configurable through "quantizer._component_" in the command
   tune run --nnodes 1 --nproc_per_node 4 qat_distributed --config llama3_2/3B_qat_full batch_size=16
 
@@ -205,13 +205,13 @@ because we are not actually casting the fake quantized values.
 
 .. code:: py
 
-  from torchao.quantization import quantize_, Int8DynamicActivationInt4WeightConfig
+  from torchao.quantization import quantize_, Int4WeightOnlyConfig
   from torchao.quantization.qat import QATConfig
 
   model = get_model()
 
   # prepare: swap `torch.nn.Linear` -> `FakeQuantizedLinear`
-  base_config = Int8DynamicActivationInt4WeightConfig(group_size=32)
+  base_config = Int4WeightOnlyConfig(group_size=32)
   quantize_(model, QATConfig(base_config, step="prepare"))
 
   # fine-tune
@@ -225,7 +225,7 @@ The next step is to actually quantize the model:
 
 .. code:: py
 
-  from torchao.quantization import Int8DynamicActivationInt4WeightConfig
+  from torchao.quantization import Int4WeightOnlyConfig
 
   # convert: swap `FakeQuantizedLinear` -> `torch.nn.Linear`, then quantize using `base_config`
   quantize_(model, QATConfig(base_config, step="convert"))
@@ -381,7 +381,7 @@ for fine-tuning Llama3.2-3B in float8:
   fp8_tensorwise          7222.198 (+11.074%)   30.010 (-0.266%)
   fp8_rowwise             6387.968 (-1.756%)    29.158 (-3.096%)
   fp8_rowwise_with_gw_hp  7573.698 (+16.480%)   29.516 (-1.908%)
-  
+
   experiment_name         hellaswag_acc    wikitext_word_perplexity
   ----------------------  ---------------  --------------------------
   bf16                    0.533 (+0.000)   12.407 (+0.000)
 
@@ -1,7 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
 import torch.nn as nn
 
-from torchao.prototype.quantization import Int8DynamicActivationInt4WeightConfig
-from torchao.quantization import quantize_
+from torchao.quantization import Int8DynamicActivationIntxWeightConfig, quantize_
+from torchao.quantization.granularity import PerGroup
 
 model = nn.Sequential(nn.Linear(2048, 2048, device="cuda"))
-quantize_(model, Int8DynamicActivationInt4WeightConfig())
+quantize_(
+    model,
+    Int8DynamicActivationIntxWeightConfig(
+        weight_dtype=torch.int4, weight_granularity=PerGroup(32)
+    ),
+)
@@ -78,19 +78,18 @@ the corresponding fake quantization configs to use.
 2. **Convert:** quantize the model using the base config provided
 
 Currently only the following PTQ base configs are supported:
-- [`Int8DynamicActivationInt4WeightConfig`](https://docs.pytorch.org/ao/main/generated/torchao.quantization.Int8DynamicActivationInt4WeightConfig.html)
 - [`Int4WeightOnlyConfig`](https://docs.pytorch.org/ao/main/generated/torchao.quantization.Int4WeightOnlyConfig.html)
 
 For example (most use cases):
 
 ```python
-from torchao.quantization import quantize_, Int8DynamicActivationInt4WeightConfig
+from torchao.quantization import quantize_, Int4WeightOnlyConfig
 from torchao.quantization.qat import QATConfig
 
 model = get_model()
 
 # prepare: swap `torch.nn.Linear` -> `FakeQuantizedLinear`
-base_config = Int8DynamicActivationInt4WeightConfig(group_size=32)
+base_config = Int4WeightOnlyConfig(group_size=32)
 quantize_(model, QATConfig(base_config, step="prepare"))
 
 # train
@@ -109,7 +108,7 @@ and/or weights. For example, the following usage is numerically equivalent
 to the above:
 
 ```python
-from torchao.quantization import quantize_, Int8DynamicActivationInt4WeightConfig
+from torchao.quantization import quantize_, Int4WeightOnlyConfig
 from torchao.quantization.qat import IntxFakeQuantizeConfig, QATConfig
 
 model = get_model()
 
@@ -34,11 +34,9 @@
     Float8WeightOnlyConfig,
     GemliteUIntXWeightOnlyConfig,
     Int4WeightOnlyConfig,
-    Int8DynamicActivationInt4WeightConfig,
     Int8DynamicActivationInt8WeightConfig,
     Int8WeightOnlyConfig,
     ModuleFqnToConfig,
-    UIntXWeightOnlyConfig,
     quantize_,
 )
 from torchao.quantization.quantize_.common.quantization_step import QuantizationStep
@@ -56,7 +54,6 @@
     Float8WeightOnlyConfig(
         weight_dtype=torch.float8_e4m3fn,
     ),
-    UIntXWeightOnlyConfig(dtype=torch.uint1),
     Float8DynamicActivationInt4WeightConfig(),
     Int4WeightOnlyConfig(
         group_size=32,
@@ -67,19 +64,11 @@
         int4_choose_qparams_algorithm="hqq",
         version=2,
     ),
-    Int8DynamicActivationInt4WeightConfig(
-        group_size=64,
-    ),
     Int8DynamicActivationInt8WeightConfig(),
     # Int8DynamicActivationInt8WeightConfig(layout=SemiSparseLayout()),
     Int8WeightOnlyConfig(
         group_size=128,
     ),
-    UIntXWeightOnlyConfig(
-        dtype=torch.uint3,
-        group_size=32,
-        use_hqq=True,
-    ),
     GemliteUIntXWeightOnlyConfig(
         group_size=128,  # Optional, has default of 64
         bit_width=8,  # Optional, has default of 4
@@ -92,7 +81,7 @@
     ModuleFqnToConfig(
         {
             "linear1": Int4WeightOnlyConfig(),
-            "linear2": Int8DynamicActivationInt4WeightConfig(),
+            "linear2": Int8DynamicActivationInt8WeightConfig(),
         }
     ),
     AWQConfig(
 
@@ -24,7 +24,6 @@
 from torchao.quantization import (
     Float8WeightOnlyConfig,
     GemliteUIntXWeightOnlyConfig,
-    Int8DynamicActivationInt4WeightConfig,
     Int8DynamicActivationInt8WeightConfig,
     Int8WeightOnlyConfig,
     quantize_,
@@ -49,7 +48,6 @@ def get_quantization_functions(
 ):
     base_functions = [
         Int8WeightOnlyConfig(),
-        Int8DynamicActivationInt4WeightConfig(),
         Int8DynamicActivationInt8WeightConfig(),
         Int8DynamicActivationInt8WeightConfig(act_mapping_type=MappingType.ASYMMETRIC),
     ]
 
@@ -10,7 +10,7 @@
 import torch
 
 from torchao.prototype.dtypes.uintx.uintx_layout import to_uintx
-from torchao.quantization.quant_api import UIntXWeightOnlyConfig, quantize_
+from torchao.quantization.quant_api import quantize_  # noqa: F401
 from torchao.quantization.quant_primitives import (
     MappingType,
     choose_qparams_affine,
@@ -60,38 +60,6 @@ def forward(self, x):
         return self.net(x)
 
 
-@pytest.mark.parametrize("dtype", dtypes)
-@pytest.mark.parametrize("group_size", group_sizes)
-@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
-def test_uintx_quant_on_cpu_then_move_to_cuda(dtype, group_size):
-    scale = 512
-    fp16_mod_on_cpu = Linear16(scale, "cpu")
-    device = get_current_accelerator_device()
-    quantize_(fp16_mod_on_cpu, UIntXWeightOnlyConfig(dtype, group_size=group_size))
-    test_input_on_cpu = torch.randn(scale * 2, dtype=torch.float16, device="cpu")
-    output_on_cpu = fp16_mod_on_cpu(test_input_on_cpu)
-    fp16_mod_on_cuda = fp16_mod_on_cpu.to(device)
-    test_input_on_cuda = test_input_on_cpu.to(device)
-    output_on_cuda = fp16_mod_on_cuda(test_input_on_cuda)
-    assert torch.allclose(output_on_cpu, output_on_cuda.cpu(), atol=1.0e-3), (
-        "The output of the model on CPU and CUDA should be close"
-    )
-
-
-@pytest.mark.parametrize("dtype", dtypes)
-@pytest.mark.parametrize("group_size", group_sizes)
-@pytest.mark.parametrize("device", devices)
-@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
-def test_uintx_weight_only_model_quant(dtype, group_size, device):
-    scale = 512
-    fp16 = Linear16(scale, device)
-    quantize_(fp16, UIntXWeightOnlyConfig(dtype, group_size=group_size))
-    uintx = torch.compile(fp16, fullgraph=True)
-    test_input = torch.randn(scale * 2, dtype=torch.float16, device=device)
-    output = uintx.forward(test_input)
-    assert output is not None, "model quantization failed"
-
-
 @pytest.mark.parametrize("dtype", dtypes)
 @pytest.mark.parametrize("group_size", group_sizes)
 @pytest.mark.parametrize("device", devices)
@@ -128,55 +96,6 @@ def test_uintx_weight_only_quant(dtype, group_size, device):
     assert deqaunt is not None, "deqauntization failed"
 
 
-@pytest.mark.parametrize("dtype", dtypes)
-@pytest.mark.skipif(not torch.accelerator.is_available(), reason="Need GPU available")
-def test_uintx_target_dtype(dtype):
-    device = get_current_accelerator_device()
-    linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=device)
-    # make sure it runs
-    quantize_(linear, UIntXWeightOnlyConfig(dtype))
-    linear(torch.randn(1, 128, dtype=torch.bfloat16, device=device))
-
-
-@pytest.mark.parametrize("dtype", dtypes)
-@pytest.mark.skipif(not torch.accelerator.is_available(), reason="Need GPU available")
-def test_uintx_target_dtype_compile(dtype):
-    device = get_current_accelerator_device()
-    linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=device)
-    # make sure it runs
-    quantize_(linear, UIntXWeightOnlyConfig(dtype))
-    linear = torch.compile(linear)
-    linear(torch.randn(1, 128, dtype=torch.bfloat16, device=device))
-
-
-@pytest.mark.parametrize("dtype", dtypes)
-@pytest.mark.skipif(not torch.accelerator.is_available(), reason="Need GPU available")
-def test_uintx_model_size(dtype):
-    from torchao.utils import get_model_size_in_bytes
-
-    # scale size = 1/64 * 2 bytes = 1/32 bytes
-    # zero_point size = 1/64 * 4 bytes = 1/16 bytes
-    # dtype data size = 1 * bit_width/8 = bit_width/8 bytes
-    _dtype_to_ratio = {
-        torch.uint1: (1 / 8 + 1 / 16 + 1 / 32) / 2,
-        torch.uint2: (2 / 8 + 1 / 16 + 1 / 32) / 2,
-        torch.uint3: (3 / 8 + 1 / 16 + 1 / 32) / 2,
-        torch.uint4: (4 / 8 + 1 / 16 + 1 / 32) / 2,
-        torch.uint5: (5 / 8 + 1 / 16 + 1 / 32) / 2,
-        torch.uint6: (6 / 8 + 1 / 16 + 1 / 32) / 2,
-        torch.uint7: (7 / 8 + 1 / 16 + 1 / 32) / 2,
-    }
-    device = get_current_accelerator_device()
-    linear = torch.nn.Sequential(
-        torch.nn.Linear(128, 256, bias=False, dtype=torch.bfloat16, device=device)
-    )
-    bf16_size = get_model_size_in_bytes(linear)
-    # make sure it runs
-    quantize_(linear[0], UIntXWeightOnlyConfig(dtype))
-    quantized_size = get_model_size_in_bytes(linear)
-    assert bf16_size * _dtype_to_ratio[dtype] == quantized_size
-
-
 def test_uintx_api_deprecation():
     """
     Test that deprecated uintx APIs trigger deprecation warnings on import.