Update on "Add FP8-INT4 checkpoint upload code"

jerryzh168 · jerryzh168 · commit 325585d1f795 · 2026-02-17T23:34:20.000-08:00
Summary: att, the support is added in #3714 checkpoint: https://huggingface.co/jerryzh168/Qwen3-8B-FP8-INT4 Test Plan: ``` sh release.sh --model_id $MODEL --push_to_hub --populate_model_card_template --quants FP8-INT4 ``` produced checkpoint: https://huggingface.co/jerryzh168/Qwen3-8B-FP8-INT4 Benchmark: ``` vllm bench throughput --model jerryzh168/Qwen3-8B-FP8-INT4 ``` ``` Throughput: 33.03 requests/s, 38055.86 total tokens/s, 4228.43 output tokens/s Total num prompt tokens: 1024000 Total num output tokens: 128000 ``` Reviewers: Subscribers: Tasks: Tags: [ghstack-poisoned]
diff --git a/.github/scripts/torchao_model_releases/quantize_and_upload.py b/.github/scripts/torchao_model_releases/quantize_and_upload.py
@@ -19,7 +19,6 @@
 
 _huggingface_hub_version = str(huggingface_hub.__version__)
 
-from torchao._models._eval import TransformerEvalWrapper
 from torchao.prototype.awq import (
     AWQConfig,
 )
@@ -783,6 +782,8 @@ def filter_fn_skip_lmhead(module, fqn):
         else:
             quantize_(model, awq_config)
 
+        from torchao._models._eval import TransformerEvalWrapper
+
         TransformerEvalWrapper(
             model=model,
             tokenizer=tokenizer,
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -877,7 +877,7 @@ def _float8_dynamic_activation_int4_weight_transform(
     )
     weight = module.weight
     group_size = 128
-    block_size = tuple([1 for _ in range(weight.ndim - 1)] + [group_size])
+    block_size = list([1 for _ in range(weight.ndim - 1)] + [group_size])
 
     if int4_packing_format == "preshuffled":
         new_weight = Int4PreshuffledTensor.from_hp(

Original file line number	Diff line number	Diff line change
`@@ -877,7 +877,7 @@ def _float8_dynamic_activation_int4_weight_transform(`
`877`	`877`	`)`
`878`	`878`	`weight = module.weight`
`879`	`879`	`group_size = 128`
`880`		`- block_size = tuple([1 for _ in range(weight.ndim - 1)] + [group_size])`
	`880`	`+ block_size = list([1 for _ in range(weight.ndim - 1)] + [group_size])`
`881`	`881`
`882`	`882`	`if int4_packing_format == "preshuffled":`
`883`	`883`	`new_weight = Int4PreshuffledTensor.from_hp(`