Skip to content

Commit 325585d

Browse files
committed
Update on "Add FP8-INT4 checkpoint upload code"
Summary: att, the support is added in #3714 checkpoint: https://huggingface.co/jerryzh168/Qwen3-8B-FP8-INT4 Test Plan: ``` sh release.sh --model_id $MODEL --push_to_hub --populate_model_card_template --quants FP8-INT4 ``` produced checkpoint: https://huggingface.co/jerryzh168/Qwen3-8B-FP8-INT4 Benchmark: ``` vllm bench throughput --model jerryzh168/Qwen3-8B-FP8-INT4 ``` ``` Throughput: 33.03 requests/s, 38055.86 total tokens/s, 4228.43 output tokens/s Total num prompt tokens: 1024000 Total num output tokens: 128000 ``` Reviewers: Subscribers: Tasks: Tags: [ghstack-poisoned]
1 parent 83d1561 commit 325585d

2 files changed

Lines changed: 3 additions & 2 deletions

File tree

.github/scripts/torchao_model_releases/quantize_and_upload.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
_huggingface_hub_version = str(huggingface_hub.__version__)
2121

22-
from torchao._models._eval import TransformerEvalWrapper
2322
from torchao.prototype.awq import (
2423
AWQConfig,
2524
)
@@ -783,6 +782,8 @@ def filter_fn_skip_lmhead(module, fqn):
783782
else:
784783
quantize_(model, awq_config)
785784

785+
from torchao._models._eval import TransformerEvalWrapper
786+
786787
TransformerEvalWrapper(
787788
model=model,
788789
tokenizer=tokenizer,

torchao/quantization/quant_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -877,7 +877,7 @@ def _float8_dynamic_activation_int4_weight_transform(
877877
)
878878
weight = module.weight
879879
group_size = 128
880-
block_size = tuple([1 for _ in range(weight.ndim - 1)] + [group_size])
880+
block_size = list([1 for _ in range(weight.ndim - 1)] + [group_size])
881881

882882
if int4_packing_format == "preshuffled":
883883
new_weight = Int4PreshuffledTensor.from_hp(

0 commit comments

Comments
 (0)