Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
ee19523
Continue optimizing AutoScheme RAM consumption
lvliang-intel Apr 17, 2026
f19224e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 17, 2026
f0d183c
fix: add missing run_mllm entry point alias (#1695)
JGSphaela Apr 17, 2026
fe0a541
rename scheme INT8_W8A8 to INT8 (#1687)
thuang6 Apr 19, 2026
3a9575f
update mtp quant for special cases (#1691)
xin3he Apr 20, 2026
68396c0
Update gaudi-docker to v1.24.0 & fix CUDA UT (#1708)
XuehaoSun Apr 21, 2026
cae2d80
add support for gemma4 model (#1655)
n1ck-guo Apr 21, 2026
7f65d03
ignore mtp.fc for qwen3_5 due to vllm failure (#1710)
xin3he Apr 22, 2026
59f3639
[research feature] Introduce INT4 support at the algorithm level (#1641)
wenhuach21 Apr 22, 2026
dd52cd2
refine int4 doc (#1720)
wenhuach21 Apr 22, 2026
8073fa7
Support new model Qwen/Qwen3.6-35B-A3B (#1705)
lvliang-intel Apr 22, 2026
318b3b3
Revert "ignore mtp.fc for qwen3_5 due to vllm failure (#1710)" (#1730)
xin3he Apr 23, 2026
507f3ef
skip quantizing mtp.fc since vLLM doesn't support (#1731)
xin3he Apr 23, 2026
d8d332a
Update pull_request_template.md (#1727)
xin3he Apr 24, 2026
107485d
Create model_support_request.yml (#1738)
xin3he Apr 24, 2026
69cae58
fix gemma3 gguf ut fail (#1735)
n1ck-guo Apr 24, 2026
1643ce1
Remove threaded packing from exporters (#1719)
yiliu30 Apr 24, 2026
26c7574
add small zimage test and fix bug (#1734)
xin3he Apr 24, 2026
8bced5f
Enhance llmc CI on XPU (#1483)
chensuyue Apr 25, 2026
4c2238f
Reduce xpu memory usage with patch_xpu_sdpa_drop_causal_mask (#1716)
xin3he Apr 25, 2026
145847b
[Experimental]Add MLX format export support and AutoScheme for vlm …
wenhuach21 Apr 26, 2026
cc66be7
add warnings for lm_head activation scale fallback (#1728)
n1ck-guo Apr 27, 2026
a4f9bf9
add support for MiMo-V2-Flash (#1718)
n1ck-guo Apr 27, 2026
38ef946
New architecture for auto_round (#1542)
n1ck-guo Apr 28, 2026
c369070
Fix vllm CUDA CI (#1750)
XuehaoSun Apr 28, 2026
d9e0f6a
delete unreproduced results for now (#1760)
ZaneMark Apr 29, 2026
9324bdf
Fix hpu error (#1766)
n1ck-guo Apr 30, 2026
4d99174
[MTP]split gate_up_proj and fix accu gap in rtn quantization (#1758)
xin3he Apr 30, 2026
74594eb
clean and fix for new arch (#1761)
n1ck-guo Apr 30, 2026
66ed80d
support gptqmodel 7.0.0 and fix bug in CI (#1772)
xin3he May 8, 2026
f518956
Optimize CUDA CI and Code Scan workflows (#1770)
XuehaoSun May 8, 2026
8573308
fix accuracy regression and check it in CUDA CI (#1785)
xin3he May 8, 2026
2b47583
fix amp (#1768)
wenhuach21 May 8, 2026
75325d2
fix amp (#1767)
wenhuach21 May 8, 2026
a97e334
Fix incompatible weight names (#1759)
mengniwang95 May 9, 2026
1295774
add notes (#1795)
wenhuach21 May 9, 2026
4c77a98
remove IPEX related code, doc, and test (#1787)
xin3he May 11, 2026
a7d01a2
support model_free WOQ quantization (#1699)
xin3he May 11, 2026
82a7b99
Integrate AutoRound Lib (#1723)
Zhenzhong1 May 11, 2026
bd935e4
fix new arch bug for llmc (#1781)
n1ck-guo May 12, 2026
330bd78
fix bug of gguf alg ext (#1796)
n1ck-guo May 12, 2026
976f90d
Continue optimizing AutoScheme RAM consumption
lvliang-intel Apr 17, 2026
a15b825
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 17, 2026
410a4e4
Merge branch 'main' into lvl/autoscheme_ram_opt
lvliang-intel May 12, 2026
07e784b
Merge branch 'main' into lvl/autoscheme_ram_opt
lvliang-intel May 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 31 additions & 10 deletions auto_round/auto_scheme/delta_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
to_device,
)
from auto_round.utils.device import MemoryMonitor
from auto_round.utils.offload import OffloadManager
from auto_round.utils.offload import OffloadManager, load_model_meta_skeleton, materialize_non_block_layers
from auto_round.wrapper import WrapperLinear

__all__ = ["gen_layer_config"]
Expand Down Expand Up @@ -420,8 +420,8 @@ def backward_pre_hook(module, grad_input):

for block_name in reversed(block_names):

# Retrieve stored inputs for the block
block_input_info = block_inputs.get(block_name, {})
# Retrieve stored inputs for the block (pop to free memory immediately)
block_input_info = block_inputs.pop(block_name, {})

block_input_args = to_device(block_input_info.get("args", []), major_device)
block_input_kwargs = to_device(block_input_info.get("kwargs", {}), major_device)
Expand Down Expand Up @@ -687,6 +687,15 @@ def _gen_layer_config(
model.eval()

block_name = get_block_names(model)[0] # TODO need change to support vlm

# When model was loaded as meta skeleton, materialize non-block layers
# from checkpoint now. Block weights stay as empty tensors and will be
# loaded on demand by OffloadManager hooks.
if offload_context is not None and model_name is not None:
_is_meta_skeleton = any(p.is_meta or p.numel() == 0 for p in model.parameters())
if _is_meta_skeleton:
materialize_non_block_layers(model, model_name, block_name)

for name in block_name:
module = get_module(model, name)
module.in_block = True
Expand Down Expand Up @@ -972,8 +981,14 @@ def gen_layer_config(
model_name = None
if isinstance(model, str):
model_name = model
# Load model on CPU only; do not apply automatic device map or tuning-aware placement at load time.
model, tokenizer, _ = llm_load_model(model_name, device_map="cpu")
if low_gpu_mem_usage and auto_scheme.low_cpu_mem_usage:
# Load model as meta skeleton (no real weights) to minimize peak RAM.
# Non-block layers will be materialized from checkpoint below;
# block weights are loaded on demand by OffloadManager hooks.
model, tokenizer, _ = load_model_meta_skeleton(model_name)
else:
# Load model on CPU only; do not apply automatic device map or tuning-aware placement at load time.
model, tokenizer, _ = llm_load_model(model_name, device_map="cpu")
# Get major device
major_device = get_major_device(device_map)
if not low_gpu_mem_usage:
Expand All @@ -982,11 +997,17 @@ def gen_layer_config(
else:
model = dispatch_model_by_all_available_devices(model, device_map)
else:
model.to("cpu")
if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
import accelerate

accelerate.hooks.remove_hook_from_submodules(model)
# Skip model.to("cpu") when model was loaded as meta skeleton --
# non-block layers are already on CPU and block weights are empty tensors.
_is_meta_loaded = model_name is not None and auto_scheme.low_cpu_mem_usage
if not _is_meta_loaded:
model.to("cpu")
if hasattr(model, "hf_device_map"):
if _is_meta_loaded or len(model.hf_device_map) > 1:
import accelerate

accelerate.hooks.remove_hook_from_submodules(model)
delattr(model, "hf_device_map")
if (isinstance(device_map, str) and "," in device_map) or device_map == "auto":
set_avg_auto_device_map(model, device_map)
else:
Expand Down
45 changes: 45 additions & 0 deletions auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,17 @@ def _gen_auto_scheme(self) -> dict[str, dict]:

if not self.enable_torch_compile and self.super_bits is None and not self.orig_scheme.low_gpu_mem_usage:
logger.warning("we strongly recommend to set `enable_torch_compile` to True for AutoScheme to save VRAM")

# When low_cpu_mem_usage is enabled, pass the model path (string) to
# AutoScheme so it can load a meta skeleton instead of keeping the full
# model in RAM. The loaded model is freed here and reloaded afterward.
_need_reload = False
_model_path = None
if self.orig_scheme.low_cpu_mem_usage and self.orig_scheme.low_gpu_mem_usage:
_model_path = getattr(self.model.config, "_name_or_path", None)
if _model_path is not None and os.path.isdir(_model_path):
_need_reload = True

Comment on lines +751 to +753
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The low-CPU-memory reload path is gated on os.path.isdir(_model_path). For many Hugging Face loads, config._name_or_path is a repo id (not a directory), so this optimization will silently not activate. Consider resolving repo ids to a local snapshot directory (e.g., via huggingface_hub.snapshot_download(local_files_only=True) or an existing helper) rather than requiring _name_or_path to already be a local dir.

Suggested change
if _model_path is not None and os.path.isdir(_model_path):
_need_reload = True
if isinstance(_model_path, str) and _model_path:
if os.path.isdir(_model_path):
_need_reload = True
else:
try:
from huggingface_hub import snapshot_download
_resolved_model_path = snapshot_download(_model_path, local_files_only=True)
if os.path.isdir(_resolved_model_path):
_model_path = _resolved_model_path
_need_reload = True
except Exception:
pass

Copilot uses AI. Check for mistakes.
self.scheme_generator = GenScheme(
self.orig_scheme,
self.model,
Expand All @@ -643,7 +654,41 @@ def _gen_auto_scheme(self) -> dict[str, dict]:
tokenizer=self.tokenizer,
enable_torch_compile=self.enable_torch_compile,
)

if _need_reload:
# GenScheme.__init__ has computed avg bit ranges using the model.
# Now swap the model reference with the path string so that
# gen_layer_config will load a meta skeleton instead.
self.scheme_generator.model = _model_path
del self.model
self.model = None
import gc

gc.collect()
clear_memory(device_list=self.device_list)
logger.info("Released loaded model before AutoScheme (will reload after)")

layer_config = self.scheme_generator.get_layer_config()

if _need_reload:
logger.info("Reloading model after AutoScheme")
self.model, self.tokenizer = llm_load_model(
_model_path,
device="cpu",
trust_remote_code=self.trust_remote_code,
)
self.model = self.model.eval()
check_and_mark_quantized_module(self.model)
# Re-apply module structure updates that quantize() applied before AutoScheme
formats = self.formats if hasattr(self, "formats") else None
if not self.diffusion and formats is not None:
self.model = update_module(
self.model, formats=formats, trust_remote_code=self.trust_remote_code, cleanup_original=False
)
for n, m in self.model.named_modules():
m.global_name = n
self.shared_cache_keys = get_shared_keys(self.model)
Comment on lines 779 to +801
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When _need_reload is true, self.model is deleted/set to None before calling get_layer_config(), and reloaded afterward. If get_layer_config() raises (OOM, dataset error, etc.), the reload block is skipped and the compressor instance is left in a broken state (self.model is None). Wrap the AutoScheme call in a try/finally so the model is reliably restored (or at least the object ends up in a consistent state) even on exceptions.

Suggested change
layer_config = self.scheme_generator.get_layer_config()
if _need_reload:
logger.info("Reloading model after AutoScheme")
self.model, self.tokenizer = llm_load_model(
_model_path,
device="cpu",
trust_remote_code=self.trust_remote_code,
)
self.model = self.model.eval()
check_and_mark_quantized_module(self.model)
# Re-apply module structure updates that quantize() applied before AutoScheme
formats = self.formats if hasattr(self, "formats") else None
if not self.diffusion and formats is not None:
self.model = update_module(
self.model, formats=formats, trust_remote_code=self.trust_remote_code, cleanup_original=False
)
for n, m in self.model.named_modules():
m.global_name = n
self.shared_cache_keys = get_shared_keys(self.model)
try:
layer_config = self.scheme_generator.get_layer_config()
finally:
if _need_reload:
logger.info("Reloading model after AutoScheme")
self.model, self.tokenizer = llm_load_model(
_model_path,
device="cpu",
trust_remote_code=self.trust_remote_code,
)
self.model = self.model.eval()
check_and_mark_quantized_module(self.model)
# Re-apply module structure updates that quantize() applied before AutoScheme
formats = self.formats if hasattr(self, "formats") else None
if not self.diffusion and formats is not None:
self.model = update_module(
self.model, formats=formats, trust_remote_code=self.trust_remote_code, cleanup_original=False
)
for n, m in self.model.named_modules():
m.global_name = n
self.shared_cache_keys = get_shared_keys(self.model)

Copilot uses AI. Check for mistakes.

return layer_config

def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
Expand Down
Loading
Loading