intel · lvliang-intel · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 19, 2026
diff --git a/auto_round/auto_scheme/delta_loss.py b/auto_round/auto_scheme/delta_loss.py
@@ -56,7 +56,7 @@
     to_device,
 )
 from auto_round.utils.device import MemoryMonitor
-from auto_round.utils.offload import OffloadManager
+from auto_round.utils.offload import OffloadManager, load_model_meta_skeleton, materialize_non_block_layers
 from auto_round.wrapper import WrapperLinear
 
 __all__ = ["gen_layer_config"]
@@ -420,8 +420,8 @@ def backward_pre_hook(module, grad_input):
 
         for block_name in reversed(block_names):
 
-            # Retrieve stored inputs for the block
-            block_input_info = block_inputs.get(block_name, {})
+            # Retrieve stored inputs for the block (pop to free memory immediately)
+            block_input_info = block_inputs.pop(block_name, {})
 
             block_input_args = to_device(block_input_info.get("args", []), major_device)
             block_input_kwargs = to_device(block_input_info.get("kwargs", {}), major_device)
@@ -687,6 +687,15 @@ def _gen_layer_config(
     model.eval()
 
     block_name = get_block_names(model)[0]  # TODO need change to support vlm
+
+    # When model was loaded as meta skeleton, materialize non-block layers
+    # from checkpoint now.  Block weights stay as empty tensors and will be
+    # loaded on demand by OffloadManager hooks.
+    if offload_context is not None and model_name is not None:
+        _is_meta_skeleton = any(p.is_meta or p.numel() == 0 for p in model.parameters())
+        if _is_meta_skeleton:
+            materialize_non_block_layers(model, model_name, block_name)
+
     for name in block_name:
         module = get_module(model, name)
         module.in_block = True
@@ -972,8 +981,14 @@ def gen_layer_config(
     model_name = None
     if isinstance(model, str):
         model_name = model
-        # Load model on CPU only; do not apply automatic device map or tuning-aware placement at load time.
-        model, tokenizer, _ = llm_load_model(model_name, device_map="cpu")
+        if low_gpu_mem_usage and auto_scheme.low_cpu_mem_usage:
+            # Load model as meta skeleton (no real weights) to minimize peak RAM.
+            # Non-block layers will be materialized from checkpoint below;
+            # block weights are loaded on demand by OffloadManager hooks.
+            model, tokenizer, _ = load_model_meta_skeleton(model_name)
+        else:
+            # Load model on CPU only; do not apply automatic device map or tuning-aware placement at load time.
+            model, tokenizer, _ = llm_load_model(model_name, device_map="cpu")
     # Get major device
     major_device = get_major_device(device_map)
     if not low_gpu_mem_usage:
@@ -982,11 +997,17 @@ def gen_layer_config(
         else:
             model = dispatch_model_by_all_available_devices(model, device_map)
     else:
-        model.to("cpu")
-        if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
-            import accelerate
-
-            accelerate.hooks.remove_hook_from_submodules(model)
+        # Skip model.to("cpu") when model was loaded as meta skeleton --
+        # non-block layers are already on CPU and block weights are empty tensors.
+        _is_meta_loaded = model_name is not None and auto_scheme.low_cpu_mem_usage
+        if not _is_meta_loaded:
+            model.to("cpu")
+        if hasattr(model, "hf_device_map"):
+            if _is_meta_loaded or len(model.hf_device_map) > 1:
+                import accelerate
+
+                accelerate.hooks.remove_hook_from_submodules(model)
+                delattr(model, "hf_device_map")
         if (isinstance(device_map, str) and "," in device_map) or device_map == "auto":
             set_avg_auto_device_map(model, device_map)
         else:

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -633,6 +633,17 @@ def _gen_auto_scheme(self) -> dict[str, dict]:
 
         if not self.enable_torch_compile and self.super_bits is None and not self.orig_scheme.low_gpu_mem_usage:
             logger.warning("we strongly recommend to set `enable_torch_compile` to True for AutoScheme to save VRAM")
+
+        # When low_cpu_mem_usage is enabled, pass the model path (string) to
+        # AutoScheme so it can load a meta skeleton instead of keeping the full
+        # model in RAM.  The loaded model is freed here and reloaded afterward.
+        _need_reload = False
+        _model_path = None
+        if self.orig_scheme.low_cpu_mem_usage and self.orig_scheme.low_gpu_mem_usage:
+            _model_path = getattr(self.model.config, "_name_or_path", None)
+            if _model_path is not None and os.path.isdir(_model_path):
+                _need_reload = True
+
-            if _model_path is not None and os.path.isdir(_model_path):
-                _need_reload = True
+            if isinstance(_model_path, str) and _model_path:
+                if os.path.isdir(_model_path):
+                    _need_reload = True
+                else:
+                    try:
+                        from huggingface_hub import snapshot_download
+
+                        _resolved_model_path = snapshot_download(_model_path, local_files_only=True)
+                        if os.path.isdir(_resolved_model_path):
+                            _model_path = _resolved_model_path
+                            _need_reload = True
+                    except Exception:
+                        pass
-            if _model_path is not None and os.path.isdir(_model_path):
-                _need_reload = True
+            if isinstance(_model_path, str) and _model_path:
+                if os.path.isdir(_model_path):
+                    _need_reload = True
+                else:
+                    try:
+                        from huggingface_hub import snapshot_download
+
+                        _resolved_model_path = snapshot_download(_model_path, local_files_only=True)
+                        if os.path.isdir(_resolved_model_path):
+                            _model_path = _resolved_model_path
+                            _need_reload = True
+                    except Exception:
+                        pass
         self.scheme_generator = GenScheme(
             self.orig_scheme,
             self.model,
@@ -643,7 +654,41 @@ def _gen_auto_scheme(self) -> dict[str, dict]:
             tokenizer=self.tokenizer,
             enable_torch_compile=self.enable_torch_compile,
         )
+
+        if _need_reload:
+            # GenScheme.__init__ has computed avg bit ranges using the model.
+            # Now swap the model reference with the path string so that
+            # gen_layer_config will load a meta skeleton instead.
+            self.scheme_generator.model = _model_path
+            del self.model
+            self.model = None
+            import gc
+
+            gc.collect()
+            clear_memory(device_list=self.device_list)
+            logger.info("Released loaded model before AutoScheme (will reload after)")
+
         layer_config = self.scheme_generator.get_layer_config()
+
+        if _need_reload:
+            logger.info("Reloading model after AutoScheme")
+            self.model, self.tokenizer = llm_load_model(
+                _model_path,
+                device="cpu",
+                trust_remote_code=self.trust_remote_code,
+            )
+            self.model = self.model.eval()
+            check_and_mark_quantized_module(self.model)
+            # Re-apply module structure updates that quantize() applied before AutoScheme
+            formats = self.formats if hasattr(self, "formats") else None
+            if not self.diffusion and formats is not None:
+                self.model = update_module(
+                    self.model, formats=formats, trust_remote_code=self.trust_remote_code, cleanup_original=False
+                )
+            for n, m in self.model.named_modules():
+                m.global_name = n
+            self.shared_cache_keys = get_shared_keys(self.model)
-        layer_config = self.scheme_generator.get_layer_config()
-
-        if _need_reload:
-            logger.info("Reloading model after AutoScheme")
-            self.model, self.tokenizer = llm_load_model(
-                _model_path,
-                device="cpu",
-                trust_remote_code=self.trust_remote_code,
-            )
-            self.model = self.model.eval()
-            check_and_mark_quantized_module(self.model)
-            # Re-apply module structure updates that quantize() applied before AutoScheme
-            formats = self.formats if hasattr(self, "formats") else None
-            if not self.diffusion and formats is not None:
-                self.model = update_module(
-                    self.model, formats=formats, trust_remote_code=self.trust_remote_code, cleanup_original=False
-                )
-            for n, m in self.model.named_modules():
-                m.global_name = n
-            self.shared_cache_keys = get_shared_keys(self.model)
+        try:
+            layer_config = self.scheme_generator.get_layer_config()
+        finally:
+            if _need_reload:
+                logger.info("Reloading model after AutoScheme")
+                self.model, self.tokenizer = llm_load_model(
+                    _model_path,
+                    device="cpu",
+                    trust_remote_code=self.trust_remote_code,
+                )
+                self.model = self.model.eval()
+                check_and_mark_quantized_module(self.model)
+                # Re-apply module structure updates that quantize() applied before AutoScheme
+                formats = self.formats if hasattr(self, "formats") else None
+                if not self.diffusion and formats is not None:
+                    self.model = update_module(
+                        self.model, formats=formats, trust_remote_code=self.trust_remote_code, cleanup_original=False
+                    )
+                for n, m in self.model.named_modules():
+                    m.global_name = n
+                self.shared_cache_keys = get_shared_keys(self.model)
-        layer_config = self.scheme_generator.get_layer_config()
-
-        if _need_reload:
-            logger.info("Reloading model after AutoScheme")
-            self.model, self.tokenizer = llm_load_model(
-                _model_path,
-                device="cpu",
-                trust_remote_code=self.trust_remote_code,
-            )
-            self.model = self.model.eval()
-            check_and_mark_quantized_module(self.model)
-            # Re-apply module structure updates that quantize() applied before AutoScheme
-            formats = self.formats if hasattr(self, "formats") else None
-            if not self.diffusion and formats is not None:
-                self.model = update_module(
-                    self.model, formats=formats, trust_remote_code=self.trust_remote_code, cleanup_original=False
-                )
-            for n, m in self.model.named_modules():
-                m.global_name = n
-            self.shared_cache_keys = get_shared_keys(self.model)
+        try:
+            layer_config = self.scheme_generator.get_layer_config()
+        finally:
+            if _need_reload:
+                logger.info("Reloading model after AutoScheme")
+                self.model, self.tokenizer = llm_load_model(
+                    _model_path,
+                    device="cpu",
+                    trust_remote_code=self.trust_remote_code,
+                )
+                self.model = self.model.eval()
+                check_and_mark_quantized_module(self.model)
+                # Re-apply module structure updates that quantize() applied before AutoScheme
+                formats = self.formats if hasattr(self, "formats") else None
+                if not self.diffusion and formats is not None:
+                    self.model = update_module(
+                        self.model, formats=formats, trust_remote_code=self.trust_remote_code, cleanup_original=False
+                    )
+                for n, m in self.model.named_modules():
+                    m.global_name = n
+                self.shared_cache_keys = get_shared_keys(self.model)
+
         return layer_config
 
     def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: