diff --git a/d2go/runner/config_defaults.py b/d2go/runner/config_defaults.py index 6c192bb4..00fd8bcc 100644 --- a/d2go/runner/config_defaults.py +++ b/d2go/runner/config_defaults.py @@ -138,10 +138,10 @@ def _add_detectron2go_runner_default_cfg(_C: CN) -> None: # Profiler _C.PROFILERS = ["default_flop_counter"] - # GPU memory profiler + # Snapshot memory profiling add_memory_profiler_configs(_C) - # Zoomer memory profiling + # Zoomer Kineto memory profiling add_zoomer_default_config(_C) # Checkpointing-specific config diff --git a/d2go/runner/default_runner.py b/d2go/runner/default_runner.py index 1409e676..244d4768 100644 --- a/d2go/runner/default_runner.py +++ b/d2go/runner/default_runner.py @@ -11,6 +11,7 @@ import detectron2.utils.comm as comm import torch +from aiplatform.monitoring.unitrace.memory_snapshot import attach_oom_logger from d2go.checkpoint.api import is_distributed_checkpoint from d2go.checkpoint.fsdp_checkpoint import FSDPCheckpointer from d2go.config import CfgNode, CONFIG_SCALING_METHOD_REGISTRY, temp_defrost @@ -38,11 +39,7 @@ get_generalized_rcnn_runner_default_cfg, ) -from d2go.runner.training_hooks import ( - D2GoGpuMemorySnapshot, - TRAINER_HOOKS_REGISTRY, - update_hooks_from_registry, -) +from d2go.runner.training_hooks import update_hooks_from_registry from d2go.trainer.fsdp import get_grad_scaler from d2go.trainer.helper import parse_precision_from_string from d2go.utils.abnormal_checker import ( @@ -51,7 +48,6 @@ get_writers, ) from d2go.utils.flop_calculator import attach_profilers -from d2go.utils.gpu_memory_profiler import attach_oom_logger from d2go.utils.helper import D2Trainer, TensorboardXWriter from d2go.utils.misc import get_tensorboard_log_dir from d2go.utils.visualization import DataLoaderVisWrapper, VisualizationEvaluator @@ -150,20 +146,6 @@ def default_scale_quantization_configs(cfg, new_world_size): ) -@TRAINER_HOOKS_REGISTRY.register() -def add_memory_profiler_hook(hooks, cfg: CfgNode): - # Add GPU memory snapshot profiler to diagnose GPU OOM issues and benchmark memory usage during model training - if cfg.get("MEMORY_PROFILER", CfgNode()).get("ENABLED", False): - hooks.append( - D2GoGpuMemorySnapshot( - cfg.OUTPUT_DIR, - log_n_steps=cfg.MEMORY_PROFILER.LOG_N_STEPS, - log_during_train_at=cfg.MEMORY_PROFILER.LOG_DURING_TRAIN_AT, - trace_max_entries=cfg.MEMORY_PROFILER.TRACE_MAX_ENTRIES, - ) - ) - - @fb_overwritable() def prepare_fb_model(cfg: CfgNode, model: torch.nn.Module) -> torch.nn.Module: return model @@ -345,9 +327,7 @@ def _build_model(self, cfg, eval_only=False): def build_model(self, cfg, eval_only=False): # Attach memory profiler to GPU OOM events if cfg.get("MEMORY_PROFILER", CfgNode()).get("ENABLED", False): - attach_oom_logger( - cfg.OUTPUT_DIR, trace_max_entries=cfg.MEMORY_PROFILER.TRACE_MAX_ENTRIES - ) + attach_oom_logger(bucket="d2go_traces") model = self._build_model(cfg, eval_only) model = prepare_fb_model(cfg, model) diff --git a/d2go/runner/training_hooks.py b/d2go/runner/training_hooks.py index 6dd7b6d7..31aa65a0 100644 --- a/d2go/runner/training_hooks.py +++ b/d2go/runner/training_hooks.py @@ -3,12 +3,17 @@ import logging from typing import List -from d2go.config import CfgNode +from aiplatform.monitoring.unitrace.memory_snapshot import ( + export_memory_snapshot, + start_record_memory_history, + stop_record_memory_history, +) -from d2go.utils.gpu_memory_profiler import log_memory_snapshot, record_memory_history +from d2go.config import CfgNode from detectron2.engine.train_loop import HookBase from detectron2.utils.registry import Registry +from mobile_cv.torch.utils_pytorch import comm logger = logging.getLogger(__name__) @@ -41,29 +46,34 @@ class D2GoGpuMemorySnapshot(HookBase): def __init__( self, - output_dir, log_n_steps: int = 3, log_during_train_at: int = 550, - trace_max_entries: int = 1000000, + manifold_bucket: str = "d2go_traces", + root_manifold_path: str = "tree/memory_snapshot", ) -> None: - self.output_dir = output_dir - self.step = 0 self.log_n_steps = log_n_steps self.log_during_train_at = log_during_train_at - self.trace_max_entries = trace_max_entries + self.manifold_bucket = manifold_bucket + self.root_manifold_path = root_manifold_path logger.warning( "WARNING: Memory snapshot profiler is enabled. This may cause ranks to die and training jobs to get stuck. Please use with caution." ) def before_step(self): if self.trainer.iter == self.log_during_train_at: - record_memory_history(self.trace_max_entries) + logger.info( + f"[itrn-{self.trainer.iter}] Starting memory snapshot recording" + ) + start_record_memory_history() def after_step(self): - if self.step == self.log_n_steps - 1: - log_memory_snapshot(self.output_dir, file_prefix=f"iter{self.trainer.iter}") - if self.trainer.iter == self.log_during_train_at + self.log_n_steps - 1: - log_memory_snapshot(self.output_dir, file_prefix=f"iter{self.trainer.iter}") - - self.step += 1 + export_memory_snapshot( + worker_name=f"rank-{comm.get_rank()}", + bucket=self.manifold_bucket, + root_manifold_path=self.root_manifold_path, + ) + logger.info( + f"[itrn-{self.trainer.iter}] Stopping memory snapshot recording" + ) + stop_record_memory_history() diff --git a/d2go/utils/gpu_memory_profiler.py b/d2go/utils/gpu_memory_profiler.py index cd0ec10f..24c3968d 100644 --- a/d2go/utils/gpu_memory_profiler.py +++ b/d2go/utils/gpu_memory_profiler.py @@ -1,12 +1,6 @@ import logging -import os -import pickle -import torch from d2go.config import CfgNode as CN -from detectron2.utils.file_io import PathManager -from mobile_cv.torch.utils_pytorch import comm -from torch.cuda._memory_viz import segment_plot, trace_plot logger: logging.Logger = logging.getLogger(__name__) @@ -29,84 +23,3 @@ def add_zoomer_default_config(_C: CN): False # Do not enable by default, since it may cause performance regression ) _C.ZOOMER.ENABLE_MEMORY_PROFILING = False - - -def omm_logger_wrapper(output_dir): - def oom_logger( - device: int, alloc: int, device_alloc: int, device_free: int - ) -> None: - """ - Log memory snapshot in the event of CUDA OOM. - """ - logger.info( - f"Saving memory snapshot device: {device}, alloc: {alloc}, device_alloc: {device_alloc}, device_free: {device_free}" - ) - try: - log_memory_snapshot(output_dir, file_prefix="oom") - except Exception as e: - logger.error(f"Failed to log memory snapshot during OOM {e}") - - return oom_logger - - -def log_memory_snapshot(output_dir: str, file_prefix: str = "") -> None: - """ - Log memory snapshots to output_dir - """ - if not torch.cuda.is_available(): - logger.info("CUDA unavailable. Not logging snapshot") - return - - try: - rank = comm.get_rank() - save_dir = os.path.join( - output_dir, "memory_snapshot", f"{file_prefix}_rank{rank}" - ) - logger.info(f"Logging memory snapshot to {save_dir}") - snapshot = torch.cuda.memory._snapshot() - dump_snapshot(save_dir, snapshot) - except Exception as e: - logger.error(f"Failed to log memory snapshot to {save_dir}: {e}") - - -def dump_snapshot(save_dir: str, snapshot): - """ - Dump memory snapshot and useful plots to save_dir. - This is a rewrite of torch.cuda.memory._dump_snapshot() with PathManager. - """ - if not PathManager.exists(save_dir): - PathManager.mkdirs(save_dir) - with PathManager.open(os.path.join(save_dir, "snapshot.pickle"), "wb") as f: - pickle.dump(snapshot, f) - with PathManager.open(os.path.join(save_dir, "trace_plot.html"), "w") as f: - f.write(trace_plot(snapshot)) - with PathManager.open(os.path.join(save_dir, "segment_plot.html"), "w") as f: - f.write(segment_plot(snapshot)) - logger.info(f"Saved memory snapshot to {save_dir}") - - -def record_memory_history(trace_max_entries=1000000) -> None: - """ - Start recording memory history and stack traces. - """ - if not torch.cuda.is_available(): - logger.info("CUDA unavailable. Not recording memory history") - return - - torch.cuda.memory._record_memory_history( - enabled="all", max_entries=trace_max_entries - ) - logger.info("Started recording memory history") - - -def attach_oom_logger(output_dir, trace_max_entries=1000000) -> None: - """ - Start recording memory history and attach the OOM logger. - """ - if not torch.cuda.is_available(): - logger.info("CUDA unavailable. Not attaching OOM logger") - return - - record_memory_history(trace_max_entries) - torch._C._cuda_attach_out_of_memory_observer(omm_logger_wrapper(output_dir)) - logger.info("Attached GPU OOM logger")