Skip to content
This repository was archived by the owner on Jan 22, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions d2go/runner/config_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,10 @@ def _add_detectron2go_runner_default_cfg(_C: CN) -> None:
# Profiler
_C.PROFILERS = ["default_flop_counter"]

# GPU memory profiler
# Snapshot memory profiling
add_memory_profiler_configs(_C)

# Zoomer memory profiling
# Zoomer Kineto memory profiling
add_zoomer_default_config(_C)

# Checkpointing-specific config
Expand Down
26 changes: 3 additions & 23 deletions d2go/runner/default_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import detectron2.utils.comm as comm
import torch
from aiplatform.monitoring.unitrace.memory_snapshot import attach_oom_logger
from d2go.checkpoint.api import is_distributed_checkpoint
from d2go.checkpoint.fsdp_checkpoint import FSDPCheckpointer
from d2go.config import CfgNode, CONFIG_SCALING_METHOD_REGISTRY, temp_defrost
Expand Down Expand Up @@ -38,11 +39,7 @@
get_generalized_rcnn_runner_default_cfg,
)

from d2go.runner.training_hooks import (
D2GoGpuMemorySnapshot,
TRAINER_HOOKS_REGISTRY,
update_hooks_from_registry,
)
from d2go.runner.training_hooks import update_hooks_from_registry
from d2go.trainer.fsdp import get_grad_scaler
from d2go.trainer.helper import parse_precision_from_string
from d2go.utils.abnormal_checker import (
Expand All @@ -51,7 +48,6 @@
get_writers,
)
from d2go.utils.flop_calculator import attach_profilers
from d2go.utils.gpu_memory_profiler import attach_oom_logger
from d2go.utils.helper import D2Trainer, TensorboardXWriter
from d2go.utils.misc import get_tensorboard_log_dir
from d2go.utils.visualization import DataLoaderVisWrapper, VisualizationEvaluator
Expand Down Expand Up @@ -150,20 +146,6 @@ def default_scale_quantization_configs(cfg, new_world_size):
)


@TRAINER_HOOKS_REGISTRY.register()
def add_memory_profiler_hook(hooks, cfg: CfgNode):
# Add GPU memory snapshot profiler to diagnose GPU OOM issues and benchmark memory usage during model training
if cfg.get("MEMORY_PROFILER", CfgNode()).get("ENABLED", False):
hooks.append(
D2GoGpuMemorySnapshot(
cfg.OUTPUT_DIR,
log_n_steps=cfg.MEMORY_PROFILER.LOG_N_STEPS,
log_during_train_at=cfg.MEMORY_PROFILER.LOG_DURING_TRAIN_AT,
trace_max_entries=cfg.MEMORY_PROFILER.TRACE_MAX_ENTRIES,
)
)


@fb_overwritable()
def prepare_fb_model(cfg: CfgNode, model: torch.nn.Module) -> torch.nn.Module:
return model
Expand Down Expand Up @@ -345,9 +327,7 @@ def _build_model(self, cfg, eval_only=False):
def build_model(self, cfg, eval_only=False):
# Attach memory profiler to GPU OOM events
if cfg.get("MEMORY_PROFILER", CfgNode()).get("ENABLED", False):
attach_oom_logger(
cfg.OUTPUT_DIR, trace_max_entries=cfg.MEMORY_PROFILER.TRACE_MAX_ENTRIES
)
attach_oom_logger(bucket="d2go_traces")

model = self._build_model(cfg, eval_only)
model = prepare_fb_model(cfg, model)
Expand Down
38 changes: 24 additions & 14 deletions d2go/runner/training_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,17 @@
import logging
from typing import List

from d2go.config import CfgNode
from aiplatform.monitoring.unitrace.memory_snapshot import (
export_memory_snapshot,
start_record_memory_history,
stop_record_memory_history,
)

from d2go.utils.gpu_memory_profiler import log_memory_snapshot, record_memory_history
from d2go.config import CfgNode

from detectron2.engine.train_loop import HookBase
from detectron2.utils.registry import Registry
from mobile_cv.torch.utils_pytorch import comm


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -41,29 +46,34 @@ class D2GoGpuMemorySnapshot(HookBase):

def __init__(
self,
output_dir,
log_n_steps: int = 3,
log_during_train_at: int = 550,
trace_max_entries: int = 1000000,
manifold_bucket: str = "d2go_traces",
root_manifold_path: str = "tree/memory_snapshot",
) -> None:
self.output_dir = output_dir
self.step = 0
self.log_n_steps = log_n_steps
self.log_during_train_at = log_during_train_at
self.trace_max_entries = trace_max_entries
self.manifold_bucket = manifold_bucket
self.root_manifold_path = root_manifold_path
logger.warning(
"WARNING: Memory snapshot profiler is enabled. This may cause ranks to die and training jobs to get stuck. Please use with caution."
)

def before_step(self):
if self.trainer.iter == self.log_during_train_at:
record_memory_history(self.trace_max_entries)
logger.info(
f"[itrn-{self.trainer.iter}] Starting memory snapshot recording"
)
start_record_memory_history()

def after_step(self):
if self.step == self.log_n_steps - 1:
log_memory_snapshot(self.output_dir, file_prefix=f"iter{self.trainer.iter}")

if self.trainer.iter == self.log_during_train_at + self.log_n_steps - 1:
log_memory_snapshot(self.output_dir, file_prefix=f"iter{self.trainer.iter}")

self.step += 1
export_memory_snapshot(
worker_name=f"rank-{comm.get_rank()}",
bucket=self.manifold_bucket,
root_manifold_path=self.root_manifold_path,
)
logger.info(
f"[itrn-{self.trainer.iter}] Stopping memory snapshot recording"
)
stop_record_memory_history()
87 changes: 0 additions & 87 deletions d2go/utils/gpu_memory_profiler.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
import logging
import os
import pickle

import torch
from d2go.config import CfgNode as CN
from detectron2.utils.file_io import PathManager
from mobile_cv.torch.utils_pytorch import comm
from torch.cuda._memory_viz import segment_plot, trace_plot

logger: logging.Logger = logging.getLogger(__name__)

Expand All @@ -29,84 +23,3 @@ def add_zoomer_default_config(_C: CN):
False # Do not enable by default, since it may cause performance regression
)
_C.ZOOMER.ENABLE_MEMORY_PROFILING = False


def omm_logger_wrapper(output_dir):
def oom_logger(
device: int, alloc: int, device_alloc: int, device_free: int
) -> None:
"""
Log memory snapshot in the event of CUDA OOM.
"""
logger.info(
f"Saving memory snapshot device: {device}, alloc: {alloc}, device_alloc: {device_alloc}, device_free: {device_free}"
)
try:
log_memory_snapshot(output_dir, file_prefix="oom")
except Exception as e:
logger.error(f"Failed to log memory snapshot during OOM {e}")

return oom_logger


def log_memory_snapshot(output_dir: str, file_prefix: str = "") -> None:
"""
Log memory snapshots to output_dir
"""
if not torch.cuda.is_available():
logger.info("CUDA unavailable. Not logging snapshot")
return

try:
rank = comm.get_rank()
save_dir = os.path.join(
output_dir, "memory_snapshot", f"{file_prefix}_rank{rank}"
)
logger.info(f"Logging memory snapshot to {save_dir}")
snapshot = torch.cuda.memory._snapshot()
dump_snapshot(save_dir, snapshot)
except Exception as e:
logger.error(f"Failed to log memory snapshot to {save_dir}: {e}")


def dump_snapshot(save_dir: str, snapshot):
"""
Dump memory snapshot and useful plots to save_dir.
This is a rewrite of torch.cuda.memory._dump_snapshot() with PathManager.
"""
if not PathManager.exists(save_dir):
PathManager.mkdirs(save_dir)
with PathManager.open(os.path.join(save_dir, "snapshot.pickle"), "wb") as f:
pickle.dump(snapshot, f)
with PathManager.open(os.path.join(save_dir, "trace_plot.html"), "w") as f:
f.write(trace_plot(snapshot))
with PathManager.open(os.path.join(save_dir, "segment_plot.html"), "w") as f:
f.write(segment_plot(snapshot))
logger.info(f"Saved memory snapshot to {save_dir}")


def record_memory_history(trace_max_entries=1000000) -> None:
"""
Start recording memory history and stack traces.
"""
if not torch.cuda.is_available():
logger.info("CUDA unavailable. Not recording memory history")
return

torch.cuda.memory._record_memory_history(
enabled="all", max_entries=trace_max_entries
)
logger.info("Started recording memory history")


def attach_oom_logger(output_dir, trace_max_entries=1000000) -> None:
"""
Start recording memory history and attach the OOM logger.
"""
if not torch.cuda.is_available():
logger.info("CUDA unavailable. Not attaching OOM logger")
return

record_memory_history(trace_max_entries)
torch._C._cuda_attach_out_of_memory_observer(omm_logger_wrapper(output_dir))
logger.info("Attached GPU OOM logger")