From 8565a95a0685c62b4b9820c0b39e059ceb5a55b5 Mon Sep 17 00:00:00 2001 From: Advait Jayant Date: Fri, 12 Jun 2026 00:47:59 +0100 Subject: [PATCH] Fix checkpoint directory race condition in DDP training With torchrun and --overwrite, every rank deleted and recreated the checkpoint directory simultaneously, so ranks could crash with FileNotFoundError when another rank removed the directory first. Only the main process now performs the rmtree and mkdir, and all ranks synchronize on a barrier before training continues. Fixes #868 --- scripts/train_pytorch.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/train_pytorch.py b/scripts/train_pytorch.py index c7ddd2b595..89226d9e9d 100644 --- a/scripts/train_pytorch.py +++ b/scripts/train_pytorch.py @@ -329,19 +329,23 @@ def train_loop(config: _config.TrainConfig): else: raise FileNotFoundError(f"Experiment checkpoint directory {exp_checkpoint_dir} does not exist for resume") elif config.overwrite and config.checkpoint_dir.exists(): - shutil.rmtree(config.checkpoint_dir) - logging.info(f"Overwriting checkpoint directory: {config.checkpoint_dir}") + if is_main: + shutil.rmtree(config.checkpoint_dir) + logging.info(f"Overwriting checkpoint directory: {config.checkpoint_dir}") # Create checkpoint directory with experiment name - if not resuming: + if not resuming and is_main: # For new runs, create experiment-specific checkpoint directory exp_checkpoint_dir = config.checkpoint_dir exp_checkpoint_dir.mkdir(parents=True, exist_ok=True) logging.info(f"Created experiment checkpoint directory: {exp_checkpoint_dir}") - else: + elif resuming: # For resume, checkpoint_dir is already set to the experiment directory logging.info(f"Using existing experiment checkpoint directory: {config.checkpoint_dir}") + if use_ddp: + dist.barrier() + # Initialize wandb (only on main process) if is_main: init_wandb(config, resuming=resuming, enabled=config.wandb_enabled)