From 8565a95a0685c62b4b9820c0b39e059ceb5a55b5 Mon Sep 17 00:00:00 2001
From: Advait Jayant <advaitjk@gmail.com>
Date: Fri, 12 Jun 2026 00:47:59 +0100
Subject: [PATCH] Fix checkpoint directory race condition in DDP training

With torchrun and --overwrite, every rank deleted and recreated the
checkpoint directory simultaneously, so ranks could crash with
FileNotFoundError when another rank removed the directory first. Only
the main process now performs the rmtree and mkdir, and all ranks
synchronize on a barrier before training continues.

Fixes #868
---
 scripts/train_pytorch.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/train_pytorch.py b/scripts/train_pytorch.py
index c7ddd2b595..89226d9e9d 100644
--- a/scripts/train_pytorch.py
+++ b/scripts/train_pytorch.py
@@ -329,19 +329,23 @@ def train_loop(config: _config.TrainConfig):
         else:
             raise FileNotFoundError(f"Experiment checkpoint directory {exp_checkpoint_dir} does not exist for resume")
     elif config.overwrite and config.checkpoint_dir.exists():
-        shutil.rmtree(config.checkpoint_dir)
-        logging.info(f"Overwriting checkpoint directory: {config.checkpoint_dir}")
+        if is_main:
+            shutil.rmtree(config.checkpoint_dir)
+            logging.info(f"Overwriting checkpoint directory: {config.checkpoint_dir}")
 
     # Create checkpoint directory with experiment name
-    if not resuming:
+    if not resuming and is_main:
         # For new runs, create experiment-specific checkpoint directory
         exp_checkpoint_dir = config.checkpoint_dir
         exp_checkpoint_dir.mkdir(parents=True, exist_ok=True)
         logging.info(f"Created experiment checkpoint directory: {exp_checkpoint_dir}")
-    else:
+    elif resuming:
         # For resume, checkpoint_dir is already set to the experiment directory
         logging.info(f"Using existing experiment checkpoint directory: {config.checkpoint_dir}")
 
+    if use_ddp:
+        dist.barrier()
+
     # Initialize wandb (only on main process)
     if is_main:
         init_wandb(config, resuming=resuming, enabled=config.wandb_enabled)