From 997c63b0bd2d65cab0d634e10f094c7b93a84f9b Mon Sep 17 00:00:00 2001 From: Betsy Cannon Date: Fri, 1 May 2026 12:14:00 -0400 Subject: [PATCH] Set default ResUNet width to 64 and avoid OOM at that width Width sweep on the 3K dataset_4 anchor (W32/W48/W64, matched recipe) shows monotonic improvement with width and a stable 0.33% absolute NMAE gap that holds across resumes. train_workers reduced from 8 to 4 to mitigate the pymatgen/loky semaphore leak that drove host-RAM OOMs at higher widths. use_checkpoint enabled because activation checkpointing is required for n_channels=64 to fit on an A100 80GB at batch_size=1. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/electrai/configs/MP/config_resunet.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/electrai/configs/MP/config_resunet.yaml b/src/electrai/configs/MP/config_resunet.yaml index 7fef6ff..627a9d6 100644 --- a/src/electrai/configs/MP/config_resunet.yaml +++ b/src/electrai/configs/MP/config_resunet.yaml @@ -5,7 +5,7 @@ data: split_file: null #/scratch/gpfs/ROSENGROUP/common/globus_share_OA/mp/dataset_1/split.json precision: f32 batch_size: 1 - train_workers: 8 + train_workers: 4 val_workers: 2 pin_memory: false val_frac: 0.005 @@ -20,11 +20,11 @@ model: _target_: electrai.model.resunet.ResUNet3D in_channels: 1 out_channels: 1 - n_channels: 32 + n_channels: 64 n_residual_blocks: 1 kernel_size: 5 depth: 2 - use_checkpoint: False + use_checkpoint: True # Training parameters precision: 32