diff --git a/.github/workflows/gpu-benchmark-modal.yml b/.github/workflows/gpu-benchmark-modal.yml new file mode 100644 index 00000000..988ea8fa --- /dev/null +++ b/.github/workflows/gpu-benchmark-modal.yml @@ -0,0 +1,115 @@ +name: GPU Benchmark (Modal) + +on: + workflow_dispatch: + inputs: + gpu: + description: 'GPU type (L4, A10G, A100, H100)' + default: 'L4' + type: string + epochs: + description: 'Number of training epochs' + default: '5' + type: string + channels: + description: 'Number of model channels (8=tiny, 32=prod, 64=large)' + default: '32' + type: string + residual_blocks: + description: 'Number of residual blocks (2=tiny, 16=prod)' + default: '16' + type: string + samples: + description: 'Number of samples (0=all eligible)' + default: '50' + type: string + dataset: + description: 'Dataset: s3 (205 samples, matches EC2) or dataset_4 (2885 samples)' + default: 's3' + type: string + wandb_project: + description: 'WandB project name' + default: 'elf-net-ci' + type: string + +jobs: + benchmark: + runs-on: ubuntu-latest + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install Modal + run: uv pip install --system modal + + - name: Run benchmark + id: bench + env: + GPU: ${{ inputs.gpu || 'L4' }} + EPOCHS: ${{ inputs.epochs || '5' }} + CHANNELS: ${{ inputs.channels || '32' }} + RESIDUAL_BLOCKS: ${{ inputs.residual_blocks || '16' }} + SAMPLES: ${{ inputs.samples || '50' }} + DATASET: ${{ inputs.dataset || 's3' }} + WANDB_PROJECT: ${{ inputs.wandb_project || 'elf-net-ci' }} + run: | + OUTPUT=$(modal run modal/benchmark.py \ + --gpu "$GPU" \ + --epochs "$EPOCHS" \ + --channels "$CHANNELS" \ + --residual-blocks "$RESIDUAL_BLOCKS" \ + --samples "$SAMPLES" \ + --dataset "$DATASET" \ + --wandb-project "$WANDB_PROJECT" 2>&1 | tee /dev/stderr) + + # Parse metrics from output + for key in VAL_LOSS TRAIN_LOSS WALLCLOCK GPU DATASET SAMPLES WANDB_URL; do + val=$(echo "$OUTPUT" | grep -oP "BENCHMARK_${key}=\K.*" || true) + if [ -n "$val" ]; then + echo "BENCHMARK_${key}=${val}" >> "$GITHUB_OUTPUT" + fi + done + + - name: Summary + env: + EPOCHS: ${{ inputs.epochs || '5' }} + CHANNELS: ${{ inputs.channels || '32' }} + RESIDUAL_BLOCKS: ${{ inputs.residual_blocks || '16' }} + VAL_LOSS: ${{ steps.bench.outputs.BENCHMARK_VAL_LOSS }} + TRAIN_LOSS: ${{ steps.bench.outputs.BENCHMARK_TRAIN_LOSS }} + WALLCLOCK: ${{ steps.bench.outputs.BENCHMARK_WALLCLOCK }} + GPU: ${{ steps.bench.outputs.BENCHMARK_GPU }} + DATASET: ${{ steps.bench.outputs.BENCHMARK_DATASET }} + SAMPLES: ${{ steps.bench.outputs.BENCHMARK_SAMPLES }} + WANDB_URL: ${{ steps.bench.outputs.BENCHMARK_WANDB_URL }} + WANDB_PROJECT: ${{ inputs.wandb_project || 'elf-net-ci' }} + run: | + echo "## GPU Benchmark Results (Modal)" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "### Configuration" >> "$GITHUB_STEP_SUMMARY" + echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY" + echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY" + echo "| Platform | Modal ($GPU) |" >> "$GITHUB_STEP_SUMMARY" + echo "| Epochs | $EPOCHS |" >> "$GITHUB_STEP_SUMMARY" + echo "| Channels | $CHANNELS |" >> "$GITHUB_STEP_SUMMARY" + echo "| Residual Blocks | $RESIDUAL_BLOCKS |" >> "$GITHUB_STEP_SUMMARY" + echo "| Dataset | $DATASET ($SAMPLES samples) |" >> "$GITHUB_STEP_SUMMARY" + COMMIT_LINK="[\`${GITHUB_SHA::7}\`](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/commit/${GITHUB_SHA})" + echo "| Commit | $COMMIT_LINK |" >> "$GITHUB_STEP_SUMMARY" + if [ -n "$WANDB_URL" ]; then + WANDB_RUN_ID=$(echo "$WANDB_URL" | grep -oP '[^/]+$') + echo "| WandB | [$WANDB_PROJECT](https://wandb.ai/PrinceOA/$WANDB_PROJECT) run [$WANDB_RUN_ID]($WANDB_URL) |" >> "$GITHUB_STEP_SUMMARY" + fi + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "### Results" >> "$GITHUB_STEP_SUMMARY" + echo "| Metric | Value |" >> "$GITHUB_STEP_SUMMARY" + echo "|--------|-------|" >> "$GITHUB_STEP_SUMMARY" + echo "| val_loss | $VAL_LOSS |" >> "$GITHUB_STEP_SUMMARY" + echo "| train_loss | $TRAIN_LOSS |" >> "$GITHUB_STEP_SUMMARY" + echo "| Wallclock | ${WALLCLOCK}s |" >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/gpu-e2e-modal.yml b/.github/workflows/gpu-e2e-modal.yml new file mode 100644 index 00000000..b68c03b6 --- /dev/null +++ b/.github/workflows/gpu-e2e-modal.yml @@ -0,0 +1,32 @@ +name: GPU E2E (Modal) + +on: + pull_request: + branches: [main] + workflow_dispatch: + inputs: + epochs: + description: 'Number of training epochs' + default: '5' + type: string + +jobs: + test: + runs-on: ubuntu-latest + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install Modal + run: uv pip install --system modal + + - name: Run e2e test on Modal GPU + env: + EPOCHS: ${{ inputs.epochs || '5' }} + run: modal run modal/ci.py --epochs "$EPOCHS" diff --git a/modal/benchmark.py b/modal/benchmark.py new file mode 100644 index 00000000..bcf82777 --- /dev/null +++ b/modal/benchmark.py @@ -0,0 +1,278 @@ +"""Modal GPU benchmark for electrai. + +Runs a configurable training benchmark on Modal GPUs with data from the +electrai-data Volume, logs metrics to WandB, and reports wall-clock time. + +Two data sources on the Volume: +- "s3" (default): 205 samples from s3://openathena/electrai/ (≤25MB, matches EC2 benchmark) +- "dataset_4": 2,885 samples from Globus/Della dataset_4 (large grids, needs A100 for prod config) + +Usage: + # Default: 50 samples from S3 set, 5 epochs, 32ch/16blk, L4 (matches EC2 benchmark) + modal run modal/benchmark.py + + # Dataset_4 with tiny model on L4 + modal run modal/benchmark.py --dataset dataset_4 --channels 8 --residual-blocks 2 + + # Production model on A100 with dataset_4 + modal run modal/benchmark.py --dataset dataset_4 --gpu A100 --channels 32 --residual-blocks 16 + + # All S3 samples + modal run modal/benchmark.py --samples 0 + + # Quick smoke test + modal run modal/benchmark.py --samples 10 --epochs 2 --channels 8 --residual-blocks 2 +""" + +from __future__ import annotations + +from pathlib import Path + +import modal + +ROOT = Path(__file__).parent.parent + +data_volume = modal.Volume.from_name("electrai-data") + +image = ( + modal.Image.debian_slim(python_version="3.12") + .apt_install("git") + .pip_install_from_pyproject( + str(ROOT / "pyproject.toml"), optional_dependencies=["dev"] + ) + .add_local_dir(str(ROOT / "src"), remote_path="/root/electrai/src", copy=True) + .add_local_dir( + str(ROOT / "scripts"), remote_path="/root/electrai/scripts", copy=True + ) + .add_local_file( + str(ROOT / "pyproject.toml"), + remote_path="/root/electrai/pyproject.toml", + copy=True, + ) + .run_commands("cd /root/electrai && pip install --no-deps -e .") +) + +app = modal.App("electrai-benchmark", image=image) + +# Data roots on the Volume +DATASETS = { + # Mirrors s3://openathena/electrai/ — same data as EC2 gpu-benchmark.yml + # Note: S3 uses input/ but RhoRead expects data/, so we symlink + "s3": { + "root": "/data/s3/openathena/electrai", + "input_dir": "input", # S3 naming + "max_file_size": 25, # matches EC2 benchmark default + }, + # Globus/Della dataset_4 — 2,885 samples, large grids + "dataset_4": { + "root": "/data/mp/chg_datasets/dataset_4", + "input_dir": "data", # Della naming + "max_file_size": 100, + }, +} + + +@app.function( + gpu="L4", + volumes={"/data": data_volume}, + secrets=[modal.Secret.from_name("wandb-credentials")], + timeout=7200, + retries=0, +) +def run_benchmark( + epochs: int = 5, + channels: int = 32, + residual_blocks: int = 16, + samples: int = 50, + max_file_size: float = -1, + seed: int = 42, + wandb_project: str = "elf-net-ci", + gpu_type: str = "L4", + dataset: str = "s3", + local_copy: bool = False, +): + """Run benchmark and return results.""" + import logging + import sys + + log = logging.getLogger(__name__) + logging.basicConfig(level=logging.INFO) + + sys.path.insert(0, "/root/electrai/scripts") + from e2e_train import run_training + + ds = DATASETS[dataset] + ds_root = Path(ds["root"]) + input_dir = ds["input_dir"] + + # Use dataset-specific default if max_file_size not explicitly set + if max_file_size < 0: + max_file_size = ds["max_file_size"] + + # Build filelist from files on disk (S3 set has no mp_filelist.txt) + data_dir = ds_root / input_dir + all_ids = sorted(p.stem for p in data_dir.glob("*.CHGCAR")) + log.info("Dataset %r: %d total samples in %s", dataset, len(all_ids), data_dir) + + # Filter by file size (avoid OOM on large grids) + if max_file_size > 0: + max_bytes = int(max_file_size * 1024 * 1024) + eligible = [ + sid + for sid in all_ids + if (data_dir / f"{sid}.CHGCAR").stat().st_size <= max_bytes + ] + log.info( + "File size filter: %d/%d eligible (<=%.0fMB)", + len(eligible), + len(all_ids), + max_file_size, + ) + else: + eligible = all_ids + + # Select samples: first N (lexicographic, matching s3_sync.py behavior) + if 0 < samples < len(eligible): + subset = eligible[:samples] + log.info("Selected first %d/%d eligible samples", samples, len(eligible)) + else: + subset = eligible + samples = len(subset) + log.info("Using all %d eligible samples", samples) + + if not subset: + raise ValueError( + f"No eligible samples (dataset={dataset}, total={len(all_ids)}, " + f"max_file_size={max_file_size}MB)" + ) + + data_root = "/tmp/benchmark_data" + Path(data_root).mkdir(parents=True, exist_ok=True) + + if local_copy: + # Copy selected samples to local disk (Volume I/O is ~15x slower) + import shutil + + local_data = Path(data_root) / "data" + local_label = Path(data_root) / "label" + local_data.mkdir(parents=True, exist_ok=True) + local_label.mkdir(parents=True, exist_ok=True) + for sid in subset: + shutil.copy2( + ds_root / input_dir / f"{sid}.CHGCAR", local_data / f"{sid}.CHGCAR" + ) + shutil.copy2( + ds_root / "label" / f"{sid}.CHGCAR", local_label / f"{sid}.CHGCAR" + ) + log.info("Copied %d samples to local disk", len(subset)) + else: + # Symlink to volume (slower I/O but no copy overhead for large datasets) + data_link = Path(data_root) / "data" + if not data_link.exists(): + data_link.symlink_to(ds_root / input_dir) + label_link = Path(data_root) / "label" + if not label_link.exists(): + label_link.symlink_to(ds_root / "label") + log.info("Using volume directly (no local copy)") + + Path(data_root, "mp_filelist.txt").write_text("\n".join(subset) + "\n") + + # Always use gradient checkpointing (32ch/16blk needs it even for ≤25MB files) + use_grad_ckpt = True + + log.info( + "Benchmark: gpu=%s, epochs=%d, channels=%d, blocks=%d, samples=%d, " + "dataset=%s, grad_ckpt=%s", + gpu_type, + epochs, + channels, + residual_blocks, + samples, + dataset, + use_grad_ckpt, + ) + + # Set WandB run name and env vars for platform tagging + import os + import time + + os.environ["INSTANCE_TYPE"] = f"modal-{gpu_type}" + # Set workflow-like name so WandB run name is descriptive. + # GHA sets GITHUB_RUN_NUMBER; for local runs, use timestamp. + os.environ["GITHUB_WORKFLOW"] = "Modal Benchmark" + if "GITHUB_RUN_NUMBER" not in os.environ: + os.environ["GITHUB_RUN_NUMBER"] = time.strftime("%y%m%d-%H%M") + + results = run_training( + channels=channels, + residual_blocks=residual_blocks, + epochs=epochs, + seed=seed, + gpu=True, + gradient_checkpoint=use_grad_ckpt, + data_root=data_root, + max_file_size=0, # already filtered above + wandb_project=wandb_project, + verbose=True, + ) + + log.info("val_loss: %.6f", results["final_val_loss"]) + log.info("train_loss: %.6f", results["final_train_loss"]) + log.info("Wallclock: %.1fs", results["wallclock_s"]) + log.info("GPU: %s (Modal)", gpu_type) + + return results + + +@app.local_entrypoint() +def main( + gpu: str = "L4", + epochs: int = 5, + channels: int = 32, + residual_blocks: int = 16, + samples: int = 50, + max_file_size: float = -1, + seed: int = 42, + wandb_project: str = "elf-net-ci", + dataset: str = "s3", + local_copy: bool = False, +): + import logging + + logging.basicConfig(level=logging.INFO) + log = logging.getLogger(__name__) + + benchmark_fn = run_benchmark + if gpu != "L4": + benchmark_fn = run_benchmark.with_options(gpu=gpu) + + results = benchmark_fn.remote( + epochs=epochs, + channels=channels, + residual_blocks=residual_blocks, + samples=samples, + max_file_size=max_file_size, + seed=seed, + wandb_project=wandb_project, + gpu_type=gpu, + dataset=dataset, + local_copy=local_copy, + ) + + log.info( + "Benchmark complete: val_loss=%.6f, wallclock=%.1fs on %s", + results["final_val_loss"], + results["wallclock_s"], + gpu, + ) + + # Print parseable output for GHA summary + wandb_url = results.get("wandb_run_url") or "" + print(f"BENCHMARK_VAL_LOSS={results['final_val_loss']:.6f}") # noqa: T201 + print(f"BENCHMARK_TRAIN_LOSS={results['final_train_loss']:.6f}") # noqa: T201 + print(f"BENCHMARK_WALLCLOCK={results['wallclock_s']:.0f}") # noqa: T201 + print(f"BENCHMARK_GPU={gpu}") # noqa: T201 + print(f"BENCHMARK_DATASET={dataset}") # noqa: T201 + print(f"BENCHMARK_SAMPLES={samples}") # noqa: T201 + if wandb_url: + print(f"BENCHMARK_WANDB_URL={wandb_url}") # noqa: T201 diff --git a/modal/ci.py b/modal/ci.py new file mode 100644 index 00000000..0cd9da96 --- /dev/null +++ b/modal/ci.py @@ -0,0 +1,93 @@ +"""Modal GPU CI for electrai e2e training test.""" + +from __future__ import annotations + +from pathlib import Path + +import modal + +ROOT = Path(__file__).parent.parent + +# Dependencies read from pyproject.toml (shared with train.py, populate_volume.py) +image = ( + modal.Image.debian_slim(python_version="3.12") + .apt_install("git") + .pip_install_from_pyproject( + str(ROOT / "pyproject.toml"), optional_dependencies=["dev"] + ) + .add_local_dir(str(ROOT / "src"), remote_path="/root/electrai/src", copy=True) + .add_local_dir( + str(ROOT / "scripts"), remote_path="/root/electrai/scripts", copy=True + ) + .add_local_dir(str(ROOT / "tests"), remote_path="/root/electrai/tests", copy=True) + .add_local_dir(str(ROOT / "data"), remote_path="/root/electrai/data", copy=True) + .add_local_file( + str(ROOT / "pyproject.toml"), + remote_path="/root/electrai/pyproject.toml", + copy=True, + ) + .run_commands("cd /root/electrai && pip install --no-deps -e .") +) + +app = modal.App("electrai-ci", image=image) + + +@app.function(gpu="L4", timeout=600, retries=0) +def run_e2e_test(epochs: int = 5, check: bool = True): + """Run e2e training test on GPU.""" + import json + import logging + import sys + + log = logging.getLogger(__name__) + + sys.path.insert(0, "/root/electrai/scripts") + from e2e_train import run_training + + results = run_training(epochs=epochs, gpu=True, verbose=True) + log.info("Platform: %s", results["platform"]) + log.info("Final val_loss: %.6f", results["final_val_loss"]) + if results["final_train_loss"] is not None: + log.info("Final train_loss: %.6f", results["final_train_loss"]) + log.info("Wallclock: %.1fs", results["wallclock_s"]) + + if check: + expected_file = Path("/root/electrai/tests/expected_values.json") + expected_values = json.loads(expected_file.read_text()) + platform = results["platform"] + if platform not in expected_values: + raise ValueError( + f"No expected values for platform {platform!r}, " + f"available: {list(expected_values.keys())}" + ) + expected = expected_values[platform] + if expected.get("final_val_loss") is None: + raise ValueError(f"Expected values for {platform!r} are null") + expected_val_loss = expected["final_val_loss"] + diff = abs(results["final_val_loss"] - expected_val_loss) + tolerance = 0.001 + if diff > tolerance: + raise AssertionError( + f"val_loss {results['final_val_loss']:.6f} differs from expected " + f"{expected_val_loss:.6f} by {diff:.6f} (tolerance: {tolerance})" + ) + log.info( + "PASS: val_loss matches expected within tolerance (%.6f <= %f)", + diff, + tolerance, + ) + + return results + + +@app.local_entrypoint() +def main(epochs: int = 5, check: bool = True): + import logging + + logging.basicConfig(level=logging.INFO) + results = run_e2e_test.remote(epochs=epochs, check=check) + logging.getLogger(__name__).info( + "Results: val_loss=%.6f in %.1fs", + results["final_val_loss"], + results["wallclock_s"], + ) diff --git a/modal/populate_volume.py b/modal/populate_volume.py new file mode 100644 index 00000000..7309f894 --- /dev/null +++ b/modal/populate_volume.py @@ -0,0 +1,84 @@ +"""Populate Modal Volume with training data from S3.""" + +from __future__ import annotations + +import modal + +app = modal.App("electrai-populate") +volume = modal.Volume.from_name("electrai-data", create_if_missing=True) + + +@app.function( + image=modal.Image.debian_slim(python_version="3.12").pip_install("boto3"), + volumes={"/data": volume}, + secrets=[modal.Secret.from_name("aws-credentials")], + timeout=7200, + retries=0, +) +def sync_s3( + bucket: str = "openathena", + prefix: str = "electrai/mp/chg_datasets/dataset_4", + dest: str = "/data/mp/chg_datasets/dataset_4", +): + """Sync dataset from S3 to Modal Volume.""" + import logging + from pathlib import Path + + import boto3 + + log = logging.getLogger(__name__) + logging.basicConfig(level=logging.INFO) + + s3 = boto3.client("s3") + paginator = s3.get_paginator("list_objects_v2") + + # Count and list all objects + objects = [ + obj + for page in paginator.paginate(Bucket=bucket, Prefix=prefix) + for obj in page.get("Contents", []) + ] + + total_bytes = sum(o["Size"] for o in objects) + log.info("Found %d objects, %.1f GiB total", len(objects), total_bytes / (1024**3)) + + downloaded = 0 + skipped = 0 + for i, obj in enumerate(objects): + key = obj["Key"] + rel = key[len(prefix) :].lstrip("/") + local_path = Path(dest) / rel + local_path.parent.mkdir(parents=True, exist_ok=True) + + # Skip if already exists with same size + if local_path.exists() and local_path.stat().st_size == obj["Size"]: + skipped += 1 + continue + + s3.download_file(bucket, key, str(local_path)) + downloaded += 1 + + if (i + 1) % 100 == 0: + log.info( + "Progress: %d/%d (downloaded %d, skipped %d)", + i + 1, + len(objects), + downloaded, + skipped, + ) + + log.info("Done: %d downloaded, %d skipped (already existed)", downloaded, skipped) + volume.commit() + + +@app.local_entrypoint() +def main( + bucket: str = "openathena", + prefix: str = "electrai/mp/chg_datasets/dataset_4", + dest: str = "/data/mp/chg_datasets/dataset_4", +): + import logging + + logging.basicConfig(level=logging.INFO) + sync_s3.remote(bucket=bucket, prefix=prefix, dest=dest) + logging.getLogger(__name__).info("Volume populated.") diff --git a/modal/train.py b/modal/train.py new file mode 100644 index 00000000..881557d1 --- /dev/null +++ b/modal/train.py @@ -0,0 +1,215 @@ +"""Modal training entrypoint for electrai. + +Run real training experiments on Modal GPUs with data from the +electrai-data Volume and checkpoints persisted to electrai-checkpoints. + +Usage: + # Default config (ResUNet, dataset_4, 50 epochs, L4) + modal run modal/train.py + + # Custom config file + modal run modal/train.py --config examples/MP/experiments/experiment_0/config.yaml + + # Override GPU, epochs, channels + modal run modal/train.py --gpu A100 --epochs 10 --channels 64 + + # Resume from last checkpoint + modal run modal/train.py --gpu A100 --resume +""" + +from __future__ import annotations + +from pathlib import Path + +import modal + +ROOT = Path(__file__).parent.parent + +# Persistent volumes +data_volume = modal.Volume.from_name("electrai-data") +ckpt_volume = modal.Volume.from_name("electrai-checkpoints", create_if_missing=True) + +# Dependencies read from pyproject.toml (shared with ci.py, populate_volume.py) +image = ( + modal.Image.debian_slim(python_version="3.12") + .apt_install("git") + .pip_install_from_pyproject( + str(ROOT / "pyproject.toml"), optional_dependencies=["dev"] + ) + .add_local_dir(str(ROOT / "src"), remote_path="/root/electrai/src", copy=True) + .add_local_dir( + str(ROOT / "examples"), remote_path="/root/electrai/examples", copy=True + ) + .add_local_file( + str(ROOT / "pyproject.toml"), + remote_path="/root/electrai/pyproject.toml", + copy=True, + ) + .run_commands("cd /root/electrai && pip install --no-deps -e .") +) + +app = modal.App("electrai-train", image=image) + +DATA_ROOT = "/data/mp/chg_datasets/dataset_4" +CKPT_ROOT = "/checkpoints" + + +@app.function( + gpu="L4", + volumes={"/data": data_volume, CKPT_ROOT: ckpt_volume}, + secrets=[modal.Secret.from_name("wandb-credentials")], + timeout=14400, # 4 hours + retries=0, +) +def train(config_json: str, gpu_type: str = "L4"): + """Run training with the given config (as JSON, converted to YAML remotely).""" + import json + import logging + import subprocess + import sys + + import yaml + + log = logging.getLogger(__name__) + logging.basicConfig(level=logging.INFO) + + cfg = json.loads(config_json) + + # Write config as YAML for the training entrypoint + config_path = Path("/tmp/config.yaml") + with config_path.open("w") as f: + yaml.dump(cfg, f, default_flow_style=False) + + log.info("GPU: %s", gpu_type) + log.info("Data root: %s", DATA_ROOT) + log.info("Checkpoint dir: %s", CKPT_ROOT) + + # Verify data exists + filelist = Path(DATA_ROOT) / "mp_filelist.txt" + if not filelist.exists(): + raise FileNotFoundError(f"Filelist not found: {filelist}") + n_samples = len(filelist.read_text().strip().splitlines()) + log.info("Dataset: %d samples", n_samples) + + # Check for existing checkpoint to resume from + ckpt_path = Path(cfg.get("ckpt_path", CKPT_ROOT)) + last_ckpt = ckpt_path / "last.ckpt" + if last_ckpt.exists(): + log.info("Found checkpoint: %s", last_ckpt) + else: + log.info("No checkpoint found, starting from scratch") + + # Run training + result = subprocess.run( + [ + sys.executable, + "-m", + "electrai.entrypoints.main", + "train", + "--config", + str(config_path), + ], + cwd="/root/electrai", + check=False, + ) + + # Persist checkpoints + ckpt_volume.commit() + + if result.returncode != 0: + raise RuntimeError(f"Training failed with exit code {result.returncode}") + + log.info("Training complete. Checkpoints saved to electrai-checkpoints volume.") + + +@app.local_entrypoint() +def main( + config: str = "", + gpu: str = "L4", + epochs: int = 50, + channels: int = 32, + residual_blocks: int = 1, + depth: int = 2, + kernel_size: int = 5, + lr: float = 0.01, + batch_size: int = 1, + val_frac: float = 0.005, + wandb_project: str = "mp-experiment", + resume: bool = False, +): + import json + import logging + + logging.basicConfig(level=logging.INFO) + log = logging.getLogger(__name__) + + if config: + import subprocess + + result = subprocess.run( + [ + "python3", + "-c", + f"import yaml, json; print(json.dumps(yaml.safe_load(open('{config}'))))", + ], + capture_output=True, + text=True, + check=True, + ) + cfg = json.loads(result.stdout) + else: + cfg = { + "data": { + "_target_": "electrai.dataloader.dataset.RhoRead", + "root": f"{DATA_ROOT}/mp_filelist.txt", + "split_file": None, + "precision": "f32", + "batch_size": batch_size, + "train_workers": 4, + "val_workers": 2, + "pin_memory": False, + "val_frac": val_frac, + "drop_last": False, + "augmentation": False, + "random_seed": 42, + }, + "model": { + "_target_": "electrai.model.resunet.ResUNet3D", + "in_channels": 1, + "out_channels": 1, + "n_channels": channels, + "n_residual_blocks": residual_blocks, + "kernel_size": kernel_size, + "depth": depth, + "use_checkpoint": False, + }, + "precision": 32, + "epochs": epochs, + "lr": lr, + "weight_decay": 0.0, + "warmup_length": 1, + "beta1": 0.9, + "beta2": 0.99, + "wandb_mode": "online", + "entity": "PrinceOA", + "wb_pname": wandb_project, + "ckpt_path": CKPT_ROOT, + } + + # Always override data root and checkpoint path for Modal + if "data" in cfg: + cfg["data"]["root"] = f"{DATA_ROOT}/mp_filelist.txt" + cfg["ckpt_path"] = CKPT_ROOT + + if not resume: + cfg.pop("resume_from_checkpoint", None) + + config_json = json.dumps(cfg, indent=2) + log.info("Config:\n%s", config_json) + + train_fn = train + if gpu != "L4": + train_fn = train.with_options(gpu=gpu) + + train_fn.remote(config_json=config_json, gpu_type=gpu) + log.info("Done.") diff --git a/specs/modal-ci.md b/specs/modal-ci.md new file mode 100644 index 00000000..dcfa408b --- /dev/null +++ b/specs/modal-ci.md @@ -0,0 +1,177 @@ +# Modal GPU CI for electrai + +## Context + +electrai currently runs GPU e2e tests on EC2 via `ec2-gha` (`gpu-e2e.yml`). This spec adds a parallel Modal-based GPU CI workflow, following the helico pattern: run tests directly inside a Modal `@app.function` rather than provisioning a self-hosted runner. + +## Approach + +Like helico's `modal/ci.py`: define a Modal app with the project's deps baked into the image, copy source/tests in, run `e2e_train.py` inside the function. The GHA workflow just calls `modal run` from `ubuntu-latest`. + +## Files to Create + +### 1. `modal/ci.py` + +```python +from pathlib import Path +import modal + +ROOT = Path(__file__).parent.parent + +image = ( + modal.Image.debian_slim(python_version="3.12") + .apt_install("git") + .pip_install( + "torch>=2.9", + "torchvision>=0.24", + "lightning~=2.5", + "numpy~=2.3", + "scikit-learn>=1.7", + "pymatgen>=2025.10", + "pyyaml>=6.0", + "zarr>=3.1", + "hydra-core>=1.3", + "wandb>=0.12", + "click", + ) + # Source code + tests + data (changes most, last layer) + .add_local_dir(str(ROOT / "src"), remote_path="/root/electrai/src") + .add_local_dir(str(ROOT / "tests"), remote_path="/root/electrai/tests") + .add_local_dir(str(ROOT / "scripts"), remote_path="/root/electrai/scripts") + .add_local_dir(str(ROOT / "data"), remote_path="/root/electrai/data") + .add_local_file( + str(ROOT / "pyproject.toml"), remote_path="/root/electrai/pyproject.toml" + ) +) + +app = modal.App("electrai-ci", image=image) + + +@app.function(gpu="L4", timeout=600) +def run_e2e_test(epochs: int = 5, check: bool = True): + """Run e2e training test on GPU.""" + import subprocess + + cmd = [ + "python", + "scripts/e2e_train.py", + "--gpu", + "--verbose", + "--epochs", + str(epochs), + ] + if not check: + cmd.append("--no-check") + result = subprocess.run(cmd, cwd="/root/electrai", check=True) + return result.returncode + + +@app.local_entrypoint() +def main(epochs: int = 5, check: bool = True): + run_e2e_test.remote(epochs=epochs, check=check) +``` + +Notes: +- GPU type: `L4` matches current EC2 `g6.xlarge` (also L4). Could use fallback list `gpu=["L4", "A10G"]`. +- `data/MP/chgcars/` has the small test dataset (~5 samples) checked into the repo; bake it in. +- `scripts/e2e_train.py` handles GPU detection, deterministic seeding, and expected value checking internally. +- No `uv` needed inside the container — deps are pre-installed by `.pip_install()`. +- `PYTHONPATH` needs to include `src/` for the `electrai` package. Either `pip install -e .` inside the function, or set env. The simplest: add `.run_commands("cd /root/electrai && pip install -e .")` as the last image layer. + +### 2. `.github/workflows/gpu-e2e-modal.yml` + +```yaml +name: GPU E2E (Modal) + +on: + pull_request: + branches: [main] + workflow_dispatch: + inputs: + epochs: + description: 'Number of training epochs' + default: '5' + type: string + +jobs: + test: + runs-on: ubuntu-latest + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: uv pip install --system modal + - run: modal run modal/ci.py --epochs ${{ inputs.epochs || '5' }} +``` + +## What this does NOT replicate from `gpu-e2e.yml` + +- **CPU baseline test** — could add a second `@app.function(gpu=None)` call, but CPU tests already run in `gen-expected.yml` +- **`update_expected` mode** — would need to get the file back out of Modal (print to stdout and capture, or use a Modal Volume). Defer for now. +- **Artifact upload** — same challenge; not needed for the basic CI pass/fail + +## Training on Modal (`modal/train.py`) + +Full training entrypoint for running real experiments on Modal, replacing Lambda Labs. + +### Data: `electrai-data` Volume + +`dataset_4` (2,885 samples, ~205 GiB) synced from Della (Globus source of truth) → S3 → Modal Volume: +- S3: `s3://openathena/electrai/mp/chg_datasets/dataset_4/` +- Volume mount: `/data/mp/chg_datasets/dataset_4/{data,label}/` + +Populate script: `modal/populate_volume.py` (S3 → Volume via `boto3`). + +### Checkpoints: `electrai-checkpoints` Volume + +Persists across runs. Mounted at `/checkpoints`. + +### Usage + +```bash +# Default: ResUNet, dataset_4, 50 epochs, L4 +modal run modal/train.py + +# A100, custom hyperparams +modal run modal/train.py --gpu A100 --channels 64 --epochs 50 + +# Use existing config file +modal run modal/train.py --config path/to/config.yaml --gpu A100 +``` + +### Data provenance + +``` +Globus (ROSENGROUP share) + └── /mp/chg_datasets/dataset_4/ (canonical, on Della) + ├── Della → S3 (aws s3 sync, one-time) + │ └── s3://openathena/electrai/mp/chg_datasets/dataset_4/ + │ └── S3 → Modal Volume (modal/populate_volume.py) + └── Della → Lambda LLFS (Globus transfer, Betsy's prior setup) + └── /home/ubuntu/betsy-electrai-2/dataset2/ +``` + +## Secrets required + +- `MODAL_TOKEN_ID` / `MODAL_TOKEN_SECRET` — repo secrets for GHA workflow +- `wandb-credentials` — Modal secret with `WANDB_API_KEY` (for training) +- `aws-credentials` — Modal secret with AWS creds (for `populate_volume.py`) + +No `GH_SA_TOKEN` needed (no runner registration). + +## Comparison: EC2 vs Modal for this workload + +| | EC2 (`gpu-e2e.yml`) | Modal (`gpu-e2e-modal.yml`) | +|---|---|---| +| Setup time | ~3-5 min (instance boot) | ~30s (image cached) | +| GPU | L4 (g6.xlarge) | L4 | +| Deps install | `uv sync` on every run | Baked into image (cached) | +| Checkout | `actions/checkout` | `.add_local_dir()` | +| Artifacts | Native GHA | Not supported (print to stdout) | +| Cost | EC2 on-demand pricing | Modal per-second billing | +| Extra secrets | `GH_SA_TOKEN`, AWS OIDC | Modal tokens only | diff --git a/src/electrai/dataloader/dataset.py b/src/electrai/dataloader/dataset.py index 1999c0f9..5b9b2353 100644 --- a/src/electrai/dataloader/dataset.py +++ b/src/electrai/dataloader/dataset.py @@ -103,7 +103,7 @@ def __init__(self, datapath: str, precision: str, augmentation: bool, **kwargs): if isinstance(datapath, str) and Path(datapath).is_file(): with Path(datapath).open() as f: lines = f.readlines() - member_list = [line.replace("\n", "") for line in lines] + member_list = [line.strip() for line in lines if line.strip()] else: raise ValueError("No filename found.")