diff --git a/.github/workflows/gpu-benchmark-modal.yml b/.github/workflows/gpu-benchmark-modal.yml
new file mode 100644
index 00000000..988ea8fa
--- /dev/null
+++ b/.github/workflows/gpu-benchmark-modal.yml
@@ -0,0 +1,115 @@
+name: GPU Benchmark (Modal)
+
+on:
+  workflow_dispatch:
+    inputs:
+      gpu:
+        description: 'GPU type (L4, A10G, A100, H100)'
+        default: 'L4'
+        type: string
+      epochs:
+        description: 'Number of training epochs'
+        default: '5'
+        type: string
+      channels:
+        description: 'Number of model channels (8=tiny, 32=prod, 64=large)'
+        default: '32'
+        type: string
+      residual_blocks:
+        description: 'Number of residual blocks (2=tiny, 16=prod)'
+        default: '16'
+        type: string
+      samples:
+        description: 'Number of samples (0=all eligible)'
+        default: '50'
+        type: string
+      dataset:
+        description: 'Dataset: s3 (205 samples, matches EC2) or dataset_4 (2885 samples)'
+        default: 's3'
+        type: string
+      wandb_project:
+        description: 'WandB project name'
+        default: 'elf-net-ci'
+        type: string
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    env:
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install Modal
+        run: uv pip install --system modal
+
+      - name: Run benchmark
+        id: bench
+        env:
+          GPU: ${{ inputs.gpu || 'L4' }}
+          EPOCHS: ${{ inputs.epochs || '5' }}
+          CHANNELS: ${{ inputs.channels || '32' }}
+          RESIDUAL_BLOCKS: ${{ inputs.residual_blocks || '16' }}
+          SAMPLES: ${{ inputs.samples || '50' }}
+          DATASET: ${{ inputs.dataset || 's3' }}
+          WANDB_PROJECT: ${{ inputs.wandb_project || 'elf-net-ci' }}
+        run: |
+          OUTPUT=$(modal run modal/benchmark.py \
+            --gpu "$GPU" \
+            --epochs "$EPOCHS" \
+            --channels "$CHANNELS" \
+            --residual-blocks "$RESIDUAL_BLOCKS" \
+            --samples "$SAMPLES" \
+            --dataset "$DATASET" \
+            --wandb-project "$WANDB_PROJECT" 2>&1 | tee /dev/stderr)
+
+          # Parse metrics from output
+          for key in VAL_LOSS TRAIN_LOSS WALLCLOCK GPU DATASET SAMPLES WANDB_URL; do
+            val=$(echo "$OUTPUT" | grep -oP "BENCHMARK_${key}=\K.*" || true)
+            if [ -n "$val" ]; then
+              echo "BENCHMARK_${key}=${val}" >> "$GITHUB_OUTPUT"
+            fi
+          done
+
+      - name: Summary
+        env:
+          EPOCHS: ${{ inputs.epochs || '5' }}
+          CHANNELS: ${{ inputs.channels || '32' }}
+          RESIDUAL_BLOCKS: ${{ inputs.residual_blocks || '16' }}
+          VAL_LOSS: ${{ steps.bench.outputs.BENCHMARK_VAL_LOSS }}
+          TRAIN_LOSS: ${{ steps.bench.outputs.BENCHMARK_TRAIN_LOSS }}
+          WALLCLOCK: ${{ steps.bench.outputs.BENCHMARK_WALLCLOCK }}
+          GPU: ${{ steps.bench.outputs.BENCHMARK_GPU }}
+          DATASET: ${{ steps.bench.outputs.BENCHMARK_DATASET }}
+          SAMPLES: ${{ steps.bench.outputs.BENCHMARK_SAMPLES }}
+          WANDB_URL: ${{ steps.bench.outputs.BENCHMARK_WANDB_URL }}
+          WANDB_PROJECT: ${{ inputs.wandb_project || 'elf-net-ci' }}
+        run: |
+          echo "## GPU Benchmark Results (Modal)" >> "$GITHUB_STEP_SUMMARY"
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+          echo "### Configuration" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY"
+          echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Platform | Modal ($GPU) |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Epochs | $EPOCHS |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Channels | $CHANNELS |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Residual Blocks | $RESIDUAL_BLOCKS |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Dataset | $DATASET ($SAMPLES samples) |" >> "$GITHUB_STEP_SUMMARY"
+          COMMIT_LINK="[\`${GITHUB_SHA::7}\`](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/commit/${GITHUB_SHA})"
+          echo "| Commit | $COMMIT_LINK |" >> "$GITHUB_STEP_SUMMARY"
+          if [ -n "$WANDB_URL" ]; then
+            WANDB_RUN_ID=$(echo "$WANDB_URL" | grep -oP '[^/]+$')
+            echo "| WandB | [$WANDB_PROJECT](https://wandb.ai/PrinceOA/$WANDB_PROJECT) run [$WANDB_RUN_ID]($WANDB_URL) |" >> "$GITHUB_STEP_SUMMARY"
+          fi
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+          echo "### Results" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Metric | Value |" >> "$GITHUB_STEP_SUMMARY"
+          echo "|--------|-------|" >> "$GITHUB_STEP_SUMMARY"
+          echo "| val_loss | $VAL_LOSS |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| train_loss | $TRAIN_LOSS |" >> "$GITHUB_STEP_SUMMARY"
+          echo "| Wallclock | ${WALLCLOCK}s |" >> "$GITHUB_STEP_SUMMARY"
diff --git a/.github/workflows/gpu-e2e-modal.yml b/.github/workflows/gpu-e2e-modal.yml
new file mode 100644
index 00000000..b68c03b6
--- /dev/null
+++ b/.github/workflows/gpu-e2e-modal.yml
@@ -0,0 +1,32 @@
+name: GPU E2E (Modal)
+
+on:
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+    inputs:
+      epochs:
+        description: 'Number of training epochs'
+        default: '5'
+        type: string
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    env:
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install Modal
+        run: uv pip install --system modal
+
+      - name: Run e2e test on Modal GPU
+        env:
+          EPOCHS: ${{ inputs.epochs || '5' }}
+        run: modal run modal/ci.py --epochs "$EPOCHS"
diff --git a/modal/benchmark.py b/modal/benchmark.py
new file mode 100644
index 00000000..bcf82777
--- /dev/null
+++ b/modal/benchmark.py
@@ -0,0 +1,278 @@
+"""Modal GPU benchmark for electrai.
+
+Runs a configurable training benchmark on Modal GPUs with data from the
+electrai-data Volume, logs metrics to WandB, and reports wall-clock time.
+
+Two data sources on the Volume:
+- "s3" (default): 205 samples from s3://openathena/electrai/ (≤25MB, matches EC2 benchmark)
+- "dataset_4": 2,885 samples from Globus/Della dataset_4 (large grids, needs A100 for prod config)
+
+Usage:
+    # Default: 50 samples from S3 set, 5 epochs, 32ch/16blk, L4 (matches EC2 benchmark)
+    modal run modal/benchmark.py
+
+    # Dataset_4 with tiny model on L4
+    modal run modal/benchmark.py --dataset dataset_4 --channels 8 --residual-blocks 2
+
+    # Production model on A100 with dataset_4
+    modal run modal/benchmark.py --dataset dataset_4 --gpu A100 --channels 32 --residual-blocks 16
+
+    # All S3 samples
+    modal run modal/benchmark.py --samples 0
+
+    # Quick smoke test
+    modal run modal/benchmark.py --samples 10 --epochs 2 --channels 8 --residual-blocks 2
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import modal
+
+ROOT = Path(__file__).parent.parent
+
+data_volume = modal.Volume.from_name("electrai-data")
+
+image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .apt_install("git")
+    .pip_install_from_pyproject(
+        str(ROOT / "pyproject.toml"), optional_dependencies=["dev"]
+    )
+    .add_local_dir(str(ROOT / "src"), remote_path="/root/electrai/src", copy=True)
+    .add_local_dir(
+        str(ROOT / "scripts"), remote_path="/root/electrai/scripts", copy=True
+    )
+    .add_local_file(
+        str(ROOT / "pyproject.toml"),
+        remote_path="/root/electrai/pyproject.toml",
+        copy=True,
+    )
+    .run_commands("cd /root/electrai && pip install --no-deps -e .")
+)
+
+app = modal.App("electrai-benchmark", image=image)
+
+# Data roots on the Volume
+DATASETS = {
+    # Mirrors s3://openathena/electrai/ — same data as EC2 gpu-benchmark.yml
+    # Note: S3 uses input/ but RhoRead expects data/, so we symlink
+    "s3": {
+        "root": "/data/s3/openathena/electrai",
+        "input_dir": "input",  # S3 naming
+        "max_file_size": 25,  # matches EC2 benchmark default
+    },
+    # Globus/Della dataset_4 — 2,885 samples, large grids
+    "dataset_4": {
+        "root": "/data/mp/chg_datasets/dataset_4",
+        "input_dir": "data",  # Della naming
+        "max_file_size": 100,
+    },
+}
+
+
+@app.function(
+    gpu="L4",
+    volumes={"/data": data_volume},
+    secrets=[modal.Secret.from_name("wandb-credentials")],
+    timeout=7200,
+    retries=0,
+)
+def run_benchmark(
+    epochs: int = 5,
+    channels: int = 32,
+    residual_blocks: int = 16,
+    samples: int = 50,
+    max_file_size: float = -1,
+    seed: int = 42,
+    wandb_project: str = "elf-net-ci",
+    gpu_type: str = "L4",
+    dataset: str = "s3",
+    local_copy: bool = False,
+):
+    """Run benchmark and return results."""
+    import logging
+    import sys
+
+    log = logging.getLogger(__name__)
+    logging.basicConfig(level=logging.INFO)
+
+    sys.path.insert(0, "/root/electrai/scripts")
+    from e2e_train import run_training
+
+    ds = DATASETS[dataset]
+    ds_root = Path(ds["root"])
+    input_dir = ds["input_dir"]
+
+    # Use dataset-specific default if max_file_size not explicitly set
+    if max_file_size < 0:
+        max_file_size = ds["max_file_size"]
+
+    # Build filelist from files on disk (S3 set has no mp_filelist.txt)
+    data_dir = ds_root / input_dir
+    all_ids = sorted(p.stem for p in data_dir.glob("*.CHGCAR"))
+    log.info("Dataset %r: %d total samples in %s", dataset, len(all_ids), data_dir)
+
+    # Filter by file size (avoid OOM on large grids)
+    if max_file_size > 0:
+        max_bytes = int(max_file_size * 1024 * 1024)
+        eligible = [
+            sid
+            for sid in all_ids
+            if (data_dir / f"{sid}.CHGCAR").stat().st_size <= max_bytes
+        ]
+        log.info(
+            "File size filter: %d/%d eligible (<=%.0fMB)",
+            len(eligible),
+            len(all_ids),
+            max_file_size,
+        )
+    else:
+        eligible = all_ids
+
+    # Select samples: first N (lexicographic, matching s3_sync.py behavior)
+    if 0 < samples < len(eligible):
+        subset = eligible[:samples]
+        log.info("Selected first %d/%d eligible samples", samples, len(eligible))
+    else:
+        subset = eligible
+        samples = len(subset)
+        log.info("Using all %d eligible samples", samples)
+
+    if not subset:
+        raise ValueError(
+            f"No eligible samples (dataset={dataset}, total={len(all_ids)}, "
+            f"max_file_size={max_file_size}MB)"
+        )
+
+    data_root = "/tmp/benchmark_data"
+    Path(data_root).mkdir(parents=True, exist_ok=True)
+
+    if local_copy:
+        # Copy selected samples to local disk (Volume I/O is ~15x slower)
+        import shutil
+
+        local_data = Path(data_root) / "data"
+        local_label = Path(data_root) / "label"
+        local_data.mkdir(parents=True, exist_ok=True)
+        local_label.mkdir(parents=True, exist_ok=True)
+        for sid in subset:
+            shutil.copy2(
+                ds_root / input_dir / f"{sid}.CHGCAR", local_data / f"{sid}.CHGCAR"
+            )
+            shutil.copy2(
+                ds_root / "label" / f"{sid}.CHGCAR", local_label / f"{sid}.CHGCAR"
+            )
+        log.info("Copied %d samples to local disk", len(subset))
+    else:
+        # Symlink to volume (slower I/O but no copy overhead for large datasets)
+        data_link = Path(data_root) / "data"
+        if not data_link.exists():
+            data_link.symlink_to(ds_root / input_dir)
+        label_link = Path(data_root) / "label"
+        if not label_link.exists():
+            label_link.symlink_to(ds_root / "label")
+        log.info("Using volume directly (no local copy)")
+
+    Path(data_root, "mp_filelist.txt").write_text("\n".join(subset) + "\n")
+
+    # Always use gradient checkpointing (32ch/16blk needs it even for ≤25MB files)
+    use_grad_ckpt = True
+
+    log.info(
+        "Benchmark: gpu=%s, epochs=%d, channels=%d, blocks=%d, samples=%d, "
+        "dataset=%s, grad_ckpt=%s",
+        gpu_type,
+        epochs,
+        channels,
+        residual_blocks,
+        samples,
+        dataset,
+        use_grad_ckpt,
+    )
+
+    # Set WandB run name and env vars for platform tagging
+    import os
+    import time
+
+    os.environ["INSTANCE_TYPE"] = f"modal-{gpu_type}"
+    # Set workflow-like name so WandB run name is descriptive.
+    # GHA sets GITHUB_RUN_NUMBER; for local runs, use timestamp.
+    os.environ["GITHUB_WORKFLOW"] = "Modal Benchmark"
+    if "GITHUB_RUN_NUMBER" not in os.environ:
+        os.environ["GITHUB_RUN_NUMBER"] = time.strftime("%y%m%d-%H%M")
+
+    results = run_training(
+        channels=channels,
+        residual_blocks=residual_blocks,
+        epochs=epochs,
+        seed=seed,
+        gpu=True,
+        gradient_checkpoint=use_grad_ckpt,
+        data_root=data_root,
+        max_file_size=0,  # already filtered above
+        wandb_project=wandb_project,
+        verbose=True,
+    )
+
+    log.info("val_loss: %.6f", results["final_val_loss"])
+    log.info("train_loss: %.6f", results["final_train_loss"])
+    log.info("Wallclock: %.1fs", results["wallclock_s"])
+    log.info("GPU: %s (Modal)", gpu_type)
+
+    return results
+
+
+@app.local_entrypoint()
+def main(
+    gpu: str = "L4",
+    epochs: int = 5,
+    channels: int = 32,
+    residual_blocks: int = 16,
+    samples: int = 50,
+    max_file_size: float = -1,
+    seed: int = 42,
+    wandb_project: str = "elf-net-ci",
+    dataset: str = "s3",
+    local_copy: bool = False,
+):
+    import logging
+
+    logging.basicConfig(level=logging.INFO)
+    log = logging.getLogger(__name__)
+
+    benchmark_fn = run_benchmark
+    if gpu != "L4":
+        benchmark_fn = run_benchmark.with_options(gpu=gpu)
+
+    results = benchmark_fn.remote(
+        epochs=epochs,
+        channels=channels,
+        residual_blocks=residual_blocks,
+        samples=samples,
+        max_file_size=max_file_size,
+        seed=seed,
+        wandb_project=wandb_project,
+        gpu_type=gpu,
+        dataset=dataset,
+        local_copy=local_copy,
+    )
+
+    log.info(
+        "Benchmark complete: val_loss=%.6f, wallclock=%.1fs on %s",
+        results["final_val_loss"],
+        results["wallclock_s"],
+        gpu,
+    )
+
+    # Print parseable output for GHA summary
+    wandb_url = results.get("wandb_run_url") or ""
+    print(f"BENCHMARK_VAL_LOSS={results['final_val_loss']:.6f}")  # noqa: T201
+    print(f"BENCHMARK_TRAIN_LOSS={results['final_train_loss']:.6f}")  # noqa: T201
+    print(f"BENCHMARK_WALLCLOCK={results['wallclock_s']:.0f}")  # noqa: T201
+    print(f"BENCHMARK_GPU={gpu}")  # noqa: T201
+    print(f"BENCHMARK_DATASET={dataset}")  # noqa: T201
+    print(f"BENCHMARK_SAMPLES={samples}")  # noqa: T201
+    if wandb_url:
+        print(f"BENCHMARK_WANDB_URL={wandb_url}")  # noqa: T201
diff --git a/modal/ci.py b/modal/ci.py
new file mode 100644
index 00000000..0cd9da96
--- /dev/null
+++ b/modal/ci.py
@@ -0,0 +1,93 @@
+"""Modal GPU CI for electrai e2e training test."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import modal
+
+ROOT = Path(__file__).parent.parent
+
+# Dependencies read from pyproject.toml (shared with train.py, populate_volume.py)
+image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .apt_install("git")
+    .pip_install_from_pyproject(
+        str(ROOT / "pyproject.toml"), optional_dependencies=["dev"]
+    )
+    .add_local_dir(str(ROOT / "src"), remote_path="/root/electrai/src", copy=True)
+    .add_local_dir(
+        str(ROOT / "scripts"), remote_path="/root/electrai/scripts", copy=True
+    )
+    .add_local_dir(str(ROOT / "tests"), remote_path="/root/electrai/tests", copy=True)
+    .add_local_dir(str(ROOT / "data"), remote_path="/root/electrai/data", copy=True)
+    .add_local_file(
+        str(ROOT / "pyproject.toml"),
+        remote_path="/root/electrai/pyproject.toml",
+        copy=True,
+    )
+    .run_commands("cd /root/electrai && pip install --no-deps -e .")
+)
+
+app = modal.App("electrai-ci", image=image)
+
+
+@app.function(gpu="L4", timeout=600, retries=0)
+def run_e2e_test(epochs: int = 5, check: bool = True):
+    """Run e2e training test on GPU."""
+    import json
+    import logging
+    import sys
+
+    log = logging.getLogger(__name__)
+
+    sys.path.insert(0, "/root/electrai/scripts")
+    from e2e_train import run_training
+
+    results = run_training(epochs=epochs, gpu=True, verbose=True)
+    log.info("Platform: %s", results["platform"])
+    log.info("Final val_loss: %.6f", results["final_val_loss"])
+    if results["final_train_loss"] is not None:
+        log.info("Final train_loss: %.6f", results["final_train_loss"])
+    log.info("Wallclock: %.1fs", results["wallclock_s"])
+
+    if check:
+        expected_file = Path("/root/electrai/tests/expected_values.json")
+        expected_values = json.loads(expected_file.read_text())
+        platform = results["platform"]
+        if platform not in expected_values:
+            raise ValueError(
+                f"No expected values for platform {platform!r}, "
+                f"available: {list(expected_values.keys())}"
+            )
+        expected = expected_values[platform]
+        if expected.get("final_val_loss") is None:
+            raise ValueError(f"Expected values for {platform!r} are null")
+        expected_val_loss = expected["final_val_loss"]
+        diff = abs(results["final_val_loss"] - expected_val_loss)
+        tolerance = 0.001
+        if diff > tolerance:
+            raise AssertionError(
+                f"val_loss {results['final_val_loss']:.6f} differs from expected "
+                f"{expected_val_loss:.6f} by {diff:.6f} (tolerance: {tolerance})"
+            )
+        log.info(
+            "PASS: val_loss matches expected within tolerance (%.6f <= %f)",
+            diff,
+            tolerance,
+        )
+
+    return results
+
+
+@app.local_entrypoint()
+def main(epochs: int = 5, check: bool = True):
+    import logging
+
+    logging.basicConfig(level=logging.INFO)
+    results = run_e2e_test.remote(epochs=epochs, check=check)
+    logging.getLogger(__name__).info(
+        "Results: val_loss=%.6f in %.1fs",
+        results["final_val_loss"],
+        results["wallclock_s"],
+    )
diff --git a/modal/populate_volume.py b/modal/populate_volume.py
new file mode 100644
index 00000000..7309f894
--- /dev/null
+++ b/modal/populate_volume.py
@@ -0,0 +1,84 @@
+"""Populate Modal Volume with training data from S3."""
+
+from __future__ import annotations
+
+import modal
+
+app = modal.App("electrai-populate")
+volume = modal.Volume.from_name("electrai-data", create_if_missing=True)
+
+
+@app.function(
+    image=modal.Image.debian_slim(python_version="3.12").pip_install("boto3"),
+    volumes={"/data": volume},
+    secrets=[modal.Secret.from_name("aws-credentials")],
+    timeout=7200,
+    retries=0,
+)
+def sync_s3(
+    bucket: str = "openathena",
+    prefix: str = "electrai/mp/chg_datasets/dataset_4",
+    dest: str = "/data/mp/chg_datasets/dataset_4",
+):
+    """Sync dataset from S3 to Modal Volume."""
+    import logging
+    from pathlib import Path
+
+    import boto3
+
+    log = logging.getLogger(__name__)
+    logging.basicConfig(level=logging.INFO)
+
+    s3 = boto3.client("s3")
+    paginator = s3.get_paginator("list_objects_v2")
+
+    # Count and list all objects
+    objects = [
+        obj
+        for page in paginator.paginate(Bucket=bucket, Prefix=prefix)
+        for obj in page.get("Contents", [])
+    ]
+
+    total_bytes = sum(o["Size"] for o in objects)
+    log.info("Found %d objects, %.1f GiB total", len(objects), total_bytes / (1024**3))
+
+    downloaded = 0
+    skipped = 0
+    for i, obj in enumerate(objects):
+        key = obj["Key"]
+        rel = key[len(prefix) :].lstrip("/")
+        local_path = Path(dest) / rel
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Skip if already exists with same size
+        if local_path.exists() and local_path.stat().st_size == obj["Size"]:
+            skipped += 1
+            continue
+
+        s3.download_file(bucket, key, str(local_path))
+        downloaded += 1
+
+        if (i + 1) % 100 == 0:
+            log.info(
+                "Progress: %d/%d (downloaded %d, skipped %d)",
+                i + 1,
+                len(objects),
+                downloaded,
+                skipped,
+            )
+
+    log.info("Done: %d downloaded, %d skipped (already existed)", downloaded, skipped)
+    volume.commit()
+
+
+@app.local_entrypoint()
+def main(
+    bucket: str = "openathena",
+    prefix: str = "electrai/mp/chg_datasets/dataset_4",
+    dest: str = "/data/mp/chg_datasets/dataset_4",
+):
+    import logging
+
+    logging.basicConfig(level=logging.INFO)
+    sync_s3.remote(bucket=bucket, prefix=prefix, dest=dest)
+    logging.getLogger(__name__).info("Volume populated.")
diff --git a/modal/train.py b/modal/train.py
new file mode 100644
index 00000000..881557d1
--- /dev/null
+++ b/modal/train.py
@@ -0,0 +1,215 @@
+"""Modal training entrypoint for electrai.
+
+Run real training experiments on Modal GPUs with data from the
+electrai-data Volume and checkpoints persisted to electrai-checkpoints.
+
+Usage:
+    # Default config (ResUNet, dataset_4, 50 epochs, L4)
+    modal run modal/train.py
+
+    # Custom config file
+    modal run modal/train.py --config examples/MP/experiments/experiment_0/config.yaml
+
+    # Override GPU, epochs, channels
+    modal run modal/train.py --gpu A100 --epochs 10 --channels 64
+
+    # Resume from last checkpoint
+    modal run modal/train.py --gpu A100 --resume
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import modal
+
+ROOT = Path(__file__).parent.parent
+
+# Persistent volumes
+data_volume = modal.Volume.from_name("electrai-data")
+ckpt_volume = modal.Volume.from_name("electrai-checkpoints", create_if_missing=True)
+
+# Dependencies read from pyproject.toml (shared with ci.py, populate_volume.py)
+image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .apt_install("git")
+    .pip_install_from_pyproject(
+        str(ROOT / "pyproject.toml"), optional_dependencies=["dev"]
+    )
+    .add_local_dir(str(ROOT / "src"), remote_path="/root/electrai/src", copy=True)
+    .add_local_dir(
+        str(ROOT / "examples"), remote_path="/root/electrai/examples", copy=True
+    )
+    .add_local_file(
+        str(ROOT / "pyproject.toml"),
+        remote_path="/root/electrai/pyproject.toml",
+        copy=True,
+    )
+    .run_commands("cd /root/electrai && pip install --no-deps -e .")
+)
+
+app = modal.App("electrai-train", image=image)
+
+DATA_ROOT = "/data/mp/chg_datasets/dataset_4"
+CKPT_ROOT = "/checkpoints"
+
+
+@app.function(
+    gpu="L4",
+    volumes={"/data": data_volume, CKPT_ROOT: ckpt_volume},
+    secrets=[modal.Secret.from_name("wandb-credentials")],
+    timeout=14400,  # 4 hours
+    retries=0,
+)
+def train(config_json: str, gpu_type: str = "L4"):
+    """Run training with the given config (as JSON, converted to YAML remotely)."""
+    import json
+    import logging
+    import subprocess
+    import sys
+
+    import yaml
+
+    log = logging.getLogger(__name__)
+    logging.basicConfig(level=logging.INFO)
+
+    cfg = json.loads(config_json)
+
+    # Write config as YAML for the training entrypoint
+    config_path = Path("/tmp/config.yaml")
+    with config_path.open("w") as f:
+        yaml.dump(cfg, f, default_flow_style=False)
+
+    log.info("GPU: %s", gpu_type)
+    log.info("Data root: %s", DATA_ROOT)
+    log.info("Checkpoint dir: %s", CKPT_ROOT)
+
+    # Verify data exists
+    filelist = Path(DATA_ROOT) / "mp_filelist.txt"
+    if not filelist.exists():
+        raise FileNotFoundError(f"Filelist not found: {filelist}")
+    n_samples = len(filelist.read_text().strip().splitlines())
+    log.info("Dataset: %d samples", n_samples)
+
+    # Check for existing checkpoint to resume from
+    ckpt_path = Path(cfg.get("ckpt_path", CKPT_ROOT))
+    last_ckpt = ckpt_path / "last.ckpt"
+    if last_ckpt.exists():
+        log.info("Found checkpoint: %s", last_ckpt)
+    else:
+        log.info("No checkpoint found, starting from scratch")
+
+    # Run training
+    result = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "electrai.entrypoints.main",
+            "train",
+            "--config",
+            str(config_path),
+        ],
+        cwd="/root/electrai",
+        check=False,
+    )
+
+    # Persist checkpoints
+    ckpt_volume.commit()
+
+    if result.returncode != 0:
+        raise RuntimeError(f"Training failed with exit code {result.returncode}")
+
+    log.info("Training complete. Checkpoints saved to electrai-checkpoints volume.")
+
+
+@app.local_entrypoint()
+def main(
+    config: str = "",
+    gpu: str = "L4",
+    epochs: int = 50,
+    channels: int = 32,
+    residual_blocks: int = 1,
+    depth: int = 2,
+    kernel_size: int = 5,
+    lr: float = 0.01,
+    batch_size: int = 1,
+    val_frac: float = 0.005,
+    wandb_project: str = "mp-experiment",
+    resume: bool = False,
+):
+    import json
+    import logging
+
+    logging.basicConfig(level=logging.INFO)
+    log = logging.getLogger(__name__)
+
+    if config:
+        import subprocess
+
+        result = subprocess.run(
+            [
+                "python3",
+                "-c",
+                f"import yaml, json; print(json.dumps(yaml.safe_load(open('{config}'))))",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        cfg = json.loads(result.stdout)
+    else:
+        cfg = {
+            "data": {
+                "_target_": "electrai.dataloader.dataset.RhoRead",
+                "root": f"{DATA_ROOT}/mp_filelist.txt",
+                "split_file": None,
+                "precision": "f32",
+                "batch_size": batch_size,
+                "train_workers": 4,
+                "val_workers": 2,
+                "pin_memory": False,
+                "val_frac": val_frac,
+                "drop_last": False,
+                "augmentation": False,
+                "random_seed": 42,
+            },
+            "model": {
+                "_target_": "electrai.model.resunet.ResUNet3D",
+                "in_channels": 1,
+                "out_channels": 1,
+                "n_channels": channels,
+                "n_residual_blocks": residual_blocks,
+                "kernel_size": kernel_size,
+                "depth": depth,
+                "use_checkpoint": False,
+            },
+            "precision": 32,
+            "epochs": epochs,
+            "lr": lr,
+            "weight_decay": 0.0,
+            "warmup_length": 1,
+            "beta1": 0.9,
+            "beta2": 0.99,
+            "wandb_mode": "online",
+            "entity": "PrinceOA",
+            "wb_pname": wandb_project,
+            "ckpt_path": CKPT_ROOT,
+        }
+
+    # Always override data root and checkpoint path for Modal
+    if "data" in cfg:
+        cfg["data"]["root"] = f"{DATA_ROOT}/mp_filelist.txt"
+    cfg["ckpt_path"] = CKPT_ROOT
+
+    if not resume:
+        cfg.pop("resume_from_checkpoint", None)
+
+    config_json = json.dumps(cfg, indent=2)
+    log.info("Config:\n%s", config_json)
+
+    train_fn = train
+    if gpu != "L4":
+        train_fn = train.with_options(gpu=gpu)
+
+    train_fn.remote(config_json=config_json, gpu_type=gpu)
+    log.info("Done.")
diff --git a/specs/modal-ci.md b/specs/modal-ci.md
new file mode 100644
index 00000000..dcfa408b
--- /dev/null
+++ b/specs/modal-ci.md
@@ -0,0 +1,177 @@
+# Modal GPU CI for electrai
+
+## Context
+
+electrai currently runs GPU e2e tests on EC2 via `ec2-gha` (`gpu-e2e.yml`). This spec adds a parallel Modal-based GPU CI workflow, following the helico pattern: run tests directly inside a Modal `@app.function` rather than provisioning a self-hosted runner.
+
+## Approach
+
+Like helico's `modal/ci.py`: define a Modal app with the project's deps baked into the image, copy source/tests in, run `e2e_train.py` inside the function. The GHA workflow just calls `modal run` from `ubuntu-latest`.
+
+## Files to Create
+
+### 1. `modal/ci.py`
+
+```python
+from pathlib import Path
+import modal
+
+ROOT = Path(__file__).parent.parent
+
+image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .apt_install("git")
+    .pip_install(
+        "torch>=2.9",
+        "torchvision>=0.24",
+        "lightning~=2.5",
+        "numpy~=2.3",
+        "scikit-learn>=1.7",
+        "pymatgen>=2025.10",
+        "pyyaml>=6.0",
+        "zarr>=3.1",
+        "hydra-core>=1.3",
+        "wandb>=0.12",
+        "click",
+    )
+    # Source code + tests + data (changes most, last layer)
+    .add_local_dir(str(ROOT / "src"), remote_path="/root/electrai/src")
+    .add_local_dir(str(ROOT / "tests"), remote_path="/root/electrai/tests")
+    .add_local_dir(str(ROOT / "scripts"), remote_path="/root/electrai/scripts")
+    .add_local_dir(str(ROOT / "data"), remote_path="/root/electrai/data")
+    .add_local_file(
+        str(ROOT / "pyproject.toml"), remote_path="/root/electrai/pyproject.toml"
+    )
+)
+
+app = modal.App("electrai-ci", image=image)
+
+
+@app.function(gpu="L4", timeout=600)
+def run_e2e_test(epochs: int = 5, check: bool = True):
+    """Run e2e training test on GPU."""
+    import subprocess
+
+    cmd = [
+        "python",
+        "scripts/e2e_train.py",
+        "--gpu",
+        "--verbose",
+        "--epochs",
+        str(epochs),
+    ]
+    if not check:
+        cmd.append("--no-check")
+    result = subprocess.run(cmd, cwd="/root/electrai", check=True)
+    return result.returncode
+
+
+@app.local_entrypoint()
+def main(epochs: int = 5, check: bool = True):
+    run_e2e_test.remote(epochs=epochs, check=check)
+```
+
+Notes:
+- GPU type: `L4` matches current EC2 `g6.xlarge` (also L4). Could use fallback list `gpu=["L4", "A10G"]`.
+- `data/MP/chgcars/` has the small test dataset (~5 samples) checked into the repo; bake it in.
+- `scripts/e2e_train.py` handles GPU detection, deterministic seeding, and expected value checking internally.
+- No `uv` needed inside the container — deps are pre-installed by `.pip_install()`.
+- `PYTHONPATH` needs to include `src/` for the `electrai` package. Either `pip install -e .` inside the function, or set env. The simplest: add `.run_commands("cd /root/electrai && pip install -e .")` as the last image layer.
+
+### 2. `.github/workflows/gpu-e2e-modal.yml`
+
+```yaml
+name: GPU E2E (Modal)
+
+on:
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+    inputs:
+      epochs:
+        description: 'Number of training epochs'
+        default: '5'
+        type: string
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    env:
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - run: uv pip install --system modal
+      - run: modal run modal/ci.py --epochs ${{ inputs.epochs || '5' }}
+```
+
+## What this does NOT replicate from `gpu-e2e.yml`
+
+- **CPU baseline test** — could add a second `@app.function(gpu=None)` call, but CPU tests already run in `gen-expected.yml`
+- **`update_expected` mode** — would need to get the file back out of Modal (print to stdout and capture, or use a Modal Volume). Defer for now.
+- **Artifact upload** — same challenge; not needed for the basic CI pass/fail
+
+## Training on Modal (`modal/train.py`)
+
+Full training entrypoint for running real experiments on Modal, replacing Lambda Labs.
+
+### Data: `electrai-data` Volume
+
+`dataset_4` (2,885 samples, ~205 GiB) synced from Della (Globus source of truth) → S3 → Modal Volume:
+- S3: `s3://openathena/electrai/mp/chg_datasets/dataset_4/`
+- Volume mount: `/data/mp/chg_datasets/dataset_4/{data,label}/`
+
+Populate script: `modal/populate_volume.py` (S3 → Volume via `boto3`).
+
+### Checkpoints: `electrai-checkpoints` Volume
+
+Persists across runs. Mounted at `/checkpoints`.
+
+### Usage
+
+```bash
+# Default: ResUNet, dataset_4, 50 epochs, L4
+modal run modal/train.py
+
+# A100, custom hyperparams
+modal run modal/train.py --gpu A100 --channels 64 --epochs 50
+
+# Use existing config file
+modal run modal/train.py --config path/to/config.yaml --gpu A100
+```
+
+### Data provenance
+
+```
+Globus (ROSENGROUP share)
+  └── /mp/chg_datasets/dataset_4/   (canonical, on Della)
+        ├── Della → S3 (aws s3 sync, one-time)
+        │     └── s3://openathena/electrai/mp/chg_datasets/dataset_4/
+        │           └── S3 → Modal Volume (modal/populate_volume.py)
+        └── Della → Lambda LLFS (Globus transfer, Betsy's prior setup)
+              └── /home/ubuntu/betsy-electrai-2/dataset2/
+```
+
+## Secrets required
+
+- `MODAL_TOKEN_ID` / `MODAL_TOKEN_SECRET` — repo secrets for GHA workflow
+- `wandb-credentials` — Modal secret with `WANDB_API_KEY` (for training)
+- `aws-credentials` — Modal secret with AWS creds (for `populate_volume.py`)
+
+No `GH_SA_TOKEN` needed (no runner registration).
+
+## Comparison: EC2 vs Modal for this workload
+
+| | EC2 (`gpu-e2e.yml`) | Modal (`gpu-e2e-modal.yml`) |
+|---|---|---|
+| Setup time | ~3-5 min (instance boot) | ~30s (image cached) |
+| GPU | L4 (g6.xlarge) | L4 |
+| Deps install | `uv sync` on every run | Baked into image (cached) |
+| Checkout | `actions/checkout` | `.add_local_dir()` |
+| Artifacts | Native GHA | Not supported (print to stdout) |
+| Cost | EC2 on-demand pricing | Modal per-second billing |
+| Extra secrets | `GH_SA_TOKEN`, AWS OIDC | Modal tokens only |
diff --git a/src/electrai/dataloader/dataset.py b/src/electrai/dataloader/dataset.py
index 1999c0f9..5b9b2353 100644
--- a/src/electrai/dataloader/dataset.py
+++ b/src/electrai/dataloader/dataset.py
@@ -103,7 +103,7 @@ def __init__(self, datapath: str, precision: str, augmentation: bool, **kwargs):
         if isinstance(datapath, str) and Path(datapath).is_file():
             with Path(datapath).open() as f:
                 lines = f.readlines()
-            member_list = [line.replace("\n", "") for line in lines]
+            member_list = [line.strip() for line in lines if line.strip()]
         else:
             raise ValueError("No filename found.")