PrimeIntellect-ai · mikasenghaas · Jun 19, 2026 · Jun 19, 2026
diff --git a/bench/persistent_runtime.py b/bench/persistent_runtime.py
@@ -0,0 +1,90 @@
+"""Benchmark the persistent runtime + warm scoring against the per-rollout default.
+
+Isolates the runtime/scoring overhead (no model generation, which would mask it): runs gsm8k's
+`verify.py` N times at a fixed concurrency, both ways, through the real framework API.
+
+  ephemeral:  make_runtime + start() + run_uv_script(cold) + stop() per call
+              — exactly today's per-rollout scoring path.
+  persistent: RuntimePool.acquire() + run_uv_script(warm=True) + release() per call
+              — runtimes (and their warm workers) reused; the math_verify import is paid once
+              per pooled runtime, not per call.
+
+Usage: uv run python bench/persistent_runtime.py [N=1000] [CONCURRENCY=128]
+"""
+
+import asyncio
+import sys
+import time
+from pathlib import Path
+
+from verifiers.v1.runtimes import RuntimePool, SubprocessConfig, make_runtime
+
+VERIFY = (
+    Path(__file__).resolve().parents[1] / "environments/gsm8k_v1/gsm8k_v1/verify.py"
+).read_bytes()
+GOLD, PRED = "42", "#### 42"
+
+
+async def ephemeral(n: int, concurrency: int) -> float:
+    sem = asyncio.Semaphore(concurrency)
+
+    async def one() -> None:
+        async with sem:
+            rt = make_runtime(SubprocessConfig())
+            await rt.start()
+            try:
+                r = await rt.run_uv_script(VERIFY, args=[GOLD, PRED])
+                assert r.stdout.strip().endswith("1.0"), r
+            finally:
+                await rt.stop()
+
+    t0 = time.time()
+    await asyncio.gather(*(one() for _ in range(n)))
+    return time.time() - t0
+
+
+async def persistent(n: int, concurrency: int) -> float:
+    sem = asyncio.Semaphore(concurrency)
+    cfg = SubprocessConfig(persistent=True)
+
+    async def one(pool: RuntimePool) -> None:
+        async with sem:
+            rt = await pool.acquire(cfg)
+            try:
+                r = await rt.run_uv_script(VERIFY, args=[GOLD, PRED], warm=True)
+                assert r.stdout.strip() == "1.0", r
+            finally:
+                await pool.release(rt)
+
+    async with RuntimePool() as pool:
+        t0 = time.time()
+        await asyncio.gather(*(one(pool) for _ in range(n)))
+        return time.time() - t0
+
+
+async def main(n: int, concurrency: int) -> None:
+    # warm the class-level interpreter cache once so neither mode pays uv-resolve
+    rt = make_runtime(SubprocessConfig())
+    await rt.start()
+    await rt.run_uv_script(VERIFY, args=[GOLD, PRED])
+    await rt.stop()
+
+    ephemeral_dt = await ephemeral(n, concurrency)
+    persistent_dt = await persistent(n, concurrency)
+
+    print(f"gsm8k verify scoring — n={n} concurrency={concurrency}")
+    print(
+        f"  ephemeral (runtime + import per call): {ephemeral_dt:7.2f}s  ({1000 * ephemeral_dt / n:6.2f} ms/call)"
+    )
+    print(
+        f"  persistent (pool + warm worker):       {persistent_dt:7.2f}s  ({1000 * persistent_dt / n:6.2f} ms/call)"
+    )
+    print(
+        f"  speedup:                               {ephemeral_dt / persistent_dt:6.1f}x"
+    )
+
+
+if __name__ == "__main__":
+    n = int(sys.argv[1]) if len(sys.argv) > 1 else 1000
+    concurrency = int(sys.argv[2]) if len(sys.argv) > 2 else 128
+    asyncio.run(main(n, concurrency))
diff --git a/environments/aime24_v1/aime24_v1/taskset.py b/environments/aime24_v1/aime24_v1/taskset.py
@@ -59,9 +59,12 @@ async def correct(
         prediction = (
             trace.assistant_messages[-1].content if trace.assistant_messages else ""
         )
+        # warm=True: on a persistent runtime, reuse a warm worker that imports math_verify once
+        # instead of execing a fresh interpreter per rollout (no-op on an ephemeral runtime).
         result = await runtime.run_uv_script(
             VERIFY,
             args=[task.answer, prediction or "", str(self.config.math_verify_timeout)],
+            warm=True,
         )
         if result.exit_code != 0:
             raise RuntimeError(f"verify.py failed: {result.stderr.strip()[-500:]}")

diff --git a/environments/aime24_v1/aime24_v1/verify.py b/environments/aime24_v1/aime24_v1/verify.py
@@ -1,24 +1,22 @@
 # /// script
 # dependencies = ["math-verify"]
 # ///
-"""Score one math answer by math-verify equivalence of the model's boxed answer vs the
-gold, run inside the rollout's runtime via `uv run`. uv installs `math-verify` into its
-own cache here — the dependency never touches the eval process. Takes the gold answer
-(argv[1]), the model's prediction (argv[2]), and a timeout in seconds (argv[3]); prints
-1.0 if they're equivalent, else 0.0.
+"""Score one math answer by math-verify equivalence of the model's boxed answer vs the gold,
+run inside the rollout's runtime via `uv run` (or a warm worker). uv installs `math-verify` into
+its own cache here — the dependency never touches the eval process. `main(argv)` takes the gold
+answer (argv[0]), the model's prediction (argv[1]), and a timeout in seconds (argv[2]); returns
+"1.0" if they're equivalent, else "0.0".
+
+Exposing `main(argv) -> str` (plus the `__main__` footer) lets the runtime keep this as a warm
+worker — `import math_verify` paid once, not per call (see `Runtime.run_uv_script(warm=True)`) —
+while staying `uv run verify.py <gold> <pred> <timeout>`-able cold. `main` must `return` (never
+`sys.exit`, which would kill a reused worker).
 """
 
 import sys
 
 from math_verify import parse, verify
 
-gold, pred, timeout = sys.argv[1], sys.argv[2], int(sys.argv[3])
-
-if "<think>" in pred and "</think>" not in pred:
-    print(0.0)
-    sys.exit(0)
-pred = pred.split("</think>")[-1]
-
 
 def extract_boxed(text: str) -> str:
     """Content of the last ``\\boxed{...}`` in ``text``, or "" if there is none."""
@@ -32,20 +30,28 @@ def extract_boxed(text: str) -> str:
     return text[start + len("\\boxed{") : i - 1] if depth == 0 else ""
 
 
-answer = extract_boxed(pred)
-if not answer:
-    print(0.0)
-    sys.exit(0)
-try:
-    score = (
-        1.0
-        if verify(
-            parse("\\boxed{" + gold + "}", parsing_timeout=timeout),
-            parse("\\boxed{" + answer + "}", parsing_timeout=timeout),
-            timeout_seconds=timeout,
+def main(argv: list[str]) -> str:
+    gold, pred, timeout = argv[0], argv[1], int(argv[2])
+    if "<think>" in pred and "</think>" not in pred:
+        return "0.0"
+    pred = pred.split("</think>")[-1]
+    answer = extract_boxed(pred)
+    if not answer:
+        return "0.0"
+    try:
+        score = (
+            1.0
+            if verify(
+                parse("\\boxed{" + gold + "}", parsing_timeout=timeout),
+                parse("\\boxed{" + answer + "}", parsing_timeout=timeout),
+                timeout_seconds=timeout,
+            )
+            else 0.0
         )
-        else 0.0
-    )
-except Exception:
-    score = 0.0
-print(score)
+    except Exception:
+        score = 0.0
+    return str(score)
+
+
+if __name__ == "__main__":
+    print(main(sys.argv[1:]))
diff --git a/environments/gsm8k_v1/gsm8k_v1/taskset.py b/environments/gsm8k_v1/gsm8k_v1/taskset.py
@@ -50,8 +50,10 @@ async def correct(
         prediction = (
             trace.assistant_messages[-1].content if trace.assistant_messages else ""
         )
+        # warm=True: on a persistent runtime, reuse a warm worker that imports math_verify once
+        # instead of execing a fresh interpreter per rollout (no-op on an ephemeral runtime).
         result = await runtime.run_uv_script(
-            VERIFY, args=[task.answer, prediction or ""]
+            VERIFY, args=[task.answer, prediction or ""], warm=True
         )
         if result.exit_code != 0:
             raise RuntimeError(f"verify.py failed: {result.stderr.strip()[-500:]}")

diff --git a/environments/gsm8k_v1/gsm8k_v1/verify.py b/environments/gsm8k_v1/gsm8k_v1/verify.py
@@ -1,31 +1,42 @@
 # /// script
 # dependencies = ["math-verify"]
 # ///
-"""Score one GSM8K answer, run inside the rollout's runtime via `uv run`.
+"""Score one GSM8K answer, run inside the rollout's runtime via `uv run` (or a warm worker).
 
-uv installs `math-verify` into its own cache here — the dependency never touches
-the eval process. Takes the gold answer (`argv[1]`) and the model's prediction
-(`argv[2]`) and prints 1.0 if they match the same number, else 0.0.
+uv installs `math-verify` into its own cache here — the dependency never touches the eval
+process. `main(argv)` takes the gold answer (`argv[0]`) and the model's prediction (`argv[1]`)
+and returns "1.0" if they match the same number, else "0.0".
 
-The model is asked for its answer after '####'. Both the gold and the prediction
-are wrapped in \\boxed{} before math-verify (matching the math-env scorer) so
-parsing is robust; a malformed prediction fails to verify rather than crashing.
+Exposing `main(argv) -> str` (plus the `__main__` footer) lets the runtime keep this as a warm
+worker — `import math_verify` is paid once, not per call (see `Runtime.run_uv_script(warm=True)`)
+— while staying `uv run verify.py <gold> <pred>`-able cold. The model is asked for its answer
+after '####'. Both the gold and the prediction are wrapped in \\boxed{} before math-verify
+(matching the math-env scorer) so parsing is robust; a malformed prediction fails to verify
+rather than crashing.
 """
 
 import re
 import sys
 
 from math_verify import parse, verify
 
-gold, pred = sys.argv[1], sys.argv[2]
-matches = re.findall(r"####\s*(.+)", pred)
-prediction = matches[-1].strip() if matches else pred
-try:
-    score = (
-        1.0
-        if verify(parse("\\boxed{" + gold + "}"), parse("\\boxed{" + prediction + "}"))
-        else 0.0
-    )
-except Exception:
-    score = 0.0
-print(score)
+
+def main(argv: list[str]) -> str:
+    gold, pred = argv[0], argv[1]
+    matches = re.findall(r"####\s*(.+)", pred)
+    prediction = matches[-1].strip() if matches else pred
+    try:
+        score = (
+            1.0
+            if verify(
+                parse("\\boxed{" + gold + "}"), parse("\\boxed{" + prediction + "}")
+            )
+            else 0.0
+        )
+    except Exception:
+        score = 0.0
+    return str(score)
+
+
+if __name__ == "__main__":
+    print(main(sys.argv[1:]))
diff --git a/environments/math_env_v1/math_env_v1/taskset.py b/environments/math_env_v1/math_env_v1/taskset.py
@@ -63,9 +63,12 @@ async def correct(
         prediction = (
             trace.assistant_messages[-1].content if trace.assistant_messages else ""
         )
+        # warm=True: on a persistent runtime, reuse a warm worker that imports math_verify once
+        # instead of execing a fresh interpreter per rollout (no-op on an ephemeral runtime).
         result = await runtime.run_uv_script(
             VERIFY,
             args=[task.answer, prediction or "", str(self.config.math_verify_timeout)],
+            warm=True,
         )
         if result.exit_code != 0:
             raise RuntimeError(f"verify.py failed: {result.stderr.strip()[-500:]}")

diff --git a/environments/math_env_v1/math_env_v1/verify.py b/environments/math_env_v1/math_env_v1/verify.py
@@ -1,24 +1,22 @@
 # /// script
 # dependencies = ["math-verify"]
 # ///
-"""Score one math answer by math-verify equivalence of the model's boxed answer vs the
-gold, run inside the rollout's runtime via `uv run`. uv installs `math-verify` into its
-own cache here — the dependency never touches the eval process. Takes the gold answer
-(argv[1]), the model's prediction (argv[2]), and a timeout in seconds (argv[3]); prints
-1.0 if they're equivalent, else 0.0.
+"""Score one math answer by math-verify equivalence of the model's boxed answer vs the gold,
+run inside the rollout's runtime via `uv run` (or a warm worker). uv installs `math-verify` into
+its own cache here — the dependency never touches the eval process. `main(argv)` takes the gold
+answer (argv[0]), the model's prediction (argv[1]), and a timeout in seconds (argv[2]); returns
+"1.0" if they're equivalent, else "0.0".
+
+Exposing `main(argv) -> str` (plus the `__main__` footer) lets the runtime keep this as a warm
+worker — `import math_verify` paid once, not per call (see `Runtime.run_uv_script(warm=True)`) —
+while staying `uv run verify.py <gold> <pred> <timeout>`-able cold. `main` must `return` (never
+`sys.exit`, which would kill a reused worker).
 """
 
 import sys
 
 from math_verify import parse, verify
 
-gold, pred, timeout = sys.argv[1], sys.argv[2], int(sys.argv[3])
-
-if "<think>" in pred and "</think>" not in pred:
-    print(0.0)
-    sys.exit(0)
-pred = pred.split("</think>")[-1]
-
 
 def extract_boxed(text: str) -> str:
     """Content of the last ``\\boxed{...}`` in ``text``, or "" if there is none."""
@@ -32,20 +30,28 @@ def extract_boxed(text: str) -> str:
     return text[start + len("\\boxed{") : i - 1] if depth == 0 else ""
 
 
-answer = extract_boxed(pred)
-if not answer:
-    print(0.0)
-    sys.exit(0)
-try:
-    score = (
-        1.0
-        if verify(
-            parse("\\boxed{" + gold + "}", parsing_timeout=timeout),
-            parse("\\boxed{" + answer + "}", parsing_timeout=timeout),
-            timeout_seconds=timeout,
+def main(argv: list[str]) -> str:
+    gold, pred, timeout = argv[0], argv[1], int(argv[2])
+    if "<think>" in pred and "</think>" not in pred:
+        return "0.0"
+    pred = pred.split("</think>")[-1]
+    answer = extract_boxed(pred)
+    if not answer:
+        return "0.0"
+    try:
+        score = (
+            1.0
+            if verify(
+                parse("\\boxed{" + gold + "}", parsing_timeout=timeout),
+                parse("\\boxed{" + answer + "}", parsing_timeout=timeout),
+                timeout_seconds=timeout,
+            )
+            else 0.0
         )
-        else 0.0
-    )
-except Exception:
-    score = 0.0
-print(score)
+    except Exception:
+        score = 0.0
+    return str(score)
+
+
+if __name__ == "__main__":
+    print(main(sys.argv[1:]))
diff --git a/verifiers/v1/GUIDE.md b/verifiers/v1/GUIDE.md
@@ -224,6 +224,25 @@ async def verified(self, task, trace, runtime) -> float:
     return float(r.stdout.strip() == "1.0")
 ```
 
+By default each `run_uv_script` execs a fresh interpreter, so a heavy import (`math-verify` pulls
+in sympy, ~0.2s) is paid every rollout. Pass `warm=True` to instead route to a long-lived worker
+that imports once and answers many calls — provided the script exposes `main(argv) -> str` (its
+result is the call's stdout) and stays `uv run`-able cold via a
+`if __name__ == "__main__": print(main(sys.argv[1:]))` footer:
+
+```python
+r = await runtime.run_uv_script(VERIFY, args=[...], warm=True)
+```
+
+The worker lives as long as the runtime, so warm only pays off on a **persistent** runtime
+(`--harness.runtime.persistent true`): runtimes are then taken from an eval/train-level pool and
+reused across rollouts (workspace `reset` between uses, torn down only at the end), so the import
+is paid once per pooled runtime instead of per rollout — ~12x faster gsm8k scoring at 1000
+rollouts / 128 concurrency, more as the run gets longer (`bench/persistent_runtime.py`). On an
+ephemeral runtime `warm=True` is a no-op (runs cold). Currently warm-capable on the subprocess
+runtime; persistence itself works on every runtime (and on remote runtimes also skips the
+per-rollout sandbox+tunnel provisioning).
+
 ## Stop conditions
 
 A rollout ends when the harness finishes, a framework budget trips (`--max-turns`, token caps), or
@@ -265,7 +284,7 @@ isolated environment. On a `runtime` you can call:
 | method | what |
 | --- | --- |
 | `run(argv, env)` | exec a command to completion → `ProgramResult(exit_code, stdout, stderr)` |
-| `run_uv_script(src, args, env)` | run a PEP 723 script (inline deps resolve in-runtime); `args` are shell-`"$@"`-safe |
+| `run_uv_script(src, args, env, warm=False)` | run a PEP 723 script (inline deps resolve in-runtime); `args` are shell-`"$@"`-safe. `warm=True` reuses a long-lived worker (import once) on a persistent runtime — see [In-runtime scoring](#in-runtime-scoring) |
 | `run_background(argv, env, log)` | start a long-lived process (e.g. a colocated server) |
 | `read(path)` / `write(path, data)` | workspace files (bytes), across the container/sandbox boundary |
 | `expose(port)` | publish a port *inside* the runtime to a host-reachable URL (`None` when local) |